Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

leaderboard / app /app_utils.py

forrestbao

display plot; use funix 0.6.2

696341e 7 months ago

raw

history blame

6.86 kB

	# %%
	import os
	import json
	from huggingface_hub import Repository
	import pandas as pd
	import matplotlib.pyplot as plt
	import matplotlib.figure
	from datetime import datetime
	from sklearn.preprocessing import MinMaxScaler

	# import dotenv
	# dotenv.load_dotenv()

	min_max_scaler = MinMaxScaler()

	# %%
	def pull_results(results_dir: str):
	repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset")
	repo.git_pull()

	def extract_info_from_result_file(result_file):
	"""
	{
	"config": {
	"model_dtype": "float16",
	"model_name": "databricks/dbrx-instruct",
	"model_sha": "main"
	},
	"results": {
	"hallucination_rate": {
	"hallucination_rate": 8.34990059642147
	},
	"factual_consistency_rate": {
	"factual_consistency_rate": 91.65009940357854
	},
	"answer_rate": {
	"answer_rate": 100.0
	},
	"average_summary_length": {
	"average_summary_length": 85.9
	}
	}
	"""

	info = json.load(open(result_file, 'r'))
	result = {
	"LLM": info["config"]["model_name"],
	"Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
	# "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
	"Answer %": info["results"]["answer_rate"]["answer_rate"],
	"Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
	}
	return result

	def get_latest_result_file(dir: str):
	"""
	Get the latest result file in the given directory based on the timestamp in the file name.
	"""
	if not os.path.isdir(dir):
	return None
	files = os.listdir(dir)
	files = [f for f in files if f.endswith(".json")]
	if len(files) == 0:
	return None
	files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
	# print ("Scanning: ", dir, "found latest file: ", files[0])
	return os.path.join(dir, files[0])

	def scan_and_extract(dir: str):
	"""Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
	"""

	results = []
	for root, dirs, files in os.walk(dir):
	if len(dirs) == 0:
	continue
	for dir in dirs:
	result_file = get_latest_result_file(os.path.join(root, dir))
	if result_file is not None:
	results.append(extract_info_from_result_file(result_file))
	return results

	def load_results(
	results_dir: str = "./results",
	results_json: str = "./results.json"
	):

	try:
	pull_results(results_dir)
	print (f"Successfully pulled results from {results_dir}")
	except Exception as e:
	print(f"Failed to pull and/or extract latest results: {e}")

	try:
	results = scan_and_extract(results_dir)
	if len(results) > 0:
	with open(results_json, "w") as f:
	json.dump(results, f, indent=2)
	print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
	else:
	print(f"No results found in {results_dir}")
	except Exception as e:
	print(f"Failed to scan and extract results from {results_dir}: {e}")
	print(f"Using pre-dumped results from {results_json}")

	results = json.load(open(results_json, "r"))
	# print(results)

	results_df = pd.DataFrame(results)
	results_df = results_df.sort_values(by="Hallucination %", ascending=True)

	# replace any value TBD with -1
	results_df = results_df.replace("TBD", 100)

	for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
	results_df[column] = results_df[column].apply(lambda x: round(x, 3))

	results_df["LLM_lower_case"] = results_df["LLM"].str.lower()

	return results_df

	# %%
	def determine_font_size(LLM: str, hallucination_percent: float) -> int:
	# based on both hallucination percent and LLM name, determine font size
	# if hallucination percentage is low and LLM name is long, use smaller font size
	name_length = len(LLM)
	if hallucination_percent < 0.25:
	if name_length > 10:
	return 8.5
	else:
	return 9
	else:
	return 9

	def determine_font_color(hallucination_percent: float) -> str:
	if 0.25 < hallucination_percent < 0.65:
	return 'black'
	else:
	return 'white'

	def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float:
	name_length = len(LLM)
	print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length)

	hallu_rate_to_bar_length_ratio = 5
	bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent
	if name_length < bar_length:
	return 0.01, determine_font_color(hallucination_percent)
	else: # to the right of the bar, black anyway
	return hallucination_percent, 'black'

	def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
	fig = plt.figure(figsize=(8, 4))
	# plot using LLM as x-axis and Hallucination % as y-axis
	# make bars horizontal
	plot_df = df.head(10)
	plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])

	plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))

	# plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
	# lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
	# axis=1
	# ))

	for i, row in plot_df.iterrows():
	plt.text(
	# row["LLM_x_position"],
	row["Hallucination %"] + 0.025,
	row["LLM"],
	row["Hallucination %"],
	# f"{row['LLM']}",
	ha='left',
	va='center',
	fontsize=9,
	# color=row["font_color"]
	)
	# plt.yticks([])
	plt.tight_layout()

	# add margin to the right of the plot
	plt.subplots_adjust(right=0.95)

	plt.xticks(fontsize=9)
	plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
	plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
	plt.gca().spines['top'].set_visible(False)
	plt.gca().spines['right'].set_visible(False)
	plt.gca().spines['left'].set_visible(False)
	plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down

	return fig

	# %%

	if __name__ == "__main__":
	results = scan_and_extract("./results")
	with open("./results.json", "w") as f:
	json.dump(results, f, indent=2)

	# %%