Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # %% | |
| import os | |
| import json | |
| from huggingface_hub import Repository | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import matplotlib.figure | |
| from datetime import datetime | |
| from sklearn.preprocessing import MinMaxScaler | |
| # import dotenv | |
| # dotenv.load_dotenv() | |
| min_max_scaler = MinMaxScaler() | |
| # %% | |
| def pull_results(results_dir: str): | |
| repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset") | |
| repo.git_pull() | |
| def extract_info_from_result_file(result_file): | |
| """ | |
| { | |
| "config": { | |
| "model_dtype": "float16", | |
| "model_name": "databricks/dbrx-instruct", | |
| "model_sha": "main" | |
| }, | |
| "results": { | |
| "hallucination_rate": { | |
| "hallucination_rate": 8.34990059642147 | |
| }, | |
| "factual_consistency_rate": { | |
| "factual_consistency_rate": 91.65009940357854 | |
| }, | |
| "answer_rate": { | |
| "answer_rate": 100.0 | |
| }, | |
| "average_summary_length": { | |
| "average_summary_length": 85.9 | |
| } | |
| } | |
| """ | |
| info = json.load(open(result_file, 'r')) | |
| result = { | |
| "LLM": info["config"]["model_name"], | |
| "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"], | |
| # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"], | |
| "Answer %": info["results"]["answer_rate"]["answer_rate"], | |
| "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"], | |
| } | |
| return result | |
| def get_latest_result_file(dir: str): | |
| """ | |
| Get the latest result file in the given directory based on the timestamp in the file name. | |
| """ | |
| if not os.path.isdir(dir): | |
| return None | |
| files = os.listdir(dir) | |
| files = [f for f in files if f.endswith(".json")] | |
| if len(files) == 0: | |
| return None | |
| files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x))) | |
| # print ("Scanning: ", dir, "found latest file: ", files[0]) | |
| return os.path.join(dir, files[0]) | |
| def scan_and_extract(dir: str): | |
| """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one. | |
| """ | |
| results = [] | |
| for root, dirs, files in os.walk(dir): | |
| if len(dirs) == 0: | |
| continue | |
| for dir in dirs: | |
| result_file = get_latest_result_file(os.path.join(root, dir)) | |
| if result_file is not None: | |
| results.append(extract_info_from_result_file(result_file)) | |
| return results | |
| def load_results( | |
| results_dir: str = "./results", | |
| results_json: str = "./results.json" | |
| ): | |
| try: | |
| pull_results(results_dir) | |
| print (f"Successfully pulled results from {results_dir}") | |
| except Exception as e: | |
| print(f"Failed to pull and/or extract latest results: {e}") | |
| try: | |
| results = scan_and_extract(results_dir) | |
| if len(results) > 0: | |
| with open(results_json, "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}") | |
| else: | |
| print(f"No results found in {results_dir}") | |
| except Exception as e: | |
| print(f"Failed to scan and extract results from {results_dir}: {e}") | |
| print(f"Using pre-dumped results from {results_json}") | |
| results = json.load(open(results_json, "r")) | |
| # print(results) | |
| results_df = pd.DataFrame(results) | |
| results_df = results_df.sort_values(by="Hallucination %", ascending=True) | |
| # replace any value TBD with -1 | |
| results_df = results_df.replace("TBD", 100) | |
| for column in ["Hallucination %", "Answer %", "Avg Summary Words"]: | |
| results_df[column] = results_df[column].apply(lambda x: round(x, 3)) | |
| results_df["LLM_lower_case"] = results_df["LLM"].str.lower() | |
| return results_df | |
| # %% | |
| def determine_font_size(LLM: str, hallucination_percent: float) -> int: | |
| # based on both hallucination percent and LLM name, determine font size | |
| # if hallucination percentage is low and LLM name is long, use smaller font size | |
| name_length = len(LLM) | |
| if hallucination_percent < 0.25: | |
| if name_length > 10: | |
| return 8.5 | |
| else: | |
| return 9 | |
| else: | |
| return 9 | |
| def determine_font_color(hallucination_percent: float) -> str: | |
| if 0.25 < hallucination_percent < 0.65: | |
| return 'black' | |
| else: | |
| return 'white' | |
| def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: float) -> float: | |
| name_length = len(LLM) | |
| print ("LLM: ", LLM, "hallu_rate: ", hallucination_percent, "name_length: ", name_length) | |
| hallu_rate_to_bar_length_ratio = 5 | |
| bar_length = hallu_rate_to_bar_length_ratio * hallucination_percent | |
| if name_length < bar_length: | |
| return 0.01, determine_font_color(hallucination_percent) | |
| else: # to the right of the bar, black anyway | |
| return hallucination_percent, 'black' | |
| def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure: | |
| fig = plt.figure(figsize=(8, 4)) | |
| # plot using LLM as x-axis and Hallucination % as y-axis | |
| # make bars horizontal | |
| plot_df = df.head(10) | |
| plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]]) | |
| plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"])) | |
| # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply( | |
| # lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]), | |
| # axis=1 | |
| # )) | |
| for i, row in plot_df.iterrows(): | |
| plt.text( | |
| # row["LLM_x_position"], | |
| row["Hallucination %"] + 0.025, | |
| row["LLM"], | |
| row["Hallucination %"], | |
| # f"{row['LLM']}", | |
| ha='left', | |
| va='center', | |
| fontsize=9, | |
| # color=row["font_color"] | |
| ) | |
| # plt.yticks([]) | |
| plt.tight_layout() | |
| # add margin to the right of the plot | |
| plt.subplots_adjust(right=0.95) | |
| plt.xticks(fontsize=9) | |
| plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9) | |
| plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12) | |
| plt.gca().spines['top'].set_visible(False) | |
| plt.gca().spines['right'].set_visible(False) | |
| plt.gca().spines['left'].set_visible(False) | |
| plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down | |
| return fig | |
| # %% | |
| if __name__ == "__main__": | |
| results = scan_and_extract("./results") | |
| with open("./results.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| # %% | |