Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Sep 14

Commit

1ca74e3

verified ·

1 Parent(s): 928c449

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +7 -35

evaluate.py CHANGED Viewed

@@ -395,10 +395,7 @@ def run_comprehensive_evaluation(
         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
-    else:
-        summary_text = "No valid test fixtures found to evaluate."
-        table_rows, headers = [], []
         # --- NEW: per-category averages ---
         try:
             cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
@@ -415,37 +412,12 @@ def run_comprehensive_evaluation(
             print(confusion.to_string())
         except Exception as e:
             print(f"WARNING: Could not build confusion matrix: {e}")
-        output_path = "evaluation_results.csv"
-        df.to_csv(output_path, index=False, encoding="utf-8")
-        print(f"Evaluation results saved to {output_path}")
-        log_path = storage_path / "evaluation_log.txt"
-        with open(log_path, "w", encoding="utf-8") as logf:
-            logf.write("===== Detailed Evaluation Run =====\n")
-            df_string = df.to_string(index=False)
-            logf.write(df_string)
-            logf.write("\n\n")
-            try:
-                cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
-                print("\n📊 Correctness by Category:")
-                print(cat_means.to_string(index=False))
-                logf.write("\n📊 Correctness by Category:\n")
-                logf.write(cat_means.to_string(index=False))
-                logf.write("\n")
-            except Exception as e:
-                print(f"WARNING: Could not compute category breakdown: {e}")
-            try:
-                confusion = pd.crosstab(df["category"], df["error_class"], rownames=["Category"], colnames=["Error Class"], dropna=False)
-                print("\n📊 Error Class Distribution by Category:")
-                print(confusion.to_string())
-                logf.write("\n📊 Error Class Distribution by Category:\n")
-                logf.write(confusion.to_string())
-                logf.write("\n")
-            except Exception as e:
-                print(f"WARNING: Could not build confusion matrix: {e}")
     return summary_text, table_rows, headers
     # return summary_text, table_rows

         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
         # --- NEW: per-category averages ---
         try:
             cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
             print(confusion.to_string())
         except Exception as e:
             print(f"WARNING: Could not build confusion matrix: {e}")
+        # END
+    else:
+        summary_text = "No valid test fixtures found to evaluate."
+        table_rows, headers = [], []
     return summary_text, table_rows, headers
     # return summary_text, table_rows