KeenWoo commited on
Commit
1ca74e3
Β·
verified Β·
1 Parent(s): 928c449

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +7 -35
evaluate.py CHANGED
@@ -395,10 +395,7 @@ def run_comprehensive_evaluation(
395
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
396
  table_rows = df_display.values.tolist()
397
  headers = df_display.columns.tolist()
398
- else:
399
- summary_text = "No valid test fixtures found to evaluate."
400
- table_rows, headers = [], []
401
-
402
  # --- NEW: per-category averages ---
403
  try:
404
  cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
@@ -415,37 +412,12 @@ def run_comprehensive_evaluation(
415
  print(confusion.to_string())
416
  except Exception as e:
417
  print(f"WARNING: Could not build confusion matrix: {e}")
418
-
419
- output_path = "evaluation_results.csv"
420
- df.to_csv(output_path, index=False, encoding="utf-8")
421
- print(f"Evaluation results saved to {output_path}")
422
-
423
- log_path = storage_path / "evaluation_log.txt"
424
- with open(log_path, "w", encoding="utf-8") as logf:
425
- logf.write("===== Detailed Evaluation Run =====\n")
426
- df_string = df.to_string(index=False)
427
- logf.write(df_string)
428
- logf.write("\n\n")
429
-
430
- try:
431
- cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
432
- print("\nπŸ“Š Correctness by Category:")
433
- print(cat_means.to_string(index=False))
434
- logf.write("\nπŸ“Š Correctness by Category:\n")
435
- logf.write(cat_means.to_string(index=False))
436
- logf.write("\n")
437
- except Exception as e:
438
- print(f"WARNING: Could not compute category breakdown: {e}")
439
-
440
- try:
441
- confusion = pd.crosstab(df["category"], df["error_class"], rownames=["Category"], colnames=["Error Class"], dropna=False)
442
- print("\nπŸ“Š Error Class Distribution by Category:")
443
- print(confusion.to_string())
444
- logf.write("\nπŸ“Š Error Class Distribution by Category:\n")
445
- logf.write(confusion.to_string())
446
- logf.write("\n")
447
- except Exception as e:
448
- print(f"WARNING: Could not build confusion matrix: {e}")
449
 
450
  return summary_text, table_rows, headers
451
  # return summary_text, table_rows
 
395
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
396
  table_rows = df_display.values.tolist()
397
  headers = df_display.columns.tolist()
398
+
 
 
 
399
  # --- NEW: per-category averages ---
400
  try:
401
  cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
 
412
  print(confusion.to_string())
413
  except Exception as e:
414
  print(f"WARNING: Could not build confusion matrix: {e}")
415
+ # END
416
+
417
+ else:
418
+ summary_text = "No valid test fixtures found to evaluate."
419
+ table_rows, headers = [], []
420
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  return summary_text, table_rows, headers
423
  # return summary_text, table_rows