Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +7 -35
evaluate.py
CHANGED
|
@@ -395,10 +395,7 @@ def run_comprehensive_evaluation(
|
|
| 395 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 396 |
table_rows = df_display.values.tolist()
|
| 397 |
headers = df_display.columns.tolist()
|
| 398 |
-
|
| 399 |
-
summary_text = "No valid test fixtures found to evaluate."
|
| 400 |
-
table_rows, headers = [], []
|
| 401 |
-
|
| 402 |
# --- NEW: per-category averages ---
|
| 403 |
try:
|
| 404 |
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
|
@@ -415,37 +412,12 @@ def run_comprehensive_evaluation(
|
|
| 415 |
print(confusion.to_string())
|
| 416 |
except Exception as e:
|
| 417 |
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
with open(log_path, "w", encoding="utf-8") as logf:
|
| 425 |
-
logf.write("===== Detailed Evaluation Run =====\n")
|
| 426 |
-
df_string = df.to_string(index=False)
|
| 427 |
-
logf.write(df_string)
|
| 428 |
-
logf.write("\n\n")
|
| 429 |
-
|
| 430 |
-
try:
|
| 431 |
-
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 432 |
-
print("\nπ Correctness by Category:")
|
| 433 |
-
print(cat_means.to_string(index=False))
|
| 434 |
-
logf.write("\nπ Correctness by Category:\n")
|
| 435 |
-
logf.write(cat_means.to_string(index=False))
|
| 436 |
-
logf.write("\n")
|
| 437 |
-
except Exception as e:
|
| 438 |
-
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 439 |
-
|
| 440 |
-
try:
|
| 441 |
-
confusion = pd.crosstab(df["category"], df["error_class"], rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 442 |
-
print("\nπ Error Class Distribution by Category:")
|
| 443 |
-
print(confusion.to_string())
|
| 444 |
-
logf.write("\nπ Error Class Distribution by Category:\n")
|
| 445 |
-
logf.write(confusion.to_string())
|
| 446 |
-
logf.write("\n")
|
| 447 |
-
except Exception as e:
|
| 448 |
-
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 449 |
|
| 450 |
return summary_text, table_rows, headers
|
| 451 |
# return summary_text, table_rows
|
|
|
|
| 395 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 396 |
table_rows = df_display.values.tolist()
|
| 397 |
headers = df_display.columns.tolist()
|
| 398 |
+
|
|
|
|
|
|
|
|
|
|
| 399 |
# --- NEW: per-category averages ---
|
| 400 |
try:
|
| 401 |
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
|
|
|
| 412 |
print(confusion.to_string())
|
| 413 |
except Exception as e:
|
| 414 |
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 415 |
+
# END
|
| 416 |
+
|
| 417 |
+
else:
|
| 418 |
+
summary_text = "No valid test fixtures found to evaluate."
|
| 419 |
+
table_rows, headers = [], []
|
| 420 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
return summary_text, table_rows, headers
|
| 423 |
# return summary_text, table_rows
|