Spaces:
Running
Running
| """ | |
| evaluator.py | |
| ββββββββββββ | |
| MΓ©tricas de calidad para las correcciones del sistema RAG. | |
| Las tres comparaciones que realiza el evaluador: | |
| 1. GT vs HTR β error de PARTIDA (quΓ© tan malo era el HTR) | |
| 2. GT vs Corregido β error FINAL (quΓ© tan bueno es el RAG) | |
| 3. HTR vs Corregido β MODERNISMOS (quΓ© cambiΓ³ el LLM, no deberΓa modernizar) | |
| Un cer_improvement positivo significa que el RAG mejorΓ³ el texto. | |
| Un cer_improvement negativo significa que el LLM empeorΓ³ el texto. | |
| MΓ©tricas: | |
| - CER (Character Error Rate) : distancia Levenshtein a nivel carΓ‘cter | |
| - WER (Word Error Rate) : distancia Levenshtein a nivel palabra | |
| - Modernism score : penalizaciΓ³n por grafΓas s.XVI modernizadas | |
| - Regression score : detecta si el LLM empeorΓ³ respecto al HTR | |
| Uso: | |
| from evaluator import Evaluator | |
| ev = Evaluator() | |
| # Un solo par | |
| metrics = ev.evaluate_pair(htr="...", corrected="...", gt="...") | |
| print(ev.format_pair_report(metrics)) | |
| # Batch | |
| report = ev.batch_evaluate(corrector, pairs[:50]) | |
| """ | |
| import re | |
| from typing import List, Dict | |
| from knowledge_base import GRAFIA_PATTERNS | |
| class Evaluator: | |
| # ββ MΓ©tricas de ediciΓ³n ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def cer(reference: str, hypothesis: str) -> float: | |
| """ | |
| Character Error Rate: fracciΓ³n de caracteres incorrectos. | |
| 0.0 = perfecto, 1.0 = todo mal. | |
| Compara: reference (GT) vs hypothesis (HTR o Corregido). | |
| """ | |
| r, h = list(reference), list(hypothesis) | |
| return Evaluator._levenshtein(r, h) / max(len(r), 1) | |
| def wer(reference: str, hypothesis: str) -> float: | |
| """ | |
| Word Error Rate: fracciΓ³n de palabras incorrectas. | |
| 0.0 = perfecto, 1.0 = todo mal. | |
| Compara: reference (GT) vs hypothesis (HTR o Corregido). | |
| """ | |
| r = reference.split() | |
| h = hypothesis.split() | |
| return Evaluator._levenshtein(r, h) / max(len(r), 1) | |
| def _levenshtein(seq1: list, seq2: list) -> int: | |
| """Distancia de ediciΓ³n mΓnima entre dos secuencias.""" | |
| m, n = len(seq1), len(seq2) | |
| dp = list(range(n + 1)) | |
| for i in range(1, m + 1): | |
| prev = dp[:] | |
| dp[0] = i | |
| for j in range(1, n + 1): | |
| if seq1[i - 1] == seq2[j - 1]: | |
| dp[j] = prev[j - 1] | |
| else: | |
| dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1]) | |
| return dp[n] | |
| # ββ Detector de modernismos ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def modernism_penalty(htr: str, corrected: str) -> Dict: | |
| """ | |
| ComparaciΓ³n 3: HTR vs Corregido. | |
| Detecta formas modernas que el LLM introdujo y que NO estaban | |
| en el HTR original. Estas son correcciones incorrectas porque | |
| el sistema NO debe modernizar grafΓas del s.XVI. | |
| Ejemplo de error detectado: | |
| HTR: "fizo merΓ§ed" | |
| Corregido: "hizo merced" β modernizΓ³ fβh y Γ§βc (INCORRECTO) | |
| score: 1.0 = sin modernismos, 0.0 = muchos modernismos | |
| """ | |
| issues = [] | |
| htr_lower = htr.lower() | |
| corr_lower = corrected.lower() | |
| for p in GRAFIA_PATTERNS: | |
| modern = p["modern"].lower() | |
| ancient_forms = [f.strip().lower() for f in p["ancient"].split("/")] | |
| # El corregido tiene la forma moderna Y el HTR no la tenΓa | |
| if modern in corr_lower and modern not in htr_lower: | |
| # AdemΓ‘s el HTR tampoco tenΓa la forma antigua | |
| # (si la tenΓa, es una expansiΓ³n de abreviatura vΓ‘lida) | |
| if not any(af in htr_lower for af in ancient_forms): | |
| issues.append({ | |
| "modern": p["modern"], | |
| "ancient": p["ancient"], | |
| "rule": p["rule"], | |
| "category": p.get("category", ""), | |
| }) | |
| score = max(0.0, 1.0 - len(issues) * 0.1) | |
| return { | |
| "count": len(issues), | |
| "issues": issues, | |
| "score": round(score, 4), | |
| } | |
| # ββ Detector de regresiones ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def regression_check(htr: str, corrected: str, gt: str) -> Dict: | |
| """ | |
| Detecta si el LLM empeorΓ³ el texto respecto al HTR original. | |
| Un resultado positivo en cer_improvement no garantiza que todo | |
| estΓ© bien β el LLM podrΓa haber corregido unos errores e | |
| introducido otros. Esta funciΓ³n identifica palabras que estaban | |
| bien en el HTR y el LLM cambiΓ³ incorrectamente. | |
| """ | |
| htr_words = htr.split() | |
| corr_words = corrected.split() | |
| gt_words = gt.split() | |
| regressions = [] | |
| # Comparar palabra a palabra hasta el mΓnimo de las tres listas | |
| for i, gt_w in enumerate(gt_words): | |
| htr_w = htr_words[i] if i < len(htr_words) else "" | |
| corr_w = corr_words[i] if i < len(corr_words) else "" | |
| # El HTR estaba bien, el corregido estΓ‘ mal | |
| if htr_w == gt_w and corr_w != gt_w: | |
| regressions.append({ | |
| "position": i, | |
| "gt": gt_w, | |
| "htr": htr_w, # correcto en HTR | |
| "corrected":corr_w, # empeorado por el LLM | |
| }) | |
| return { | |
| "count": len(regressions), | |
| "regressions": regressions, | |
| } | |
| # ββ EvaluaciΓ³n de un par βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def evaluate_pair(self, htr: str, corrected: str, gt: str) -> Dict: | |
| """ | |
| EvalΓΊa una correcciΓ³n con las tres comparaciones: | |
| ComparaciΓ³n 1 β GT vs HTR: | |
| Mide el error de partida. CuΓ‘nto se alejaba el HTR del GT. | |
| ComparaciΓ³n 2 β GT vs Corregido: | |
| Mide el error final. CuΓ‘nto se aleja la correcciΓ³n del GT. | |
| cer_improvement > 0 β el RAG mejorΓ³ el texto | |
| cer_improvement < 0 β el RAG empeorΓ³ el texto | |
| ComparaciΓ³n 3 β HTR vs Corregido: | |
| Detecta modernismos introducidos por el LLM. | |
| El LLM NO debe cambiar grafΓas vΓ‘lidas del s.XVI. | |
| """ | |
| # ββ ComparaciΓ³n 1: GT vs HTR (error de partida) ββββββββββββββββββββ | |
| cer_htr = self.cer(gt, htr) | |
| wer_htr = self.wer(gt, htr) | |
| # ββ ComparaciΓ³n 2: GT vs Corregido (error final) βββββββββββββββββββ | |
| cer_corr = self.cer(gt, corrected) | |
| wer_corr = self.wer(gt, corrected) | |
| cer_improvement = cer_htr - cer_corr # positivo = mejorΓ³ | |
| wer_improvement = wer_htr - wer_corr | |
| # Veredicto de la correcciΓ³n | |
| if cer_improvement > 0.02: | |
| verdict = "β MEJORADO" | |
| elif cer_improvement < -0.02: | |
| verdict = "β EMPEORADO" | |
| else: | |
| verdict = "~ SIN CAMBIO SIGNIFICATIVO" | |
| # ββ ComparaciΓ³n 3: HTR vs Corregido (modernismos) βββββββββββββββββ | |
| modernism = self.modernism_penalty(htr, corrected) | |
| regression = self.regression_check(htr, corrected, gt) | |
| return { | |
| # ββ Error de partida (GT vs HTR) ββββββββββββββββββββββββββββββ | |
| "cer_before": round(cer_htr, 4), | |
| "wer_before": round(wer_htr, 4), | |
| # ββ Error final (GT vs Corregido) βββββββββββββββββββββββββββββ | |
| "cer_after": round(cer_corr, 4), | |
| "wer_after": round(wer_corr, 4), | |
| # ββ Mejora neta βββββββββββββββββββββββββββββββββββββββββββββββ | |
| "cer_improvement": round(cer_improvement, 4), | |
| "wer_improvement": round(wer_improvement, 4), | |
| "verdict": verdict, | |
| # ββ Modernismos (HTR vs Corregido) ββββββββββββββββββββββββββββ | |
| "modernism_score": modernism["score"], | |
| "modernism": modernism, | |
| # ββ Regresiones βββββββββββββββββββββββββββββββββββββββββββββββ | |
| "regression": regression, | |
| } | |
| # ββ Reporte legible de un par ββββββββββββββββββββββββββββββββββββββββββββ | |
| def format_pair_report(metrics: Dict) -> str: | |
| """ | |
| Formatea las mΓ©tricas de un par en texto legible para la UI. | |
| """ | |
| lines = [] | |
| lines.append("β" * 50) | |
| lines.append(" EVALUACIΓN DE LA CORRECCIΓN") | |
| lines.append("β" * 50) | |
| # ComparaciΓ³n 1 β error de partida | |
| lines.append("\nβ Error de partida (GT vs HTR original)") | |
| lines.append(f" CER: {metrics['cer_before']:.2%} WER: {metrics['wer_before']:.2%}") | |
| # ComparaciΓ³n 2 β error final | |
| lines.append("\nβ‘ Error final (GT vs Texto corregido)") | |
| lines.append(f" CER: {metrics['cer_after']:.2%} WER: {metrics['wer_after']:.2%}") | |
| # Mejora neta | |
| cer_imp = metrics["cer_improvement"] | |
| wer_imp = metrics["wer_improvement"] | |
| sign_c = "+" if cer_imp >= 0 else "" | |
| sign_w = "+" if wer_imp >= 0 else "" | |
| lines.append(f"\n Mejora CER: {sign_c}{cer_imp:.2%} Mejora WER: {sign_w}{wer_imp:.2%}") | |
| lines.append(f" {metrics['verdict']}") | |
| # ComparaciΓ³n 3 β modernismos | |
| mod = metrics["modernism"] | |
| lines.append(f"\nβ’ Modernismos introducidos (HTR vs Corregido)") | |
| if mod["count"] == 0: | |
| lines.append(" β Ninguno β el LLM respetΓ³ las grafΓas del s.XVI") | |
| else: | |
| lines.append(f" β {mod['count']} modernismo(s) detectado(s):") | |
| for issue in mod["issues"]: | |
| lines.append( | |
| f" β’ '{issue['modern']}' introducido " | |
| f"(deberΓa ser '{issue['ancient']}'): {issue['rule']}" | |
| ) | |
| # Regresiones | |
| reg = metrics["regression"] | |
| if reg["count"] > 0: | |
| lines.append(f"\nβ Regresiones: {reg['count']} palabra(s) correctas en HTR empeoradas por el LLM:") | |
| for r in reg["regressions"][:5]: | |
| lines.append( | |
| f" β’ posiciΓ³n {r['position']}: " | |
| f"HTR='{r['htr']}' β Corregido='{r['corrected']}' " | |
| f"(GT='{r['gt']}')" | |
| ) | |
| lines.append("β" * 50) | |
| return "\n".join(lines) | |
| # ββ EvaluaciΓ³n en batch ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def batch_evaluate( | |
| self, corrector, pairs: List[Dict], verbose: bool = True | |
| ) -> Dict: | |
| """ | |
| EvalΓΊa el sistema sobre una lista de pares con groundtruth. | |
| Retorna mΓ©tricas agregadas + detalle por par. | |
| """ | |
| results = [] | |
| for i, pair in enumerate(pairs): | |
| if verbose: | |
| print(f" Evaluando {i+1}/{len(pairs)}: {pair['id']}") | |
| try: | |
| out = corrector.correct(pair["htr"]) | |
| metrics = self.evaluate_pair( | |
| htr=pair["htr"], | |
| corrected=out["corrected"], | |
| gt=pair["gt"], | |
| ) | |
| metrics["id"] = pair["id"] | |
| metrics["htr"] = pair["htr"] | |
| metrics["corrected"] = out["corrected"] | |
| metrics["gt"] = pair["gt"] | |
| results.append(metrics) | |
| except Exception as e: | |
| print(f" Error en {pair['id']}: {e}") | |
| if not results: | |
| return {"error": "Sin resultados"} | |
| def avg(key): | |
| return round(sum(r[key] for r in results) / len(results), 4) | |
| n = len(results) | |
| mejoras = sum(1 for r in results if r["cer_improvement"] > 0.02) | |
| empeorados = sum(1 for r in results if r["cer_improvement"] < -0.02) | |
| sin_cambio = n - mejoras - empeorados | |
| summary = { | |
| "n_evaluated": n, | |
| # ββ ComparaciΓ³n 1: GT vs HTR ββββββββββββββββββββββββββββββββββ | |
| "avg_cer_before": avg("cer_before"), | |
| "avg_wer_before": avg("wer_before"), | |
| # ββ ComparaciΓ³n 2: GT vs Corregido ββββββββββββββββββββββββββββ | |
| "avg_cer_after": avg("cer_after"), | |
| "avg_wer_after": avg("wer_after"), | |
| # ββ Mejora neta βββββββββββββββββββββββββββββββββββββββββββββββ | |
| "avg_cer_improvement": avg("cer_improvement"), | |
| "avg_wer_improvement": avg("wer_improvement"), | |
| "n_mejorados": mejoras, | |
| "n_empeorados": empeorados, | |
| "n_sin_cambio": sin_cambio, | |
| # ββ ComparaciΓ³n 3: Modernismos ββββββββββββββββββββββββββββββββ | |
| "avg_modernism_score": avg("modernism_score"), | |
| "detail": results, | |
| } | |
| if verbose: | |
| print(f"\n{'β'*55}") | |
| print(f" RESUMEN EVALUACIΓN ({n} pares)") | |
| print(f"{'β'*55}") | |
| print(f"\nβ Error de partida (GT vs HTR):") | |
| print(f" CER: {summary['avg_cer_before']:.2%} WER: {summary['avg_wer_before']:.2%}") | |
| print(f"\nβ‘ Error final (GT vs Corregido):") | |
| print(f" CER: {summary['avg_cer_after']:.2%} WER: {summary['avg_wer_after']:.2%}") | |
| print(f"\n Mejora CER: {summary['avg_cer_improvement']:+.2%} " | |
| f"Mejora WER: {summary['avg_wer_improvement']:+.2%}") | |
| print(f"\n Mejorados : {mejoras}/{n} ({mejoras/n:.0%})") | |
| print(f" Empeorados : {empeorados}/{n} ({empeorados/n:.0%})") | |
| print(f" Sin cambio : {sin_cambio}/{n} ({sin_cambio/n:.0%})") | |
| print(f"\nβ’ Modernismos (HTR vs Corregido):") | |
| print(f" Score promedio: {summary['avg_modernism_score']:.2%} " | |
| f"(1.0 = sin modernismos)") | |
| print(f"{'β'*55}") | |
| return summary | |