Spaces:
Sleeping
Sleeping
File size: 7,293 Bytes
17b34c0 91208da f5c3a19 d727f97 8667b65 d727f97 f5c3a19 d727f97 5162b8e 8667b65 d727f97 5162b8e 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 f5c3a19 8667b65 17b34c0 8667b65 17b34c0 8667b65 17b34c0 8667b65 d727f97 17b34c0 d727f97 8667b65 f5c3a19 8667b65 f5c3a19 d727f97 8667b65 d727f97 17b34c0 8667b65 d727f97 8667b65 17b34c0 8667b65 5162b8e 8667b65 d727f97 8667b65 d727f97 8667b65 5162b8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import os, glob, re
import gradio as gr
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# --- QA (transformers) ---
from transformers import pipeline
# ------------------ Config ------------------
DOCS_DIR = "." # .docx en la raíz del Space
CHUNK_SIZE = 900 # longitud del fragmento (caracteres)
OVERLAP = 150 # solapamiento
TOP_K_RETRIEVE = 5 # fragmentos candidatos para QA
TOP_K_SHOW = 3 # fragmentos a mostrar en modo "fragmentos"
QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
QA_THRESHOLD = 0.25 # umbral mínimo de confianza del modelo QA
# Stopwords (lista breve en español)
SPANISH_STOPWORDS = [
"de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
"no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
"ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también",
"me","hasta","hay","donde","quien","desde","todo","nos","durante","todos","uno",
"les","ni","contra","otros","ese","eso","ante","ellos","e","esto","mí","antes",
"algunos","qué","unos","yo","otro","otras","otra","él","tanto","esa","estos",
"mucho","quienes","nada","muchos","cual","poco","ella","estar","estas","algunas",
"algo","nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros",
"vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
"suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
"vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
"estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
]
# ------------------ Utilidades ------------------
def _read_docx(path: str) -> str:
doc = Document(path)
parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
return "\n".join(parts)
def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
text = re.sub(r"\s+", " ", text).strip()
if not text:
return []
chunks, i = [], 0
step = max(1, size - overlap)
while i < len(text):
chunks.append(text[i:i+size])
i += step
return chunks
# ------------------ Indexación ------------------
corpus, sources = [], []
indexed_files, skipped_files = [], []
def build_index():
global corpus, sources, indexed_files, skipped_files, vectorizer, X
corpus, sources = [], []
indexed_files, skipped_files = [], []
for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx"))):
try:
txt = _read_docx(path)
chs = _chunk(txt)
if chs:
corpus.extend(chs)
sources.extend([path] * len(chs))
indexed_files.append(os.path.basename(path))
else:
skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
except Exception as e:
skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
if not corpus:
corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"]
sources[:] = [""]
vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
X = vectorizer.fit_transform(corpus)
build_index()
# ------------------ QA ------------------
qa = pipeline("question-answering", model=QA_MODEL)
def answer_qa(question: str):
"""Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta."""
q = vectorizer.transform([question])
sims = cosine_similarity(q, X).ravel()
top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE]
best = None
for i in top_idx:
context = corpus[i]
res = qa(question=question, context=context)
# res: {'score': float, 'start': int, 'end': int, 'answer': text}
candidate = {
"text": res.get("answer", "").strip(),
"score": float(res.get("score", 0.0)),
"source": os.path.basename(sources[i]),
"context": context
}
if not best or candidate["score"] > best["score"]:
best = candidate
return best
# ------------------ Funciones UI ------------------
def chat_fn(message, history, modo_qa):
if "(No hay texto indexado" in corpus[0]:
return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
if modo_qa:
best = answer_qa(message)
if best and best["text"] and best["score"] >= QA_THRESHOLD:
return f"**Respuesta:** {best['text']}\n\n**Fuente:** {best['source']} \n*(confianza: {best['score']:.2f})*"
else:
# fallback a fragmentos cuando la confianza es baja
q = vectorizer.transform([message])
sims = cosine_similarity(q, X).ravel()
top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
bullets = []
for i in top_idx:
frag = corpus[i]
src = os.path.basename(sources[i])
bullets.append(f"**{src}** · …{frag[:420]}…")
return (
"No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- "
+ "\n- ".join(bullets)
)
else:
# modo fragmentos (como ahora)
q = vectorizer.transform([message])
sims = cosine_similarity(q, X).ravel()
top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
bullets = []
for i in top_idx:
frag = corpus[i]
src = os.path.basename(sources[i])
bullets.append(f"**{src}** · …{frag[:420]}…")
return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
def status_fn():
lines = []
if indexed_files:
lines.append("**Archivos indexados:**")
for f in indexed_files:
lines.append(f"- " + f)
if skipped_files:
lines.append("\n**Archivos saltados:**")
for f, why in skipped_files:
lines.append(f"- {f}: {why}")
if not lines:
lines.append("No se encontró ningún .docx en el directorio.")
return "\n".join(lines)
# ------------------ Interfaz Gradio ------------------
with gr.Blocks() as demo:
gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)")
gr.Markdown(
"Activá **Respuesta natural (QA)** para que el sistema intente contestar en español "
"a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos."
)
with gr.Tabs():
with gr.Tab("Chat"):
modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True)
chat = gr.ChatInterface(
fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value),
title=None, description=None
)
# Vincular el checkbox al chat (simple workaround)
modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[])
with gr.Tab("Estado"):
btn = gr.Button("Actualizar estado")
out = gr.Markdown(status_fn())
btn.click(fn=lambda: status_fn(), outputs=out)
if __name__ == "__main__":
demo.launch()
|