import os, glob, re import gradio as gr from docx import Document from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # --- QA (transformers) --- from transformers import pipeline # ------------------ Config ------------------ DOCS_DIR = "." # .docx en la raíz del Space CHUNK_SIZE = 900 # longitud del fragmento (caracteres) OVERLAP = 150 # solapamiento TOP_K_RETRIEVE = 5 # fragmentos candidatos para QA TOP_K_SHOW = 3 # fragmentos a mostrar en modo "fragmentos" QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es" QA_THRESHOLD = 0.25 # umbral mínimo de confianza del modelo QA # Stopwords (lista breve en español) SPANISH_STOPWORDS = [ "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con", "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este", "ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también", "me","hasta","hay","donde","quien","desde","todo","nos","durante","todos","uno", "les","ni","contra","otros","ese","eso","ante","ellos","e","esto","mí","antes", "algunos","qué","unos","yo","otro","otras","otra","él","tanto","esa","estos", "mucho","quienes","nada","muchos","cual","poco","ella","estar","estas","algunas", "algo","nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros", "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo", "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro", "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos", "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran" ] # ------------------ Utilidades ------------------ def _read_docx(path: str) -> str: doc = Document(path) parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()] return "\n".join(parts) def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP): text = re.sub(r"\s+", " ", text).strip() if not text: return [] chunks, i = [], 0 step = max(1, size - overlap) while i < len(text): chunks.append(text[i:i+size]) i += step return chunks # ------------------ Indexación ------------------ corpus, sources = [], [] indexed_files, skipped_files = [], [] def build_index(): global corpus, sources, indexed_files, skipped_files, vectorizer, X corpus, sources = [], [] indexed_files, skipped_files = [], [] for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx"))): try: txt = _read_docx(path) chs = _chunk(txt) if chs: corpus.extend(chs) sources.extend([path] * len(chs)) indexed_files.append(os.path.basename(path)) else: skipped_files.append((os.path.basename(path), "Sin texto utilizable")) except Exception as e: skipped_files.append((os.path.basename(path), f"Error al leer: {e}")) if not corpus: corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"] sources[:] = [""] vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True) X = vectorizer.fit_transform(corpus) build_index() # ------------------ QA ------------------ qa = pipeline("question-answering", model=QA_MODEL) def answer_qa(question: str): """Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta.""" q = vectorizer.transform([question]) sims = cosine_similarity(q, X).ravel() top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE] best = None for i in top_idx: context = corpus[i] res = qa(question=question, context=context) # res: {'score': float, 'start': int, 'end': int, 'answer': text} candidate = { "text": res.get("answer", "").strip(), "score": float(res.get("score", 0.0)), "source": os.path.basename(sources[i]), "context": context } if not best or candidate["score"] > best["score"]: best = candidate return best # ------------------ Funciones UI ------------------ def chat_fn(message, history, modo_qa): if "(No hay texto indexado" in corpus[0]: return "No hay texto indexado aún. Verificá que los .docx tengan contenido." if modo_qa: best = answer_qa(message) if best and best["text"] and best["score"] >= QA_THRESHOLD: return f"**Respuesta:** {best['text']}\n\n**Fuente:** {best['source']} \n*(confianza: {best['score']:.2f})*" else: # fallback a fragmentos cuando la confianza es baja q = vectorizer.transform([message]) sims = cosine_similarity(q, X).ravel() top_idx = sims.argsort()[::-1][:TOP_K_SHOW] bullets = [] for i in top_idx: frag = corpus[i] src = os.path.basename(sources[i]) bullets.append(f"**{src}** · …{frag[:420]}…") return ( "No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- " + "\n- ".join(bullets) ) else: # modo fragmentos (como ahora) q = vectorizer.transform([message]) sims = cosine_similarity(q, X).ravel() top_idx = sims.argsort()[::-1][:TOP_K_SHOW] bullets = [] for i in top_idx: frag = corpus[i] src = os.path.basename(sources[i]) bullets.append(f"**{src}** · …{frag[:420]}…") return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets) def status_fn(): lines = [] if indexed_files: lines.append("**Archivos indexados:**") for f in indexed_files: lines.append(f"- " + f) if skipped_files: lines.append("\n**Archivos saltados:**") for f, why in skipped_files: lines.append(f"- {f}: {why}") if not lines: lines.append("No se encontró ningún .docx en el directorio.") return "\n".join(lines) # ------------------ Interfaz Gradio ------------------ with gr.Blocks() as demo: gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)") gr.Markdown( "Activá **Respuesta natural (QA)** para que el sistema intente contestar en español " "a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos." ) with gr.Tabs(): with gr.Tab("Chat"): modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True) chat = gr.ChatInterface( fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value), title=None, description=None ) # Vincular el checkbox al chat (simple workaround) modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[]) with gr.Tab("Estado"): btn = gr.Button("Actualizar estado") out = gr.Markdown(status_fn()) btn.click(fn=lambda: status_fn(), outputs=out) if __name__ == "__main__": demo.launch()