Spaces:

SurFuturo
/

plataformas

Sleeping

App Files Files Community

SurFuturo commited on Sep 8

Commit

8667b65

verified ·

1 Parent(s): 5162b8e

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -49

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
-import os, glob, re
 import gradio as gr
 from docx import Document
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-DOCS_DIR = "."
-CHUNK_SIZE = 900
-OVERLAP = 150
-# Stopwords básicas en español
 SPANISH_STOPWORDS = {
     "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
     "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
@@ -21,66 +23,107 @@ SPANISH_STOPWORDS = {
     "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
     "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
     "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
-    "estáis","están","esté","estés","estemos","estéis","estén","estar","he","has",
-    "ha","hemos","habéis","han","haya","hayas","hayamos","hayáis","hayan","sea","seas",
-    "seamos","seáis","sean","ser","son","era","eras","éramos","erais","eran"
 }
-def _read_docx(path):
     doc = Document(path)
-    full_text = []
-    for para in doc.paragraphs:
-        if para.text.strip():
-            full_text.append(para.text)
-    return "\n".join(full_text)
-def _chunk(text, size=CHUNK_SIZE, overlap=OVERLAP):
     text = re.sub(r"\s+", " ", text).strip()
     chunks, i = [], 0
     while i < len(text):
         chunks.append(text[i:i+size])
-        i += (size - overlap)
     return chunks
-# 1) Cargar documentos DOCX
-docs = []
-for path in glob.glob(os.path.join(DOCS_DIR, "*.docx")):
-    docs.append((path, _read_docx(path)))
 corpus, sources = [], []
-for src, fulltext in docs:
-    for ch in _chunk(fulltext):
-        corpus.append(ch)
-        sources.append(src)
-if not corpus:
-    corpus = ["(Aún no subiste documentos .docx)"]
-    sources = [""]
-# 2) Vectorizar con stopwords básicas en español
-vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
-X = vectorizer.fit_transform(corpus)
-# 3) Función de respuesta
 def answer_fn(message, history):
-    if "(Aún no" in corpus[0]:
-        return "No hay documentos .docx disponibles."
-    q = vectorizer.transform([message])
-    sims = cosine_similarity(q, X).ravel()
-    top_idx = sims.argsort()[::-1][:3]
-    bullets = []
-    for i in top_idx:
-        frag = corpus[i]
-        src = os.path.basename(sources[i])
-        bullets.append(f"**{src}** · …{frag[:420]}…")
-    return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
-# 4) Interfaz
-demo = gr.ChatInterface(
-    fn=answer_fn,
-    title="Chat de documentos (DOCX)",
-    description="Consultá los informes y obtené fragmentos relevantes."
-)
 if __name__ == "__main__":
     demo.launch()

+import os, glob, re, traceback
 import gradio as gr
 from docx import Document
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# ------------------ Config ------------------
+DOCS_DIR = "."          # carpeta donde están los .docx
+CHUNK_SIZE = 900        # longitud de fragmento (caracteres)
+OVERLAP = 150           # solapamiento entre fragmentos
+TOP_K = 3               # cantidad de fragmentos a devolver
+# Stopwords básicas en español (lista corta, suficiente para MVP)
 SPANISH_STOPWORDS = {
     "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
     "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
     "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
     "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
     "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
+    "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
 }
+# ------------------ Utilidades ------------------
+def _read_docx(path: str) -> str:
+    """Lee texto de un .docx. Si falla, dispara excepción para ser capturada arriba."""
     doc = Document(path)
+    parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
+    return "\n".join(parts)
+def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
+    """Divide en fragmentos solapados para rankear luego."""
     text = re.sub(r"\s+", " ", text).strip()
+    if not text:
+        return []
     chunks, i = [], 0
+    step = max(1, size - overlap)
     while i < len(text):
         chunks.append(text[i:i+size])
+        i += step
     return chunks
+# ------------------ Indexación ------------------
+INDEX_READY = False
 corpus, sources = [], []
+indexed_files, skipped_files = [], []
+def build_index():
+    global corpus, sources, INDEX_READY, indexed_files, skipped_files, vectorizer, X
+    corpus, sources = [], []
+    indexed_files, skipped_files = [], []
+    paths = sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx")))
+    for path in paths:
+        try:
+            txt = _read_docx(path)
+            chs = _chunk(txt)
+            if chs:
+                corpus.extend(chs)
+                sources.extend([path] * len(chs))
+                indexed_files.append(os.path.basename(path))
+            else:
+                skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
+        except Exception as e:
+            skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
+    if not corpus:
+        corpus = ["(No hay texto indexado: agregá .docx con contenido)"]
+        sources = [""]
+    # Vectorizador sin 'spanish' (scikit-learn no lo trae); usamos lista propia
+    vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
+    X = vectorizer.fit_transform(corpus)
+    INDEX_READY = True
+# Construye el índice al cargar
+build_index()
+# ------------------ Funciones UI ------------------
 def answer_fn(message, history):
+    if not INDEX_READY or "(No hay texto indexado" in corpus[0]:
+        return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
+    try:
+        q = vectorizer.transform([message])
+        sims = cosine_similarity(q, X).ravel()
+        top_idx = sims.argsort()[::-1][:TOP_K]
+        bullets = []
+        for i in top_idx:
+            frag = corpus[i]
+            src = os.path.basename(sources[i])
+            bullets.append(f"**{src}** · …{frag[:420]}…")
+        return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
+    except Exception as e:
+        return f"Se produjo un error al responder: {e}"
+def status_fn():
+    """Devuelve un pequeño reporte de estado para depurar sin abrir Logs."""
+    lines = []
+    if indexed_files:
+        lines.append("**Archivos indexados:**")
+        for f in indexed_files:
+            lines.append(f"- {f}")
+    if skipped_files:
+        lines.append("\n**Archivos saltados:**")
+        for f, why in skipped_files:
+            lines.append(f"- {f}: {why}")
+    if not lines:
+        lines.append("No se encontró ningún .docx en el directorio.")
+    return "\n".join(lines)
+# ------------------ Interfaz Gradio ------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## Chat de documentos (DOCX)")
+    gr.Markdown(
+        "Escribí tu consulta y te devuelvo fragmentos relevantes. "
+        "Usá la pestaña **Estado** para ver qué archivos se indexaron."
+    )
+    with gr.Tabs():
+        with gr.Tab("Chat"):
+            chat = gr.ChatInterface(fn=answer_fn, title=None, description=None)
+        with gr.Tab("Estado"):
+            btn = gr.Button("Actualizar estado")
+            out = gr.Markdown(status_fn())
+            btn.click(fn=lambda: status_fn(), outputs=out)
 if __name__ == "__main__":
     demo.launch()