Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,16 @@
|
|
| 1 |
-
import os, glob, re
|
| 2 |
import gradio as gr
|
| 3 |
from docx import Document
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
# Stopwords básicas en español
|
| 12 |
SPANISH_STOPWORDS = {
|
| 13 |
"de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
|
| 14 |
"no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
|
|
@@ -21,66 +23,107 @@ SPANISH_STOPWORDS = {
|
|
| 21 |
"vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
|
| 22 |
"suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
|
| 23 |
"vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
|
| 24 |
-
"estáis","están","
|
| 25 |
-
"ha","hemos","habéis","han","haya","hayas","hayamos","hayáis","hayan","sea","seas",
|
| 26 |
-
"seamos","seáis","sean","ser","son","era","eras","éramos","erais","eran"
|
| 27 |
}
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
doc = Document(path)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
if para.text.strip():
|
| 34 |
-
full_text.append(para.text)
|
| 35 |
-
return "\n".join(full_text)
|
| 36 |
|
| 37 |
-
def _chunk(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
|
|
|
| 38 |
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
|
| 39 |
chunks, i = [], 0
|
|
|
|
| 40 |
while i < len(text):
|
| 41 |
chunks.append(text[i:i+size])
|
| 42 |
-
i +=
|
| 43 |
return chunks
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
for path in glob.glob(os.path.join(DOCS_DIR, "*.docx")):
|
| 48 |
-
docs.append((path, _read_docx(path)))
|
| 49 |
-
|
| 50 |
corpus, sources = [], []
|
| 51 |
-
|
| 52 |
-
for ch in _chunk(fulltext):
|
| 53 |
-
corpus.append(ch)
|
| 54 |
-
sources.append(src)
|
| 55 |
|
| 56 |
-
|
| 57 |
-
corpus
|
| 58 |
-
sources = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
-
X = vectorizer.fit_transform(corpus)
|
| 63 |
|
| 64 |
-
#
|
| 65 |
def answer_fn(message, history):
|
| 66 |
-
if "(
|
| 67 |
-
return "No hay
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
if __name__ == "__main__":
|
| 86 |
demo.launch()
|
|
|
|
| 1 |
+
import os, glob, re, traceback
|
| 2 |
import gradio as gr
|
| 3 |
from docx import Document
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
|
| 7 |
+
# ------------------ Config ------------------
|
| 8 |
+
DOCS_DIR = "." # carpeta donde están los .docx
|
| 9 |
+
CHUNK_SIZE = 900 # longitud de fragmento (caracteres)
|
| 10 |
+
OVERLAP = 150 # solapamiento entre fragmentos
|
| 11 |
+
TOP_K = 3 # cantidad de fragmentos a devolver
|
| 12 |
|
| 13 |
+
# Stopwords básicas en español (lista corta, suficiente para MVP)
|
| 14 |
SPANISH_STOPWORDS = {
|
| 15 |
"de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
|
| 16 |
"no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
|
|
|
|
| 23 |
"vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
|
| 24 |
"suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
|
| 25 |
"vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
|
| 26 |
+
"estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
|
|
|
|
|
|
|
| 27 |
}
|
| 28 |
|
| 29 |
+
# ------------------ Utilidades ------------------
|
| 30 |
+
def _read_docx(path: str) -> str:
|
| 31 |
+
"""Lee texto de un .docx. Si falla, dispara excepción para ser capturada arriba."""
|
| 32 |
doc = Document(path)
|
| 33 |
+
parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
|
| 34 |
+
return "\n".join(parts)
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
|
| 37 |
+
"""Divide en fragmentos solapados para rankear luego."""
|
| 38 |
text = re.sub(r"\s+", " ", text).strip()
|
| 39 |
+
if not text:
|
| 40 |
+
return []
|
| 41 |
chunks, i = [], 0
|
| 42 |
+
step = max(1, size - overlap)
|
| 43 |
while i < len(text):
|
| 44 |
chunks.append(text[i:i+size])
|
| 45 |
+
i += step
|
| 46 |
return chunks
|
| 47 |
|
| 48 |
+
# ------------------ Indexación ------------------
|
| 49 |
+
INDEX_READY = False
|
|
|
|
|
|
|
|
|
|
| 50 |
corpus, sources = [], []
|
| 51 |
+
indexed_files, skipped_files = [], []
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
def build_index():
|
| 54 |
+
global corpus, sources, INDEX_READY, indexed_files, skipped_files, vectorizer, X
|
| 55 |
+
corpus, sources = [], []
|
| 56 |
+
indexed_files, skipped_files = [], []
|
| 57 |
+
paths = sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx")))
|
| 58 |
+
for path in paths:
|
| 59 |
+
try:
|
| 60 |
+
txt = _read_docx(path)
|
| 61 |
+
chs = _chunk(txt)
|
| 62 |
+
if chs:
|
| 63 |
+
corpus.extend(chs)
|
| 64 |
+
sources.extend([path] * len(chs))
|
| 65 |
+
indexed_files.append(os.path.basename(path))
|
| 66 |
+
else:
|
| 67 |
+
skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
|
| 68 |
+
except Exception as e:
|
| 69 |
+
skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
|
| 70 |
+
if not corpus:
|
| 71 |
+
corpus = ["(No hay texto indexado: agregá .docx con contenido)"]
|
| 72 |
+
sources = [""]
|
| 73 |
+
# Vectorizador sin 'spanish' (scikit-learn no lo trae); usamos lista propia
|
| 74 |
+
vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
|
| 75 |
+
X = vectorizer.fit_transform(corpus)
|
| 76 |
+
INDEX_READY = True
|
| 77 |
|
| 78 |
+
# Construye el índice al cargar
|
| 79 |
+
build_index()
|
|
|
|
| 80 |
|
| 81 |
+
# ------------------ Funciones UI ------------------
|
| 82 |
def answer_fn(message, history):
|
| 83 |
+
if not INDEX_READY or "(No hay texto indexado" in corpus[0]:
|
| 84 |
+
return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
|
| 85 |
+
try:
|
| 86 |
+
q = vectorizer.transform([message])
|
| 87 |
+
sims = cosine_similarity(q, X).ravel()
|
| 88 |
+
top_idx = sims.argsort()[::-1][:TOP_K]
|
| 89 |
+
bullets = []
|
| 90 |
+
for i in top_idx:
|
| 91 |
+
frag = corpus[i]
|
| 92 |
+
src = os.path.basename(sources[i])
|
| 93 |
+
bullets.append(f"**{src}** · …{frag[:420]}…")
|
| 94 |
+
return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return f"Se produjo un error al responder: {e}"
|
| 97 |
+
|
| 98 |
+
def status_fn():
|
| 99 |
+
"""Devuelve un pequeño reporte de estado para depurar sin abrir Logs."""
|
| 100 |
+
lines = []
|
| 101 |
+
if indexed_files:
|
| 102 |
+
lines.append("**Archivos indexados:**")
|
| 103 |
+
for f in indexed_files:
|
| 104 |
+
lines.append(f"- {f}")
|
| 105 |
+
if skipped_files:
|
| 106 |
+
lines.append("\n**Archivos saltados:**")
|
| 107 |
+
for f, why in skipped_files:
|
| 108 |
+
lines.append(f"- {f}: {why}")
|
| 109 |
+
if not lines:
|
| 110 |
+
lines.append("No se encontró ningún .docx en el directorio.")
|
| 111 |
+
return "\n".join(lines)
|
| 112 |
|
| 113 |
+
# ------------------ Interfaz Gradio ------------------
|
| 114 |
+
with gr.Blocks() as demo:
|
| 115 |
+
gr.Markdown("## Chat de documentos (DOCX)")
|
| 116 |
+
gr.Markdown(
|
| 117 |
+
"Escribí tu consulta y te devuelvo fragmentos relevantes. "
|
| 118 |
+
"Usá la pestaña **Estado** para ver qué archivos se indexaron."
|
| 119 |
+
)
|
| 120 |
+
with gr.Tabs():
|
| 121 |
+
with gr.Tab("Chat"):
|
| 122 |
+
chat = gr.ChatInterface(fn=answer_fn, title=None, description=None)
|
| 123 |
+
with gr.Tab("Estado"):
|
| 124 |
+
btn = gr.Button("Actualizar estado")
|
| 125 |
+
out = gr.Markdown(status_fn())
|
| 126 |
+
btn.click(fn=lambda: status_fn(), outputs=out)
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
| 129 |
demo.launch()
|