SurFuturo commited on
Commit
8667b65
·
verified ·
1 Parent(s): 5162b8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -49
app.py CHANGED
@@ -1,14 +1,16 @@
1
- import os, glob, re
2
  import gradio as gr
3
  from docx import Document
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
- DOCS_DIR = "."
8
- CHUNK_SIZE = 900
9
- OVERLAP = 150
 
 
10
 
11
- # Stopwords básicas en español
12
  SPANISH_STOPWORDS = {
13
  "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
14
  "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
@@ -21,66 +23,107 @@ SPANISH_STOPWORDS = {
21
  "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
22
  "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
23
  "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
24
- "estáis","están","esté","estés","estemos","estéis","estén","estar","he","has",
25
- "ha","hemos","habéis","han","haya","hayas","hayamos","hayáis","hayan","sea","seas",
26
- "seamos","seáis","sean","ser","son","era","eras","éramos","erais","eran"
27
  }
28
 
29
- def _read_docx(path):
 
 
30
  doc = Document(path)
31
- full_text = []
32
- for para in doc.paragraphs:
33
- if para.text.strip():
34
- full_text.append(para.text)
35
- return "\n".join(full_text)
36
 
37
- def _chunk(text, size=CHUNK_SIZE, overlap=OVERLAP):
 
38
  text = re.sub(r"\s+", " ", text).strip()
 
 
39
  chunks, i = [], 0
 
40
  while i < len(text):
41
  chunks.append(text[i:i+size])
42
- i += (size - overlap)
43
  return chunks
44
 
45
- # 1) Cargar documentos DOCX
46
- docs = []
47
- for path in glob.glob(os.path.join(DOCS_DIR, "*.docx")):
48
- docs.append((path, _read_docx(path)))
49
-
50
  corpus, sources = [], []
51
- for src, fulltext in docs:
52
- for ch in _chunk(fulltext):
53
- corpus.append(ch)
54
- sources.append(src)
55
 
56
- if not corpus:
57
- corpus = ["(Aún no subiste documentos .docx)"]
58
- sources = [""]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- # 2) Vectorizar con stopwords básicas en español
61
- vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
62
- X = vectorizer.fit_transform(corpus)
63
 
64
- # 3) Función de respuesta
65
  def answer_fn(message, history):
66
- if "(Aún no" in corpus[0]:
67
- return "No hay documentos .docx disponibles."
68
- q = vectorizer.transform([message])
69
- sims = cosine_similarity(q, X).ravel()
70
- top_idx = sims.argsort()[::-1][:3]
71
- bullets = []
72
- for i in top_idx:
73
- frag = corpus[i]
74
- src = os.path.basename(sources[i])
75
- bullets.append(f"**{src}** · …{frag[:420]}…")
76
- return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # 4) Interfaz
79
- demo = gr.ChatInterface(
80
- fn=answer_fn,
81
- title="Chat de documentos (DOCX)",
82
- description="Consultá los informes y obtené fragmentos relevantes."
83
- )
 
 
 
 
 
 
 
 
84
 
85
  if __name__ == "__main__":
86
  demo.launch()
 
1
+ import os, glob, re, traceback
2
  import gradio as gr
3
  from docx import Document
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
+ # ------------------ Config ------------------
8
+ DOCS_DIR = "." # carpeta donde están los .docx
9
+ CHUNK_SIZE = 900 # longitud de fragmento (caracteres)
10
+ OVERLAP = 150 # solapamiento entre fragmentos
11
+ TOP_K = 3 # cantidad de fragmentos a devolver
12
 
13
+ # Stopwords básicas en español (lista corta, suficiente para MVP)
14
  SPANISH_STOPWORDS = {
15
  "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
16
  "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
 
23
  "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
24
  "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
25
  "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
26
+ "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
 
 
27
  }
28
 
29
+ # ------------------ Utilidades ------------------
30
+ def _read_docx(path: str) -> str:
31
+ """Lee texto de un .docx. Si falla, dispara excepción para ser capturada arriba."""
32
  doc = Document(path)
33
+ parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
34
+ return "\n".join(parts)
 
 
 
35
 
36
+ def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
37
+ """Divide en fragmentos solapados para rankear luego."""
38
  text = re.sub(r"\s+", " ", text).strip()
39
+ if not text:
40
+ return []
41
  chunks, i = [], 0
42
+ step = max(1, size - overlap)
43
  while i < len(text):
44
  chunks.append(text[i:i+size])
45
+ i += step
46
  return chunks
47
 
48
+ # ------------------ Indexación ------------------
49
+ INDEX_READY = False
 
 
 
50
  corpus, sources = [], []
51
+ indexed_files, skipped_files = [], []
 
 
 
52
 
53
+ def build_index():
54
+ global corpus, sources, INDEX_READY, indexed_files, skipped_files, vectorizer, X
55
+ corpus, sources = [], []
56
+ indexed_files, skipped_files = [], []
57
+ paths = sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx")))
58
+ for path in paths:
59
+ try:
60
+ txt = _read_docx(path)
61
+ chs = _chunk(txt)
62
+ if chs:
63
+ corpus.extend(chs)
64
+ sources.extend([path] * len(chs))
65
+ indexed_files.append(os.path.basename(path))
66
+ else:
67
+ skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
68
+ except Exception as e:
69
+ skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
70
+ if not corpus:
71
+ corpus = ["(No hay texto indexado: agregá .docx con contenido)"]
72
+ sources = [""]
73
+ # Vectorizador sin 'spanish' (scikit-learn no lo trae); usamos lista propia
74
+ vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
75
+ X = vectorizer.fit_transform(corpus)
76
+ INDEX_READY = True
77
 
78
+ # Construye el índice al cargar
79
+ build_index()
 
80
 
81
+ # ------------------ Funciones UI ------------------
82
  def answer_fn(message, history):
83
+ if not INDEX_READY or "(No hay texto indexado" in corpus[0]:
84
+ return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
85
+ try:
86
+ q = vectorizer.transform([message])
87
+ sims = cosine_similarity(q, X).ravel()
88
+ top_idx = sims.argsort()[::-1][:TOP_K]
89
+ bullets = []
90
+ for i in top_idx:
91
+ frag = corpus[i]
92
+ src = os.path.basename(sources[i])
93
+ bullets.append(f"**{src}** · …{frag[:420]}…")
94
+ return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
95
+ except Exception as e:
96
+ return f"Se produjo un error al responder: {e}"
97
+
98
+ def status_fn():
99
+ """Devuelve un pequeño reporte de estado para depurar sin abrir Logs."""
100
+ lines = []
101
+ if indexed_files:
102
+ lines.append("**Archivos indexados:**")
103
+ for f in indexed_files:
104
+ lines.append(f"- {f}")
105
+ if skipped_files:
106
+ lines.append("\n**Archivos saltados:**")
107
+ for f, why in skipped_files:
108
+ lines.append(f"- {f}: {why}")
109
+ if not lines:
110
+ lines.append("No se encontró ningún .docx en el directorio.")
111
+ return "\n".join(lines)
112
 
113
+ # ------------------ Interfaz Gradio ------------------
114
+ with gr.Blocks() as demo:
115
+ gr.Markdown("## Chat de documentos (DOCX)")
116
+ gr.Markdown(
117
+ "Escribí tu consulta y te devuelvo fragmentos relevantes. "
118
+ "Usá la pestaña **Estado** para ver qué archivos se indexaron."
119
+ )
120
+ with gr.Tabs():
121
+ with gr.Tab("Chat"):
122
+ chat = gr.ChatInterface(fn=answer_fn, title=None, description=None)
123
+ with gr.Tab("Estado"):
124
+ btn = gr.Button("Actualizar estado")
125
+ out = gr.Markdown(status_fn())
126
+ btn.click(fn=lambda: status_fn(), outputs=out)
127
 
128
  if __name__ == "__main__":
129
  demo.launch()