SurFuturo commited on
Commit
d727f97
·
verified ·
1 Parent(s): fe85f61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -24
app.py CHANGED
@@ -4,14 +4,20 @@ from docx import Document
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
 
 
 
7
  # ------------------ Config ------------------
8
- DOCS_DIR = "." # carpeta con los .docx (raíz del Space)
9
- CHUNK_SIZE = 900 # longitud de fragmento (caracteres)
10
- OVERLAP = 150 # solapamiento entre fragmentos
11
- TOP_K = 3 # cantidad de fragmentos a devolver
 
 
 
12
 
13
- # Stopwords básicas en español (conjunto -> luego lo convertimos a list())
14
- SPANISH_STOPWORDS = {
15
  "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
16
  "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
17
  "ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también",
@@ -24,7 +30,7 @@ SPANISH_STOPWORDS = {
24
  "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
25
  "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
26
  "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
27
- }
28
 
29
  # ------------------ Utilidades ------------------
30
  def _read_docx(path: str) -> str:
@@ -66,28 +72,73 @@ def build_index():
66
  skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
67
 
68
  if not corpus:
69
- corpus = ["(No hay texto indexado: agregá .docx con contenido)"]
70
- sources = [""]
71
 
72
- # IMPORTANTE: pasar una LISTA, no set
73
- vectorizer = TfidfVectorizer(stop_words=list(SPANISH_STOPWORDS), lowercase=True)
74
  X = vectorizer.fit_transform(corpus)
75
 
76
  build_index()
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # ------------------ Funciones UI ------------------
79
- def answer_fn(message, history):
80
  if "(No hay texto indexado" in corpus[0]:
81
  return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
82
- q = vectorizer.transform([message])
83
- sims = cosine_similarity(q, X).ravel()
84
- top_idx = sims.argsort()[::-1][:TOP_K]
85
- bullets = []
86
- for i in top_idx:
87
- frag = corpus[i]
88
- src = os.path.basename(sources[i])
89
- bullets.append(f"**{src}** · …{frag[:420]}…")
90
- return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def status_fn():
93
  lines = []
@@ -105,11 +156,22 @@ def status_fn():
105
 
106
  # ------------------ Interfaz Gradio ------------------
107
  with gr.Blocks() as demo:
108
- gr.Markdown("## Chat de documentos (DOCX)")
109
- gr.Markdown("Escribí tu consulta y te devuelvo fragmentos relevantes. Revisá **Estado** para ver indexación.")
 
 
 
 
110
  with gr.Tabs():
111
  with gr.Tab("Chat"):
112
- gr.ChatInterface(fn=answer_fn)
 
 
 
 
 
 
 
113
  with gr.Tab("Estado"):
114
  btn = gr.Button("Actualizar estado")
115
  out = gr.Markdown(status_fn())
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
+ # --- QA (transformers) ---
8
+ from transformers import pipeline
9
+
10
  # ------------------ Config ------------------
11
+ DOCS_DIR = "." # .docx en la raíz del Space
12
+ CHUNK_SIZE = 900 # longitud del fragmento (caracteres)
13
+ OVERLAP = 150 # solapamiento
14
+ TOP_K_RETRIEVE = 5 # fragmentos candidatos para QA
15
+ TOP_K_SHOW = 3 # fragmentos a mostrar en modo "fragmentos"
16
+ QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
17
+ QA_THRESHOLD = 0.25 # umbral mínimo de confianza del modelo QA
18
 
19
+ # Stopwords (lista breve en español)
20
+ SPANISH_STOPWORDS = [
21
  "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
22
  "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
23
  "ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también",
 
30
  "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
31
  "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
32
  "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
33
+ ]
34
 
35
  # ------------------ Utilidades ------------------
36
  def _read_docx(path: str) -> str:
 
72
  skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))
73
 
74
  if not corpus:
75
+ corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"]
76
+ sources[:] = [""]
77
 
78
+ vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
 
79
  X = vectorizer.fit_transform(corpus)
80
 
81
  build_index()
82
 
83
+ # ------------------ QA ------------------
84
+ qa = pipeline("question-answering", model=QA_MODEL)
85
+
86
+ def answer_qa(question: str):
87
+ """Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta."""
88
+ q = vectorizer.transform([question])
89
+ sims = cosine_similarity(q, X).ravel()
90
+ top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE]
91
+
92
+ best = None
93
+ for i in top_idx:
94
+ context = corpus[i]
95
+ res = qa(question=question, context=context)
96
+ # res: {'score': float, 'start': int, 'end': int, 'answer': text}
97
+ candidate = {
98
+ "text": res.get("answer", "").strip(),
99
+ "score": float(res.get("score", 0.0)),
100
+ "source": os.path.basename(sources[i]),
101
+ "context": context
102
+ }
103
+ if not best or candidate["score"] > best["score"]:
104
+ best = candidate
105
+
106
+ return best
107
+
108
  # ------------------ Funciones UI ------------------
109
+ def chat_fn(message, history, modo_qa):
110
  if "(No hay texto indexado" in corpus[0]:
111
  return "No hay texto indexado aún. Verificá que los .docx tengan contenido."
112
+
113
+ if modo_qa:
114
+ best = answer_qa(message)
115
+ if best and best["text"] and best["score"] >= QA_THRESHOLD:
116
+ return f"**Respuesta:** {best['text']}\n\n**Fuente:** {best['source']} \n*(confianza: {best['score']:.2f})*"
117
+ else:
118
+ # fallback a fragmentos cuando la confianza es baja
119
+ q = vectorizer.transform([message])
120
+ sims = cosine_similarity(q, X).ravel()
121
+ top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
122
+ bullets = []
123
+ for i in top_idx:
124
+ frag = corpus[i]
125
+ src = os.path.basename(sources[i])
126
+ bullets.append(f"**{src}** · …{frag[:420]}…")
127
+ return (
128
+ "No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- "
129
+ + "\n- ".join(bullets)
130
+ )
131
+ else:
132
+ # modo fragmentos (como ahora)
133
+ q = vectorizer.transform([message])
134
+ sims = cosine_similarity(q, X).ravel()
135
+ top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
136
+ bullets = []
137
+ for i in top_idx:
138
+ frag = corpus[i]
139
+ src = os.path.basename(sources[i])
140
+ bullets.append(f"**{src}** · …{frag[:420]}…")
141
+ return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)
142
 
143
  def status_fn():
144
  lines = []
 
156
 
157
  # ------------------ Interfaz Gradio ------------------
158
  with gr.Blocks() as demo:
159
+ gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)")
160
+ gr.Markdown(
161
+ "Activá **Respuesta natural (QA)** para que el sistema intente contestar en español "
162
+ "a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos."
163
+ )
164
+
165
  with gr.Tabs():
166
  with gr.Tab("Chat"):
167
+ modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True)
168
+ chat = gr.ChatInterface(
169
+ fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value),
170
+ title=None, description=None
171
+ )
172
+ # Vincular el checkbox al chat (simple workaround)
173
+ modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[])
174
+
175
  with gr.Tab("Estado"):
176
  btn = gr.Button("Actualizar estado")
177
  out = gr.Markdown(status_fn())