File size: 7,293 Bytes
17b34c0
91208da
f5c3a19
 
 
 
d727f97
 
 
8667b65
d727f97
 
 
 
 
 
 
f5c3a19
d727f97
 
5162b8e
 
 
 
 
 
 
 
 
 
 
8667b65
d727f97
5162b8e
8667b65
 
f5c3a19
8667b65
 
f5c3a19
8667b65
f5c3a19
8667b65
 
f5c3a19
8667b65
f5c3a19
 
8667b65
f5c3a19
 
8667b65
f5c3a19
8667b65
f5c3a19
8667b65
17b34c0
8667b65
 
17b34c0
 
8667b65
 
 
 
 
 
 
 
 
 
 
17b34c0
8667b65
d727f97
 
17b34c0
d727f97
8667b65
f5c3a19
8667b65
f5c3a19
d727f97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8667b65
d727f97
17b34c0
8667b65
d727f97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8667b65
 
 
 
 
 
17b34c0
8667b65
 
 
 
 
 
 
5162b8e
8667b65
 
d727f97
 
 
 
 
 
8667b65
 
d727f97
 
 
 
 
 
 
 
8667b65
 
 
 
5162b8e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os, glob, re
import gradio as gr
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- QA (transformers) ---
from transformers import pipeline

# ------------------ Config ------------------
DOCS_DIR = "."          # .docx en la raíz del Space
CHUNK_SIZE = 900        # longitud del fragmento (caracteres)
OVERLAP = 150           # solapamiento
TOP_K_RETRIEVE = 5      # fragmentos candidatos para QA
TOP_K_SHOW = 3          # fragmentos a mostrar en modo "fragmentos"
QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
QA_THRESHOLD = 0.25     # umbral mínimo de confianza del modelo QA

# Stopwords (lista breve en español)
SPANISH_STOPWORDS = [
    "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
    "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
    "ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también",
    "me","hasta","hay","donde","quien","desde","todo","nos","durante","todos","uno",
    "les","ni","contra","otros","ese","eso","ante","ellos","e","esto","mí","antes",
    "algunos","qué","unos","yo","otro","otras","otra","él","tanto","esa","estos",
    "mucho","quienes","nada","muchos","cual","poco","ella","estar","estas","algunas",
    "algo","nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros",
    "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
    "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
    "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
    "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
]

# ------------------ Utilidades ------------------
def _read_docx(path: str) -> str:
    doc = Document(path)
    parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
    return "\n".join(parts)

def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []
    chunks, i = [], 0
    step = max(1, size - overlap)
    while i < len(text):
        chunks.append(text[i:i+size])
        i += step
    return chunks

# ------------------ Indexación ------------------
corpus, sources = [], []
indexed_files, skipped_files = [], []

def build_index():
    global corpus, sources, indexed_files, skipped_files, vectorizer, X
    corpus, sources = [], []
    indexed_files, skipped_files = [], []

    for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx"))):
        try:
            txt = _read_docx(path)
            chs = _chunk(txt)
            if chs:
                corpus.extend(chs)
                sources.extend([path] * len(chs))
                indexed_files.append(os.path.basename(path))
            else:
                skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
        except Exception as e:
            skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))

    if not corpus:
        corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"]
        sources[:] = [""]

    vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
    X = vectorizer.fit_transform(corpus)

build_index()

# ------------------ QA ------------------
qa = pipeline("question-answering", model=QA_MODEL)

def answer_qa(question: str):
    """Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta."""
    q = vectorizer.transform([question])
    sims = cosine_similarity(q, X).ravel()
    top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE]

    best = None
    for i in top_idx:
        context = corpus[i]
        res = qa(question=question, context=context)
        # res: {'score': float, 'start': int, 'end': int, 'answer': text}
        candidate = {
            "text": res.get("answer", "").strip(),
            "score": float(res.get("score", 0.0)),
            "source": os.path.basename(sources[i]),
            "context": context
        }
        if not best or candidate["score"] > best["score"]:
            best = candidate

    return best

# ------------------ Funciones UI ------------------
def chat_fn(message, history, modo_qa):
    if "(No hay texto indexado" in corpus[0]:
        return "No hay texto indexado aún. Verificá que los .docx tengan contenido."

    if modo_qa:
        best = answer_qa(message)
        if best and best["text"] and best["score"] >= QA_THRESHOLD:
            return f"**Respuesta:** {best['text']}\n\n**Fuente:** {best['source']}  \n*(confianza: {best['score']:.2f})*"
        else:
            # fallback a fragmentos cuando la confianza es baja
            q = vectorizer.transform([message])
            sims = cosine_similarity(q, X).ravel()
            top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
            bullets = []
            for i in top_idx:
                frag = corpus[i]
                src = os.path.basename(sources[i])
                bullets.append(f"**{src}** · …{frag[:420]}…")
            return (
                "No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- "
                + "\n- ".join(bullets)
            )
    else:
        # modo fragmentos (como ahora)
        q = vectorizer.transform([message])
        sims = cosine_similarity(q, X).ravel()
        top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
        bullets = []
        for i in top_idx:
            frag = corpus[i]
            src = os.path.basename(sources[i])
            bullets.append(f"**{src}** · …{frag[:420]}…")
        return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)

def status_fn():
    lines = []
    if indexed_files:
        lines.append("**Archivos indexados:**")
        for f in indexed_files:
            lines.append(f"- " + f)
    if skipped_files:
        lines.append("\n**Archivos saltados:**")
        for f, why in skipped_files:
            lines.append(f"- {f}: {why}")
    if not lines:
        lines.append("No se encontró ningún .docx en el directorio.")
    return "\n".join(lines)

# ------------------ Interfaz Gradio ------------------
with gr.Blocks() as demo:
    gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)")
    gr.Markdown(
        "Activá **Respuesta natural (QA)** para que el sistema intente contestar en español "
        "a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos."
    )

    with gr.Tabs():
        with gr.Tab("Chat"):
            modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True)
            chat = gr.ChatInterface(
                fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value),
                title=None, description=None
            )
            # Vincular el checkbox al chat (simple workaround)
            modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[])

        with gr.Tab("Estado"):
            btn = gr.Button("Actualizar estado")
            out = gr.Markdown(status_fn())
            btn.click(fn=lambda: status_fn(), outputs=out)

if __name__ == "__main__":
    demo.launch()