Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -3,85 +3,137 @@ import gradio as gr
|
|
| 3 |
import chromadb
|
| 4 |
import pandas as pd
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
-
import re
|
|
|
|
| 7 |
|
| 8 |
# --- 1. SETUP MODELS AND DATABASE ---
|
| 9 |
|
| 10 |
print("Loading embedding model...")
|
| 11 |
-
# Using the recommended Portuguese model
|
| 12 |
embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
|
| 13 |
|
| 14 |
client = chromadb.Client()
|
| 15 |
collection = client.get_or_create_collection(
|
| 16 |
-
name="
|
| 17 |
metadata={"hnsw:space": "cosine"}
|
| 18 |
)
|
| 19 |
print("ChromaDB collection ready.")
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def index_transcript(transcript_text):
|
| 23 |
-
|
| 24 |
if not transcript_text.strip():
|
| 25 |
return "Please paste a transcript before indexing.", pd.DataFrame()
|
| 26 |
-
|
| 27 |
-
# --- THE FIX: Remove NLTK and use a reliable Regex to split sentences ---
|
| 28 |
-
# This splits the text after any period, question mark, or exclamation point.
|
| 29 |
chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
|
| 30 |
-
|
| 31 |
-
# Clean up any empty strings or very short fragments that might result
|
| 32 |
chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
|
| 33 |
-
|
| 34 |
-
# Debugging logs to confirm the chunking
|
| 35 |
-
print("--- CHUNKING DEBUG ---")
|
| 36 |
-
print(f"Number of chunks created: {len(chunks)}")
|
| 37 |
-
print(f"Chunks found: {chunks}")
|
| 38 |
-
print("--- END DEBUGGING ---")
|
| 39 |
-
|
| 40 |
ids = [f"chunk_{i}" for i in range(len(chunks))]
|
| 41 |
-
|
| 42 |
-
# Clear previous entries before adding new ones
|
| 43 |
if collection.count() > 0:
|
| 44 |
collection.delete(ids=collection.get()['ids'])
|
| 45 |
-
|
| 46 |
-
collection.add(documents=chunks, ids=ids)
|
| 47 |
indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
|
| 48 |
return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
|
| 49 |
|
| 50 |
-
def
|
| 51 |
-
|
| 52 |
if not query.strip():
|
| 53 |
-
return pd.DataFrame()
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
documents = results['documents'][0]
|
| 58 |
distances = results['distances'][0]
|
| 59 |
similarities = [f"{1 - dist:.2f}" for dist in distances]
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
"Matching Chunk": documents
|
| 63 |
-
})
|
| 64 |
-
return df, "Search complete."
|
| 65 |
|
| 66 |
-
# ---
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
|
| 70 |
-
gr.Markdown("# 🤖 Guideline Compliance Prototype")
|
| 71 |
with gr.Row():
|
|
|
|
| 72 |
with gr.Column(scale=1):
|
| 73 |
-
gr.
|
| 74 |
-
transcript_input = gr.Textbox(lines=15, label="Paste Full Transcript Here", value=sample_transcript)
|
| 75 |
index_button = gr.Button("Index Transcript", variant="primary")
|
| 76 |
-
index_status = gr.Label(
|
| 77 |
-
indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview"
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
demo.launch()
|
|
|
|
| 3 |
import chromadb
|
| 4 |
import pandas as pd
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import re
|
| 7 |
+
import numpy
|
| 8 |
|
| 9 |
# --- 1. SETUP MODELS AND DATABASE ---
|
| 10 |
|
| 11 |
print("Loading embedding model...")
|
|
|
|
| 12 |
embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
|
| 13 |
|
| 14 |
client = chromadb.Client()
|
| 15 |
collection = client.get_or_create_collection(
|
| 16 |
+
name="transcript_comparison_app",
|
| 17 |
metadata={"hnsw:space": "cosine"}
|
| 18 |
)
|
| 19 |
print("ChromaDB collection ready.")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
# --- 2. NEW: DEFINE AND PRE-COMPUTE GUIDELINE PROFILES ---
|
| 23 |
+
|
| 24 |
+
# Define the positive phrases for each guideline you want to test
|
| 25 |
+
GUIDELINE_PROFILES = {
|
| 26 |
+
"Agent Empathy": [
|
| 27 |
+
"Sinto muito por esse transtorno.",
|
| 28 |
+
"Eu entendo completamente sua frustração.",
|
| 29 |
+
"Imagino como isso deve ser chato, vamos resolver.",
|
| 30 |
+
"Lamento que você tenha passado por isso.",
|
| 31 |
+
"Compreendo sua situação e peço desculpas pelo ocorrido."
|
| 32 |
+
],
|
| 33 |
+
"Problem Resolution Offer": [
|
| 34 |
+
"Para resolver isso, posso te oferecer duas opções.",
|
| 35 |
+
"Temos algumas alternativas para solucionar seu problema.",
|
| 36 |
+
"A solução que posso propor é a seguinte.",
|
| 37 |
+
"Vamos encontrar uma forma de resolver isso para você."
|
| 38 |
+
],
|
| 39 |
+
"Polite Closing": [
|
| 40 |
+
"Obrigado por sua ligação, tenha um ótimo dia.",
|
| 41 |
+
"Agradecemos seu contato.",
|
| 42 |
+
"Se precisar de mais alguma coisa, é só ligar.",
|
| 43 |
+
"Tenha uma excelente semana."
|
| 44 |
+
]
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Pre-compute the averaged profile embeddings when the app starts
|
| 48 |
+
print("Computing guideline profile embeddings...")
|
| 49 |
+
profile_embeddings = {}
|
| 50 |
+
for guideline_name, phrases in GUIDELINE_PROFILES.items():
|
| 51 |
+
phrase_embeddings = embedding_model.encode(phrases)
|
| 52 |
+
profile_embeddings[guideline_name] = numpy.mean(phrase_embeddings, axis=0)
|
| 53 |
+
print("✅ Guideline profiles are ready.")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# --- 3. CORE FUNCTIONS ---
|
| 57 |
+
|
| 58 |
def index_transcript(transcript_text):
|
| 59 |
+
# This function remains the same
|
| 60 |
if not transcript_text.strip():
|
| 61 |
return "Please paste a transcript before indexing.", pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
| 62 |
chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
|
|
|
|
|
|
|
| 63 |
chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
ids = [f"chunk_{i}" for i in range(len(chunks))]
|
|
|
|
|
|
|
| 65 |
if collection.count() > 0:
|
| 66 |
collection.delete(ids=collection.get()['ids'])
|
| 67 |
+
collection.add(embeddings=embedding_model.encode(chunks).tolist(), documents=chunks, ids=ids)
|
|
|
|
| 68 |
indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
|
| 69 |
return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
|
| 70 |
|
| 71 |
+
def search_with_single_query(query):
|
| 72 |
+
# This is the original search method
|
| 73 |
if not query.strip():
|
| 74 |
+
return pd.DataFrame()
|
| 75 |
+
query_embedding = embedding_model.encode(query).tolist()
|
| 76 |
+
results = collection.query(query_embeddings=[query_embedding], n_results=3)
|
| 77 |
+
documents = results['documents'][0]
|
| 78 |
+
distances = results['distances'][0]
|
| 79 |
+
similarities = [f"{1 - dist:.2f}" for dist in distances]
|
| 80 |
+
return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})
|
| 81 |
+
|
| 82 |
+
def search_with_profile(guideline_name):
|
| 83 |
+
# This is the NEW search method using the pre-computed profiles
|
| 84 |
+
if not guideline_name:
|
| 85 |
+
return pd.DataFrame()
|
| 86 |
+
profile_embedding = profile_embeddings[guideline_name].tolist()
|
| 87 |
+
results = collection.query(query_embeddings=[profile_embedding], n_results=3)
|
| 88 |
documents = results['documents'][0]
|
| 89 |
distances = results['distances'][0]
|
| 90 |
similarities = [f"{1 - dist:.2f}" for dist in distances]
|
| 91 |
+
return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})
|
| 92 |
+
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
# --- 4. GRADIO INTERFACE FOR COMPARISON ---
|
| 95 |
+
|
| 96 |
+
sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
|
| 97 |
+
Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
|
| 98 |
+
Atendente: Puxa, que chato isso. Lamento que você tenha passado por isso. Pode me informar o número do pedido para eu localizar sua compra?
|
| 99 |
+
Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
|
| 100 |
+
Atendente: Entendo perfeitamente sua frustração. Para resolver isso, posso te oferecer duas opções.
|
| 101 |
+
Cliente: Prefiro receber um novo.
|
| 102 |
+
Atendente: Combinado. Obrigado por sua ligação, tenha um ótimo dia.
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 106 |
+
gr.Markdown("# 🔍 Search Method Comparison")
|
| 107 |
+
gr.Markdown("Index a transcript once, then search using both methods to compare the results.")
|
| 108 |
|
|
|
|
|
|
|
| 109 |
with gr.Row():
|
| 110 |
+
# Indexing column is the same
|
| 111 |
with gr.Column(scale=1):
|
| 112 |
+
transcript_input = gr.Textbox(lines=15, label="Paste Transcript Here", value=sample_transcript)
|
|
|
|
| 113 |
index_button = gr.Button("Index Transcript", variant="primary")
|
| 114 |
+
index_status = gr.Label()
|
| 115 |
+
indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview")
|
| 116 |
+
|
| 117 |
+
gr.HTML("<hr>") # Add a horizontal line for separation
|
| 118 |
+
|
| 119 |
+
with gr.Row():
|
| 120 |
+
# Column for the simple, single query search
|
| 121 |
+
with gr.Column():
|
| 122 |
+
gr.Markdown("### Method 1: Single Query Search")
|
| 123 |
+
query_input = gr.Textbox(label="Enter a Simple Query", placeholder="Ex: o agente foi empático?")
|
| 124 |
+
search_button_single = gr.Button("Search Single Query")
|
| 125 |
+
results_output_single = gr.DataFrame(label="Single Query Results")
|
| 126 |
+
|
| 127 |
+
# Column for the new, profile-based search
|
| 128 |
+
with gr.Column():
|
| 129 |
+
gr.Markdown("### Method 2: Guideline Profile Search")
|
| 130 |
+
profile_input = gr.Dropdown(choices=list(GUIDELINE_PROFILES.keys()), label="Select a Guideline Profile")
|
| 131 |
+
search_button_profile = gr.Button("Search with Profile", variant="primary")
|
| 132 |
+
results_output_profile = gr.DataFrame(label="Profile Search Results")
|
| 133 |
+
|
| 134 |
+
# Wire up the components
|
| 135 |
index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
|
| 136 |
+
search_button_single.click(fn=search_with_single_query, inputs=[query_input], outputs=[results_output_single])
|
| 137 |
+
search_button_profile.click(fn=search_with_profile, inputs=[profile_input], outputs=[results_output_profile])
|
| 138 |
|
| 139 |
demo.launch()
|