ibombonato commited on
Commit
d44b7ca
·
verified ·
1 Parent(s): caf6350

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +99 -47
app.py CHANGED
@@ -3,85 +3,137 @@ import gradio as gr
3
  import chromadb
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
- import re # Import the regular expression library
 
7
 
8
  # --- 1. SETUP MODELS AND DATABASE ---
9
 
10
  print("Loading embedding model...")
11
- # Using the recommended Portuguese model
12
  embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
13
 
14
  client = chromadb.Client()
15
  collection = client.get_or_create_collection(
16
- name="transcript_demo_final_v3",
17
  metadata={"hnsw:space": "cosine"}
18
  )
19
  print("ChromaDB collection ready.")
20
 
21
- # --- 2. CORE FUNCTIONS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def index_transcript(transcript_text):
23
- """Chunks and indexes a full transcript into ChromaDB."""
24
  if not transcript_text.strip():
25
  return "Please paste a transcript before indexing.", pd.DataFrame()
26
-
27
- # --- THE FIX: Remove NLTK and use a reliable Regex to split sentences ---
28
- # This splits the text after any period, question mark, or exclamation point.
29
  chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
30
-
31
- # Clean up any empty strings or very short fragments that might result
32
  chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
33
-
34
- # Debugging logs to confirm the chunking
35
- print("--- CHUNKING DEBUG ---")
36
- print(f"Number of chunks created: {len(chunks)}")
37
- print(f"Chunks found: {chunks}")
38
- print("--- END DEBUGGING ---")
39
-
40
  ids = [f"chunk_{i}" for i in range(len(chunks))]
41
-
42
- # Clear previous entries before adding new ones
43
  if collection.count() > 0:
44
  collection.delete(ids=collection.get()['ids'])
45
-
46
- collection.add(documents=chunks, ids=ids)
47
  indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
48
  return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
49
 
50
- def search_transcript(query):
51
- """Searches the indexed transcript for a given query."""
52
  if not query.strip():
53
- return pd.DataFrame(), "Please enter a query."
54
- results = collection.query(query_texts=[query], n_results=3)
55
- if not results or not results['documents'][0]:
56
- return pd.DataFrame(), "No similar chunks found."
 
 
 
 
 
 
 
 
 
 
57
  documents = results['documents'][0]
58
  distances = results['distances'][0]
59
  similarities = [f"{1 - dist:.2f}" for dist in distances]
60
- df = pd.DataFrame({
61
- "Similarity Score": similarities,
62
- "Matching Chunk": documents
63
- })
64
- return df, "Search complete."
65
 
66
- # --- 3. GRADIO INTERFACE ---
67
- sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar? Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar. Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra? Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes."""
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
70
- gr.Markdown("# 🤖 Guideline Compliance Prototype")
71
  with gr.Row():
 
72
  with gr.Column(scale=1):
73
- gr.Markdown("### 1. Index a Transcript")
74
- transcript_input = gr.Textbox(lines=15, label="Paste Full Transcript Here", value=sample_transcript)
75
  index_button = gr.Button("Index Transcript", variant="primary")
76
- index_status = gr.Label(value="Status: Waiting for transcript.")
77
- indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview", interactive=False)
78
- with gr.Column(scale=1):
79
- gr.Markdown("### 2. Search for Compliance")
80
- query_input = gr.Textbox(label="Guideline Query", placeholder="Ex: O operador ofereceu duas opções?")
81
- search_button = gr.Button("Search", variant="primary")
82
- search_status = gr.Label(value="Status: Waiting for query.")
83
- results_output = gr.DataFrame(headers=["Similarity Score", "Matching Chunk"], label="Search Results (Top 3)")
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
85
- search_button.click(fn=search_transcript, inputs=[query_input], outputs=[results_output, search_status])
 
86
 
87
  demo.launch()
 
3
  import chromadb
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
+ import re
7
+ import numpy
8
 
9
  # --- 1. SETUP MODELS AND DATABASE ---
10
 
11
  print("Loading embedding model...")
 
12
  embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
13
 
14
  client = chromadb.Client()
15
  collection = client.get_or_create_collection(
16
+ name="transcript_comparison_app",
17
  metadata={"hnsw:space": "cosine"}
18
  )
19
  print("ChromaDB collection ready.")
20
 
21
+
22
+ # --- 2. NEW: DEFINE AND PRE-COMPUTE GUIDELINE PROFILES ---
23
+
24
+ # Define the positive phrases for each guideline you want to test
25
+ GUIDELINE_PROFILES = {
26
+ "Agent Empathy": [
27
+ "Sinto muito por esse transtorno.",
28
+ "Eu entendo completamente sua frustração.",
29
+ "Imagino como isso deve ser chato, vamos resolver.",
30
+ "Lamento que você tenha passado por isso.",
31
+ "Compreendo sua situação e peço desculpas pelo ocorrido."
32
+ ],
33
+ "Problem Resolution Offer": [
34
+ "Para resolver isso, posso te oferecer duas opções.",
35
+ "Temos algumas alternativas para solucionar seu problema.",
36
+ "A solução que posso propor é a seguinte.",
37
+ "Vamos encontrar uma forma de resolver isso para você."
38
+ ],
39
+ "Polite Closing": [
40
+ "Obrigado por sua ligação, tenha um ótimo dia.",
41
+ "Agradecemos seu contato.",
42
+ "Se precisar de mais alguma coisa, é só ligar.",
43
+ "Tenha uma excelente semana."
44
+ ]
45
+ }
46
+
47
+ # Pre-compute the averaged profile embeddings when the app starts
48
+ print("Computing guideline profile embeddings...")
49
+ profile_embeddings = {}
50
+ for guideline_name, phrases in GUIDELINE_PROFILES.items():
51
+ phrase_embeddings = embedding_model.encode(phrases)
52
+ profile_embeddings[guideline_name] = numpy.mean(phrase_embeddings, axis=0)
53
+ print("✅ Guideline profiles are ready.")
54
+
55
+
56
+ # --- 3. CORE FUNCTIONS ---
57
+
58
  def index_transcript(transcript_text):
59
+ # This function remains the same
60
  if not transcript_text.strip():
61
  return "Please paste a transcript before indexing.", pd.DataFrame()
 
 
 
62
  chunks = re.split(r'(?<=[.!?])\s+', transcript_text)
 
 
63
  chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
 
 
 
 
 
 
 
64
  ids = [f"chunk_{i}" for i in range(len(chunks))]
 
 
65
  if collection.count() > 0:
66
  collection.delete(ids=collection.get()['ids'])
67
+ collection.add(embeddings=embedding_model.encode(chunks).tolist(), documents=chunks, ids=ids)
 
68
  indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
69
  return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
70
 
71
+ def search_with_single_query(query):
72
+ # This is the original search method
73
  if not query.strip():
74
+ return pd.DataFrame()
75
+ query_embedding = embedding_model.encode(query).tolist()
76
+ results = collection.query(query_embeddings=[query_embedding], n_results=3)
77
+ documents = results['documents'][0]
78
+ distances = results['distances'][0]
79
+ similarities = [f"{1 - dist:.2f}" for dist in distances]
80
+ return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})
81
+
82
+ def search_with_profile(guideline_name):
83
+ # This is the NEW search method using the pre-computed profiles
84
+ if not guideline_name:
85
+ return pd.DataFrame()
86
+ profile_embedding = profile_embeddings[guideline_name].tolist()
87
+ results = collection.query(query_embeddings=[profile_embedding], n_results=3)
88
  documents = results['documents'][0]
89
  distances = results['distances'][0]
90
  similarities = [f"{1 - dist:.2f}" for dist in distances]
91
+ return pd.DataFrame({"Similarity": similarities, "Matching Chunk": documents})
92
+
 
 
 
93
 
94
+ # --- 4. GRADIO INTERFACE FOR COMPARISON ---
95
+
96
+ sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
97
+ Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
98
+ Atendente: Puxa, que chato isso. Lamento que você tenha passado por isso. Pode me informar o número do pedido para eu localizar sua compra?
99
+ Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
100
+ Atendente: Entendo perfeitamente sua frustração. Para resolver isso, posso te oferecer duas opções.
101
+ Cliente: Prefiro receber um novo.
102
+ Atendente: Combinado. Obrigado por sua ligação, tenha um ótimo dia.
103
+ """
104
+
105
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
106
+ gr.Markdown("# 🔍 Search Method Comparison")
107
+ gr.Markdown("Index a transcript once, then search using both methods to compare the results.")
108
 
 
 
109
  with gr.Row():
110
+ # Indexing column is the same
111
  with gr.Column(scale=1):
112
+ transcript_input = gr.Textbox(lines=15, label="Paste Transcript Here", value=sample_transcript)
 
113
  index_button = gr.Button("Index Transcript", variant="primary")
114
+ index_status = gr.Label()
115
+ indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview")
116
+
117
+ gr.HTML("<hr>") # Add a horizontal line for separation
118
+
119
+ with gr.Row():
120
+ # Column for the simple, single query search
121
+ with gr.Column():
122
+ gr.Markdown("### Method 1: Single Query Search")
123
+ query_input = gr.Textbox(label="Enter a Simple Query", placeholder="Ex: o agente foi empático?")
124
+ search_button_single = gr.Button("Search Single Query")
125
+ results_output_single = gr.DataFrame(label="Single Query Results")
126
+
127
+ # Column for the new, profile-based search
128
+ with gr.Column():
129
+ gr.Markdown("### Method 2: Guideline Profile Search")
130
+ profile_input = gr.Dropdown(choices=list(GUIDELINE_PROFILES.keys()), label="Select a Guideline Profile")
131
+ search_button_profile = gr.Button("Search with Profile", variant="primary")
132
+ results_output_profile = gr.DataFrame(label="Profile Search Results")
133
+
134
+ # Wire up the components
135
  index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
136
+ search_button_single.click(fn=search_with_single_query, inputs=[query_input], outputs=[results_output_single])
137
+ search_button_profile.click(fn=search_with_profile, inputs=[profile_input], outputs=[results_output_profile])
138
 
139
  demo.launch()