tuliodisanto commited on
Commit
998b898
·
verified ·
1 Parent(s): ce9c0d7

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +250 -0
  2. enhanced_search_v2.py +471 -0
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py (Versão FINAL - Corrigido para erro 401 com renomeação de Secret)
2
+
3
+ import pandas as pd
4
+ from flask import Flask, render_template, request, jsonify
5
+ import os
6
+ import sys
7
+ import traceback
8
+ import subprocess
9
+ from sentence_transformers import SentenceTransformer
10
+ import csv
11
+ from collections import defaultdict
12
+ import datetime
13
+ import re
14
+ from huggingface_hub import InferenceClient
15
+
16
+ # --- Variáveis e Constantes de Feedback ---
17
+ USER_FEEDBACK_FILE = 'user_feedback.csv'
18
+ USER_BEST_MATCHES_COUNTS = {}
19
+ USER_FEEDBACK_THRESHOLD = 3
20
+ FEEDBACK_CSV_COLUMNS = ['timestamp', 'query_original', 'query_normalized', 'tuss_code_submitted', 'tuss_code_raw_input', 'tuss_description_associated', 'rol_names_associated', 'feedback_type']
21
+
22
+ # --- INÍCIO DA CORREÇÃO: Lendo o Secret com o nome correto ---
23
+ # Buscamos a chave do Secret que você renomeou para 'NOVITA_API_KEY'
24
+ api_key = os.environ.get("NOVITA_API_KEY")
25
+ if not api_key:
26
+ print("--- [AVISO CRÍTICO] Secret 'NOVITA_API_KEY' não encontrado. As chamadas para a IA irão falhar. ---")
27
+ client_ia = None
28
+ else:
29
+ client_ia = InferenceClient(
30
+ provider="novita",
31
+ api_key=api_key,
32
+ )
33
+ print("--- [SUCESSO] Cliente de Inferência da IA configurado com a chave correta. ---")
34
+ # --- FIM DA CORREÇÃO ---
35
+
36
+
37
+ # --- Funções de Feedback (sem alterações) ---
38
+ def normalize_text_for_feedback(text):
39
+ if pd.isna(text): return ""
40
+ from enhanced_search_v2 import normalize_text as es_normalize_text
41
+ return es_normalize_text(str(text).strip())
42
+
43
+ def load_user_feedback():
44
+ global USER_BEST_MATCHES_COUNTS; USER_BEST_MATCHES_COUNTS = {}
45
+ feedback_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), USER_FEEDBACK_FILE)
46
+ if not os.path.exists(feedback_file_path):
47
+ with open(feedback_file_path, 'w', newline='', encoding='utf-8') as f: csv.writer(f).writerow(FEEDBACK_CSV_COLUMNS)
48
+ return
49
+ try:
50
+ with open(feedback_file_path, 'r', encoding='utf-8') as f:
51
+ reader = csv.reader(f)
52
+ header = next(reader)
53
+ if [col.strip() for col in header] != FEEDBACK_CSV_COLUMNS: raise ValueError("Cabeçalho inválido")
54
+ for row in reader:
55
+ if len(row) == len(FEEDBACK_CSV_COLUMNS):
56
+ row_dict = dict(zip(FEEDBACK_CSV_COLUMNS, row))
57
+ query_norm, tuss_code = row_dict.get('query_normalized', ''), row_dict.get('tuss_code_submitted', '')
58
+ if query_norm and tuss_code:
59
+ if query_norm not in USER_BEST_MATCHES_COUNTS: USER_BEST_MATCHES_COUNTS[query_norm] = {}
60
+ USER_BEST_MATCHES_COUNTS[query_norm][tuss_code] = USER_BEST_MATCHES_COUNTS[query_norm].get(tuss_code, 0) + 1
61
+ print(f"--- [SUCESSO] Feedback de usuário carregado. ---")
62
+ except Exception as e: print(f"--- [ERRO] Falha ao carregar feedback: {e} ---"); traceback.print_exc()
63
+
64
+ def append_user_feedback(query_original, tuss_code_submitted, feedback_type, tuss_code_raw_input=''):
65
+ feedback_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), USER_FEEDBACK_FILE)
66
+ query_normalized = normalize_text_for_feedback(query_original)
67
+ tuss_descriptions, rol_names = [], []
68
+ if DF_ORIGINAL is not None and not DF_ORIGINAL.empty:
69
+ matching_rows = DF_ORIGINAL[DF_ORIGINAL['Codigo_TUSS'].astype(str) == tuss_code_submitted]
70
+ if not matching_rows.empty:
71
+ tuss_descriptions = matching_rows['Descricao_TUSS'].unique().tolist()
72
+ rol_names = matching_rows['Procedimento_Rol'].unique().tolist()
73
+ tuss_desc_assoc = " | ".join(filter(None, tuss_descriptions)) or 'Não encontrado'
74
+ rol_names_assoc = " | ".join(filter(None, rol_names)) or 'Não encontrado'
75
+ try:
76
+ with open(feedback_file_path, 'a', newline='', encoding='utf-8') as f:
77
+ csv.writer(f).writerow([datetime.datetime.now().isoformat(), query_original, query_normalized, tuss_code_submitted, tuss_code_raw_input, tuss_desc_assoc, rol_names_assoc, feedback_type])
78
+ global USER_BEST_MATCHES_COUNTS
79
+ if query_normalized not in USER_BEST_MATCHES_COUNTS: USER_BEST_MATCHES_COUNTS[query_normalized] = {}
80
+ USER_BEST_MATCHES_COUNTS[query_normalized][tuss_code_submitted] = USER_BEST_MATCHES_COUNTS[query_normalized].get(tuss_code_submitted, 0) + 1
81
+ except Exception as e: print(f"--- [ERRO] Falha ao adicionar feedback: {e} ---"); traceback.print_exc()
82
+
83
+ # --- Execução de Scripts e Importações (sem alterações) ---
84
+ # ... (código igual ao anterior)
85
+
86
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
87
+ try:
88
+ from enhanced_search_v2 import load_and_prepare_database, load_correction_corpus, load_general_dictionary, search_procedure_with_log
89
+ print("--- [SUCESSO] Módulo 'enhanced_search_v2.py' importado. ---")
90
+ except Exception as e: print(f"--- [ERRO CRÍTICO] Não foi possível importar 'enhanced_search_v2.py': {e} ---"); traceback.print_exc(); sys.exit(1)
91
+
92
+ app = Flask(__name__)
93
+
94
+ # --- Carregamento dos Dados ---
95
+ DF_ORIGINAL, DF_NORMALIZED, FUZZY_CORPUS, IDF_SCORES, DB_WORD_SET = (None, None, None, None, set())
96
+ CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS = [], []
97
+ PORTUGUESE_WORD_SET = set()
98
+ SEMANTIC_MODEL = None
99
+
100
+ try:
101
+ db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'rol_procedures_database.csv')
102
+ DF_ORIGINAL, DF_NORMALIZED, FUZZY_CORPUS, IDF_SCORES, DB_WORD_SET = load_and_prepare_database(db_path)
103
+
104
+ dict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Dic.csv')
105
+ CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS = load_correction_corpus(dict_path, column_name='Termo_Correto')
106
+
107
+ general_dict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dicionario_ptbr.txt')
108
+ PORTUGUESE_WORD_SET = load_general_dictionary(general_dict_path)
109
+
110
+ load_user_feedback()
111
+
112
+ print("\n--- [SETUP] Carregando modelo semântico... ---")
113
+ model_name = 'sentence-transformers/all-MiniLM-L6-v2'
114
+
115
+ # --- CORREÇÃO: A chamada agora é limpa, sem parâmetros extras. ---
116
+ # Como não há mais um HF_TOKEN no ambiente, a biblioteca não tentará se autenticar.
117
+ SEMANTIC_MODEL = SentenceTransformer(model_name, device='cpu')
118
+
119
+ print(f"--- [SUCESSO] Modelo semântico '{model_name}' carregado. ---")
120
+
121
+ except Exception as e:
122
+ print(f"--- [ERRO CRÍTICO] Falha fatal durante o setup: {e} ---"); traceback.print_exc(); sys.exit(1)
123
+
124
+
125
+ # --- Rotas da Aplicação (O restante do arquivo permanece igual) ---
126
+ @app.route('/')
127
+ def index(): return render_template('index.html')
128
+
129
+ @app.route('/favicon.ico')
130
+ def favicon(): return '', 204
131
+
132
+ @app.route('/search', methods=['POST'])
133
+ def search():
134
+ try:
135
+ data = request.get_json()
136
+ query = data.get('query', '').strip()
137
+
138
+ results = search_procedure_with_log(
139
+ query=query,
140
+ df_original=DF_ORIGINAL,
141
+ df_normalized=DF_NORMALIZED,
142
+ fuzzy_search_corpus=FUZZY_CORPUS,
143
+ correction_corpus=(CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS),
144
+ portuguese_word_set=PORTUGUESE_WORD_SET,
145
+ idf_scores=IDF_SCORES,
146
+ db_word_set=DB_WORD_SET,
147
+ limit_per_layer=10,
148
+ semantic_model=SEMANTIC_MODEL,
149
+ user_best_matches_counts=USER_BEST_MATCHES_COUNTS,
150
+ user_feedback_threshold=USER_FEEDBACK_THRESHOLD
151
+ )
152
+ return jsonify(results)
153
+ except Exception as e:
154
+ print("--- [ERRO FATAL DURANTE A BUSCA] ---"); traceback.print_exc()
155
+ return jsonify({"error": "Ocorreu um erro interno no motor de busca."}), 500
156
+
157
+ @app.route('/submit_feedback', methods=['POST'])
158
+ def submit_feedback_route():
159
+ try:
160
+ data = request.get_json()
161
+ query, tuss_code_submitted, feedback_type, tuss_code_raw_input = data.get('query'), data.get('tuss_code'), data.get('feedback_type', 'unknown'), data.get('tuss_code_raw_input', '')
162
+ if not query or not tuss_code_submitted: return jsonify({"status": "error", "message": "Query e TUSS Code são obrigatórios."}), 400
163
+ append_user_feedback(query, tuss_code_submitted, feedback_type, tuss_code_raw_input)
164
+ return jsonify({"status": "success", "message": "Feedback recebido!"}), 200
165
+ except Exception as e: print("--- [ERRO NO SUBMIT_FEEDBACK] ---"); traceback.print_exc(); return jsonify({"status": "error", "message": "Erro interno."}), 500
166
+
167
+ @app.route('/get_tuss_info', methods=['GET'])
168
+ def get_tuss_info():
169
+ tuss_code_prefix = request.args.get('tuss_prefix', '').strip()
170
+ if not tuss_code_prefix: return jsonify([])
171
+ suggestions = []
172
+ if DF_ORIGINAL is not None and not DF_ORIGINAL.empty:
173
+ filtered_df = DF_ORIGINAL[DF_ORIGINAL['Codigo_TUSS'].astype(str).str.startswith(tuss_code_prefix)]
174
+ tuss_grouped = filtered_df.groupby('Codigo_TUSS').agg(tuss_descriptions=('Descricao_TUSS', lambda x: list(x.unique())), rol_names=('Procedimento_Rol', lambda x: list(x.unique()))).reset_index()
175
+ for index, row in tuss_grouped.head(10).iterrows():
176
+ tuss_desc = " | ".join(filter(None, row['tuss_descriptions'])) or 'Sem descrição TUSS'
177
+ rol_name = " | ".join(filter(None, row['rol_names'])) or 'Sem procedimento Rol'
178
+ suggestions.append({'tuss_code': str(row['Codigo_TUSS']), 'tuss_description': tuss_desc, 'rol_name': rol_name})
179
+ return jsonify(suggestions)
180
+
181
+
182
+ @app.route('/get_ai_suggestion', methods=['POST'])
183
+ def get_ai_suggestion():
184
+ if not client_ia:
185
+ return jsonify({"error": "O serviço de IA não está configurado no servidor (sem chave de API)."}), 503
186
+
187
+ try:
188
+ data = request.get_json()
189
+ query = data.get('query')
190
+ results = data.get('results', [])
191
+
192
+ if not query or not results:
193
+ return jsonify({"error": "A consulta e os resultados são necessários."}), 400
194
+
195
+ formatted_results = []
196
+ for r in results:
197
+ tuss_code = r.get('Codigo_TUSS', 'N/A')
198
+ tuss_desc = r.get('Descricao_TUSS', 'N/A')
199
+ rol_desc = r.get('Procedimento_Rol', 'N/A')
200
+ sinonimos = ", ".join(filter(None, [
201
+ r.get('Sinonimo_1'), r.get('Sinonimo_2'), r.get('Sinonimo_3'), r.get('Sinonimo_4'), r.get('Semantico')
202
+ ]))
203
+ formatted_results.append(
204
+ f"TUSS: {tuss_code}\nDescrição TUSS: {tuss_desc}\nDescrição Rol: {rol_desc}\nContexto/Sinônimos: {sinonimos}\n"
205
+ )
206
+
207
+ results_string = "\n".join(formatted_results)
208
+ system_prompt = (
209
+ "Você é um especialista em terminologia médica e na tabela TUSS brasileira. "
210
+ "Sua tarefa é analisar uma consulta de busca e uma lista de procedimentos médicos retornados por um sistema. "
211
+ "Você deve escolher o ÚNICO procedimento que melhor corresponde à intenção da consulta. "
212
+ "Responda APENAS com o código TUSS do procedimento escolhido (ex: 40301741). Não inclua nenhuma outra palavra, explicação ou pontuação."
213
+ )
214
+ user_prompt = (
215
+ f"Consulta do usuário: \"{query}\"\n\n"
216
+ "Resultados da busca:\n"
217
+ "----------------------\n"
218
+ f"{results_string}"
219
+ "\n----------------------\n\n"
220
+ "Qual destes é a melhor correspondência para a consulta? Responda apenas com o código TUSS."
221
+ )
222
+
223
+ completion = client_ia.chat.completions.create(
224
+ model="baidu/ERNIE-4.5-21B-A3B-PT",
225
+ messages=[
226
+ {"role": "system", "content": system_prompt},
227
+ {"role": "user", "content": user_prompt}
228
+ ],
229
+ max_tokens=20,
230
+ temperature=0.1,
231
+ )
232
+
233
+ suggested_content = completion.choices[0].message.content.strip()
234
+ suggested_tuss = re.sub(r'\D', '', suggested_content)
235
+
236
+ if not suggested_tuss:
237
+ return jsonify({"error": "A IA não conseguiu determinar um melhor resultado.", "details": suggested_content}), 422
238
+
239
+ return jsonify({"suggested_tuss_code": suggested_tuss})
240
+
241
+ except Exception as e:
242
+ print("--- [ERRO FATAL NA SUGESTÃO DA IA] ---")
243
+ traceback.print_exc()
244
+ error_message = f"Ocorreu um erro interno ao processar a sugestão da IA: {str(e)}"
245
+ return jsonify({"error": error_message}), 500
246
+
247
+
248
+ if __name__ == '__main__':
249
+ port = int(os.environ.get("PORT", 7860))
250
+ app.run(host='0.0.0.0', port=port, debug=False)
enhanced_search_v2.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_search_v2.py
2
+ # ---------------------
3
+ # Motor de busca híbrido e em camadas para procedimentos médicos.
4
+ # Versão final consolidada com todas as otimizações.
5
+ #
6
+ # Funcionalidades Principais:
7
+ # 1. Correção Ortográfica PRÉ-BUSCA: Corrige termos inválidos ANTES de qualquer busca.
8
+ # 2. Camada 0 para Busca Literal Robusta: Encontra correspondências exatas da frase,
9
+ # ignorando caixa, acentos, pontuação e espaçamento.
10
+ # 3. Early Exit Otimizado: Interrompe a busca com log claro e correto.
11
+ # 4. Busca em Múltiplas Camadas: Da mais restrita (literal) à mais abrangente (ponderada).
12
+ # 5. Pontuação por Relevância (IDF): Palavras raras têm mais peso.
13
+ # 6. Limpeza de Dados: Zera campos do Rol para procedimentos que não são do Rol.
14
+ # 7. Reordenação Semântica: Usa o MiniLM-L6-v2 para entender o significado e reordenar.
15
+ # 8. Feedback de Usuário: Prioriza resultados validados pela comunidade.
16
+ # 9. Lida com múltiplos procedimentos (linhas do DB) para um mesmo código TUSS.
17
+
18
+ import pandas as pd
19
+ import re
20
+ from thefuzz import process, fuzz
21
+ from unidecode import unidecode
22
+ import time
23
+ from sentence_transformers import util
24
+ import torch
25
+ import math
26
+ from collections import defaultdict
27
+
28
+
29
+ # --- FUNÇÕES AUXILIARES DE NORMALIZAÇÃO ---
30
+
31
+ def literal_normalize_text(text):
32
+ """
33
+ Normaliza o texto para busca literal (Camada 0): minúsculas, sem acentos,
34
+ sem pontuação e com espaços padronizados.
35
+ """
36
+ if pd.isna(text): return ""
37
+ normalized = unidecode(str(text).lower())
38
+ normalized = re.sub(r'[^\w\s]', '', normalized)
39
+ return re.sub(r'\s+', ' ', normalized).strip()
40
+
41
+ def normalize_text(text):
42
+ """Normaliza o texto para busca por tokens (palavras): minúsculas, sem acentos e espaços extras."""
43
+ if pd.isna(text): return ""
44
+ return unidecode(str(text).lower().strip())
45
+
46
+ def get_longest_word(query_text):
47
+ """Extrai a palavra mais longa de uma query (usado no fallback)."""
48
+ words = re.findall(r'\b\w{4,}\b', query_text)
49
+ if not words: return ""
50
+ return max(words, key=len)
51
+
52
+
53
+ # --- FUNÇÕES DE FORMATAÇÃO E DESTAQUE ---
54
+
55
+ # No arquivo enhanced_search_v2.py
56
+
57
+ def format_result(row_data, match_type="", score=0):
58
+ """
59
+ Formata uma linha do DataFrame em um dicionário de resultado padrão.
60
+ Aplica a regra de negócio para limpar dados se o procedimento não for do Rol.
61
+ """
62
+ data = row_data.copy()
63
+ if data.get('Correlacao_Rol', '').strip().lower() != 'sim':
64
+ data['Grupo'], data['Subgrupo'], data['Vigencia'], data['Resolucao_Normativa'] = '', '', '', ''
65
+ data['PAC'], data['DUT'] = '---', '---'
66
+ else:
67
+ data['PAC'] = 'Sim' if data.get('PAC', '').strip().lower() == 'pac' else 'Não'
68
+
69
+ # Lógica da DUT corrigida aqui
70
+ original_dut_value = data.get('DUT', '').strip()
71
+ # CORREÇÃO: A verificação agora aceita números com ponto decimal (ex: "65.1")
72
+ # A lógica é: se o valor, após remover o primeiro '.', for composto apenas de dígitos, é válido.
73
+ if original_dut_value and original_dut_value.replace('.', '', 1).isdigit():
74
+ data['DUT'] = f'Sim, DUT nº {original_dut_value}'
75
+ else:
76
+ data['DUT'] = 'Não'
77
+
78
+ standard_columns = [
79
+ 'Codigo_TUSS', 'Descricao_TUSS', 'Correlacao_Rol', 'Procedimento_Rol',
80
+ 'Resolucao_Normativa', 'Vigencia', 'OD', 'AMB', 'HCO', 'HSO', 'PAC',
81
+ 'DUT', 'SUBGRUPO', 'GRUPO', 'CAPITULO', 'Sinonimo_1', 'Sinonimo_2',
82
+ 'Sinonimo_3', 'Sinonimo_4', 'Semantico'
83
+ ]
84
+ formatted_data = {col: data.get(col, '') for col in standard_columns}
85
+ result = {"score": round(score), "match_type": match_type}
86
+ result.update(formatted_data)
87
+ return result
88
+
89
+ def _highlight_matches(results, query):
90
+ """Adiciona tags <b></b> em volta das palavras da query nos resultados."""
91
+ if not query or not results: return results
92
+ stopwords = {'de', 'do', 'da', 'dos', 'das', 'a', 'o', 'e', 'em', 'um', 'uma', 'para', 'com'}
93
+ query_words = {word for word in normalize_text(query).split() if len(word) > 2 and word not in stopwords}
94
+ cols_to_highlight = ['Descricao_TUSS', 'Procedimento_Rol', 'Sinonimo_1', 'Sinonimo_2', 'Sinonimo_3', 'Sinonimo_4', 'Semantico']
95
+
96
+ for result in results:
97
+ for col in cols_to_highlight:
98
+ original_text = result.get(col, '')
99
+ highlighted_text = original_text
100
+ if original_text and query_words:
101
+ for word in sorted(list(query_words), key=len, reverse=True):
102
+ pattern = r'\b(' + re.escape(word) + r')\b'
103
+ highlighted_text = re.sub(pattern, r'<b>\1</b>', highlighted_text, flags=re.IGNORECASE)
104
+ result[f"{col}_highlighted"] = highlighted_text
105
+ return results
106
+
107
+
108
+ # --- FUNÇÕES DE CARREGAMENTO DE DADOS ---
109
+
110
+ def load_and_prepare_database(db_path):
111
+ """
112
+ Carrega o CSV, cria colunas normalizadas, campo de texto único, pesos IDF e um conjunto
113
+ de todas as palavras únicas da base de dados.
114
+ """
115
+ try:
116
+ print(f"Carregando e preparando a base de dados de: {db_path}...")
117
+ df_original = pd.read_csv(db_path, dtype=str).fillna('')
118
+ search_cols = ['Descricao_TUSS', 'Procedimento_Rol', 'Sinonimo_1', 'Sinonimo_2', 'Sinonimo_3', 'Sinonimo_4', 'Semantico']
119
+ df_normalized = df_original.copy()
120
+
121
+ df_normalized['Codigo_TUSS_literal'] = df_normalized['Codigo_TUSS'].apply(literal_normalize_text)
122
+ df_normalized['Codigo_TUSS_norm'] = df_normalized['Codigo_TUSS'].apply(normalize_text)
123
+
124
+ df_normalized['full_text_norm'] = ""
125
+ for col in search_cols:
126
+ if col in df_normalized.columns:
127
+ df_normalized[f'{col}_literal'] = df_normalized[col].apply(literal_normalize_text)
128
+ df_normalized[f'{col}_norm'] = df_normalized[col].apply(normalize_text)
129
+ df_normalized['full_text_norm'] += ' ' + df_normalized[f'{col}_norm']
130
+
131
+ print("Calculando pesos IDF e dicionário da base...")
132
+ num_documents = len(df_normalized)
133
+ doc_freq = defaultdict(int)
134
+ db_word_set = set()
135
+ for text in df_normalized['full_text_norm']:
136
+ words = set(text.split())
137
+ db_word_set.update(words)
138
+ for word in words:
139
+ if word: doc_freq[word] += 1
140
+ db_word_set.discard('')
141
+ print(f"Dicionário da base de dados criado com {len(db_word_set)} palavras únicas.")
142
+
143
+ idf_scores = {word: math.log(num_documents / (freq + 1)) for word, freq in doc_freq.items()}
144
+ print(f"Pesos IDF calculados para {len(idf_scores)} palavras.")
145
+
146
+ print("Criando corpus para busca fuzzy...")
147
+ fuzzy_search_corpus = []
148
+ for index, row in df_normalized.iterrows():
149
+ for col in search_cols:
150
+ if col in df_original.columns and f'{col}_norm' in row and pd.notna(row[f'{col}_norm']):
151
+ val = row[f'{col}_norm']
152
+ if val: fuzzy_search_corpus.append((val, index, f'{col}_norm'))
153
+
154
+ print(f"Base de dados pronta com {len(df_original)} procedimentos.")
155
+ return df_original, df_normalized, fuzzy_search_corpus, idf_scores, db_word_set
156
+ except Exception as e:
157
+ print(f"Erro crítico ao carregar/preparar a base de dados: {e}"); raise
158
+
159
+ def load_general_dictionary(path):
160
+ try:
161
+ print(f"Carregando dicionário geral de português de: {path}...")
162
+ with open(path, 'r', encoding='utf-8') as f:
163
+ words = {normalize_text(line.strip()) for line in f if line.strip()}
164
+ print(f"Dicionário geral carregado com {len(words)} palavras.")
165
+ return words
166
+ except FileNotFoundError: return set()
167
+ except Exception as e: return set()
168
+
169
+ def load_correction_corpus(dict_path, column_name='Termo_Correto'):
170
+ try:
171
+ print(f"Carregando corpus de correção de: {dict_path}...")
172
+ df_dict = pd.read_csv(dict_path, dtype=str).fillna('')
173
+ if column_name not in df_dict.columns: return [], []
174
+
175
+ original_corpus = df_dict[column_name].dropna().astype(str).tolist()
176
+ normalized_corpus = [normalize_text(term) for term in original_corpus]
177
+
178
+ print(f"Corpus de correção carregado com {len(original_corpus)} termos.")
179
+ return original_corpus, normalized_corpus
180
+ except FileNotFoundError: return [], []
181
+ except Exception as e: return [], []
182
+
183
+
184
+ # --- FUNÇÃO DE RECLASSIFICAÇÃO SEMÂNTICA ---
185
+
186
+ def rerank_with_semantic_model(original_query, results_list, model):
187
+ if not model or not results_list: return results_list
188
+ semantic_columns = ['Descricao_TUSS', 'Procedimento_Rol', 'SUBGRUPO', 'Sinonimo_1', 'Sinonimo_2', 'Sinonimo_3', 'Sinonimo_4', 'Semantico']
189
+ corpus_texts = [". ".join(sorted(list({res.get(col) for col in semantic_columns if res.get(col) and isinstance(res.get(col), str)}))) for res in results_list]
190
+ try:
191
+ query_embedding = model.encode(original_query, convert_to_tensor=True, show_progress_bar=False)
192
+ corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=False)
193
+ cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)
194
+ except Exception as e: return results_list
195
+ for i, result in enumerate(results_list):
196
+ result['semantic_score'] = round(max(0, cosine_scores[0][i].item()) * 100)
197
+ result['hybrid_score'] = result['semantic_score'] + result.get('score', 0)
198
+ return sorted(results_list, key=lambda x: (x.get('score', 0) == 100, x.get('hybrid_score', 0)), reverse=True)
199
+
200
+
201
+ # --- FUNÇÃO INTERNA DE BUSCA COM CAMADAS ---
202
+
203
+ def _run_search_layers(literal_query, normalized_query, response, df_original, df_normalized, fuzzy_search_corpus, idf_scores, limit_per_layer):
204
+ """Executa as camadas de busca e retorna o nome da camada de saída em caso de early exit."""
205
+ matched_indices = set()
206
+ stopwords = {'de', 'do', 'da', 'dos', 'das', 'a', 'o', 'e', 'em', 'um', 'uma', 'para', 'com'}
207
+ query_words = [word for word in normalized_query.split() if word not in stopwords and len(word) > 1]
208
+
209
+ # --- CAMADA 0: Busca Literal ---
210
+ if literal_query:
211
+ temp_results = []
212
+ literal_cols = ['Codigo_TUSS_literal', 'Descricao_TUSS_literal', 'Procedimento_Rol_literal']
213
+ for col in literal_cols:
214
+ if col in df_normalized.columns:
215
+ mask = df_normalized[col].str.contains(r'\b' + re.escape(literal_query) + r'\b', na=False)
216
+ matches = df_normalized[mask]
217
+ for index, _ in matches.iterrows():
218
+ if index not in matched_indices:
219
+ match_type = "Código Literal" if "Codigo" in col else "Texto Literal"
220
+ temp_results.append(format_result(df_original.loc[index], match_type, 100))
221
+ matched_indices.add(index)
222
+ if temp_results:
223
+ response["results_by_layer"]["literal_matches"] = sorted(temp_results, key=lambda x: x['Codigo_TUSS'])[:limit_per_layer]
224
+ return "Busca Literal"
225
+
226
+ # --- CAMADA 1: Busca Normalizada Exata ---
227
+ temp_results = []
228
+ if normalized_query:
229
+ exact_code_matches = df_normalized[df_normalized['Codigo_TUSS_norm'] == normalized_query]
230
+ for index, _ in exact_code_matches.iterrows():
231
+ if index not in matched_indices:
232
+ temp_results.append(format_result(df_original.loc[index], "Código Exato (Normalizado)", 100))
233
+ matched_indices.add(index)
234
+ for col in ['Descricao_TUSS_norm', 'Procedimento_Rol_norm']:
235
+ if col in df_normalized.columns:
236
+ exact_text_matches = df_normalized[df_normalized[col] == normalized_query]
237
+ for index, _ in exact_text_matches.iterrows():
238
+ if index not in matched_indices:
239
+ temp_results.append(format_result(df_original.loc[index], "Exato (Normalizado)", 100))
240
+ matched_indices.add(index)
241
+ if temp_results:
242
+ response["results_by_layer"]["exact_matches"] = sorted(temp_results, key=lambda x: x['Codigo_TUSS'])[:limit_per_layer]
243
+ return "Normalizada Exata"
244
+
245
+ # --- CAMADA 2: Busca Lógica 'E' ---
246
+ temp_results = []
247
+ if query_words:
248
+ mask = pd.Series(True, index=df_normalized.index)
249
+ for word in query_words:
250
+ mask &= df_normalized['full_text_norm'].str.contains(r'\b' + re.escape(word) + r'\b', na=False)
251
+ for index, row in df_normalized[mask & ~df_normalized.index.isin(matched_indices)].iterrows():
252
+ score = fuzz.WRatio(normalized_query, row.get('full_text_norm', ''))
253
+ if score > 85:
254
+ temp_results.append(format_result(df_original.loc[index], "Busca Lógica (E)", score))
255
+ matched_indices.add(index)
256
+ response["results_by_layer"]["logical_matches"] = sorted(temp_results, key=lambda x: x.get('score', 0), reverse=True)[:limit_per_layer]
257
+
258
+ # --- CAMADA 3: Busca 'Quase Exata' (Fuzzy) ---
259
+ temp_results = []
260
+ processed_indices_layer3 = set()
261
+ if fuzzy_search_corpus and normalized_query:
262
+ almost_exact_matches = process.extractBests(normalized_query, [item[0] for item in fuzzy_search_corpus], scorer=fuzz.token_set_ratio, limit=limit_per_layer * 3, score_cutoff=90)
263
+ for match_text, score in almost_exact_matches:
264
+ if score == 100 and match_text == normalized_query: continue
265
+ corpus_items = [item for item in fuzzy_search_corpus if item[0] == match_text]
266
+ for _, original_index, _ in corpus_items:
267
+ if original_index not in matched_indices and original_index not in processed_indices_layer3:
268
+ temp_results.append(format_result(df_original.loc[original_index], "Quase Exato", 98))
269
+ matched_indices.add(original_index)
270
+ processed_indices_layer3.add(original_index)
271
+ response["results_by_layer"]["almost_exact_matches"] = sorted(temp_results, key=lambda x: x['Codigo_TUSS'])[:limit_per_layer]
272
+
273
+ # --- CAMADA 4: Busca por Termos Validados ---
274
+ temp_results = []
275
+ if query_words:
276
+ mask_c4 = pd.Series(True, index=df_normalized.index)
277
+ for word in query_words:
278
+ mask_c4 &= df_normalized['full_text_norm'].str.contains(r'\b' + re.escape(word) + r'\b', na=False)
279
+ for index, row in df_normalized[mask_c4 & ~df_normalized.index.isin(matched_indices)].iterrows():
280
+ score = fuzz.WRatio(normalized_query, row.get('Descricao_TUSS_norm', ''))
281
+ if score > 75:
282
+ temp_results.append(format_result(df_original.iloc[index], "Termos Validados (E)", score))
283
+ matched_indices.add(index)
284
+ response["results_by_layer"]["contains_matches"] = sorted(temp_results, key=lambda x: x.get('score', 0), reverse=True)[:limit_per_layer]
285
+
286
+ # --- CAMADA 5: Busca Ponderada (IDF) ---
287
+ temp_results = []
288
+ if query_words and idf_scores:
289
+ max_idf = max(idf_scores.values()) if idf_scores else 1.0
290
+ total_query_idf = sum(idf_scores.get(word, max_idf) for word in query_words)
291
+ regex_pattern = '|'.join(re.escape(word) for word in query_words)
292
+ mask = df_normalized['full_text_norm'].str.contains(regex_pattern, na=False)
293
+ candidate_df = df_normalized[mask & ~df_normalized.index.isin(matched_indices)]
294
+ for index, row in candidate_df.iterrows():
295
+ weighted_score = sum(idf_scores.get(word, 0) for word in query_words if word in row['full_text_norm'].split())
296
+ normalized_score = (weighted_score / total_query_idf) * 90 if total_query_idf > 0 else 0
297
+ if query_words and row.get('full_text_norm', '').strip().startswith(query_words[0]):
298
+ normalized_score = min(normalized_score + 10, 95)
299
+ temp_results.append(format_result(df_original.loc[index], "Busca Ponderada (IDF)", normalized_score))
300
+ matched_indices.add(index)
301
+ response["results_by_layer"]["term_matches"] = sorted(temp_results, key=lambda x: x.get('score', 0), reverse=True)[:limit_per_layer * 4]
302
+
303
+ # --- CAMADA 6: Fallback ---
304
+ total_found_primary = sum(len(v) for v in response["results_by_layer"].values())
305
+ if total_found_primary == 0 and normalized_query:
306
+ temp_results = []
307
+ longest_word = get_longest_word(normalized_query)
308
+ if longest_word:
309
+ mask_c6 = df_normalized['full_text_norm'].str.contains(r'\b' + re.escape(longest_word) + r'\b', na=False)
310
+ for index, row in df_normalized[mask_c6 & ~df_normalized.index.isin(matched_indices)].iterrows():
311
+ temp_results.append(format_result(df_original.loc[index], f"Palavra-Chave '{longest_word}'", 80))
312
+ response["results_by_layer"]["keyword_matches"] = sorted(temp_results, key=lambda x: x['Codigo_TUSS'])[:limit_per_layer]
313
+
314
+ return None
315
+
316
+
317
+ # --- FUNÇÃO PRINCIPAL QUE ORQUESTRA A BUSCA ---
318
+
319
+ def search_procedure_with_log(query, df_original, df_normalized, fuzzy_search_corpus, correction_corpus,
320
+ portuguese_word_set, idf_scores, db_word_set,
321
+ limit_per_layer=10, semantic_model=None,
322
+ user_best_matches_counts=None, user_feedback_threshold=10):
323
+ SEMANTIC_RERANK_LIMIT = 40
324
+ start_time = time.time()
325
+ original_query = str(query).strip()
326
+
327
+ response = {"search_log": [],
328
+ "results_by_layer": {"literal_matches": [], "exact_matches": [], "logical_matches": [],
329
+ "almost_exact_matches": [], "contains_matches": [],
330
+ "term_matches": [], "keyword_matches": []},
331
+ "final_semantic_results": [], "was_corrected": False, "original_query": original_query,
332
+ "corrected_query": ""}
333
+
334
+ if not original_query:
335
+ response["search_log"].append("Query vazia, busca não realizada.")
336
+ return response
337
+
338
+ response["search_log"].append(f"Buscando por: '{original_query}'")
339
+
340
+ # ETAPA 1: CORREÇÃO ORTOGRÁFICA PRÉ-BUSCA
341
+ stopwords = {'de', 'do', 'da', 'dos', 'das', 'a', 'o', 'e', 'em', 'um', 'uma', 'para', 'com'}
342
+ query_after_correction = original_query
343
+ original_correction_corpus, normalized_correction_corpus = correction_corpus
344
+ valid_words = portuguese_word_set.union(db_word_set)
345
+
346
+ if valid_words and original_correction_corpus:
347
+ words_from_query, corrected_words, made_correction = original_query.split(), [], False
348
+ for word in words_from_query:
349
+ norm_word = normalize_text(word)
350
+ if norm_word in stopwords or len(norm_word) < 4:
351
+ corrected_words.append(word)
352
+ continue
353
+
354
+ if norm_word not in valid_words:
355
+ match_norm, score = process.extractOne(norm_word, normalized_correction_corpus, scorer=fuzz.token_set_ratio)
356
+ if score >= 85:
357
+ match_index = normalized_correction_corpus.index(match_norm)
358
+ corrected_word_original = original_correction_corpus[match_index]
359
+
360
+ corrected_word = corrected_word_original
361
+ if word.istitle(): corrected_word = corrected_word.title()
362
+ elif word.isupper(): corrected_word = corrected_word.upper()
363
+
364
+ corrected_words.append(corrected_word)
365
+ made_correction = True
366
+ else:
367
+ corrected_words.append(word)
368
+ else:
369
+ corrected_words.append(word)
370
+ if made_correction:
371
+ query_after_correction = " ".join(corrected_words)
372
+ response["was_corrected"] = True
373
+ response["corrected_query"] = query_after_correction
374
+ response["search_log"].append(f"Query corrigida para: '{query_after_correction}'.")
375
+
376
+ # ETAPA 2: PREPARAÇÃO DAS QUERIES PARA AS CAMADAS
377
+ literal_query = literal_normalize_text(query_after_correction)
378
+ cleaned_query = " ".join([word for word in query_after_correction.split() if normalize_text(word) not in stopwords])
379
+ normalized_query = normalize_text(cleaned_query)
380
+
381
+ if not cleaned_query.strip() and not literal_query.strip():
382
+ response["search_log"].append("Query resultante é vazia. Busca não realizada.")
383
+ return response
384
+
385
+ if cleaned_query != query_after_correction:
386
+ response["search_log"].append(f"Query limpa (sem stop words): '{cleaned_query}'")
387
+
388
+ # Para proibir buscas com apenas um caractere
389
+ if len(cleaned_query.strip()) <= 1:
390
+ # Pega o termo que seria buscado para exibir no log. Pode ser vazio.
391
+ term_to_log = cleaned_query.strip()
392
+ response["search_log"].append(f"Busca por '{term_to_log}' ignorada. A busca deve conter no mínimo 2 caracteres.")
393
+ response["final_semantic_results"] = []
394
+ return response
395
+
396
+ # ETAPA 3: EXECUÇÃO DA BUSCA
397
+ exit_layer_name = _run_search_layers(literal_query, normalized_query, response, df_original, df_normalized, fuzzy_search_corpus,
398
+ idf_scores, limit_per_layer)
399
+
400
+ # ETAPA 4: AGREGAÇÃO E REORDENAÇÃO DOS RESULTADOS
401
+ all_candidates = []
402
+ layer_order = ["literal_matches", "exact_matches", "logical_matches", "almost_exact_matches", "contains_matches", "term_matches", "keyword_matches"]
403
+ layer_names_map = {"literal_matches": "0. Busca Literal", "exact_matches": "1. Normalizada Exata", "logical_matches": "2. Lógica 'E'",
404
+ "almost_exact_matches": "3. Quase Exatos (Fuzzy)", "contains_matches": "4. Termos Validados",
405
+ "term_matches": "5. Busca Ponderada (IDF)", "keyword_matches": "6. Fallback (Palavra-Chave)"}
406
+
407
+ if exit_layer_name:
408
+ response["search_log"].append(f"--- [OTIMIZAÇÃO] Resultado de alta confiança encontrado na camada '{exit_layer_name}'. Busca interrompida. ---")
409
+
410
+ response["search_log"].append("\n--- Detalhamento da Busca por Camadas ---")
411
+ for layer_key in layer_order:
412
+ layer_results = response["results_by_layer"].get(layer_key, [])
413
+ num_results = len(layer_results)
414
+ response["search_log"].append(f"[{layer_names_map.get(layer_key, layer_key)}]: {num_results} resultado(s)")
415
+ all_candidates.extend(layer_results)
416
+
417
+ # Lógica de feedback do usuário
418
+ feedback_prioritized_tuss_votes = {}
419
+ if user_best_matches_counts and all_candidates:
420
+ query_norm_for_feedback = normalize_text(response.get("corrected_query") or original_query)
421
+ feedback_for_query = user_best_matches_counts.get(query_norm_for_feedback, {})
422
+ for tuss_code, votes in feedback_for_query.items():
423
+ if votes >= user_feedback_threshold:
424
+ feedback_prioritized_tuss_votes[tuss_code] = votes
425
+ if feedback_prioritized_tuss_votes:
426
+ response["search_log"].append(f"\nFeedback de usuários qualificado encontrado.")
427
+ for result in all_candidates:
428
+ if result.get('Codigo_TUSS') in feedback_prioritized_tuss_votes:
429
+ result['is_user_best_match'] = True
430
+ result['feedback_votes'] = feedback_prioritized_tuss_votes[result.get('Codigo_TUSS')]
431
+
432
+ response["search_log"].append(f"\n--- Análise e Reordenação ---\nTotal de candidatos encontrados: {len(all_candidates)}")
433
+
434
+ query_for_highlight = response.get("corrected_query") or cleaned_query
435
+ all_candidates = _highlight_matches(all_candidates, query_for_highlight)
436
+ final_list = []
437
+
438
+ if all_candidates:
439
+ query_for_semantic = response.get("corrected_query") or cleaned_query
440
+ prioritized_candidates = [res for res in all_candidates if res.get('is_user_best_match')]
441
+ non_prioritized_candidates = [res for res in all_candidates if not res.get('is_user_best_match')]
442
+
443
+ if semantic_model and prioritized_candidates:
444
+ reranked_prioritized = rerank_with_semantic_model(query_for_semantic, prioritized_candidates, semantic_model)
445
+ prioritized_results_sorted = sorted(reranked_prioritized, key=lambda x: (x.get('feedback_votes', 0), x.get('semantic_score', 0)), reverse=True)
446
+ else:
447
+ prioritized_results_sorted = sorted(prioritized_candidates, key=lambda x: (x.get('feedback_votes', 0), x.get('score', 0)), reverse=True)
448
+ final_list.extend(prioritized_results_sorted)
449
+
450
+ if semantic_model and non_prioritized_candidates:
451
+ candidates_for_rerank = non_prioritized_candidates[:SEMANTIC_RERANK_LIMIT]
452
+ reranked_non_prioritized = rerank_with_semantic_model(query_for_semantic, candidates_for_rerank, semantic_model)
453
+ final_list.extend(reranked_non_prioritized)
454
+ seen_reranked_codes = {r.get('Codigo_TUSS') for r in reranked_non_prioritized}
455
+ for candidate in non_prioritized_candidates:
456
+ if candidate.get('Codigo_TUSS') not in seen_reranked_codes:
457
+ final_list.append(candidate)
458
+ else:
459
+ final_list.extend(sorted(non_prioritized_candidates, key=lambda x: x.get('score', 0), reverse=True))
460
+
461
+ response["search_log"].append(f"Lista final de resultados combinada: {len(final_list)} itens antes do limite.")
462
+ response["final_semantic_results"] = final_list[:10]
463
+ else:
464
+ response["search_log"].append("Nenhum resultado final para exibir.")
465
+ response["final_semantic_results"] = []
466
+
467
+ end_time = time.time()
468
+ response["search_duration_seconds"] = round(end_time - start_time, 4)
469
+ response["search_log"].append(f"\nBusca completa em {response['search_duration_seconds']} segundos.")
470
+ print(f"\n\n==================== LOG DE DEPURAÇÃO (QUERY: '{original_query}') ====================")
471
+ return response