Spaces:

asilvamaia
/

Ident_br

Sleeping

App Files Files Community

Ident_br / app.py

asilvamaia

Create app.py

fa24fb3 verified 24 days ago

raw

history blame contribute delete

4.24 kB

	import streamlit as st
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from urllib.parse import urlparse
	import pandas as pd
	import io

	# --- CONFIGURAÇÕES ---
	# ⚠️ TROQUE PELO SEU USUÁRIO/MODELO NO HUGGING FACE
	MODEL_ID = "asilvamaia/ident_br"

	# Configuração da Página
	st.set_page_config(page_title="Validador de Domínios .BR", page_icon="🇧🇷")

	# --- FUNÇÃO DE LIMPEZA (V11) ---
	def limpar_entrada(texto: str) -> str:
	texto = str(texto).strip().lower()
	if not texto: return ""
	if "@" in texto: return texto # Mantém e-mail para rejeição

	if "http" not in texto and "://" not in texto:
	texto_temp = "http://" + texto
	else:
	texto_temp = texto

	try:
	parsed = urlparse(texto_temp)
	dominio_limpo = parsed.netloc if parsed.netloc else texto

	if ":" in dominio_limpo: # Remove porta
	dominio_limpo = dominio_limpo.split(':')[0]

	if dominio_limpo.startswith("www."): # Remove www
	dominio_limpo = dominio_limpo[4:]

	return dominio_limpo
	except:
	return texto

	# --- CARREGAMENTO DO MODELO (COM CACHE) ---
	# O cache impede que o modelo seja recarregado a cada clique, deixando o app rápido
	@st.cache_resource
	def load_model():
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
	# No Space gratuito usamos CPU
	model.to("cpu")
	model.eval()
	return tokenizer, model
	except Exception as e:
	st.error(f"Erro ao carregar o modelo: {e}")
	return None, None

	tokenizer, model = load_model()

	# --- INTERFACE ---
	st.title("🇧🇷 Validador de Domínios .BR com IA")
	st.write("Faça upload de uma lista suja (.txt) e a IA extrairá apenas os domínios .br válidos.")

	uploaded_file = st.file_uploader("Escolha um arquivo .txt", type="txt")

	if uploaded_file is not None and tokenizer is not None:
	# Lê o arquivo
	stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
	linhas = stringio.readlines()

	st.info(f"Arquivo carregado com {len(linhas)} linhas. Processando...")

	if st.button("Iniciar Limpeza"):
	validos = []
	rejeitados = []

	# Barra de progresso
	progress_bar = st.progress(0)

	# Processamento (Batch size 1 para simplicidade visual ou aumentar se precisar)
	# Como é CPU, vamos processar linha a linha para atualizar a barra
	for i, linha in enumerate(linhas):
	original = linha.strip()
	limpo = limpar_entrada(original)

	if not limpo: continue

	# Inferência
	inputs = tokenizer(limpo, return_tensors="pt", truncation=True, max_length=128)
	with torch.no_grad():
	outputs = model(**inputs)
	pred = torch.argmax(outputs.logits, dim=1).item()

	# Regras de Validação
	if pred == 1 and "." in limpo and "@" not in limpo:
	validos.append(limpo)
	else:
	rejeitados.append(original)

	# Atualiza barra a cada 10 itens para não travar a UI
	if i % 10 == 0:
	progress_bar.progress((i + 1) / len(linhas))

	progress_bar.progress(100)

	# --- RESULTADOS ---
	st.success("Processamento Concluído!")

	col1, col2 = st.columns(2)
	with col1:
	st.metric("✅ Aprovados", len(validos))
	with col2:
	st.metric("🔴 Rejeitados", len(rejeitados))

	# --- DOWNLOAD ---
	# Remove duplicatas
	validos_unicos = sorted(list(set(validos)))
	res_text = "\n".join(validos_unicos)

	st.download_button(
	label="⬇️ Baixar Lista Limpa (.txt)",
	data=res_text,
	file_name="dominios_limpos.txt",
	mime="text/plain"
	)

	# Opcional: Mostrar amostra
	with st.expander("Ver amostra dos aprovados"):
	st.write(validos_unicos[:20])