import os import time from cve_factchecker.retriever import VectorNewsRetriever from cve_factchecker.models import NewsArticle # Keep this test lightweight and isolated os.environ["USE_DUMMY_EMBEDDINGS"] = "true" os.environ["VECTOR_PERSIST_DIR"] = os.path.abspath("./vector_db_hybrid_test") articles = [ NewsArticle( title="Militants storm FC lines in Bannu", content=( "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa. " "Security forces responded swiftly, and the situation is under control." ), url="https://tribune.com.pk/story/2564614/militants-storm-fc-lines-in-bannu", source="The Express Tribune", published_date="2025-09-15", scraped_date=str(int(time.time())), article_id="a1", language="English", ), NewsArticle( title="Six soldiers martyred; five terrorists killed in Bannu FC compound attack", content=( "An attack on the FC compound in Bannu resulted in the martyrdom of six soldiers." "Reports indicate five terrorists were killed in the exchange." ), url="https://dailytimes.com.pk/1363459/six-soldiers-martyred-five-terrorists-killed-in-attack-on-bannu-fc-compound/", source="Daily Times", published_date="2025-09-15", scraped_date=str(int(time.time())), article_id="a2", language="English", ), NewsArticle( title="KP operations update: militants neutralized", content=( "Security operations in Khyber-Pakhtunkhwa neutralized multiple militants. The Frontier Corps participated " "in the operations across the province." ), url="https://dailytimes.com.pk/1368975/31-indian-backed-militants-killed-in-kp-operations/", source="Daily Times", published_date="2025-09-16", scraped_date=str(int(time.time())), article_id="a3", language="English", ), NewsArticle( title="Sports: Cricket series announced", content="Pakistan Cricket Board announced a new bilateral series in Lahore next month.", url="https://example.com/sports/cricket-series", source="Example Sports", published_date="2025-09-10", scraped_date=str(int(time.time())), article_id="a4", language="English", ), ] if __name__ == "__main__": retriever = VectorNewsRetriever(persist_directory=os.environ["VECTOR_PERSIST_DIR"]) retriever.store_articles_in_vector_db(articles, clear_first=True) query = ( "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa" ) print("\n=== Hybrid Retrieval Results (k=5) ===") results = retriever.semantic_search(query, k=5) for i, r in enumerate(results, 1): print(f"{i}. {r.get('title')} | {r.get('url')} | source={r.get('source')}") snippet = (r.get('content','') or '')[:120].replace('\n', ' ') print(f" Snippet: {snippet}...") # Basic sanity checks print("\nCounts:") print("vector_count:", retriever.get_vector_count()) print("results_count:", len(results))