CVE-FactChecker / test_hybrid_retriever.py
NLPGenius's picture
Hybrid retrieval: semantic + BM25-style keyword fusion, lazy index, dedupe, robust fallbacks
a7270f3
import os
import time
from cve_factchecker.retriever import VectorNewsRetriever
from cve_factchecker.models import NewsArticle
# Keep this test lightweight and isolated
os.environ["USE_DUMMY_EMBEDDINGS"] = "true"
os.environ["VECTOR_PERSIST_DIR"] = os.path.abspath("./vector_db_hybrid_test")
articles = [
NewsArticle(
title="Militants storm FC lines in Bannu",
content=(
"At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa. "
"Security forces responded swiftly, and the situation is under control."
),
url="https://tribune.com.pk/story/2564614/militants-storm-fc-lines-in-bannu",
source="The Express Tribune",
published_date="2025-09-15",
scraped_date=str(int(time.time())),
article_id="a1",
language="English",
),
NewsArticle(
title="Six soldiers martyred; five terrorists killed in Bannu FC compound attack",
content=(
"An attack on the FC compound in Bannu resulted in the martyrdom of six soldiers."
"Reports indicate five terrorists were killed in the exchange."
),
url="https://dailytimes.com.pk/1363459/six-soldiers-martyred-five-terrorists-killed-in-attack-on-bannu-fc-compound/",
source="Daily Times",
published_date="2025-09-15",
scraped_date=str(int(time.time())),
article_id="a2",
language="English",
),
NewsArticle(
title="KP operations update: militants neutralized",
content=(
"Security operations in Khyber-Pakhtunkhwa neutralized multiple militants. The Frontier Corps participated "
"in the operations across the province."
),
url="https://dailytimes.com.pk/1368975/31-indian-backed-militants-killed-in-kp-operations/",
source="Daily Times",
published_date="2025-09-16",
scraped_date=str(int(time.time())),
article_id="a3",
language="English",
),
NewsArticle(
title="Sports: Cricket series announced",
content="Pakistan Cricket Board announced a new bilateral series in Lahore next month.",
url="https://example.com/sports/cricket-series",
source="Example Sports",
published_date="2025-09-10",
scraped_date=str(int(time.time())),
article_id="a4",
language="English",
),
]
if __name__ == "__main__":
retriever = VectorNewsRetriever(persist_directory=os.environ["VECTOR_PERSIST_DIR"])
retriever.store_articles_in_vector_db(articles, clear_first=True)
query = (
"At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa"
)
print("\n=== Hybrid Retrieval Results (k=5) ===")
results = retriever.semantic_search(query, k=5)
for i, r in enumerate(results, 1):
print(f"{i}. {r.get('title')} | {r.get('url')} | source={r.get('source')}")
snippet = (r.get('content','') or '')[:120].replace('\n', ' ')
print(f" Snippet: {snippet}...")
# Basic sanity checks
print("\nCounts:")
print("vector_count:", retriever.get_vector_count())
print("results_count:", len(results))