File size: 3,218 Bytes
a7270f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import time
from cve_factchecker.retriever import VectorNewsRetriever
from cve_factchecker.models import NewsArticle

# Keep this test lightweight and isolated
os.environ["USE_DUMMY_EMBEDDINGS"] = "true"
os.environ["VECTOR_PERSIST_DIR"] = os.path.abspath("./vector_db_hybrid_test")

articles = [
    NewsArticle(
        title="Militants storm FC lines in Bannu",
        content=(
            "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa. "
            "Security forces responded swiftly, and the situation is under control."
        ),
        url="https://tribune.com.pk/story/2564614/militants-storm-fc-lines-in-bannu",
        source="The Express Tribune",
        published_date="2025-09-15",
        scraped_date=str(int(time.time())),
        article_id="a1",
        language="English",
    ),
    NewsArticle(
        title="Six soldiers martyred; five terrorists killed in Bannu FC compound attack",
        content=(
            "An attack on the FC compound in Bannu resulted in the martyrdom of six soldiers."
            "Reports indicate five terrorists were killed in the exchange."
        ),
        url="https://dailytimes.com.pk/1363459/six-soldiers-martyred-five-terrorists-killed-in-attack-on-bannu-fc-compound/",
        source="Daily Times",
        published_date="2025-09-15",
        scraped_date=str(int(time.time())),
        article_id="a2",
        language="English",
    ),
    NewsArticle(
        title="KP operations update: militants neutralized",
        content=(
            "Security operations in Khyber-Pakhtunkhwa neutralized multiple militants. The Frontier Corps participated "
            "in the operations across the province."
        ),
        url="https://dailytimes.com.pk/1368975/31-indian-backed-militants-killed-in-kp-operations/",
        source="Daily Times",
        published_date="2025-09-16",
        scraped_date=str(int(time.time())),
        article_id="a3",
        language="English",
    ),
    NewsArticle(
        title="Sports: Cricket series announced",
        content="Pakistan Cricket Board announced a new bilateral series in Lahore next month.",
        url="https://example.com/sports/cricket-series",
        source="Example Sports",
        published_date="2025-09-10",
        scraped_date=str(int(time.time())),
        article_id="a4",
        language="English",
    ),
]

if __name__ == "__main__":
    retriever = VectorNewsRetriever(persist_directory=os.environ["VECTOR_PERSIST_DIR"])
    retriever.store_articles_in_vector_db(articles, clear_first=True)

    query = (
        "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa"
    )

    print("\n=== Hybrid Retrieval Results (k=5) ===")
    results = retriever.semantic_search(query, k=5)
    for i, r in enumerate(results, 1):
        print(f"{i}. {r.get('title')} | {r.get('url')} | source={r.get('source')}")
        snippet = (r.get('content','') or '')[:120].replace('\n', ' ')
        print(f"   Snippet: {snippet}...")

    # Basic sanity checks
    print("\nCounts:")
    print("vector_count:", retriever.get_vector_count())
    print("results_count:", len(results))