Spaces:

NLPGenius
/

CVE-FactChecker

Running

App Files Files Community

NLPGenius commited on Sep 23

Commit

a7270f3

1 Parent(s): a97117b

Hybrid retrieval: semantic + BM25-style keyword fusion, lazy index, dedupe, robust fallbacks

Browse files

Files changed (2) hide show

cve_factchecker/retriever.py +236 -7
test_hybrid_retriever.py +80 -0

cve_factchecker/retriever.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from __future__ import annotations
 import os
-from typing import List, Dict, Any
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document
 try:
@@ -44,6 +47,18 @@ class VectorNewsRetriever:
         self.persist_directory = env_dir or persist_directory
         self.embeddings = build_embeddings()
         self.vector_store = self._initialize_vector_store()
     def _initialize_vector_store(self) -> Chroma:
         """Initialize vector store with proper error handling for permission issues."""
         # If no persist directory (failed all write tests), use in-memory
@@ -97,6 +112,10 @@ class VectorNewsRetriever:
             print(f"⚠️ Could not clear vector store: {e}")
             # Fallback: create new in-memory store
             self.vector_store = Chroma(embedding_function=self.embeddings, collection_name="news_articles_fresh")
     def store_articles_in_vector_db(self, articles: List[NewsArticle], clear_first: bool = False) -> None:
         if not articles:
@@ -204,14 +223,216 @@ class VectorNewsRetriever:
         except Exception as e:
             print(f"⚠️ Could not persist vector store: {e}")
         print(f"✅ Stored {len(docs)} chunks from {len(articles)} articles")
-    def semantic_search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
         try:
-            # Guardrails on k to avoid heavy loads
-            k = max(1, min(int(k or 5), 10))
-            docs = self.vector_store.similarity_search(query, k=k)
         except Exception as e:
             print(f"❌ Vector search failed: {e}")
             return []
         results: List[Dict[str, Any]] = []
         seen_urls = set()
         for d in docs:
@@ -221,9 +442,17 @@ class VectorNewsRetriever:
             if content.startswith("Title: "):
                 line = content.splitlines()[0]
                 title = line.replace("Title: ", "").strip() or title
-            url = meta.get("url", "")
             if url and url in seen_urls:
                 continue
             seen_urls.add(url)
-            results.append({"title": title, "content": content, "url": url, "source": meta.get("source", "Unknown"), "metadata": meta})
         return results

 from __future__ import annotations
 import os
+import math
+import re
+import time
+from typing import List, Dict, Any, Tuple, Optional
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document
 try:
         self.persist_directory = env_dir or persist_directory
         self.embeddings = build_embeddings()
         self.vector_store = self._initialize_vector_store()
+        # Lightweight in-memory keyword index (lazy-built)
+        self._index_ready: bool = False
+        self._index_built_at: float = 0.0
+        self._N: int = 0  # number of docs
+        self._avgdl: float = 0.0
+        self._df: Dict[str, int] = {}
+        self._postings: Dict[str, Dict[str, int]] = {}  # term -> {doc_id: tf}
+        self._doc_len: Dict[str, int] = {}
+        self._doc_meta: Dict[str, Dict[str, Any]] = {}  # id -> {content, metadata}
+        self._stopwords = set(
+            "the a an and or of to in on for from by with without at as is are was were be been being this that those these it its their his her you your we our not no over under into about across more most least few many much may might should would could will can https http www com pk net org www.".split()
+        )
     def _initialize_vector_store(self) -> Chroma:
         """Initialize vector store with proper error handling for permission issues."""
         # If no persist directory (failed all write tests), use in-memory
             print(f"⚠️ Could not clear vector store: {e}")
             # Fallback: create new in-memory store
             self.vector_store = Chroma(embedding_function=self.embeddings, collection_name="news_articles_fresh")
+        # Invalidate keyword index after clear
+        self._index_ready = False
+        self._df.clear(); self._postings.clear(); self._doc_len.clear(); self._doc_meta.clear()
+        self._N = 0; self._avgdl = 0.0
     def store_articles_in_vector_db(self, articles: List[NewsArticle], clear_first: bool = False) -> None:
         if not articles:
         except Exception as e:
             print(f"⚠️ Could not persist vector store: {e}")
         print(f"✅ Stored {len(docs)} chunks from {len(articles)} articles")
+        # Invalidate index so it is rebuilt on next query
+        self._index_ready = False
+        self._df.clear(); self._postings.clear(); self._doc_len.clear(); self._doc_meta.clear()
+        self._N = 0; self._avgdl = 0.0
+    # -----------------------------
+    # Hybrid Retrieval Implementation
+    # -----------------------------
+    def _tokenize(self, text: str) -> List[str]:
+        text = text.lower()
+        # Keep alphanumerics as tokens
+        tokens = re.split(r"[^a-z0-9]+", text)
+        return [t for t in tokens if t and t not in self._stopwords and not t.isdigit()]
+    def _ensure_index(self) -> None:
+        if self._index_ready:
+            return
         try:
+            # Prefer direct collection access for efficiency
+            docs_data: Optional[Dict[str, Any]] = None
+            if hasattr(self.vector_store, "_collection") and self.vector_store._collection is not None:  # type: ignore[attr-defined]
+                try:
+                    docs_data = self.vector_store._collection.get(include=["ids", "documents", "metadatas"])  # type: ignore[attr-defined]
+                except Exception as e:
+                    print(f"⚠️ Could not read collection directly: {e}")
+            if docs_data is None:
+                try:
+                    docs_data = self.vector_store.get()
+                except Exception as e:
+                    print(f"⚠️ Could not fetch documents for index: {e}")
+                    self._index_ready = False
+                    return
+            ids = docs_data.get("ids", []) or []
+            documents = docs_data.get("documents", []) or []
+            metadatas = docs_data.get("metadatas", []) or []
+            N = len(ids)
+            if N == 0:
+                self._index_ready = True
+                self._N = 0
+                self._avgdl = 0.0
+                return
+            df: Dict[str, int] = {}
+            postings: Dict[str, Dict[str, int]] = {}
+            doc_len: Dict[str, int] = {}
+            doc_meta: Dict[str, Dict[str, Any]] = {}
+            total_len = 0
+            for doc_id, content, meta in zip(ids, documents, metadatas):
+                content = content or ""
+                tokens = self._tokenize(content)
+                total_len += len(tokens)
+                doc_len[doc_id] = len(tokens)
+                # compute term frequencies
+                tf: Dict[str, int] = {}
+                for tok in tokens:
+                    tf[tok] = tf.get(tok, 0) + 1
+                # update postings and df
+                for tok, freq in tf.items():
+                    if tok not in postings:
+                        postings[tok] = {doc_id: freq}
+                        df[tok] = 1
+                    else:
+                        postings[tok][doc_id] = freq
+                        df[tok] = df.get(tok, 0) + 1
+                # store meta for reconstruction
+                doc_meta[doc_id] = {
+                    "content": content,
+                    "metadata": meta or {},
+                }
+            self._N = N
+            self._avgdl = (total_len / N) if N else 0.0
+            self._df = df
+            self._postings = postings
+            self._doc_len = doc_len
+            self._doc_meta = doc_meta
+            self._index_ready = True
+            self._index_built_at = time.time()
+            # print(f"🔎 Keyword index built for {N} docs (avgdl={self._avgdl:.1f})")
+        except Exception as e:
+            print(f"⚠️ Failed building keyword index: {e}")
+            self._index_ready = False
+    def _bm25_scores(self, query: str) -> Dict[str, float]:
+        self._ensure_index()
+        if not self._index_ready or self._N == 0:
+            return {}
+        q_tokens = self._tokenize(query)
+        if not q_tokens:
+            return {}
+        # collect candidate docs (union of postings for query tokens)
+        candidate_docs: Dict[str, float] = {}
+        k1, b = 1.5, 0.75
+        for tok in q_tokens:
+            df = self._df.get(tok, 0)
+            postings = self._postings.get(tok)
+            if not postings or df == 0:
+                continue
+            # IDF with +1 stabilizer
+            idf = math.log((self._N - df + 0.5) / (df + 0.5) + 1.0)
+            for doc_id, tf in postings.items():
+                dl = self._doc_len.get(doc_id, 0) or 1
+                denom = tf + k1 * (1 - b + b * (dl / (self._avgdl or 1.0)))
+                score = idf * (tf * (k1 + 1)) / denom
+                candidate_docs[doc_id] = candidate_docs.get(doc_id, 0.0) + score
+        return candidate_docs
+    def _semantic_candidates(self, query: str, n: int) -> List[Tuple[Any, float]]:
+        """Return list of (doc, score) for semantic candidates; fallback if scores not available."""
+        try:
+            if hasattr(self.vector_store, "similarity_search_with_score"):
+                docs_scores = self.vector_store.similarity_search_with_score(query, k=n)
+                # docs_scores -> List[Tuple[Document, float]] where lower score is closer for some stores; normalize later
+                return docs_scores
+            # fallback: without scores, get docs and synthesize decreasing scores
+            docs = self.vector_store.similarity_search(query, k=n)
+            return list(zip(docs, [1.0 - (i / max(1, n)) for i in range(len(docs))]))
         except Exception as e:
             print(f"❌ Vector search failed: {e}")
             return []
+    def _normalize_scores(self, scores: Dict[str, float]) -> Dict[str, float]:
+        if not scores:
+            return {}
+        vals = list(scores.values())
+        mx = max(vals)
+        mn = min(vals)
+        if mx == mn:
+            return {k: 1.0 for k in scores}
+        return {k: (v - mn) / (mx - mn) for k, v in scores.items()}
+    def semantic_search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
+        """Hybrid retrieval: fuse semantic and keyword (BM25-like) signals and return top-k results.
+        Maintains original signature and return shape for compatibility.
+        """
+        # Guardrails
+        k = max(1, min(int(k or 5), 10))
+        # Collect candidates
+        n_sem = max(k * 2, 10)
+        n_kw = max(k * 3, 20)
+        sem_pairs = self._semantic_candidates(query, n_sem)
+        # Build semantic score map keyed by (url or id)
+        sem_scores: Dict[str, float] = {}
+        sem_docs_map: Dict[str, Any] = {}
+        for d, score in sem_pairs:
+            meta = getattr(d, "metadata", {}) or {}
+            url = (meta.get("url") or "").strip()
+            key = url or getattr(d, "id", None) or id(d)
+            sem_scores[key] = float(score if score is not None else 0.0)
+            sem_docs_map[key] = d
+        # Normalize semantic scores to ascending relevance (higher better)
+        # For some stores, lower distance is better; invert appropriately
+        if sem_scores:
+            # Try to detect if lower is better (distance) and invert
+            vals = list(sem_scores.values())
+            lower_is_better = True if len(vals) > 1 and vals[0] > vals[-1] else False
+            if lower_is_better:
+                maxv = max(vals)
+                sem_scores = {k: (maxv - v) for k, v in sem_scores.items()}
+            sem_scores = self._normalize_scores(sem_scores)
+        # Keyword BM25 candidates
+        kw_raw_scores = self._bm25_scores(query)
+        # Keep top n_kw keyword docs
+        if kw_raw_scores:
+            kw_items = sorted(kw_raw_scores.items(), key=lambda x: x[1], reverse=True)[:n_kw]
+            kw_raw_scores = dict(kw_items)
+        kw_scores = self._normalize_scores(kw_raw_scores)
+        # Fusion: weighted sum
+        alpha = 0.6  # semantic weight
+        beta = 0.4   # keyword weight
+        fused: Dict[str, float] = {}
+        # Include all keys from both sets
+        keys = set(sem_scores.keys()) | set(kw_scores.keys())
+        for key in keys:
+            s = sem_scores.get(key, 0.0)
+            w = kw_scores.get(key, 0.0)
+            fused[key] = alpha * s + beta * w
+        if not fused and sem_docs_map:
+            # If keyword index not ready, fallback to semantic docs order
+            ordered = sorted(sem_docs_map.items(), key=lambda kv: sem_scores.get(kv[0], 0.0), reverse=True)
+            docs = [d for _, d in ordered[:k]]
+        elif not fused and kw_scores:
+            # If semantic failed, reconstruct docs from index metadata
+            ordered = sorted(kw_scores.items(), key=lambda kv: kv[1], reverse=True)[:k]
+            docs = []
+            for doc_id, _ in ordered:
+                meta_entry = self._doc_meta.get(doc_id) or {}
+                content = meta_entry.get("content", "")
+                meta = meta_entry.get("metadata", {})
+                docs.append(Document(page_content=content, metadata=meta))
+        else:
+            ordered = sorted(fused.items(), key=lambda kv: kv[1], reverse=True)[:max(k*2, 20)]
+            docs = []
+            seen_keys = set()
+            for key, _ in ordered:
+                if key in seen_keys:
+                    continue
+                seen_keys.add(key)
+                if key in sem_docs_map:
+                    docs.append(sem_docs_map[key])
+                else:
+                    # reconstruct from keyword index
+                    meta_entry = self._doc_meta.get(key) or {}
+                    content = meta_entry.get("content", "")
+                    meta = meta_entry.get("metadata", {})
+                    docs.append(Document(page_content=content, metadata=meta))
+        # Convert to results shape and dedupe by URL
         results: List[Dict[str, Any]] = []
         seen_urls = set()
         for d in docs:
             if content.startswith("Title: "):
                 line = content.splitlines()[0]
                 title = line.replace("Title: ", "").strip() or title
+            url = (meta.get("url", "") or "").strip()
             if url and url in seen_urls:
                 continue
             seen_urls.add(url)
+            results.append({
+                "title": title,
+                "content": content,
+                "url": url,
+                "source": meta.get("source", "Unknown"),
+                "metadata": meta,
+            })
+            if len(results) >= k:
+                break
         return results

test_hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import time
+from cve_factchecker.retriever import VectorNewsRetriever
+from cve_factchecker.models import NewsArticle
+# Keep this test lightweight and isolated
+os.environ["USE_DUMMY_EMBEDDINGS"] = "true"
+os.environ["VECTOR_PERSIST_DIR"] = os.path.abspath("./vector_db_hybrid_test")
+articles = [
+    NewsArticle(
+        title="Militants storm FC lines in Bannu",
+        content=(
+            "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa. "
+            "Security forces responded swiftly, and the situation is under control."
+        ),
+        url="https://tribune.com.pk/story/2564614/militants-storm-fc-lines-in-bannu",
+        source="The Express Tribune",
+        published_date="2025-09-15",
+        scraped_date=str(int(time.time())),
+        article_id="a1",
+        language="English",
+    ),
+    NewsArticle(
+        title="Six soldiers martyred; five terrorists killed in Bannu FC compound attack",
+        content=(
+            "An attack on the FC compound in Bannu resulted in the martyrdom of six soldiers."
+            "Reports indicate five terrorists were killed in the exchange."
+        ),
+        url="https://dailytimes.com.pk/1363459/six-soldiers-martyred-five-terrorists-killed-in-attack-on-bannu-fc-compound/",
+        source="Daily Times",
+        published_date="2025-09-15",
+        scraped_date=str(int(time.time())),
+        article_id="a2",
+        language="English",
+    ),
+    NewsArticle(
+        title="KP operations update: militants neutralized",
+        content=(
+            "Security operations in Khyber-Pakhtunkhwa neutralized multiple militants. The Frontier Corps participated "
+            "in the operations across the province."
+        ),
+        url="https://dailytimes.com.pk/1368975/31-indian-backed-militants-killed-in-kp-operations/",
+        source="Daily Times",
+        published_date="2025-09-16",
+        scraped_date=str(int(time.time())),
+        article_id="a3",
+        language="English",
+    ),
+    NewsArticle(
+        title="Sports: Cricket series announced",
+        content="Pakistan Cricket Board announced a new bilateral series in Lahore next month.",
+        url="https://example.com/sports/cricket-series",
+        source="Example Sports",
+        published_date="2025-09-10",
+        scraped_date=str(int(time.time())),
+        article_id="a4",
+        language="English",
+    ),
+]
+if __name__ == "__main__":
+    retriever = VectorNewsRetriever(persist_directory=os.environ["VECTOR_PERSIST_DIR"])
+    retriever.store_articles_in_vector_db(articles, clear_first=True)
+    query = (
+        "At least five militants attacked the Frontier Corps (FC) Lines in Bannu, Khyber-Pakhtunkhwa"
+    )
+    print("\n=== Hybrid Retrieval Results (k=5) ===")
+    results = retriever.semantic_search(query, k=5)
+    for i, r in enumerate(results, 1):
+        print(f"{i}. {r.get('title')} | {r.get('url')} | source={r.get('source')}")
+        snippet = (r.get('content','') or '')[:120].replace('\n', ' ')
+        print(f"   Snippet: {snippet}...")
+    # Basic sanity checks
+    print("\nCounts:")
+    print("vector_count:", retriever.get_vector_count())
+    print("results_count:", len(results))