Spaces:

NLPGenius
/

CVE-FactChecker

Running

App Files Files Community

NLPGenius commited on Sep 16

Commit

a97117b

1 Parent(s): 1dd0906

Reliability: enforce JSON output via response_format, robust JSON fallback, dedupe retrieved articles, lightweight health

Browse files

Files changed (3) hide show

cve_factchecker/analyzer.py +50 -7
cve_factchecker/llm.py +11 -2
cve_factchecker/retriever.py +6 -1

cve_factchecker/analyzer.py CHANGED Viewed

@@ -29,13 +29,23 @@ class ClaimAnalyzer:
         self.cfg = cfg
         self.client = build_openrouter_client(cfg)
     def analyze(self, claim: str, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
         if not self.client:
             # Heuristic fallback: simple keyword overlap scoring.
             claim_lc = claim.lower()
             keywords = {w for w in claim_lc.split() if len(w) > 4}
             supporting: List[str] = []
             score = 0
-            for a in articles:
                 text = (a.get('content','') or '').lower()
                 overlap = sum(1 for k in keywords if k in text)
                 if overlap:
@@ -51,12 +61,24 @@ class ClaimAnalyzer:
                 "contradicting_evidence": [],
                 "context_quality": "medium" if supporting else "low",
             }
-        context = "\n\n".join([f"Article {i+1}:\nTitle: {a.get('title','Unknown')}\nSource: {a.get('source','Unknown')}\nURL: {a.get('url','')}\nContent: {a.get('content','')[:500]}..." for i,a in enumerate(articles)])
-        prompt = ("You are an expert Pakistani fact-checker. Analyze the claim against the retrieved context and return JSON only.\n\n"
-                  f"NEWS CLAIM: {claim}\n\nRETRIEVED CONTEXT:\n{context}\n\n"
-                  "Return strictly valid JSON with keys: verdict, confidence, reasoning, supporting_evidence, contradicting_evidence, context_quality.")
         try:
-            content = chat_complete(self.client, self.cfg.model, prompt, temperature=self.cfg.temperature, max_tokens=self.cfg.max_tokens).strip()
             if content.startswith("```"):
                 content = content.strip("`")
                 if "\n" in content:
@@ -66,5 +88,26 @@ class ClaimAnalyzer:
                 content = m.group(0)
             data = json.loads(content)
         except Exception as e:
-            return {"verdict": "ERROR", "confidence": 0.0, "reasoning": f"Analysis failed: {e}", "supporting_evidence": [], "contradicting_evidence": [], "context_quality": "low"}
         return normalize_result(data)

         self.cfg = cfg
         self.client = build_openrouter_client(cfg)
     def analyze(self, claim: str, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+        # Deduplicate articles by URL to reduce duplicates and noise
+        deduped: List[Dict[str, Any]] = []
+        seen = set()
+        for a in articles:
+            u = (a.get('url') or '').strip()
+            if u and u in seen:
+                continue
+            seen.add(u)
+            deduped.append(a)
         if not self.client:
             # Heuristic fallback: simple keyword overlap scoring.
             claim_lc = claim.lower()
             keywords = {w for w in claim_lc.split() if len(w) > 4}
             supporting: List[str] = []
             score = 0
+            for a in deduped:
                 text = (a.get('content','') or '').lower()
                 overlap = sum(1 for k in keywords if k in text)
                 if overlap:
                 "contradicting_evidence": [],
                 "context_quality": "medium" if supporting else "low",
             }
+        context = "\n\n".join([f"Article {i+1}:\nTitle: {a.get('title','Unknown')}\nSource: {a.get('source','Unknown')}\nURL: {a.get('url','')}\nContent: {a.get('content','')[:500]}..." for i,a in enumerate(deduped)])
+        prompt = (
+            "You are an expert Pakistani fact-checker. Analyze the claim against the retrieved context.\n"
+            "Return JSON ONLY. No prose. Use this exact schema keys: \n"
+            "{verdict: string, confidence: number between 0 and 1, reasoning: string, supporting_evidence: string[], contradicting_evidence: string[], context_quality: string}.\n"
+            "Do not include code fences. Do not include comments."
+            f"\n\nNEWS CLAIM: {claim}\n\nRETRIEVED CONTEXT:\n{context}\n"
+        )
         try:
+            # Request structured JSON when supported
+            content = chat_complete(
+                self.client,
+                self.cfg.model,
+                prompt,
+                temperature=self.cfg.temperature,
+                max_tokens=self.cfg.max_tokens,
+                response_format={"type": "json_object"}
+            ).strip()
             if content.startswith("```"):
                 content = content.strip("`")
                 if "\n" in content:
                 content = m.group(0)
             data = json.loads(content)
         except Exception as e:
+            # Robust fallback instead of returning ERROR: use heuristic pathway on failure
+            print(f"⚠️ JSON parse failed, falling back to heuristic: {e}")
+            claim_lc = claim.lower()
+            keywords = {w for w in claim_lc.split() if len(w) > 4}
+            supporting: List[str] = []
+            score = 0
+            for a in deduped:
+                text = (a.get('content','') or '').lower()
+                overlap = sum(1 for k in keywords if k in text)
+                if overlap:
+                    supporting.append(f"Match ({overlap}) in {a.get('url','')}")
+                    score += overlap
+            confidence = min(0.6, 0.1 * score) if supporting else 0.05
+            verdict = "POSSIBLY TRUE" if confidence > 0.3 else "UNVERIFIED"
+            data = {
+                "verdict": verdict,
+                "confidence": confidence,
+                "reasoning": "LLM output invalid; heuristic fallback used.",
+                "supporting_evidence": supporting[:5],
+                "contradicting_evidence": [],
+                "context_quality": "medium" if supporting else "low",
+            }
         return normalize_result(data)

cve_factchecker/llm.py CHANGED Viewed

@@ -13,7 +13,16 @@ def build_openrouter_client(cfg: OpenRouterConfig) -> Optional[OpenAI]:
         print(f"⚠️ LLM initialization failed: {e}")
         return None
-def chat_complete(client: OpenAI, model: str, prompt: str, temperature: float = 0.0, max_tokens: int = 800) -> str:
-    resp = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}], temperature=temperature, max_tokens=max_tokens)
     choice = resp.choices[0]
     return getattr(getattr(choice, "message", None), "content", None) or getattr(choice, "text", "") or ""

         print(f"⚠️ LLM initialization failed: {e}")
         return None
+def chat_complete(client: OpenAI, model: str, prompt: str, temperature: float = 0.0, max_tokens: int = 800, response_format: dict | None = None) -> str:
+    # Pass response_format when supported by the model/provider (OpenAI/OpenRouter)
+    kwargs = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    if response_format:
+        kwargs["response_format"] = response_format
+    resp = client.chat.completions.create(**kwargs)
     choice = resp.choices[0]
     return getattr(getattr(choice, "message", None), "content", None) or getattr(choice, "text", "") or ""

cve_factchecker/retriever.py CHANGED Viewed

@@ -213,6 +213,7 @@ class VectorNewsRetriever:
             print(f"❌ Vector search failed: {e}")
             return []
         results: List[Dict[str, Any]] = []
         for d in docs:
             meta = getattr(d, "metadata", {}) or {}
             content = getattr(d, "page_content", "") or ""
@@ -220,5 +221,9 @@ class VectorNewsRetriever:
             if content.startswith("Title: "):
                 line = content.splitlines()[0]
                 title = line.replace("Title: ", "").strip() or title
-            results.append({"title": title, "content": content, "url": meta.get("url", ""), "source": meta.get("source", "Unknown"), "metadata": meta})
         return results

             print(f"❌ Vector search failed: {e}")
             return []
         results: List[Dict[str, Any]] = []
+        seen_urls = set()
         for d in docs:
             meta = getattr(d, "metadata", {}) or {}
             content = getattr(d, "page_content", "") or ""
             if content.startswith("Title: "):
                 line = content.splitlines()[0]
                 title = line.replace("Title: ", "").strip() or title
+            url = meta.get("url", "")
+            if url and url in seen_urls:
+                continue
+            seen_urls.add(url)
+            results.append({"title": title, "content": content, "url": url, "source": meta.get("source", "Unknown"), "metadata": meta})
         return results