Spaces:

NLPGenius
/

CVE-FactChecker

Running

App Files Files Community

NLPGenius commited on Sep 12

Commit

16779bb

1 Parent(s): d004d70

vectore stor & firebase api error

Browse files

Files changed (1) hide show

cve_factchecker/app.py +179 -23

cve_factchecker/app.py CHANGED Viewed

@@ -3,6 +3,12 @@ from flask import Flask, jsonify, request
 from typing import Any, Dict
 import time
 import threading
 from .orchestrator import FactCheckSystem
 from .firebase_service import FirebaseVectorSync
 try:
@@ -10,44 +16,179 @@ try:
 except Exception:  # pragma: no cover
     CORS = None  # type: ignore
-system = FactCheckSystem()
-firebase_sync = FirebaseVectorSync()
-import os
-AUTO_INGEST = True  # Always ingest on startup for seamless experience
 INGEST_STATUS: Dict[str, Any] = {"started": time.time(), "finished": False, "synced": 0}
 def _background_ingest() -> None:
     try:
-        print("🚀 Refreshing vector store with latest 5000 Firebase articles...")
-        ingest_res = system.ingest_firebase(limit=5000)  # Use orchestrator method with 5000 limit
-        INGEST_STATUS.update({"finished": True, **ingest_res})
-        if not ingest_res.get("success"):
-            print("⚠️ Startup ingestion did not succeed:", ingest_res.get("error"))
-        else:
-            print(f"✅ Startup ingestion complete: {ingest_res.get('synced')} articles")
         # Log LLM availability
-        if system.analyzer.client:
-            print(f"🤖 LLM active: model={system.cfg.model} max_tokens={system.cfg.max_tokens}")
-        else:
-            print("⚠️ No LLM API key detected. Using heuristic fallback.")
     except Exception as e:
         INGEST_STATUS.update({"finished": True, "error": str(e)})
         print(f"❌ Startup ingestion failed: {e}")
 def _start_ingest_thread() -> None:
     if not AUTO_INGEST:
         return
     t = threading.Thread(target=_background_ingest, name="firebase-ingest", daemon=True)
     t.start()
 app = Flask(__name__)
 if CORS:
     CORS(app, resources={r"/*": {"origins": "*"}})
 start_time = time.time()
-# Start ingestion in background as soon as the module is imported / app is created
-_start_ingest_thread()
 @app.route('/health')
 def health() -> Any:
@@ -56,13 +197,28 @@ def health() -> Any:
 ## Simplified API: only /health and /fact-check provided. Data ingestion occurs automatically on startup.
 def _run_fact_check(claim: str):  # internal helper
     if not INGEST_STATUS.get("finished"):
         return {"verdict": "INITIALIZING", "reasoning": "Ingestion still in progress. Try again soon.", "confidence": 0.0}, 503
-    result = system.fact_check(claim)
-    if result.get('verdict') == 'ERROR' and '402' in result.get('reasoning',''):
-        result['verdict'] = 'UNVERIFIED'
-        result['reasoning'] = 'LLM quota/credits insufficient. Retrieval performed; provide API key to enable full analysis.'
-    return result, 200
 @app.route('/fact-check', methods=['POST','GET'])
 def fact_check() -> Any:

 from typing import Any, Dict
 import time
 import threading
+import os
+try:
+    import fcntl
+except ImportError:
+    # Windows doesn't have fcntl, use alternative locking
+    fcntl = None
 from .orchestrator import FactCheckSystem
 from .firebase_service import FirebaseVectorSync
 try:
 except Exception:  # pragma: no cover
     CORS = None  # type: ignore
+# Global initialization with proper error handling
+system = None
+firebase_sync = None
+AUTO_INGEST = os.environ.get("AUTO_INGEST", "true").lower() in ("true", "1", "yes")
 INGEST_STATUS: Dict[str, Any] = {"started": time.time(), "finished": False, "synced": 0}
+INIT_LOCK = threading.Lock()
+INGEST_LOCK_FILE = "/tmp/ingest.lock" if os.name != 'nt' else "ingest.lock"
+def _safe_initialize_system():
+    """Initialize system with proper error handling and singleton pattern."""
+    global system, firebase_sync
+    with INIT_LOCK:
+        if system is not None:
+            return
+        try:
+            print("� Initializing fact-check system...")
+            # Use safe directory for vector store
+            vector_dir = "/tmp/vector_db"
+            if os.path.exists("/data"):
+                vector_dir = "/data/vector_db"
+            system = FactCheckSystem(vector_dir=vector_dir)
+            firebase_sync = FirebaseVectorSync()
+            print("✅ System initialized successfully")
+        except Exception as e:
+            print(f"❌ System initialization failed: {e}")
+            # Create minimal fallback system
+            try:
+                system = FactCheckSystem(vector_dir="/tmp/vector_db_fallback")
+                firebase_sync = None
+                print("⚠️ Using fallback system")
+            except Exception as fallback_error:
+                print(f"❌ Even fallback failed: {fallback_error}")
+                system = None
+                firebase_sync = None
+def _is_ingest_locked():
+    """Check if another process is already running ingestion."""
+    try:
+        return os.path.exists(INGEST_LOCK_FILE)
+    except:
+        return False
+def _acquire_ingest_lock():
+    """Acquire lock for ingestion process."""
+    try:
+        if fcntl:  # Unix-like systems
+            with open(INGEST_LOCK_FILE, 'w') as f:
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+                f.write(str(os.getpid()))
+                return True
+        else:  # Windows or systems without fcntl
+            if os.path.exists(INGEST_LOCK_FILE):
+                return False
+            with open(INGEST_LOCK_FILE, 'w') as f:
+                f.write(str(os.getpid()))
+            return True
+    except (IOError, OSError):
+        return False
+def _release_ingest_lock():
+    """Release ingestion lock."""
+    try:
+        if os.path.exists(INGEST_LOCK_FILE):
+            os.remove(INGEST_LOCK_FILE)
+    except:
+        pass
 def _background_ingest() -> None:
+    """Background ingestion with proper locking and error handling."""
+    global system
+    # Check if another worker is already doing ingestion
+    if _is_ingest_locked():
+        print("⏳ Another process is handling ingestion, skipping...")
+        INGEST_STATUS.update({"finished": True, "skipped": True})
+        return
+    # Try to acquire lock
+    if not _acquire_ingest_lock():
+        print("⏳ Could not acquire ingestion lock, skipping...")
+        INGEST_STATUS.update({"finished": True, "skipped": True})
+        return
     try:
+        _safe_initialize_system()
+        if system is None:
+            print("❌ System not initialized, cannot perform ingestion")
+            INGEST_STATUS.update({"finished": True, "error": "System initialization failed"})
+            return
+        print("🚀 Refreshing vector store with latest Firebase articles...")
+        # Add retry logic with exponential backoff for Firebase API
+        max_retries = 3
+        base_delay = 5  # Start with 5 seconds
+        for attempt in range(max_retries):
+            try:
+                # Use smaller limit to avoid rate limiting
+                limit = min(1000, 5000)  # Start with smaller batch
+                ingest_res = system.ingest_firebase(limit=limit)
+                if ingest_res.get("success"):
+                    INGEST_STATUS.update({"finished": True, **ingest_res})
+                    print(f"✅ Startup ingestion complete: {ingest_res.get('synced')} articles")
+                    break
+                else:
+                    error_msg = ingest_res.get("error", "Unknown error")
+                    if "429" in str(error_msg) and attempt < max_retries - 1:
+                        delay = base_delay * (2 ** attempt)  # Exponential backoff
+                        print(f"⏳ Rate limited, waiting {delay}s before retry {attempt + 1}/{max_retries}")
+                        time.sleep(delay)
+                        continue
+                    else:
+                        print(f"⚠️ Startup ingestion failed: {error_msg}")
+                        INGEST_STATUS.update({"finished": True, "error": error_msg})
+                        break
+            except Exception as e:
+                if "429" in str(e) and attempt < max_retries - 1:
+                    delay = base_delay * (2 ** attempt)
+                    print(f"⏳ Rate limited (exception), waiting {delay}s before retry {attempt + 1}/{max_retries}")
+                    time.sleep(delay)
+                    continue
+                else:
+                    raise e
         # Log LLM availability
+        try:
+            if system and system.analyzer and system.analyzer.client:
+                print(f"🤖 LLM active: model={system.cfg.model} max_tokens={system.cfg.max_tokens}")
+            else:
+                print("⚠️ No LLM API key detected. Using heuristic fallback.")
+        except Exception as e:
+            print(f"⚠️ Could not check LLM status: {e}")
     except Exception as e:
         INGEST_STATUS.update({"finished": True, "error": str(e)})
         print(f"❌ Startup ingestion failed: {e}")
+    finally:
+        _release_ingest_lock()
 def _start_ingest_thread() -> None:
+    """Start ingestion thread with proper conditions."""
     if not AUTO_INGEST:
+        print("⏭️ Auto-ingestion disabled")
         return
+    # Only start if we're not already finished
+    if INGEST_STATUS.get("finished"):
+        return
     t = threading.Thread(target=_background_ingest, name="firebase-ingest", daemon=True)
     t.start()
+    print("🔄 Started background ingestion thread")
 app = Flask(__name__)
 if CORS:
     CORS(app, resources={r"/*": {"origins": "*"}})
 start_time = time.time()
+# Initialize system safely
+_safe_initialize_system()
+# Start ingestion in background only for the main process
+if os.environ.get('WERKZEUG_RUN_MAIN') != 'true' or not hasattr(os, 'fork'):
+    _start_ingest_thread()
 @app.route('/health')
 def health() -> Any:
 ## Simplified API: only /health and /fact-check provided. Data ingestion occurs automatically on startup.
 def _run_fact_check(claim: str):  # internal helper
+    """Run fact check with proper error handling."""
+    global system
+    # Ensure system is initialized
+    if system is None:
+        _safe_initialize_system()
+    if system is None:
+        return {"verdict": "ERROR", "reasoning": "System initialization failed. Please try again later.", "confidence": 0.0}, 503
     if not INGEST_STATUS.get("finished"):
         return {"verdict": "INITIALIZING", "reasoning": "Ingestion still in progress. Try again soon.", "confidence": 0.0}, 503
+    try:
+        result = system.fact_check(claim)
+        if result.get('verdict') == 'ERROR' and '402' in result.get('reasoning',''):
+            result['verdict'] = 'UNVERIFIED'
+            result['reasoning'] = 'LLM quota/credits insufficient. Retrieval performed; provide API key to enable full analysis.'
+        return result, 200
+    except Exception as e:
+        print(f"❌ Fact check error: {e}")
+        return {"verdict": "ERROR", "reasoning": f"Analysis failed: {str(e)}", "confidence": 0.0}, 500
 @app.route('/fact-check', methods=['POST','GET'])
 def fact_check() -> Any: