Spaces:

NLPGenius
/

CVE-FactChecker

Sleeping

File size: 9,056 Bytes

186fe46

#!/usr/bin/env python3
"""
Firebase collection explorer to understand the actual database structure.
"""

import os
import sys
import requests

# Add the parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, current_dir)

def explore_firebase_collections():
    """Explore available Firebase collections and their structure."""
    print("🔍 Firebase Collections Explorer")
    print("=" * 60)
    
    try:
        from cve_factchecker.firebase_loader import FirebaseNewsLoader
        
        loader = FirebaseNewsLoader()
        project_id = loader.project_id
        api_key = loader.config.api_key
        
        print(f"📡 Project ID: {project_id}")
        
        # Try different collection names
        collection_candidates = [
            "articles",
            "english_articles", 
            "Articles",
            "English_articles",
            "news_articles",
            "cve_articles",
            "documents"
        ]
        
        found_collections = []
        
        for collection_name in collection_candidates:
            print(f"\n🔍 Checking collection: '{collection_name}'")
            
            try:
                base_url = f"https://firestore.googleapis.com/v1/projects/{project_id}/databases/(default)/documents/{collection_name}"
                params = {
                    "key": api_key,
                    "pageSize": 3  # Small sample
                }
                
                resp = requests.get(base_url, params=params, timeout=15)
                
                if resp.status_code == 200:
                    data = resp.json()
                    docs = data.get("documents", [])
                    
                    if docs:
                        print(f"   ✅ Found {len(docs)} documents")
                        found_collections.append(collection_name)
                        
                        # Analyze first document
                        first_doc = docs[0]
                        if "fields" in first_doc:
                            fields = first_doc["fields"]
                            field_names = list(fields.keys())
                            print(f"   📊 Fields: {field_names}")
                            
                            # Check for language field
                            if "language" in fields:
                                lang_value = fields["language"]
                                if "stringValue" in lang_value:
                                    print(f"   🌐 Language: '{lang_value['stringValue']}'")
                            
                            # Check for content fields
                            content_fields = [f for f in field_names if any(term in f.lower() for term in ['content', 'text', 'article'])]
                            if content_fields:
                                print(f"   📝 Content fields: {content_fields}")
                                
                                # Show content sample
                                for cf in content_fields[:1]:  # First content field
                                    if cf in fields and "stringValue" in fields[cf]:
                                        content_sample = fields[cf]["stringValue"][:100]
                                        print(f"   📖 {cf} sample: {content_sample}...")
                    else:
                        print(f"   📭 Collection exists but is empty")
                        found_collections.append(f"{collection_name} (empty)")
                        
                elif resp.status_code == 404:
                    print(f"   ❌ Collection does not exist")
                else:
                    print(f"   ⚠️ Error {resp.status_code}: {resp.text[:100]}")
                    
            except Exception as e:
                print(f"   ❌ Error checking collection: {e}")
        
        print(f"\n📋 Summary:")
        print(f"   Found collections: {found_collections}")
        
        # If main articles collection exists, explore language distribution
        if "articles" in found_collections:
            print(f"\n🌐 Analyzing language distribution in 'articles' collection...")
            explore_language_distribution(loader, "articles")
        
        return found_collections
        
    except Exception as e:
        print(f"❌ Firebase exploration failed: {e}")
        import traceback
        traceback.print_exc()
        return []

def explore_language_distribution(loader, collection_name, sample_size=10):
    """Explore language distribution in a collection."""
    try:
        base_url = f"https://firestore.googleapis.com/v1/projects/{loader.project_id}/databases/(default)/documents/{collection_name}"
        params = {
            "key": loader.config.api_key,
            "pageSize": sample_size
        }
        
        resp = requests.get(base_url, params=params, timeout=15)
        
        if resp.status_code == 200:
            data = resp.json()
            docs = data.get("documents", [])
            
            language_counts = {}
            content_lengths = []
            
            for doc in docs:
                if "fields" in doc:
                    fields = doc["fields"]
                    
                    # Check language
                    lang = "unknown"
                    if "language" in fields and "stringValue" in fields["language"]:
                        lang = fields["language"]["stringValue"]
                    
                    language_counts[lang] = language_counts.get(lang, 0) + 1
                    
                    # Check content length
                    content_fields = ["content", "Content", "article_text", "Article_text", "text"]
                    for cf in content_fields:
                        if cf in fields and "stringValue" in fields[cf]:
                            content_length = len(fields[cf]["stringValue"])
                            content_lengths.append(content_length)
                            break
            
            print(f"   Language distribution: {language_counts}")
            if content_lengths:
                avg_length = sum(content_lengths) / len(content_lengths)
                print(f"   Average content length: {avg_length:.0f} characters")
                print(f"   Content range: {min(content_lengths)} - {max(content_lengths)} characters")
        
    except Exception as e:
        print(f"   ❌ Error analyzing language distribution: {e}")

def create_test_collection_strategy(found_collections):
    """Create a strategy for testing based on found collections."""
    print(f"\n💡 Recommended Testing Strategy")
    print("=" * 60)
    
    if "articles" in found_collections:
        print("✅ Use 'articles' collection with language filtering")
        print("   - This appears to be the main collection")
        print("   - Filter by language='English' or similar")
        
        # Test language filtering
        print(f"\n🧪 Testing language filtering on 'articles' collection...")
        test_language_filtering()
        
    elif any("english" in col.lower() for col in found_collections):
        english_collections = [col for col in found_collections if "english" in col.lower()]
        print(f"✅ Use English-specific collection: {english_collections[0]}")
        
    else:
        print("⚠️ No obvious English collection found")
        print("💡 Recommended approach:")
        print("   1. Use the largest available collection")
        print("   2. Apply content-based English detection")
        
    return found_collections

def test_language_filtering():
    """Test different language filter values."""
    try:
        from cve_factchecker.firebase_loader import FirebaseNewsLoader
        
        loader = FirebaseNewsLoader()
        
        # Test different language values
        language_variants = ["English", "english", "en", "EN", "eng"]
        
        for lang in language_variants:
            print(f"   Testing language='{lang}'...")
            articles = loader.fetch_articles(limit=5, language=lang)
            print(f"     Result: {len(articles)} articles")
            
            if articles:
                # Show sample
                sample = articles[0]
                print(f"     Sample: {sample.title[:50]}...")
                break
                
    except Exception as e:
        print(f"   ❌ Language filtering test failed: {e}")

def main():
    """Main exploration function."""
    print("🔍 CVE Fact Checker - Firebase Database Explorer")
    print("=" * 80)
    
    found_collections = explore_firebase_collections()
    
    if found_collections:
        create_test_collection_strategy(found_collections)
    else:
        print("❌ No collections found. Check Firebase configuration.")
    
    return bool(found_collections)

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)