#!/usr/bin/env python3 """ Firebase collection explorer to understand the actual database structure. """ import os import sys import requests # Add the parent directory to Python path current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, current_dir) def explore_firebase_collections(): """Explore available Firebase collections and their structure.""" print("๐Ÿ” Firebase Collections Explorer") print("=" * 60) try: from cve_factchecker.firebase_loader import FirebaseNewsLoader loader = FirebaseNewsLoader() project_id = loader.project_id api_key = loader.config.api_key print(f"๐Ÿ“ก Project ID: {project_id}") # Try different collection names collection_candidates = [ "articles", "english_articles", "Articles", "English_articles", "news_articles", "cve_articles", "documents" ] found_collections = [] for collection_name in collection_candidates: print(f"\n๐Ÿ” Checking collection: '{collection_name}'") try: base_url = f"https://firestore.googleapis.com/v1/projects/{project_id}/databases/(default)/documents/{collection_name}" params = { "key": api_key, "pageSize": 3 # Small sample } resp = requests.get(base_url, params=params, timeout=15) if resp.status_code == 200: data = resp.json() docs = data.get("documents", []) if docs: print(f" โœ… Found {len(docs)} documents") found_collections.append(collection_name) # Analyze first document first_doc = docs[0] if "fields" in first_doc: fields = first_doc["fields"] field_names = list(fields.keys()) print(f" ๐Ÿ“Š Fields: {field_names}") # Check for language field if "language" in fields: lang_value = fields["language"] if "stringValue" in lang_value: print(f" ๐ŸŒ Language: '{lang_value['stringValue']}'") # Check for content fields content_fields = [f for f in field_names if any(term in f.lower() for term in ['content', 'text', 'article'])] if content_fields: print(f" ๐Ÿ“ Content fields: {content_fields}") # Show content sample for cf in content_fields[:1]: # First content field if cf in fields and "stringValue" in fields[cf]: content_sample = fields[cf]["stringValue"][:100] print(f" ๐Ÿ“– {cf} sample: {content_sample}...") else: print(f" ๐Ÿ“ญ Collection exists but is empty") found_collections.append(f"{collection_name} (empty)") elif resp.status_code == 404: print(f" โŒ Collection does not exist") else: print(f" โš ๏ธ Error {resp.status_code}: {resp.text[:100]}") except Exception as e: print(f" โŒ Error checking collection: {e}") print(f"\n๐Ÿ“‹ Summary:") print(f" Found collections: {found_collections}") # If main articles collection exists, explore language distribution if "articles" in found_collections: print(f"\n๐ŸŒ Analyzing language distribution in 'articles' collection...") explore_language_distribution(loader, "articles") return found_collections except Exception as e: print(f"โŒ Firebase exploration failed: {e}") import traceback traceback.print_exc() return [] def explore_language_distribution(loader, collection_name, sample_size=10): """Explore language distribution in a collection.""" try: base_url = f"https://firestore.googleapis.com/v1/projects/{loader.project_id}/databases/(default)/documents/{collection_name}" params = { "key": loader.config.api_key, "pageSize": sample_size } resp = requests.get(base_url, params=params, timeout=15) if resp.status_code == 200: data = resp.json() docs = data.get("documents", []) language_counts = {} content_lengths = [] for doc in docs: if "fields" in doc: fields = doc["fields"] # Check language lang = "unknown" if "language" in fields and "stringValue" in fields["language"]: lang = fields["language"]["stringValue"] language_counts[lang] = language_counts.get(lang, 0) + 1 # Check content length content_fields = ["content", "Content", "article_text", "Article_text", "text"] for cf in content_fields: if cf in fields and "stringValue" in fields[cf]: content_length = len(fields[cf]["stringValue"]) content_lengths.append(content_length) break print(f" Language distribution: {language_counts}") if content_lengths: avg_length = sum(content_lengths) / len(content_lengths) print(f" Average content length: {avg_length:.0f} characters") print(f" Content range: {min(content_lengths)} - {max(content_lengths)} characters") except Exception as e: print(f" โŒ Error analyzing language distribution: {e}") def create_test_collection_strategy(found_collections): """Create a strategy for testing based on found collections.""" print(f"\n๐Ÿ’ก Recommended Testing Strategy") print("=" * 60) if "articles" in found_collections: print("โœ… Use 'articles' collection with language filtering") print(" - This appears to be the main collection") print(" - Filter by language='English' or similar") # Test language filtering print(f"\n๐Ÿงช Testing language filtering on 'articles' collection...") test_language_filtering() elif any("english" in col.lower() for col in found_collections): english_collections = [col for col in found_collections if "english" in col.lower()] print(f"โœ… Use English-specific collection: {english_collections[0]}") else: print("โš ๏ธ No obvious English collection found") print("๐Ÿ’ก Recommended approach:") print(" 1. Use the largest available collection") print(" 2. Apply content-based English detection") return found_collections def test_language_filtering(): """Test different language filter values.""" try: from cve_factchecker.firebase_loader import FirebaseNewsLoader loader = FirebaseNewsLoader() # Test different language values language_variants = ["English", "english", "en", "EN", "eng"] for lang in language_variants: print(f" Testing language='{lang}'...") articles = loader.fetch_articles(limit=5, language=lang) print(f" Result: {len(articles)} articles") if articles: # Show sample sample = articles[0] print(f" Sample: {sample.title[:50]}...") break except Exception as e: print(f" โŒ Language filtering test failed: {e}") def main(): """Main exploration function.""" print("๐Ÿ” CVE Fact Checker - Firebase Database Explorer") print("=" * 80) found_collections = explore_firebase_collections() if found_collections: create_test_collection_strategy(found_collections) else: print("โŒ No collections found. Check Firebase configuration.") return bool(found_collections) if __name__ == "__main__": success = main() sys.exit(0 if success else 1)