Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Firebase collection explorer to understand the actual database structure. | |
| """ | |
| import os | |
| import sys | |
| import requests | |
| # Add the parent directory to Python path | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, current_dir) | |
| def explore_firebase_collections(): | |
| """Explore available Firebase collections and their structure.""" | |
| print("π Firebase Collections Explorer") | |
| print("=" * 60) | |
| try: | |
| from cve_factchecker.firebase_loader import FirebaseNewsLoader | |
| loader = FirebaseNewsLoader() | |
| project_id = loader.project_id | |
| api_key = loader.config.api_key | |
| print(f"π‘ Project ID: {project_id}") | |
| # Try different collection names | |
| collection_candidates = [ | |
| "articles", | |
| "english_articles", | |
| "Articles", | |
| "English_articles", | |
| "news_articles", | |
| "cve_articles", | |
| "documents" | |
| ] | |
| found_collections = [] | |
| for collection_name in collection_candidates: | |
| print(f"\nπ Checking collection: '{collection_name}'") | |
| try: | |
| base_url = f"https://firestore.googleapis.com/v1/projects/{project_id}/databases/(default)/documents/{collection_name}" | |
| params = { | |
| "key": api_key, | |
| "pageSize": 3 # Small sample | |
| } | |
| resp = requests.get(base_url, params=params, timeout=15) | |
| if resp.status_code == 200: | |
| data = resp.json() | |
| docs = data.get("documents", []) | |
| if docs: | |
| print(f" β Found {len(docs)} documents") | |
| found_collections.append(collection_name) | |
| # Analyze first document | |
| first_doc = docs[0] | |
| if "fields" in first_doc: | |
| fields = first_doc["fields"] | |
| field_names = list(fields.keys()) | |
| print(f" π Fields: {field_names}") | |
| # Check for language field | |
| if "language" in fields: | |
| lang_value = fields["language"] | |
| if "stringValue" in lang_value: | |
| print(f" π Language: '{lang_value['stringValue']}'") | |
| # Check for content fields | |
| content_fields = [f for f in field_names if any(term in f.lower() for term in ['content', 'text', 'article'])] | |
| if content_fields: | |
| print(f" π Content fields: {content_fields}") | |
| # Show content sample | |
| for cf in content_fields[:1]: # First content field | |
| if cf in fields and "stringValue" in fields[cf]: | |
| content_sample = fields[cf]["stringValue"][:100] | |
| print(f" π {cf} sample: {content_sample}...") | |
| else: | |
| print(f" π Collection exists but is empty") | |
| found_collections.append(f"{collection_name} (empty)") | |
| elif resp.status_code == 404: | |
| print(f" β Collection does not exist") | |
| else: | |
| print(f" β οΈ Error {resp.status_code}: {resp.text[:100]}") | |
| except Exception as e: | |
| print(f" β Error checking collection: {e}") | |
| print(f"\nπ Summary:") | |
| print(f" Found collections: {found_collections}") | |
| # If main articles collection exists, explore language distribution | |
| if "articles" in found_collections: | |
| print(f"\nπ Analyzing language distribution in 'articles' collection...") | |
| explore_language_distribution(loader, "articles") | |
| return found_collections | |
| except Exception as e: | |
| print(f"β Firebase exploration failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return [] | |
| def explore_language_distribution(loader, collection_name, sample_size=10): | |
| """Explore language distribution in a collection.""" | |
| try: | |
| base_url = f"https://firestore.googleapis.com/v1/projects/{loader.project_id}/databases/(default)/documents/{collection_name}" | |
| params = { | |
| "key": loader.config.api_key, | |
| "pageSize": sample_size | |
| } | |
| resp = requests.get(base_url, params=params, timeout=15) | |
| if resp.status_code == 200: | |
| data = resp.json() | |
| docs = data.get("documents", []) | |
| language_counts = {} | |
| content_lengths = [] | |
| for doc in docs: | |
| if "fields" in doc: | |
| fields = doc["fields"] | |
| # Check language | |
| lang = "unknown" | |
| if "language" in fields and "stringValue" in fields["language"]: | |
| lang = fields["language"]["stringValue"] | |
| language_counts[lang] = language_counts.get(lang, 0) + 1 | |
| # Check content length | |
| content_fields = ["content", "Content", "article_text", "Article_text", "text"] | |
| for cf in content_fields: | |
| if cf in fields and "stringValue" in fields[cf]: | |
| content_length = len(fields[cf]["stringValue"]) | |
| content_lengths.append(content_length) | |
| break | |
| print(f" Language distribution: {language_counts}") | |
| if content_lengths: | |
| avg_length = sum(content_lengths) / len(content_lengths) | |
| print(f" Average content length: {avg_length:.0f} characters") | |
| print(f" Content range: {min(content_lengths)} - {max(content_lengths)} characters") | |
| except Exception as e: | |
| print(f" β Error analyzing language distribution: {e}") | |
| def create_test_collection_strategy(found_collections): | |
| """Create a strategy for testing based on found collections.""" | |
| print(f"\nπ‘ Recommended Testing Strategy") | |
| print("=" * 60) | |
| if "articles" in found_collections: | |
| print("β Use 'articles' collection with language filtering") | |
| print(" - This appears to be the main collection") | |
| print(" - Filter by language='English' or similar") | |
| # Test language filtering | |
| print(f"\nπ§ͺ Testing language filtering on 'articles' collection...") | |
| test_language_filtering() | |
| elif any("english" in col.lower() for col in found_collections): | |
| english_collections = [col for col in found_collections if "english" in col.lower()] | |
| print(f"β Use English-specific collection: {english_collections[0]}") | |
| else: | |
| print("β οΈ No obvious English collection found") | |
| print("π‘ Recommended approach:") | |
| print(" 1. Use the largest available collection") | |
| print(" 2. Apply content-based English detection") | |
| return found_collections | |
| def test_language_filtering(): | |
| """Test different language filter values.""" | |
| try: | |
| from cve_factchecker.firebase_loader import FirebaseNewsLoader | |
| loader = FirebaseNewsLoader() | |
| # Test different language values | |
| language_variants = ["English", "english", "en", "EN", "eng"] | |
| for lang in language_variants: | |
| print(f" Testing language='{lang}'...") | |
| articles = loader.fetch_articles(limit=5, language=lang) | |
| print(f" Result: {len(articles)} articles") | |
| if articles: | |
| # Show sample | |
| sample = articles[0] | |
| print(f" Sample: {sample.title[:50]}...") | |
| break | |
| except Exception as e: | |
| print(f" β Language filtering test failed: {e}") | |
| def main(): | |
| """Main exploration function.""" | |
| print("π CVE Fact Checker - Firebase Database Explorer") | |
| print("=" * 80) | |
| found_collections = explore_firebase_collections() | |
| if found_collections: | |
| create_test_collection_strategy(found_collections) | |
| else: | |
| print("β No collections found. Check Firebase configuration.") | |
| return bool(found_collections) | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |