CVE-FactChecker / explore_firebase.py
NLPGenius's picture
fix firebase issues
186fe46
raw
history blame
9.06 kB
#!/usr/bin/env python3
"""
Firebase collection explorer to understand the actual database structure.
"""
import os
import sys
import requests
# Add the parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, current_dir)
def explore_firebase_collections():
"""Explore available Firebase collections and their structure."""
print("πŸ” Firebase Collections Explorer")
print("=" * 60)
try:
from cve_factchecker.firebase_loader import FirebaseNewsLoader
loader = FirebaseNewsLoader()
project_id = loader.project_id
api_key = loader.config.api_key
print(f"πŸ“‘ Project ID: {project_id}")
# Try different collection names
collection_candidates = [
"articles",
"english_articles",
"Articles",
"English_articles",
"news_articles",
"cve_articles",
"documents"
]
found_collections = []
for collection_name in collection_candidates:
print(f"\nπŸ” Checking collection: '{collection_name}'")
try:
base_url = f"https://firestore.googleapis.com/v1/projects/{project_id}/databases/(default)/documents/{collection_name}"
params = {
"key": api_key,
"pageSize": 3 # Small sample
}
resp = requests.get(base_url, params=params, timeout=15)
if resp.status_code == 200:
data = resp.json()
docs = data.get("documents", [])
if docs:
print(f" βœ… Found {len(docs)} documents")
found_collections.append(collection_name)
# Analyze first document
first_doc = docs[0]
if "fields" in first_doc:
fields = first_doc["fields"]
field_names = list(fields.keys())
print(f" πŸ“Š Fields: {field_names}")
# Check for language field
if "language" in fields:
lang_value = fields["language"]
if "stringValue" in lang_value:
print(f" 🌐 Language: '{lang_value['stringValue']}'")
# Check for content fields
content_fields = [f for f in field_names if any(term in f.lower() for term in ['content', 'text', 'article'])]
if content_fields:
print(f" πŸ“ Content fields: {content_fields}")
# Show content sample
for cf in content_fields[:1]: # First content field
if cf in fields and "stringValue" in fields[cf]:
content_sample = fields[cf]["stringValue"][:100]
print(f" πŸ“– {cf} sample: {content_sample}...")
else:
print(f" πŸ“­ Collection exists but is empty")
found_collections.append(f"{collection_name} (empty)")
elif resp.status_code == 404:
print(f" ❌ Collection does not exist")
else:
print(f" ⚠️ Error {resp.status_code}: {resp.text[:100]}")
except Exception as e:
print(f" ❌ Error checking collection: {e}")
print(f"\nπŸ“‹ Summary:")
print(f" Found collections: {found_collections}")
# If main articles collection exists, explore language distribution
if "articles" in found_collections:
print(f"\n🌐 Analyzing language distribution in 'articles' collection...")
explore_language_distribution(loader, "articles")
return found_collections
except Exception as e:
print(f"❌ Firebase exploration failed: {e}")
import traceback
traceback.print_exc()
return []
def explore_language_distribution(loader, collection_name, sample_size=10):
"""Explore language distribution in a collection."""
try:
base_url = f"https://firestore.googleapis.com/v1/projects/{loader.project_id}/databases/(default)/documents/{collection_name}"
params = {
"key": loader.config.api_key,
"pageSize": sample_size
}
resp = requests.get(base_url, params=params, timeout=15)
if resp.status_code == 200:
data = resp.json()
docs = data.get("documents", [])
language_counts = {}
content_lengths = []
for doc in docs:
if "fields" in doc:
fields = doc["fields"]
# Check language
lang = "unknown"
if "language" in fields and "stringValue" in fields["language"]:
lang = fields["language"]["stringValue"]
language_counts[lang] = language_counts.get(lang, 0) + 1
# Check content length
content_fields = ["content", "Content", "article_text", "Article_text", "text"]
for cf in content_fields:
if cf in fields and "stringValue" in fields[cf]:
content_length = len(fields[cf]["stringValue"])
content_lengths.append(content_length)
break
print(f" Language distribution: {language_counts}")
if content_lengths:
avg_length = sum(content_lengths) / len(content_lengths)
print(f" Average content length: {avg_length:.0f} characters")
print(f" Content range: {min(content_lengths)} - {max(content_lengths)} characters")
except Exception as e:
print(f" ❌ Error analyzing language distribution: {e}")
def create_test_collection_strategy(found_collections):
"""Create a strategy for testing based on found collections."""
print(f"\nπŸ’‘ Recommended Testing Strategy")
print("=" * 60)
if "articles" in found_collections:
print("βœ… Use 'articles' collection with language filtering")
print(" - This appears to be the main collection")
print(" - Filter by language='English' or similar")
# Test language filtering
print(f"\nπŸ§ͺ Testing language filtering on 'articles' collection...")
test_language_filtering()
elif any("english" in col.lower() for col in found_collections):
english_collections = [col for col in found_collections if "english" in col.lower()]
print(f"βœ… Use English-specific collection: {english_collections[0]}")
else:
print("⚠️ No obvious English collection found")
print("πŸ’‘ Recommended approach:")
print(" 1. Use the largest available collection")
print(" 2. Apply content-based English detection")
return found_collections
def test_language_filtering():
"""Test different language filter values."""
try:
from cve_factchecker.firebase_loader import FirebaseNewsLoader
loader = FirebaseNewsLoader()
# Test different language values
language_variants = ["English", "english", "en", "EN", "eng"]
for lang in language_variants:
print(f" Testing language='{lang}'...")
articles = loader.fetch_articles(limit=5, language=lang)
print(f" Result: {len(articles)} articles")
if articles:
# Show sample
sample = articles[0]
print(f" Sample: {sample.title[:50]}...")
break
except Exception as e:
print(f" ❌ Language filtering test failed: {e}")
def main():
"""Main exploration function."""
print("πŸ” CVE Fact Checker - Firebase Database Explorer")
print("=" * 80)
found_collections = explore_firebase_collections()
if found_collections:
create_test_collection_strategy(found_collections)
else:
print("❌ No collections found. Check Firebase configuration.")
return bool(found_collections)
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)