Spaces:

NLPGenius
/

CVE-FactChecker

Running

App Files Files Community

CVE-FactChecker / explore_firebase.py

NLPGenius

fix firebase issues

186fe46 3 months ago

raw

history blame

9.06 kB

	#!/usr/bin/env python3
	"""
	Firebase collection explorer to understand the actual database structure.
	"""

	import os
	import sys
	import requests

	# Add the parent directory to Python path
	current_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.insert(0, current_dir)

	def explore_firebase_collections():
	"""Explore available Firebase collections and their structure."""
	print("🔍 Firebase Collections Explorer")
	print("=" * 60)

	try:
	from cve_factchecker.firebase_loader import FirebaseNewsLoader

	loader = FirebaseNewsLoader()
	project_id = loader.project_id
	api_key = loader.config.api_key

	print(f"📡 Project ID: {project_id}")

	# Try different collection names
	collection_candidates = [
	"articles",
	"english_articles",
	"Articles",
	"English_articles",
	"news_articles",
	"cve_articles",
	"documents"
	]

	found_collections = []

	for collection_name in collection_candidates:
	print(f"\n🔍 Checking collection: '{collection_name}'")

	try:
	base_url = f"https://firestore.googleapis.com/v1/projects/{project_id}/databases/(default)/documents/{collection_name}"
	params = {
	"key": api_key,
	"pageSize": 3 # Small sample
	}

	resp = requests.get(base_url, params=params, timeout=15)

	if resp.status_code == 200:
	data = resp.json()
	docs = data.get("documents", [])

	if docs:
	print(f" ✅ Found {len(docs)} documents")
	found_collections.append(collection_name)

	# Analyze first document
	first_doc = docs[0]
	if "fields" in first_doc:
	fields = first_doc["fields"]
	field_names = list(fields.keys())
	print(f" 📊 Fields: {field_names}")

	# Check for language field
	if "language" in fields:
	lang_value = fields["language"]
	if "stringValue" in lang_value:
	print(f" 🌐 Language: '{lang_value['stringValue']}'")

	# Check for content fields
	content_fields = [f for f in field_names if any(term in f.lower() for term in ['content', 'text', 'article'])]
	if content_fields:
	print(f" 📝 Content fields: {content_fields}")

	# Show content sample
	for cf in content_fields[:1]: # First content field
	if cf in fields and "stringValue" in fields[cf]:
	content_sample = fields[cf]["stringValue"][:100]
	print(f" 📖 {cf} sample: {content_sample}...")
	else:
	print(f" 📭 Collection exists but is empty")
	found_collections.append(f"{collection_name} (empty)")

	elif resp.status_code == 404:
	print(f" ❌ Collection does not exist")
	else:
	print(f" ⚠️ Error {resp.status_code}: {resp.text[:100]}")

	except Exception as e:
	print(f" ❌ Error checking collection: {e}")

	print(f"\n📋 Summary:")
	print(f" Found collections: {found_collections}")

	# If main articles collection exists, explore language distribution
	if "articles" in found_collections:
	print(f"\n🌐 Analyzing language distribution in 'articles' collection...")
	explore_language_distribution(loader, "articles")

	return found_collections

	except Exception as e:
	print(f"❌ Firebase exploration failed: {e}")
	import traceback
	traceback.print_exc()
	return []

	def explore_language_distribution(loader, collection_name, sample_size=10):
	"""Explore language distribution in a collection."""
	try:
	base_url = f"https://firestore.googleapis.com/v1/projects/{loader.project_id}/databases/(default)/documents/{collection_name}"
	params = {
	"key": loader.config.api_key,
	"pageSize": sample_size
	}

	resp = requests.get(base_url, params=params, timeout=15)

	if resp.status_code == 200:
	data = resp.json()
	docs = data.get("documents", [])

	language_counts = {}
	content_lengths = []

	for doc in docs:
	if "fields" in doc:
	fields = doc["fields"]

	# Check language
	lang = "unknown"
	if "language" in fields and "stringValue" in fields["language"]:
	lang = fields["language"]["stringValue"]

	language_counts[lang] = language_counts.get(lang, 0) + 1

	# Check content length
	content_fields = ["content", "Content", "article_text", "Article_text", "text"]
	for cf in content_fields:
	if cf in fields and "stringValue" in fields[cf]:
	content_length = len(fields[cf]["stringValue"])
	content_lengths.append(content_length)
	break

	print(f" Language distribution: {language_counts}")
	if content_lengths:
	avg_length = sum(content_lengths) / len(content_lengths)
	print(f" Average content length: {avg_length:.0f} characters")
	print(f" Content range: {min(content_lengths)} - {max(content_lengths)} characters")

	except Exception as e:
	print(f" ❌ Error analyzing language distribution: {e}")

	def create_test_collection_strategy(found_collections):
	"""Create a strategy for testing based on found collections."""
	print(f"\n💡 Recommended Testing Strategy")
	print("=" * 60)

	if "articles" in found_collections:
	print("✅ Use 'articles' collection with language filtering")
	print(" - This appears to be the main collection")
	print(" - Filter by language='English' or similar")

	# Test language filtering
	print(f"\n🧪 Testing language filtering on 'articles' collection...")
	test_language_filtering()

	elif any("english" in col.lower() for col in found_collections):
	english_collections = [col for col in found_collections if "english" in col.lower()]
	print(f"✅ Use English-specific collection: {english_collections[0]}")

	else:
	print("⚠️ No obvious English collection found")
	print("💡 Recommended approach:")
	print(" 1. Use the largest available collection")
	print(" 2. Apply content-based English detection")

	return found_collections

	def test_language_filtering():
	"""Test different language filter values."""
	try:
	from cve_factchecker.firebase_loader import FirebaseNewsLoader

	loader = FirebaseNewsLoader()

	# Test different language values
	language_variants = ["English", "english", "en", "EN", "eng"]

	for lang in language_variants:
	print(f" Testing language='{lang}'...")
	articles = loader.fetch_articles(limit=5, language=lang)
	print(f" Result: {len(articles)} articles")

	if articles:
	# Show sample
	sample = articles[0]
	print(f" Sample: {sample.title[:50]}...")
	break

	except Exception as e:
	print(f" ❌ Language filtering test failed: {e}")

	def main():
	"""Main exploration function."""
	print("🔍 CVE Fact Checker - Firebase Database Explorer")
	print("=" * 80)

	found_collections = explore_firebase_collections()

	if found_collections:
	create_test_collection_strategy(found_collections)
	else:
	print("❌ No collections found. Check Firebase configuration.")

	return bool(found_collections)

	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)