File size: 34,399 Bytes
89fd50e
 
e06a21d
89fd50e
 
 
 
 
 
 
 
 
 
 
 
 
186fe46
 
 
 
89fd50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06a21d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c552a3
 
 
 
186fe46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06a21d
 
 
 
 
 
 
 
 
48cec82
 
 
 
 
e06a21d
 
48cec82
e06a21d
 
 
48cec82
 
e06a21d
48cec82
e06a21d
48cec82
 
3c552a3
e06a21d
48cec82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c552a3
 
 
 
 
 
 
 
 
 
 
 
 
48cec82
 
 
 
 
 
e06a21d
48cec82
 
 
 
 
 
 
 
 
 
 
 
 
e06a21d
 
 
48cec82
 
e06a21d
48cec82
 
 
 
 
 
 
 
 
 
 
 
3c552a3
 
 
 
48cec82
 
3c552a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48cec82
 
3c552a3
 
e06a21d
48cec82
 
 
 
 
 
e06a21d
48cec82
 
 
 
 
 
 
e06a21d
 
48cec82
e06a21d
 
 
48cec82
 
 
e06a21d
 
48cec82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06a21d
 
89fd50e
 
 
 
e06a21d
89fd50e
e06a21d
 
 
89fd50e
 
 
e06a21d
 
 
 
89fd50e
 
 
 
e06a21d
 
 
 
 
89fd50e
e06a21d
 
 
 
 
 
 
 
89fd50e
e06a21d
 
 
89fd50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48cec82
89fd50e
 
 
 
48cec82
 
89fd50e
 
 
48cec82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89fd50e
 
48cec82
 
 
89fd50e
48cec82
89fd50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
import os
import requests
import time
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime
from .models import NewsArticle

@dataclass
class FirebaseConfig:
    api_key: str
    auth_domain: str
    project_id: str
    storage_bucket: str
    messaging_sender_id: str
    app_id: str
    
    # Collection names
    ARTICLES_COLLECTION: str = "articles"
    ENGLISH_ARTICLES_COLLECTION: str = "Articles"  # Dedicated English articles collection (capital A)

FIREBASE_CONFIG = FirebaseConfig(
    api_key=os.environ.get("FIREBASE_API_KEY", "AIzaSyAX2ZBIB5lkBEEgXydi__Qlb0WBpUmntCk"),
    auth_domain=os.environ.get("FIREBASE_AUTH_DOMAIN", "cve-articles-b4f4f.firebaseapp.com"),
    project_id=os.environ.get("FIREBASE_PROJECT_ID", "cve-articles-b4f4f"),
    storage_bucket=os.environ.get("FIREBASE_STORAGE_BUCKET", "cve-articles-b4f4f.firebasestorage.app"),
    messaging_sender_id=os.environ.get("FIREBASE_MESSAGING_SENDER_ID", "682945772298"),
    app_id=os.environ.get("FIREBASE_APP_ID", "1:682945772298:web:b0d1dab0c7e07f83fad8f3")
)

class FirebaseNewsLoader:
    def __init__(self, config: Optional[FirebaseConfig] = None):
        self.config = config or FIREBASE_CONFIG
        self.project_id = self.config.project_id
        self.api_key = self.config.api_key

    def fetch_articles(self, limit: int = 5000, language: str = "English") -> List[NewsArticle]:
        """Fetch articles with optional limit, language filter, and rate limiting handling."""
        try:
            collection_name = "articles"
            
            # Use structured query to filter by language
            if language:
                return self._fetch_articles_with_filter(collection_name, limit, language)
            else:
                return self._fetch_articles_simple(collection_name, limit)
                
        except Exception as e:
            print(f"❌ Firebase error: {e}")
            return []

    def fetch_articles_by_language(self, language: str = "English", limit: int = 5000) -> List[NewsArticle]:
        """Fetch articles filtered by language - convenience method."""
        return self.fetch_articles(limit=limit, language=language)

    def fetch_english_articles(self, limit: int = 5000) -> List[NewsArticle]:
        """
        Fetch articles from the dedicated English articles collection.
        This mirrors the JavaScript fetchEnglishArticles function.
        """
        try:
            collection_name = self.config.ENGLISH_ARTICLES_COLLECTION
            print(f"πŸ” Fetching English articles from '{collection_name}' collection...")
            
            # Use simple GET request to fetch from the English articles collection
            base_url = f"https://firestore.googleapis.com/v1/projects/{self.project_id}/databases/(default)/documents/{collection_name}"
            
            articles: List[NewsArticle] = []
            page_token: Optional[str] = None
            batch_size = min(300, limit or 300)  # Firestore max pageSize
            remaining = limit
            
            while True:
                if remaining is not None and remaining <= 0:
                    break
                
                page_size = batch_size if remaining is None else min(batch_size, remaining)
                params = {
                    "key": self.config.api_key,
                    "pageSize": page_size
                }
                
                if page_token:
                    params["pageToken"] = page_token
                
                print(f"πŸ“‘ Requesting {page_size} articles from English collection...")
                resp = requests.get(base_url, params=params, timeout=30)
                
                if resp.status_code == 429:  # Rate limit
                    retry_after = int(resp.headers.get('Retry-After', 30))
                    print(f"⏳ Rate limited, waiting {retry_after}s...")
                    time.sleep(retry_after)
                    continue
                elif resp.status_code != 200:
                    print(f"❌ Failed to fetch English articles: {resp.status_code}")
                    if resp.status_code == 404:
                        print(f"πŸ’‘ Collection '{collection_name}' not found. Falling back to language filtering...")
                        return self.fetch_articles(limit=limit, language="English")
                    elif resp.status_code >= 500:
                        print(f"πŸ”„ Server error {resp.status_code}, retrying...")
                        time.sleep(2)
                        continue
                    else:
                        print(f"πŸ”„ Falling back to language filtering due to error {resp.status_code}")
                        return self.fetch_articles(limit=limit, language="English")
                
                data = resp.json()
                docs = data.get("documents", [])
                
                if not docs:
                    print("πŸ“­ No more documents in English collection")
                    break
                
                # Convert documents to NewsArticle objects
                batch_articles = []
                for doc in docs:
                    article = self._convert_english_doc(doc)
                    if article:
                        batch_articles.append(article)
                
                articles.extend(batch_articles)
                print(f"βœ… Processed {len(batch_articles)} articles from batch")
                
                if remaining is not None:
                    remaining -= len(docs)
                
                # Check for next page
                page_token = data.get("nextPageToken")
                if not page_token:
                    break
                
                # Small delay to avoid rate limiting
                time.sleep(0.1)
            
            print(f"🎯 Successfully fetched {len(articles)} English articles")
            return articles
            
        except Exception as e:
            print(f"❌ Error fetching English articles: {e}")
            import traceback
            traceback.print_exc()
            
            # Fallback to the old method
            print("πŸ”„ Falling back to language filtering method...")
            return self.fetch_articles(limit=limit, language="English")

    def _convert_english_doc(self, doc: Dict[str, Any]) -> Optional[NewsArticle]:
        """
        Convert Firebase document from English articles collection to NewsArticle.
        Optimized for the specific structure of English articles.
        """
        try:
            doc_name = doc.get("name", "")
            doc_id = doc_name.split("/")[-1] if doc_name else "unknown"
            fields = doc.get("fields", {})
            
            # Extract field values with proper type handling
            data: Dict[str, Any] = {}
            for fname, fval in fields.items():
                if fval and isinstance(fval, dict):
                    # Handle different Firestore value types
                    if "stringValue" in fval:
                        data[fname] = fval["stringValue"]
                    elif "integerValue" in fval:
                        data[fname] = int(fval["integerValue"])
                    elif "doubleValue" in fval:
                        data[fname] = float(fval["doubleValue"])
                    elif "timestampValue" in fval:
                        data[fname] = fval["timestampValue"]
                    elif "booleanValue" in fval:
                        data[fname] = fval["booleanValue"]
                    else:
                        # Get the first available value type
                        ftype = list(fval.keys())[0]
                        data[fname] = fval[ftype]
            
            # Enhanced field mapping for English articles collection
            # Try multiple field name variations for content
            content_candidates = [
                "content", "Content", "article_text", "Article_text", "articleText",
                "text", "Text", "body", "Body", "description", "Description",
                "summary", "Summary", "article_content", "articleContent", "full_text"
            ]
            
            content = ""
            content_field_used = None
            for candidate in content_candidates:
                if candidate in data and data[candidate]:
                    content = str(data[candidate]).strip()
                    content_field_used = candidate
                    break
            
            # Try multiple field name variations for title
            title_candidates = [
                "title", "Title", "headline", "Headline", "subject", "Subject", 
                "name", "Name", "article_title", "articleTitle"
            ]
            
            title = "Untitled"
            for candidate in title_candidates:
                if candidate in data and data[candidate]:
                    title = str(data[candidate]).strip()
                    break
            
            # Try multiple field name variations for URL
            url_candidates = [
                "url", "URL", "link", "Link", "href", "source_url", "sourceUrl", "web_url"
            ]
            
            url = f"firebase://english_articles/{doc_id}"
            for candidate in url_candidates:
                if candidate in data and data[candidate]:
                    url_value = str(data[candidate]).strip()
                    if url_value.startswith(('http://', 'https://')):
                        url = url_value
                        break
            
            # Source information
            source_candidates = ["source", "Source", "publisher", "Publisher", "site", "Site"]
            source = "English Articles Collection"
            for candidate in source_candidates:
                if candidate in data and data[candidate]:
                    source = str(data[candidate]).strip()
                    break
            
            # Date information
            date_candidates = [
                "published_date", "publishedDate", "date", "Date", "created_at", "createdAt",
                "timestamp", "publish_time", "publication_date"
            ]
            
            published_date = datetime.now().isoformat()
            for candidate in date_candidates:
                if candidate in data and data[candidate]:
                    published_date = str(data[candidate])
                    break
            
            # Quality check - ensure we have substantial content
            if len(content) < 100:
                print(f"⚠️ English article {doc_id[:8]}... has minimal content:")
                print(f"   Content field '{content_field_used}': {len(content)} chars")
                print(f"   Available fields: {list(data.keys())}")
                
                # Try to combine multiple fields if content is insufficient
                combined_content = []
                if title and title != "Untitled":
                    combined_content.append(f"Title: {title}")
                
                for field_name, field_value in data.items():
                    if (isinstance(field_value, str) and 
                        len(field_value) > 50 and 
                        field_name not in content_candidates[:3]):  # Not already used
                        combined_content.append(f"{field_name}: {field_value}")
                
                if combined_content:
                    content = "\n\n".join(combined_content)
                    print(f"   πŸ“ Combined content from multiple fields: {len(content)} chars")
            
            article = NewsArticle(
                title=title,
                content=content,
                url=url,
                source=source,
                published_date=published_date,
                scraped_date=data.get("scraped_date", data.get("scrapedAt", datetime.now().isoformat())),
                article_id=doc_id,
            )
            
            # Add language marker (since these are from English collection)
            article.language = "english"  # Match the JavaScript implementation
            
            return article
            
        except Exception as e:
            print(f"⚠️ Error converting English article {doc_id}: {e}")
            return None

    def _fetch_articles_with_filter(self, collection_name: str, limit: int, language: str) -> List[NewsArticle]:
        """Fetch articles using Firestore structured query with language filter."""
        try:
            # Firestore structured query endpoint
            query_url = f"https://firestore.googleapis.com/v1/projects/{self.project_id}/databases/(default)/documents:runQuery"
            
            remaining = None if (limit is None or (isinstance(limit, int) and limit <= 0)) else int(limit)
            articles: List[NewsArticle] = []
            
            # First, let's check what the data actually looks like
            print(f"πŸ” Analyzing Firebase data structure for language filtering...")
            
            # Get a small sample first to understand the data structure
            sample_query = {
                "structuredQuery": {
                    "from": [{"collectionId": collection_name}],
                    "limit": 3
                }
            }
            
            headers = {'Content-Type': 'application/json'}
            params = {"key": self.api_key}
            
            sample_resp = requests.post(query_url, json=sample_query, headers=headers, params=params, timeout=30)
            
            if sample_resp.status_code == 200:
                sample_data = sample_resp.json()
                print(f"πŸ“‹ Sample response contains {len(sample_data) if isinstance(sample_data, list) else 1} items")
                
                # Analyze the structure of the first document
                if isinstance(sample_data, list) and len(sample_data) > 0:
                    first_item = sample_data[0]
                    if "document" in first_item:
                        doc = first_item["document"]
                        if "fields" in doc:
                            fields = doc["fields"]
                            available_fields = list(fields.keys())
                            print(f"πŸ“Š Available fields: {available_fields}")
                            
                            # Check language field specifically
                            if "language" in fields:
                                lang_field = fields["language"]
                                print(f"πŸ”€ Language field structure: {lang_field}")
                                if "stringValue" in lang_field:
                                    print(f"πŸ”€ Language value: '{lang_field['stringValue']}'")
                            else:
                                print("⚠️ No 'language' field found! Looking for alternatives...")
                                # Check for alternative language field names
                                lang_candidates = [f for f in available_fields if 'lang' in f.lower()]
                                if lang_candidates:
                                    print(f"πŸ” Possible language fields: {lang_candidates}")
                                    # Use the first candidate
                                    alt_field = lang_candidates[0]
                                    print(f"πŸ”„ Using '{alt_field}' as language field")
                                    language_field = alt_field
                                else:
                                    print("❌ No language field found. Falling back to content analysis.")
                                    return self._fetch_with_content_filter(collection_name, limit, language)
                            
                            # Sample a few more documents to see language distribution
                            lang_values = set()
                            for item in sample_data:
                                if "document" in item and "fields" in item["document"]:
                                    doc_fields = item["document"]["fields"]
                                    if "language" in doc_fields and "stringValue" in doc_fields["language"]:
                                        lang_values.add(doc_fields["language"]["stringValue"])
                            
                            print(f"🌐 Language values found in sample: {list(lang_values)}")
                elif isinstance(sample_data, dict) and "documents" in sample_data:
                    # Different response format
                    documents = sample_data["documents"]
                    print(f"πŸ“‹ Found {len(documents)} documents in response")
                    if documents:
                        first_doc = documents[0]
                        if "fields" in first_doc:
                            fields = first_doc["fields"]
                            available_fields = list(fields.keys())
                            print(f"πŸ“Š Available fields: {available_fields}")
            else:
                print(f"❌ Sample query failed: {sample_resp.status_code}")
                # Continue anyway with best guess
            
            # Now try to query with language filter
            language_variants = [language, language.lower(), language.upper(), language.capitalize()]
            
            for lang_variant in language_variants:
                print(f"πŸ” Trying language filter: '{lang_variant}'")
                
                query_data = {
                    "structuredQuery": {
                        "from": [{"collectionId": collection_name}],
                        "where": {
                            "fieldFilter": {
                                "field": {"fieldPath": "language"},
                                "op": "EQUAL",
                                "value": {"stringValue": lang_variant}
                            }
                        },
                        "limit": min(remaining or 1000, 1000)
                    }
                }
                
                resp = requests.post(query_url, json=query_data, headers=headers, params=params, timeout=30)
                
                if resp.status_code == 200:
                    data = resp.json()
                    
                    if isinstance(data, list):
                        filtered_count = len(data)
                        print(f"πŸ“ˆ Found {filtered_count} articles with language='{lang_variant}'")
                        
                        if filtered_count > 0:
                            # Process the results
                            for result in data:
                                if "document" in result:
                                    doc = result["document"]
                                    art = self._convert_doc(doc)
                                    if art:
                                        articles.append(art)
                                elif "fields" in result:  # Direct document format
                                    art = self._convert_doc(result)
                                    if art:
                                        articles.append(art)
                            
                            # If we got good results, continue with this variant
                            if len(articles) >= 5:  # Lower threshold
                                print(f"βœ… Using language variant '{lang_variant}' - found {len(articles)} articles")
                                break
                    elif isinstance(data, dict) and "documents" in data:
                        # Alternative response format
                        documents = data["documents"]
                        filtered_count = len(documents)
                        print(f"πŸ“ˆ Found {filtered_count} documents with language='{lang_variant}'")
                        
                        if filtered_count > 0:
                            for doc in documents:
                                art = self._convert_doc(doc)
                                if art:
                                    articles.append(art)
                            
                            if len(articles) >= 5:
                                print(f"βœ… Using language variant '{lang_variant}' - found {len(articles)} articles")
                                break
                else:
                    print(f"❌ Query failed for '{lang_variant}': {resp.status_code}")
                
                time.sleep(0.2)  # Small delay between attempts
            
            # If we still don't have enough articles, fall back to content filtering
            if len(articles) < 100:
                print(f"⚠️ Only found {len(articles)} articles with language filter. Trying content-based filtering...")
                fallback_articles = self._fetch_with_content_filter(collection_name, remaining or 1000, language)
                
                # Merge results, avoiding duplicates
                existing_ids = {art.article_id for art in articles}
                for art in fallback_articles:
                    if art.article_id not in existing_ids:
                        articles.append(art)
                        if remaining and len(articles) >= remaining:
                            break
            
            print(f"βœ… Fetched {len(articles)} {language} articles from Firebase")
            return articles[:remaining] if remaining else articles
            
        except Exception as e:
            print(f"❌ Error in filtered fetch: {e}")
            import traceback
            traceback.print_exc()
            # Fallback to simple fetch
            return self._fetch_articles_simple(collection_name, limit)

    def _fetch_with_content_filter(self, collection_name: str, limit: int, language: str) -> List[NewsArticle]:
        """Fetch articles and filter by content analysis (fallback method)."""
        print(f"πŸ”„ Fetching articles and filtering by content for {language}...")
        
        # Fetch more articles to filter from
        raw_articles = self._fetch_articles_simple(collection_name, min(2000, limit * 3))
        filtered_articles = []
        
        for article in raw_articles:
            if self._is_likely_language(article.content, language):
                filtered_articles.append(article)
                if len(filtered_articles) >= limit:
                    break
        
        print(f"πŸ“Š Content filtering: {len(filtered_articles)} {language} articles from {len(raw_articles)} total")
        return filtered_articles

    def _is_likely_language(self, text: str, target_language: str) -> bool:
        """Simple heuristic to check if text is likely in the target language."""
        if not text or len(text) < 50:
            return False
        
        if target_language.lower() in ["english", "en"]:
            return self._is_likely_english(text)
        
        # For other languages, we'll need different heuristics
        # For now, default to True
        return True

    def _is_likely_english(self, text: str) -> bool:
        """Simple heuristic to check if text is likely English."""
        if not text or len(text) < 50:
            return False
        
        # Common English words and patterns
        english_indicators = {
            'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with',
            'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her',
            'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up',
            'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time',
            'security', 'vulnerability', 'attack', 'system', 'software', 'data', 'network', 'computer',
            'application', 'server', 'database', 'information', 'technology', 'cyber', 'malware', 'breach'
        }
        
        # Convert to lowercase and split into words
        words = text.lower().replace(',', ' ').replace('.', ' ').split()[:100]  # Check first 100 words
        if len(words) < 10:
            return False
        
        # Count English indicators
        english_count = 0
        for word in words:
            # Remove punctuation for matching
            clean_word = ''.join(c for c in word if c.isalnum())
            if clean_word in english_indicators:
                english_count += 1
        
        ratio = english_count / len(words)
        return ratio > 0.15  # At least 15% English indicators

    def _fetch_articles_simple(self, collection_name: str, limit: int) -> List[NewsArticle]:
        """Original simple fetch method without filtering."""
        try:
            base_url = f"https://firestore.googleapis.com/v1/projects/{self.project_id}/databases/(default)/documents/{collection_name}"
            remaining = None if (limit is None or (isinstance(limit, int) and limit <= 0)) else int(limit)
            page_token: Optional[str] = None
            batch_size = min(100, 300)  # Smaller batch size to avoid rate limiting
            articles: List[NewsArticle] = []
            request_count = 0
            max_requests = 50  # Limit total requests to avoid rate limiting
            
            while True:
                if remaining is not None and remaining <= 0:
                    break
                if request_count >= max_requests:
                    print(f"⏳ Reached max requests limit ({max_requests}), stopping to avoid rate limits")
                    break
                    
                page_size = batch_size if remaining is None else min(batch_size, remaining)
                params = {"key": self.api_key, "pageSize": page_size}
                if page_token:
                    params["pageToken"] = page_token
                
                # Add delay between requests to avoid rate limiting
                if request_count > 0:
                    time.sleep(0.2)  # 200ms delay between requests
                
                resp = requests.get(base_url, params=params, timeout=30)
                request_count += 1
                
                if resp.status_code == 429:  # Rate limit
                    retry_after = int(resp.headers.get('Retry-After', 60))
                    print(f"❌ Firebase API rate limited: waiting {retry_after}s")
                    time.sleep(retry_after)
                    continue
                elif resp.status_code != 200:
                    print(f"❌ Firebase API failed: {resp.status_code}")
                    if resp.status_code >= 500:  # Server error, might be temporary
                        time.sleep(5)
                        continue
                    break
                data = resp.json()
                docs = data.get("documents", [])
                if not docs:
                    break
                for d in docs:
                    art = self._convert_doc(d)
                    if art:
                        articles.append(art)
                if remaining is not None:
                    remaining -= len(docs)
                page_token = data.get("nextPageToken")
                if not page_token:
                    break
            return articles
        except Exception as e:
            print(f"❌ Firebase error: {e}")
            return []

    def _convert_doc(self, doc: Dict[str, Any]) -> Optional[NewsArticle]:
        """Convert Firebase document to NewsArticle with improved field mapping."""
        try:
            doc_name = doc.get("name", "")
            doc_id = doc_name.split("/")[-1] if doc_name else "unknown"
            fields = doc.get("fields", {})
            
            # Extract field values with better handling
            data: Dict[str, Any] = {}
            for fname, fval in fields.items():
                if fval and isinstance(fval, dict):
                    # Handle different Firestore value types
                    if "stringValue" in fval:
                        data[fname] = fval["stringValue"]
                    elif "integerValue" in fval:
                        data[fname] = fval["integerValue"]
                    elif "doubleValue" in fval:
                        data[fname] = fval["doubleValue"]
                    elif "timestampValue" in fval:
                        data[fname] = fval["timestampValue"]
                    elif "booleanValue" in fval:
                        data[fname] = fval["booleanValue"]
                    else:
                        # Get the first available value type
                        ftype = list(fval.keys())[0]
                        data[fname] = fval[ftype]
            
            # Try multiple field name variations for content
            content_candidates = [
                "Article_text", "article_text", "content", "Content", 
                "text", "Text", "body", "Body", "description", "Description",
                "summary", "Summary", "article_content", "articleContent"
            ]
            
            content = ""
            content_field = None
            for candidate in content_candidates:
                if candidate in data and data[candidate]:
                    content = str(data[candidate]).strip()
                    content_field = candidate
                    break
            
            # Try multiple field name variations for title
            title_candidates = [
                "Title", "title", "headline", "Headline", "subject", "Subject", "name", "Name"
            ]
            
            title = "Untitled"
            for candidate in title_candidates:
                if candidate in data and data[candidate]:
                    title = str(data[candidate]).strip()
                    break
            
            # Try multiple field name variations for URL
            url_candidates = [
                "URL", "url", "link", "Link", "href", "source_url", "sourceUrl"
            ]
            
            url = f"firebase://doc/{doc_id}"
            for candidate in url_candidates:
                if candidate in data and data[candidate]:
                    url = str(data[candidate]).strip()
                    break
            
            # Debug output for empty content
            if not content or len(content) < 50:
                available_fields = list(data.keys())
                print(f"⚠️ Article {doc_id[:8]}... has minimal content:")
                print(f"   Content field '{content_field}': {len(content)} chars")
                print(f"   Available fields: {available_fields}")
                print(f"   Sample data: {str(data)[:200]}...")
            
            article = NewsArticle(
                title=title,
                content=content,
                url=url,
                source=data.get("source", data.get("Source", "Firebase")),
                published_date=data.get("Date", data.get("date", data.get("published_date", data.get("createdAt", datetime.now().isoformat())))),
                scraped_date=data.get("scrapedAt", data.get("scraped_date", data.get("createdAt", datetime.now().isoformat()))),
                article_id=doc_id,
            )
            
            return article
            
        except Exception as e:
            print(f"⚠️ Document conversion error for {doc_id}: {e}")
            return None

    def load_news_articles(self, collection_name: str = "Articles", limit: int = 100) -> List[NewsArticle]:
        return self.fetch_articles(collection_name, limit)

    def analyze_schema(self, collection_name: str = "Articles") -> Dict[str, Any]:
        try:
            url = f"https://firestore.googleapis.com/v1/projects/{self.project_id}/databases/(default)/documents/{collection_name}"
            params = {"key": self.api_key, "pageSize": 5}
            response = requests.get(url, params=params, timeout=30)
            if response.status_code == 200:
                data = response.json()
                documents = data.get("documents", [])
                if not documents:
                    return {"error": "empty", "collection": collection_name}
                all_fields = set()
                sample_data = []
                for doc in documents:
                    fields = doc.get("fields", {})
                    field_names = list(fields.keys())
                    all_fields.update(field_names)
                    sample_values: Dict[str, Any] = {}
                    for fname, fdata in fields.items():
                        if fdata and isinstance(fdata, dict):
                            ftype = list(fdata.keys())[0]
                            sample_values[fname] = str(fdata[ftype])[:100]
                    doc_id = doc.get("name", "").split("/")[-1]
                    sample_data.append({"id": doc_id, "fields": field_names, "sample": sample_values})
                return {
                    "collection": collection_name,
                    "document_count": len(documents),
                    "unique_fields": sorted(list(all_fields)),
                    "field_count": len(all_fields),
                    "sample_documents": sample_data,
                }
            return {"error": f"status {response.status_code}", "collection": collection_name}
        except Exception as e:
            return {"error": str(e), "collection": collection_name}

    def get_collections_info(self) -> List[Dict[str, Any]]:
        possible = ["Articles", "articles"]
        results: List[Dict[str, Any]] = []
        seen = set()
        for name in possible:
            if name in seen:
                continue
            arts = self.fetch_articles(name, limit=5)
            if arts:
                results.append({
                    "name": name,
                    "document_count": "β‰₯" + str(len(arts)),
                    "sample_titles": [a.title for a in arts[:3]],
                })
                seen.add(name)
        if not results:
            results.append({"name": "Articles", "document_count": 0})
        return results