Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Analyze the current chunking strategy by examining actual chunks created from English articles. | |
| """ | |
| import os | |
| import sys | |
| # Add the parent directory to Python path | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, current_dir) | |
| def analyze_chunking_strategy(): | |
| """Analyze how articles are being chunked.""" | |
| print("π Chunking Strategy Analysis") | |
| print("=" * 60) | |
| try: | |
| from cve_factchecker.firebase_loader import FirebaseNewsLoader | |
| from cve_factchecker.retriever import VectorNewsRetriever | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # 1. Fetch a few English articles | |
| loader = FirebaseNewsLoader() | |
| print("π Fetching sample English articles...") | |
| articles = loader.fetch_english_articles(limit=3) | |
| if not articles: | |
| print("β No articles found") | |
| return | |
| print(f"β Got {len(articles)} articles for analysis") | |
| # 2. Show article content before chunking | |
| print(f"\nπ Article Content Analysis:") | |
| for i, article in enumerate(articles, 1): | |
| print(f"\n Article {i}: {article.title[:80]}...") | |
| print(f" Content Length: {len(article.content)} characters") | |
| print(f" URL: {article.url}") | |
| print(f" Content Preview: {article.content[:200]}...") | |
| # 3. Demonstrate chunking process | |
| print(f"\nπͺ Chunking Process Analysis:") | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| for i, article in enumerate(articles, 1): | |
| print(f"\n--- Article {i} Chunking ---") | |
| print(f"Title: {article.title}") | |
| print(f"Original Length: {len(article.content)} chars") | |
| # Create chunks | |
| chunks = splitter.split_text(article.content) | |
| print(f"Number of Chunks: {len(chunks)}") | |
| # Analyze each chunk | |
| for j, chunk in enumerate(chunks): | |
| print(f"\n Chunk {j+1}:") | |
| print(f" Length: {len(chunk)} characters") | |
| print(f" Content: {chunk[:150]}...") | |
| if j < len(chunks) - 1: | |
| # Check overlap with next chunk | |
| next_chunk = chunks[j+1] | |
| overlap = find_overlap(chunk, next_chunk) | |
| print(f" Overlap with next: {len(overlap)} chars") | |
| if overlap: | |
| print(f" Overlap text: '{overlap[:50]}...'") | |
| # 4. Test the complete vector storage process | |
| print(f"\nποΈ Vector Storage Process:") | |
| retriever = VectorNewsRetriever() | |
| # Process one article to see the complete document creation | |
| test_article = articles[0] | |
| print(f"\nProcessing: {test_article.title[:50]}...") | |
| # Simulate the document creation process | |
| chunks = splitter.split_text(test_article.content) | |
| documents = [] | |
| for i, chunk in enumerate(chunks): | |
| if len(chunk.strip()) < 30: | |
| continue | |
| # Show how page_content is constructed | |
| page_content = f"Title: {test_article.title}\n\n{chunk}" | |
| if test_article.source and test_article.source not in chunk: | |
| page_content += f"\n\nSource: {test_article.source}" | |
| metadata = { | |
| "url": test_article.url, | |
| "source": test_article.source, | |
| "published_date": test_article.published_date, | |
| "scraped_date": test_article.scraped_date, | |
| "id": test_article.article_id, | |
| "chunk_id": f"{test_article.article_id}_{i}", | |
| "title": test_article.title | |
| } | |
| documents.append({ | |
| "page_content": page_content, | |
| "metadata": metadata | |
| }) | |
| print(f"Created {len(documents)} document objects") | |
| # Show sample document structure | |
| if documents: | |
| print(f"\nπ Sample Document Structure:") | |
| sample_doc = documents[0] | |
| print(f"Page Content Length: {len(sample_doc['page_content'])} chars") | |
| print(f"Page Content Preview:") | |
| print(f" {sample_doc['page_content'][:300]}...") | |
| print(f"\nMetadata:") | |
| for key, value in sample_doc['metadata'].items(): | |
| print(f" {key}: {value}") | |
| return True | |
| except Exception as e: | |
| print(f"β Analysis failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def find_overlap(text1, text2): | |
| """Find overlapping text between two chunks.""" | |
| # Look for overlap from the end of text1 to the beginning of text2 | |
| max_overlap = min(200, len(text1), len(text2)) # Match chunk_overlap=200 | |
| for i in range(max_overlap, 0, -1): | |
| if text1[-i:] == text2[:i]: | |
| return text1[-i:] | |
| return "" | |
| def test_chunking_parameters(): | |
| """Test different chunking parameters to understand the strategy.""" | |
| print(f"\nπ§ͺ Chunking Parameters Test") | |
| print("=" * 60) | |
| try: | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from cve_factchecker.firebase_loader import FirebaseNewsLoader | |
| # Get a test article | |
| loader = FirebaseNewsLoader() | |
| articles = loader.fetch_english_articles(limit=1) | |
| if not articles: | |
| print("β No test article available") | |
| return | |
| test_content = articles[0].content | |
| print(f"Test Content Length: {len(test_content)} characters") | |
| # Test different chunk sizes | |
| test_configs = [ | |
| {"chunk_size": 500, "chunk_overlap": 100}, | |
| {"chunk_size": 1000, "chunk_overlap": 200}, # Current setting | |
| {"chunk_size": 1500, "chunk_overlap": 300}, | |
| {"chunk_size": 2000, "chunk_overlap": 400}, | |
| ] | |
| for config in test_configs: | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=config["chunk_size"], | |
| chunk_overlap=config["chunk_overlap"] | |
| ) | |
| chunks = splitter.split_text(test_content) | |
| print(f"\nπ Config: chunk_size={config['chunk_size']}, overlap={config['chunk_overlap']}") | |
| print(f" Chunks created: {len(chunks)}") | |
| if chunks: | |
| chunk_lengths = [len(chunk) for chunk in chunks] | |
| avg_length = sum(chunk_lengths) / len(chunk_lengths) | |
| print(f" Average chunk length: {avg_length:.0f} chars") | |
| print(f" Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} chars") | |
| # Show first chunk | |
| print(f" First chunk preview: {chunks[0][:100]}...") | |
| # Test overlap | |
| if len(chunks) > 1: | |
| overlap = find_overlap(chunks[0], chunks[1]) | |
| print(f" Actual overlap: {len(overlap)} chars") | |
| return True | |
| except Exception as e: | |
| print(f"β Parameter test failed: {e}") | |
| return False | |
| def analyze_current_vector_db(): | |
| """Analyze what's currently in the vector database.""" | |
| print(f"\nποΈ Current Vector Database Analysis") | |
| print("=" * 60) | |
| try: | |
| from cve_factchecker.retriever import VectorNewsRetriever | |
| retriever = VectorNewsRetriever() | |
| # Try a few different search queries to see what chunks look like | |
| test_queries = [ | |
| "security vulnerability", | |
| "cyberattack", | |
| "data breach", | |
| "malware", | |
| "terrorism" | |
| ] | |
| for query in test_queries: | |
| print(f"\nπ Search: '{query}'") | |
| results = retriever.semantic_search(query, k=2) | |
| if results: | |
| for i, result in enumerate(results, 1): | |
| print(f"\n Result {i}:") | |
| print(f" Title: {result['title'][:60]}...") | |
| print(f" Content Length: {len(result['content'])} chars") | |
| print(f" Content Preview: {result['content'][:200]}...") | |
| print(f" URL: {result['url']}") | |
| print(f" Source: {result['source']}") | |
| # Check chunk metadata | |
| metadata = result.get('metadata', {}) | |
| if 'chunk_id' in metadata: | |
| print(f" Chunk ID: {metadata['chunk_id']}") | |
| else: | |
| print(f" No results found") | |
| if results: | |
| break # Stop after first successful query | |
| return True | |
| except Exception as e: | |
| print(f"β Vector DB analysis failed: {e}") | |
| return False | |
| def main(): | |
| """Main analysis function.""" | |
| print("π CVE Fact Checker - Chunking Strategy Analysis") | |
| print("=" * 80) | |
| # Run all analyses | |
| success1 = analyze_chunking_strategy() | |
| success2 = test_chunking_parameters() if success1 else False | |
| success3 = analyze_current_vector_db() if success1 else False | |
| print(f"\nπ Analysis Summary:") | |
| print(f" Chunking Process: {'β Analyzed' if success1 else 'β Failed'}") | |
| print(f" Parameter Testing: {'β Completed' if success2 else 'β Failed'}") | |
| print(f" Vector DB Content: {'β Analyzed' if success3 else 'β Failed'}") | |
| if success1: | |
| print(f"\nπ‘ Current Chunking Strategy:") | |
| print(f" π Chunk Size: 1000 characters") | |
| print(f" π Overlap: 200 characters") | |
| print(f" πͺ Splitter: RecursiveCharacterTextSplitter") | |
| print(f" π Format: Title + Content + Source") | |
| print(f" π·οΈ Metadata: URL, source, dates, chunk_id") | |
| return success1 | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |