#!/usr/bin/env python3 """ Analyze the current chunking strategy by examining actual chunks created from English articles. """ import os import sys # Add the parent directory to Python path current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, current_dir) def analyze_chunking_strategy(): """Analyze how articles are being chunked.""" print("๐Ÿ” Chunking Strategy Analysis") print("=" * 60) try: from cve_factchecker.firebase_loader import FirebaseNewsLoader from cve_factchecker.retriever import VectorNewsRetriever from langchain.text_splitter import RecursiveCharacterTextSplitter # 1. Fetch a few English articles loader = FirebaseNewsLoader() print("๐Ÿ“Š Fetching sample English articles...") articles = loader.fetch_english_articles(limit=3) if not articles: print("โŒ No articles found") return print(f"โœ… Got {len(articles)} articles for analysis") # 2. Show article content before chunking print(f"\n๐Ÿ“„ Article Content Analysis:") for i, article in enumerate(articles, 1): print(f"\n Article {i}: {article.title[:80]}...") print(f" Content Length: {len(article.content)} characters") print(f" URL: {article.url}") print(f" Content Preview: {article.content[:200]}...") # 3. Demonstrate chunking process print(f"\n๐Ÿ”ช Chunking Process Analysis:") splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) for i, article in enumerate(articles, 1): print(f"\n--- Article {i} Chunking ---") print(f"Title: {article.title}") print(f"Original Length: {len(article.content)} chars") # Create chunks chunks = splitter.split_text(article.content) print(f"Number of Chunks: {len(chunks)}") # Analyze each chunk for j, chunk in enumerate(chunks): print(f"\n Chunk {j+1}:") print(f" Length: {len(chunk)} characters") print(f" Content: {chunk[:150]}...") if j < len(chunks) - 1: # Check overlap with next chunk next_chunk = chunks[j+1] overlap = find_overlap(chunk, next_chunk) print(f" Overlap with next: {len(overlap)} chars") if overlap: print(f" Overlap text: '{overlap[:50]}...'") # 4. Test the complete vector storage process print(f"\n๐Ÿ—„๏ธ Vector Storage Process:") retriever = VectorNewsRetriever() # Process one article to see the complete document creation test_article = articles[0] print(f"\nProcessing: {test_article.title[:50]}...") # Simulate the document creation process chunks = splitter.split_text(test_article.content) documents = [] for i, chunk in enumerate(chunks): if len(chunk.strip()) < 30: continue # Show how page_content is constructed page_content = f"Title: {test_article.title}\n\n{chunk}" if test_article.source and test_article.source not in chunk: page_content += f"\n\nSource: {test_article.source}" metadata = { "url": test_article.url, "source": test_article.source, "published_date": test_article.published_date, "scraped_date": test_article.scraped_date, "id": test_article.article_id, "chunk_id": f"{test_article.article_id}_{i}", "title": test_article.title } documents.append({ "page_content": page_content, "metadata": metadata }) print(f"Created {len(documents)} document objects") # Show sample document structure if documents: print(f"\n๐Ÿ“‹ Sample Document Structure:") sample_doc = documents[0] print(f"Page Content Length: {len(sample_doc['page_content'])} chars") print(f"Page Content Preview:") print(f" {sample_doc['page_content'][:300]}...") print(f"\nMetadata:") for key, value in sample_doc['metadata'].items(): print(f" {key}: {value}") return True except Exception as e: print(f"โŒ Analysis failed: {e}") import traceback traceback.print_exc() return False def find_overlap(text1, text2): """Find overlapping text between two chunks.""" # Look for overlap from the end of text1 to the beginning of text2 max_overlap = min(200, len(text1), len(text2)) # Match chunk_overlap=200 for i in range(max_overlap, 0, -1): if text1[-i:] == text2[:i]: return text1[-i:] return "" def test_chunking_parameters(): """Test different chunking parameters to understand the strategy.""" print(f"\n๐Ÿงช Chunking Parameters Test") print("=" * 60) try: from langchain.text_splitter import RecursiveCharacterTextSplitter from cve_factchecker.firebase_loader import FirebaseNewsLoader # Get a test article loader = FirebaseNewsLoader() articles = loader.fetch_english_articles(limit=1) if not articles: print("โŒ No test article available") return test_content = articles[0].content print(f"Test Content Length: {len(test_content)} characters") # Test different chunk sizes test_configs = [ {"chunk_size": 500, "chunk_overlap": 100}, {"chunk_size": 1000, "chunk_overlap": 200}, # Current setting {"chunk_size": 1500, "chunk_overlap": 300}, {"chunk_size": 2000, "chunk_overlap": 400}, ] for config in test_configs: splitter = RecursiveCharacterTextSplitter( chunk_size=config["chunk_size"], chunk_overlap=config["chunk_overlap"] ) chunks = splitter.split_text(test_content) print(f"\n๐Ÿ“Š Config: chunk_size={config['chunk_size']}, overlap={config['chunk_overlap']}") print(f" Chunks created: {len(chunks)}") if chunks: chunk_lengths = [len(chunk) for chunk in chunks] avg_length = sum(chunk_lengths) / len(chunk_lengths) print(f" Average chunk length: {avg_length:.0f} chars") print(f" Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} chars") # Show first chunk print(f" First chunk preview: {chunks[0][:100]}...") # Test overlap if len(chunks) > 1: overlap = find_overlap(chunks[0], chunks[1]) print(f" Actual overlap: {len(overlap)} chars") return True except Exception as e: print(f"โŒ Parameter test failed: {e}") return False def analyze_current_vector_db(): """Analyze what's currently in the vector database.""" print(f"\n๐Ÿ—„๏ธ Current Vector Database Analysis") print("=" * 60) try: from cve_factchecker.retriever import VectorNewsRetriever retriever = VectorNewsRetriever() # Try a few different search queries to see what chunks look like test_queries = [ "security vulnerability", "cyberattack", "data breach", "malware", "terrorism" ] for query in test_queries: print(f"\n๐Ÿ” Search: '{query}'") results = retriever.semantic_search(query, k=2) if results: for i, result in enumerate(results, 1): print(f"\n Result {i}:") print(f" Title: {result['title'][:60]}...") print(f" Content Length: {len(result['content'])} chars") print(f" Content Preview: {result['content'][:200]}...") print(f" URL: {result['url']}") print(f" Source: {result['source']}") # Check chunk metadata metadata = result.get('metadata', {}) if 'chunk_id' in metadata: print(f" Chunk ID: {metadata['chunk_id']}") else: print(f" No results found") if results: break # Stop after first successful query return True except Exception as e: print(f"โŒ Vector DB analysis failed: {e}") return False def main(): """Main analysis function.""" print("๐Ÿ“Š CVE Fact Checker - Chunking Strategy Analysis") print("=" * 80) # Run all analyses success1 = analyze_chunking_strategy() success2 = test_chunking_parameters() if success1 else False success3 = analyze_current_vector_db() if success1 else False print(f"\n๐Ÿ“‹ Analysis Summary:") print(f" Chunking Process: {'โœ… Analyzed' if success1 else 'โŒ Failed'}") print(f" Parameter Testing: {'โœ… Completed' if success2 else 'โŒ Failed'}") print(f" Vector DB Content: {'โœ… Analyzed' if success3 else 'โŒ Failed'}") if success1: print(f"\n๐Ÿ’ก Current Chunking Strategy:") print(f" ๐Ÿ“ Chunk Size: 1000 characters") print(f" ๐Ÿ”„ Overlap: 200 characters") print(f" ๐Ÿ”ช Splitter: RecursiveCharacterTextSplitter") print(f" ๐Ÿ“ Format: Title + Content + Source") print(f" ๐Ÿท๏ธ Metadata: URL, source, dates, chunk_id") return success1 if __name__ == "__main__": success = main() sys.exit(0 if success else 1)