CVE-FactChecker / analyze_chunking.py
NLPGenius's picture
Fix deployment issues: enhanced environment config, robust background ingestion, improved health checks, production-ready
aa69d4c
raw
history blame
10.3 kB
#!/usr/bin/env python3
"""
Analyze the current chunking strategy by examining actual chunks created from English articles.
"""
import os
import sys
# Add the parent directory to Python path
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, current_dir)
def analyze_chunking_strategy():
"""Analyze how articles are being chunked."""
print("πŸ” Chunking Strategy Analysis")
print("=" * 60)
try:
from cve_factchecker.firebase_loader import FirebaseNewsLoader
from cve_factchecker.retriever import VectorNewsRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 1. Fetch a few English articles
loader = FirebaseNewsLoader()
print("πŸ“Š Fetching sample English articles...")
articles = loader.fetch_english_articles(limit=3)
if not articles:
print("❌ No articles found")
return
print(f"βœ… Got {len(articles)} articles for analysis")
# 2. Show article content before chunking
print(f"\nπŸ“„ Article Content Analysis:")
for i, article in enumerate(articles, 1):
print(f"\n Article {i}: {article.title[:80]}...")
print(f" Content Length: {len(article.content)} characters")
print(f" URL: {article.url}")
print(f" Content Preview: {article.content[:200]}...")
# 3. Demonstrate chunking process
print(f"\nπŸ”ͺ Chunking Process Analysis:")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
for i, article in enumerate(articles, 1):
print(f"\n--- Article {i} Chunking ---")
print(f"Title: {article.title}")
print(f"Original Length: {len(article.content)} chars")
# Create chunks
chunks = splitter.split_text(article.content)
print(f"Number of Chunks: {len(chunks)}")
# Analyze each chunk
for j, chunk in enumerate(chunks):
print(f"\n Chunk {j+1}:")
print(f" Length: {len(chunk)} characters")
print(f" Content: {chunk[:150]}...")
if j < len(chunks) - 1:
# Check overlap with next chunk
next_chunk = chunks[j+1]
overlap = find_overlap(chunk, next_chunk)
print(f" Overlap with next: {len(overlap)} chars")
if overlap:
print(f" Overlap text: '{overlap[:50]}...'")
# 4. Test the complete vector storage process
print(f"\nπŸ—„οΈ Vector Storage Process:")
retriever = VectorNewsRetriever()
# Process one article to see the complete document creation
test_article = articles[0]
print(f"\nProcessing: {test_article.title[:50]}...")
# Simulate the document creation process
chunks = splitter.split_text(test_article.content)
documents = []
for i, chunk in enumerate(chunks):
if len(chunk.strip()) < 30:
continue
# Show how page_content is constructed
page_content = f"Title: {test_article.title}\n\n{chunk}"
if test_article.source and test_article.source not in chunk:
page_content += f"\n\nSource: {test_article.source}"
metadata = {
"url": test_article.url,
"source": test_article.source,
"published_date": test_article.published_date,
"scraped_date": test_article.scraped_date,
"id": test_article.article_id,
"chunk_id": f"{test_article.article_id}_{i}",
"title": test_article.title
}
documents.append({
"page_content": page_content,
"metadata": metadata
})
print(f"Created {len(documents)} document objects")
# Show sample document structure
if documents:
print(f"\nπŸ“‹ Sample Document Structure:")
sample_doc = documents[0]
print(f"Page Content Length: {len(sample_doc['page_content'])} chars")
print(f"Page Content Preview:")
print(f" {sample_doc['page_content'][:300]}...")
print(f"\nMetadata:")
for key, value in sample_doc['metadata'].items():
print(f" {key}: {value}")
return True
except Exception as e:
print(f"❌ Analysis failed: {e}")
import traceback
traceback.print_exc()
return False
def find_overlap(text1, text2):
"""Find overlapping text between two chunks."""
# Look for overlap from the end of text1 to the beginning of text2
max_overlap = min(200, len(text1), len(text2)) # Match chunk_overlap=200
for i in range(max_overlap, 0, -1):
if text1[-i:] == text2[:i]:
return text1[-i:]
return ""
def test_chunking_parameters():
"""Test different chunking parameters to understand the strategy."""
print(f"\nπŸ§ͺ Chunking Parameters Test")
print("=" * 60)
try:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from cve_factchecker.firebase_loader import FirebaseNewsLoader
# Get a test article
loader = FirebaseNewsLoader()
articles = loader.fetch_english_articles(limit=1)
if not articles:
print("❌ No test article available")
return
test_content = articles[0].content
print(f"Test Content Length: {len(test_content)} characters")
# Test different chunk sizes
test_configs = [
{"chunk_size": 500, "chunk_overlap": 100},
{"chunk_size": 1000, "chunk_overlap": 200}, # Current setting
{"chunk_size": 1500, "chunk_overlap": 300},
{"chunk_size": 2000, "chunk_overlap": 400},
]
for config in test_configs:
splitter = RecursiveCharacterTextSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"]
)
chunks = splitter.split_text(test_content)
print(f"\nπŸ“Š Config: chunk_size={config['chunk_size']}, overlap={config['chunk_overlap']}")
print(f" Chunks created: {len(chunks)}")
if chunks:
chunk_lengths = [len(chunk) for chunk in chunks]
avg_length = sum(chunk_lengths) / len(chunk_lengths)
print(f" Average chunk length: {avg_length:.0f} chars")
print(f" Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} chars")
# Show first chunk
print(f" First chunk preview: {chunks[0][:100]}...")
# Test overlap
if len(chunks) > 1:
overlap = find_overlap(chunks[0], chunks[1])
print(f" Actual overlap: {len(overlap)} chars")
return True
except Exception as e:
print(f"❌ Parameter test failed: {e}")
return False
def analyze_current_vector_db():
"""Analyze what's currently in the vector database."""
print(f"\nπŸ—„οΈ Current Vector Database Analysis")
print("=" * 60)
try:
from cve_factchecker.retriever import VectorNewsRetriever
retriever = VectorNewsRetriever()
# Try a few different search queries to see what chunks look like
test_queries = [
"security vulnerability",
"cyberattack",
"data breach",
"malware",
"terrorism"
]
for query in test_queries:
print(f"\nπŸ” Search: '{query}'")
results = retriever.semantic_search(query, k=2)
if results:
for i, result in enumerate(results, 1):
print(f"\n Result {i}:")
print(f" Title: {result['title'][:60]}...")
print(f" Content Length: {len(result['content'])} chars")
print(f" Content Preview: {result['content'][:200]}...")
print(f" URL: {result['url']}")
print(f" Source: {result['source']}")
# Check chunk metadata
metadata = result.get('metadata', {})
if 'chunk_id' in metadata:
print(f" Chunk ID: {metadata['chunk_id']}")
else:
print(f" No results found")
if results:
break # Stop after first successful query
return True
except Exception as e:
print(f"❌ Vector DB analysis failed: {e}")
return False
def main():
"""Main analysis function."""
print("πŸ“Š CVE Fact Checker - Chunking Strategy Analysis")
print("=" * 80)
# Run all analyses
success1 = analyze_chunking_strategy()
success2 = test_chunking_parameters() if success1 else False
success3 = analyze_current_vector_db() if success1 else False
print(f"\nπŸ“‹ Analysis Summary:")
print(f" Chunking Process: {'βœ… Analyzed' if success1 else '❌ Failed'}")
print(f" Parameter Testing: {'βœ… Completed' if success2 else '❌ Failed'}")
print(f" Vector DB Content: {'βœ… Analyzed' if success3 else '❌ Failed'}")
if success1:
print(f"\nπŸ’‘ Current Chunking Strategy:")
print(f" πŸ“ Chunk Size: 1000 characters")
print(f" πŸ”„ Overlap: 200 characters")
print(f" πŸ”ͺ Splitter: RecursiveCharacterTextSplitter")
print(f" πŸ“ Format: Title + Content + Source")
print(f" 🏷️ Metadata: URL, source, dates, chunk_id")
return success1
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)