Anonymous1223334444
Initial commit of multimodal multilingual PDF embedding pipeline
c2e3cf5
raw
history blame
11.6 kB
import os
import json
import traceback
from pathlib import Path
import tiktoken
# Import functions from your src directory
from src.pdf_processor import extract_page_data_pymupdf, clean_text
from src.embedding_utils import initialize_clients, token_chunking, generate_multimodal_description, generate_text_embedding, ENCODING_NAME, MAX_TOKENS_NORMAL
# --- Configuration ---
# You can set these directly or get them from environment variables (recommended)
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
LOCATION = os.getenv("VERTEX_AI_LOCATION")
GENAI_API_KEY = os.getenv("GENAI_API_KEY") # For Gemini API
# Path configuration
BASE_DIR = Path.cwd() # Current working directory of the script
PDF_DIRECTORY = BASE_DIR / "docs"
OUTPUT_DIR = BASE_DIR / "output" # New output directory for generated files
EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal.json"
# Directory to save extracted images and tables HTML (within output)
IMAGE_SAVE_SUBDIR = "extracted_graphs"
TABLE_SAVE_SUBDIR = "extracted_tables"
# Absolute paths for saving
IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR
TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR
# --- Main Processing Function ---
def process_pdfs_in_directory(directory):
"""Main processing pipeline for all PDFs in a directory."""
all_embeddings_data = []
processed_files = 0
pdf_files = list(directory.glob("*.pdf"))
total_files = len(pdf_files)
if total_files == 0:
print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}")
return []
for pdf_file_path in pdf_files:
processed_files += 1
print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...")
page_data_list = extract_page_data_pymupdf(pdf_file_path, IMAGE_SAVE_DIR, TABLE_SAVE_DIR, IMAGE_SAVE_SUBDIR, TABLE_SAVE_SUBDIR)
if not page_data_list:
print(f" Aucune donnée extraite de {pdf_file_path.name}.")
continue
for page_data in page_data_list:
pdf_file = page_data['pdf_file']
page_num = page_data['page_number']
page_text = page_data['text']
images = page_data['images'] # List of non-table image dicts
tables = page_data['tables'] # List of table dicts
pdf_title = page_data.get('pdf_title')
pdf_subject = page_data.get('pdf_subject')
pdf_keywords = page_data.get('pdf_keywords')
print(f" Génération des descriptions et embeddings pour la page {page_num}...")
# Process tables: Generate description and then embedding
for table_idx, table in enumerate(tables):
table_image_bytes = table.get('image_bytes')
table_text_repr = table.get('table_text_representation', '')
table_html_url = table.get('table_html_url')
description = None
if table_image_bytes:
prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles."
print(f" Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...")
description = generate_multimodal_description(table_image_bytes, prompt)
elif table_text_repr:
prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..."
print(f" Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback)...")
# Use the multimodal model with text-only input (via google.generativeai)
if GENAI_API_KEY:
try:
model = genai.GenerativeModel("models/gemini-1.5-flash-latest") # Explicitly use the model
response = model.generate_content(prompt)
description = response.text.strip()
except Exception as e:
print(f" Erreur lors de la génération de description textuelle pour le tableau {table_idx}: {e}")
description = None
else:
print(" Skipping text description generation for table: GEMINI_API_KEY is not set.")
description = None
if description:
print(f" Page {page_num}: Description générée pour le tableau {table_idx}.")
embedding_vector = generate_text_embedding(description) # max_retries, delay are defaults
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"table_{table_idx}",
"content_type": "table",
"text_content": description,
"embedding": embedding_vector,
"table_html_url": table_html_url,
"image_url": table.get('image_url'),
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Embedding généré pour la description du tableau {table_idx}.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.")
else:
print(f" Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.")
# Process images (non-table): Generate description and then embedding
for img_idx, image in enumerate(images):
image_bytes = image.get('image_bytes')
image_url = image.get('image_url')
if image_bytes:
prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles."
print(f" Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...")
description = generate_multimodal_description(image_bytes, prompt)
if description:
print(f" Page {page_num}: Description générée pour l'image {img_idx}.")
embedding_vector = generate_text_embedding(description) # max_retries, delay are defaults
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"image_{img_idx}",
"content_type": "image",
"text_content": description,
"embedding": embedding_vector,
"image_url": image_url,
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Embedding généré pour la description de l'image {img_idx}.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.")
else:
print(f" Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.")
# Process regular text: Chunk and then generate embeddings
if page_text:
try:
encoding = tiktoken.get_encoding(ENCODING_NAME)
text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding)
except Exception as e:
print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.")
text_chunks = [page_text]
for chunk_idx, chunk_content in enumerate(text_chunks):
print(f" Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...")
embedding_vector = generate_text_embedding(chunk_content) # max_retries, delay are defaults
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"text_{chunk_idx}",
"content_type": "text",
"text_content": chunk_content,
"embedding": embedding_vector,
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.")
print(f" Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.")
return all_embeddings_data
# --- Main Execution ---
if __name__ == "__main__":
print("Démarrage du traitement PDF multimodal avec génération de descriptions et embeddings textuels multilingues...")
# Validate and create directories
if not PDF_DIRECTORY.is_dir():
print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}")
exit(1)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
print(f"Répertoire de sortie : {OUTPUT_DIR}")
print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}")
print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}")
# Initialize clients for Vertex AI and GenAI
initialize_clients(PROJECT_ID, LOCATION, GENAI_API_KEY)
final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY)
if final_embeddings:
print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.")
try:
with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f:
json.dump(final_embeddings, f, indent=2, ensure_ascii=False)
print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}")
except Exception as e:
print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}")
traceback.print_exc()
else:
print("\nAucun embedding n'a été généré.")