import os import json import traceback from pathlib import Path import tiktoken # Import functions from your src directory from src.pdf_processor import extract_page_data_pymupdf, clean_text from src.embedding_utils import initialize_clients, token_chunking, generate_multimodal_description, generate_text_embedding, ENCODING_NAME, MAX_TOKENS_NORMAL # --- Configuration --- # You can set these directly or get them from environment variables (recommended) PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") LOCATION = os.getenv("VERTEX_AI_LOCATION") GENAI_API_KEY = os.getenv("GENAI_API_KEY") # For Gemini API # Path configuration BASE_DIR = Path.cwd() # Current working directory of the script PDF_DIRECTORY = BASE_DIR / "docs" OUTPUT_DIR = BASE_DIR / "output" # New output directory for generated files EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal.json" # Directory to save extracted images and tables HTML (within output) IMAGE_SAVE_SUBDIR = "extracted_graphs" TABLE_SAVE_SUBDIR = "extracted_tables" # Absolute paths for saving IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR # --- Main Processing Function --- def process_pdfs_in_directory(directory): """Main processing pipeline for all PDFs in a directory.""" all_embeddings_data = [] processed_files = 0 pdf_files = list(directory.glob("*.pdf")) total_files = len(pdf_files) if total_files == 0: print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}") return [] for pdf_file_path in pdf_files: processed_files += 1 print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...") page_data_list = extract_page_data_pymupdf(pdf_file_path, IMAGE_SAVE_DIR, TABLE_SAVE_DIR, IMAGE_SAVE_SUBDIR, TABLE_SAVE_SUBDIR) if not page_data_list: print(f" Aucune donnée extraite de {pdf_file_path.name}.") continue for page_data in page_data_list: pdf_file = page_data['pdf_file'] page_num = page_data['page_number'] page_text = page_data['text'] images = page_data['images'] # List of non-table image dicts tables = page_data['tables'] # List of table dicts pdf_title = page_data.get('pdf_title') pdf_subject = page_data.get('pdf_subject') pdf_keywords = page_data.get('pdf_keywords') print(f" Génération des descriptions et embeddings pour la page {page_num}...") # Process tables: Generate description and then embedding for table_idx, table in enumerate(tables): table_image_bytes = table.get('image_bytes') table_text_repr = table.get('table_text_representation', '') table_html_url = table.get('table_html_url') description = None if table_image_bytes: prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles." print(f" Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...") description = generate_multimodal_description(table_image_bytes, prompt) elif table_text_repr: prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..." print(f" Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback)...") # Use the multimodal model with text-only input (via google.generativeai) if GENAI_API_KEY: try: model = genai.GenerativeModel("models/gemini-1.5-flash-latest") # Explicitly use the model response = model.generate_content(prompt) description = response.text.strip() except Exception as e: print(f" Erreur lors de la génération de description textuelle pour le tableau {table_idx}: {e}") description = None else: print(" Skipping text description generation for table: GEMINI_API_KEY is not set.") description = None if description: print(f" Page {page_num}: Description générée pour le tableau {table_idx}.") embedding_vector = generate_text_embedding(description) # max_retries, delay are defaults if embedding_vector is not None: chunk_data = { "pdf_file": pdf_file, "page_number": page_num, "chunk_id": f"table_{table_idx}", "content_type": "table", "text_content": description, "embedding": embedding_vector, "table_html_url": table_html_url, "image_url": table.get('image_url'), "pdf_title": pdf_title, "pdf_subject": pdf_subject, "pdf_keywords": pdf_keywords } all_embeddings_data.append(chunk_data) print(f" Page {page_num}: Embedding généré pour la description du tableau {table_idx}.") else: print(f" Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.") else: print(f" Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.") # Process images (non-table): Generate description and then embedding for img_idx, image in enumerate(images): image_bytes = image.get('image_bytes') image_url = image.get('image_url') if image_bytes: prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles." print(f" Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...") description = generate_multimodal_description(image_bytes, prompt) if description: print(f" Page {page_num}: Description générée pour l'image {img_idx}.") embedding_vector = generate_text_embedding(description) # max_retries, delay are defaults if embedding_vector is not None: chunk_data = { "pdf_file": pdf_file, "page_number": page_num, "chunk_id": f"image_{img_idx}", "content_type": "image", "text_content": description, "embedding": embedding_vector, "image_url": image_url, "pdf_title": pdf_title, "pdf_subject": pdf_subject, "pdf_keywords": pdf_keywords } all_embeddings_data.append(chunk_data) print(f" Page {page_num}: Embedding généré pour la description de l'image {img_idx}.") else: print(f" Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.") else: print(f" Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.") # Process regular text: Chunk and then generate embeddings if page_text: try: encoding = tiktoken.get_encoding(ENCODING_NAME) text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding) except Exception as e: print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.") text_chunks = [page_text] for chunk_idx, chunk_content in enumerate(text_chunks): print(f" Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...") embedding_vector = generate_text_embedding(chunk_content) # max_retries, delay are defaults if embedding_vector is not None: chunk_data = { "pdf_file": pdf_file, "page_number": page_num, "chunk_id": f"text_{chunk_idx}", "content_type": "text", "text_content": chunk_content, "embedding": embedding_vector, "pdf_title": pdf_title, "pdf_subject": pdf_subject, "pdf_keywords": pdf_keywords } all_embeddings_data.append(chunk_data) print(f" Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.") else: print(f" Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.") print(f" Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.") return all_embeddings_data # --- Main Execution --- if __name__ == "__main__": print("Démarrage du traitement PDF multimodal avec génération de descriptions et embeddings textuels multilingues...") # Validate and create directories if not PDF_DIRECTORY.is_dir(): print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}") exit(1) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True) TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True) print(f"Répertoire de sortie : {OUTPUT_DIR}") print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}") print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}") # Initialize clients for Vertex AI and GenAI initialize_clients(PROJECT_ID, LOCATION, GENAI_API_KEY) final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY) if final_embeddings: print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.") try: with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f: json.dump(final_embeddings, f, indent=2, ensure_ascii=False) print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}") except Exception as e: print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}") traceback.print_exc() else: print("\nAucun embedding n'a été généré.")