|
|
import os |
|
|
import json |
|
|
import traceback |
|
|
from pathlib import Path |
|
|
import tiktoken |
|
|
|
|
|
|
|
|
from src.pdf_processor import extract_page_data_pymupdf, clean_text |
|
|
from src.embedding_utils import initialize_clients, token_chunking, generate_multimodal_description, generate_text_embedding, ENCODING_NAME, MAX_TOKENS_NORMAL |
|
|
|
|
|
|
|
|
|
|
|
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") |
|
|
LOCATION = os.getenv("VERTEX_AI_LOCATION") |
|
|
GENAI_API_KEY = os.getenv("GENAI_API_KEY") |
|
|
|
|
|
|
|
|
BASE_DIR = Path.cwd() |
|
|
PDF_DIRECTORY = BASE_DIR / "docs" |
|
|
OUTPUT_DIR = BASE_DIR / "output" |
|
|
EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal.json" |
|
|
|
|
|
|
|
|
IMAGE_SAVE_SUBDIR = "extracted_graphs" |
|
|
TABLE_SAVE_SUBDIR = "extracted_tables" |
|
|
|
|
|
IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR |
|
|
TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR |
|
|
|
|
|
|
|
|
|
|
|
def process_pdfs_in_directory(directory): |
|
|
"""Main processing pipeline for all PDFs in a directory.""" |
|
|
all_embeddings_data = [] |
|
|
processed_files = 0 |
|
|
pdf_files = list(directory.glob("*.pdf")) |
|
|
total_files = len(pdf_files) |
|
|
|
|
|
if total_files == 0: |
|
|
print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}") |
|
|
return [] |
|
|
|
|
|
for pdf_file_path in pdf_files: |
|
|
processed_files += 1 |
|
|
print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...") |
|
|
|
|
|
page_data_list = extract_page_data_pymupdf(pdf_file_path, IMAGE_SAVE_DIR, TABLE_SAVE_DIR, IMAGE_SAVE_SUBDIR, TABLE_SAVE_SUBDIR) |
|
|
|
|
|
if not page_data_list: |
|
|
print(f" Aucune donnée extraite de {pdf_file_path.name}.") |
|
|
continue |
|
|
|
|
|
for page_data in page_data_list: |
|
|
pdf_file = page_data['pdf_file'] |
|
|
page_num = page_data['page_number'] |
|
|
page_text = page_data['text'] |
|
|
images = page_data['images'] |
|
|
tables = page_data['tables'] |
|
|
pdf_title = page_data.get('pdf_title') |
|
|
pdf_subject = page_data.get('pdf_subject') |
|
|
pdf_keywords = page_data.get('pdf_keywords') |
|
|
|
|
|
print(f" Génération des descriptions et embeddings pour la page {page_num}...") |
|
|
|
|
|
|
|
|
for table_idx, table in enumerate(tables): |
|
|
table_image_bytes = table.get('image_bytes') |
|
|
table_text_repr = table.get('table_text_representation', '') |
|
|
table_html_url = table.get('table_html_url') |
|
|
|
|
|
description = None |
|
|
if table_image_bytes: |
|
|
prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles." |
|
|
print(f" Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...") |
|
|
description = generate_multimodal_description(table_image_bytes, prompt) |
|
|
elif table_text_repr: |
|
|
prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..." |
|
|
print(f" Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback)...") |
|
|
|
|
|
if GENAI_API_KEY: |
|
|
try: |
|
|
model = genai.GenerativeModel("models/gemini-1.5-flash-latest") |
|
|
response = model.generate_content(prompt) |
|
|
description = response.text.strip() |
|
|
except Exception as e: |
|
|
print(f" Erreur lors de la génération de description textuelle pour le tableau {table_idx}: {e}") |
|
|
description = None |
|
|
else: |
|
|
print(" Skipping text description generation for table: GEMINI_API_KEY is not set.") |
|
|
description = None |
|
|
|
|
|
|
|
|
if description: |
|
|
print(f" Page {page_num}: Description générée pour le tableau {table_idx}.") |
|
|
embedding_vector = generate_text_embedding(description) |
|
|
|
|
|
if embedding_vector is not None: |
|
|
chunk_data = { |
|
|
"pdf_file": pdf_file, |
|
|
"page_number": page_num, |
|
|
"chunk_id": f"table_{table_idx}", |
|
|
"content_type": "table", |
|
|
"text_content": description, |
|
|
"embedding": embedding_vector, |
|
|
"table_html_url": table_html_url, |
|
|
"image_url": table.get('image_url'), |
|
|
"pdf_title": pdf_title, |
|
|
"pdf_subject": pdf_subject, |
|
|
"pdf_keywords": pdf_keywords |
|
|
} |
|
|
all_embeddings_data.append(chunk_data) |
|
|
print(f" Page {page_num}: Embedding généré pour la description du tableau {table_idx}.") |
|
|
else: |
|
|
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.") |
|
|
else: |
|
|
print(f" Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.") |
|
|
|
|
|
|
|
|
|
|
|
for img_idx, image in enumerate(images): |
|
|
image_bytes = image.get('image_bytes') |
|
|
image_url = image.get('image_url') |
|
|
|
|
|
if image_bytes: |
|
|
prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles." |
|
|
print(f" Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...") |
|
|
description = generate_multimodal_description(image_bytes, prompt) |
|
|
|
|
|
if description: |
|
|
print(f" Page {page_num}: Description générée pour l'image {img_idx}.") |
|
|
embedding_vector = generate_text_embedding(description) |
|
|
|
|
|
if embedding_vector is not None: |
|
|
chunk_data = { |
|
|
"pdf_file": pdf_file, |
|
|
"page_number": page_num, |
|
|
"chunk_id": f"image_{img_idx}", |
|
|
"content_type": "image", |
|
|
"text_content": description, |
|
|
"embedding": embedding_vector, |
|
|
"image_url": image_url, |
|
|
"pdf_title": pdf_title, |
|
|
"pdf_subject": pdf_subject, |
|
|
"pdf_keywords": pdf_keywords |
|
|
} |
|
|
all_embeddings_data.append(chunk_data) |
|
|
print(f" Page {page_num}: Embedding généré pour la description de l'image {img_idx}.") |
|
|
else: |
|
|
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.") |
|
|
else: |
|
|
print(f" Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.") |
|
|
|
|
|
|
|
|
|
|
|
if page_text: |
|
|
try: |
|
|
encoding = tiktoken.get_encoding(ENCODING_NAME) |
|
|
text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding) |
|
|
except Exception as e: |
|
|
print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.") |
|
|
text_chunks = [page_text] |
|
|
|
|
|
|
|
|
for chunk_idx, chunk_content in enumerate(text_chunks): |
|
|
print(f" Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...") |
|
|
embedding_vector = generate_text_embedding(chunk_content) |
|
|
|
|
|
if embedding_vector is not None: |
|
|
chunk_data = { |
|
|
"pdf_file": pdf_file, |
|
|
"page_number": page_num, |
|
|
"chunk_id": f"text_{chunk_idx}", |
|
|
"content_type": "text", |
|
|
"text_content": chunk_content, |
|
|
"embedding": embedding_vector, |
|
|
"pdf_title": pdf_title, |
|
|
"pdf_subject": pdf_subject, |
|
|
"pdf_keywords": pdf_keywords |
|
|
} |
|
|
all_embeddings_data.append(chunk_data) |
|
|
print(f" Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.") |
|
|
else: |
|
|
print(f" Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.") |
|
|
|
|
|
|
|
|
print(f" Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.") |
|
|
|
|
|
|
|
|
return all_embeddings_data |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Démarrage du traitement PDF multimodal avec génération de descriptions et embeddings textuels multilingues...") |
|
|
|
|
|
|
|
|
if not PDF_DIRECTORY.is_dir(): |
|
|
print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}") |
|
|
exit(1) |
|
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True) |
|
|
print(f"Répertoire de sortie : {OUTPUT_DIR}") |
|
|
print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}") |
|
|
print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}") |
|
|
|
|
|
|
|
|
initialize_clients(PROJECT_ID, LOCATION, GENAI_API_KEY) |
|
|
|
|
|
final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY) |
|
|
|
|
|
if final_embeddings: |
|
|
print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.") |
|
|
try: |
|
|
with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f: |
|
|
json.dump(final_embeddings, f, indent=2, ensure_ascii=False) |
|
|
print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}") |
|
|
except Exception as e: |
|
|
print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}") |
|
|
traceback.print_exc() |
|
|
else: |
|
|
print("\nAucun embedding n'a été généré.") |