Anonymous1223334444

Initial commit of multimodal multilingual PDF embedding pipeline

c2e3cf5 3 months ago

11.1 kB

	import fitz # PyMuPDF
	import camelot # For table extraction
	import pandas as pd
	from bs4 import BeautifulSoup
	import re
	from pathlib import Path
	import traceback

	# Path configuration (assuming these are passed or relative to run_pipeline.py)
	# For module, these should ideally be arguments or imported from a config
	# BASE_DIR = Path("/content/")
	# PDF_DIRECTORY = BASE_DIR / "docs"
	# IMAGE_SAVE_SUBDIR = "extracted_graphs"
	# TABLE_SAVE_SUBDIR = "extracted_tables"
	# STATIC_DIR = BASE_DIR / "static"
	# IMAGE_SAVE_DIR = STATIC_DIR / IMAGE_SAVE_SUBDIR
	# TABLE_SAVE_DIR = STATIC_DIR / TABLE_SAVE_SUBDIR

	# These should be passed as arguments or configured at a higher level
	IMAGE_MIN_WIDTH = 100 # Ignore very small images (likely logos/icons)
	IMAGE_MIN_HEIGHT = 100

	def clean_text(text):
	"""Normalize whitespace and clean text while preserving paragraph breaks"""
	if not text:
	return ""
	# Replace tabs with spaces, but preserve paragraph breaks
	text = text.replace('\t', ' ')
	# Normalize multiple spaces to single spaces
	text = re.sub(r' +', ' ', text)
	# Preserve paragraph breaks but normalize them
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()

	def extract_page_data_pymupdf(pdf_path, image_save_dir, table_save_dir, image_save_subdir, table_save_subdir):
	"""Extract text, tables and save images from each page using PyMuPDF and Camelot."""
	page_data_list = []
	try:
	doc = fitz.open(pdf_path)
	metadata = doc.metadata or {}
	pdf_data = {
	'pdf_title': metadata.get('title', pdf_path.name),
	'pdf_subject': metadata.get('subject', 'Statistiques'),
	'pdf_keywords': metadata.get('keywords', '')
	}

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	page_index = page_num + 1 # 1-based index

	print(f" Extraction des données de la page {page_index}...")

	# Extract tables first
	table_data = extract_tables_and_images_from_page(pdf_path, page, page_index, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir)

	# Track table regions to avoid double-processing text
	table_regions = []
	for item in table_data:
	if 'rect' in item and item['rect'] and len(item['rect']) == 4:
	table_regions.append(fitz.Rect(item['rect']))
	else:
	print(f" Warning: Invalid rect for table on page {page_index}")


	# Extract text excluding table regions
	page_text = ""
	if table_regions:
	# Get text blocks
	blocks = page.get_text("blocks")
	for block in blocks:
	block_rect = fitz.Rect(block[:4])
	is_in_table = False
	for table_rect in table_regions:
	if block_rect.intersects(table_rect):
	is_in_table = True
	break
	if not is_in_table:
	page_text += block[4] + "\n" # Add text content
	else:
	# If no tables, get all text
	page_text = page.get_text("text")

	page_text = clean_text(page_text)


	# Extract and save images (excluding those identified as tables)
	image_data = extract_images_from_page(pdf_path, page, page_index, image_save_dir, image_save_subdir, excluded_rects=table_regions)


	page_data_list.append({
	'pdf_file': pdf_path.name,
	'page_number': page_index,
	'text': page_text,
	'images': image_data, # Includes non-table images
	'tables': [item for item in table_data if item['content_type'] == 'table'], # Only table data here
	'pdf_title': pdf_data.get('pdf_title'),
	'pdf_subject': pdf_data.get('pdf_subject'),
	'pdf_keywords': pdf_data.get('pdf_keywords')
	})
	doc.close()
	except Exception as e:
	print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}")
	traceback.print_exc() # Print traceback for debugging
	return page_data_list


	def extract_tables_and_images_from_page(pdf_path, page, page_num, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir):
	"""Extract tables using Camelot and capture images of table areas."""
	table_and_image_data = []
	try:
	tables = camelot.read_pdf(
	str(pdf_path),
	pages=str(page_num),
	flavor='lattice',
	)

	if len(tables) == 0:
	tables = camelot.read_pdf(
	str(pdf_path),
	pages=str(page_num),
	flavor='stream'
	)

	for i, table in enumerate(tables):
	if table.accuracy < 70:
	print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}")
	continue

	table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0])
	if not table_bbox or len(table_bbox) != 4:
	print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.")
	table_rect = None
	else:
	table_rect = fitz.Rect(table_bbox)

	safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
	table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html"
	table_html_save_path = table_save_dir / table_html_filename
	relative_html_url_path = f"/static/{table_save_subdir}/{table_html_filename}"

	table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png"
	table_image_save_path = image_save_dir / table_image_filename
	relative_image_url_path = f"/static/{image_save_subdir}/{table_image_filename}"


	df = table.df
	html = f"<caption>Table extrait de {pdf_path.name}, page {page_num}</caption>\n" + df.to_html(index=False)
	soup = BeautifulSoup(html, 'html.parser')
	table_tag = soup.find('table')
	if table_tag:
	table_tag['class'] = 'table table-bordered table-striped'
	table_tag['style'] = 'width:100%; border-collapse:collapse;'

	style_tag = soup.new_tag('style')
	style_tag.string = """
	.table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;}
	.table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; }
	.table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
	.table th { background-color: #f2f2f2; font-weight: bold; }
	.table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); }
	.table-responsive { overflow-x: auto; margin-bottom: 1rem; }
	"""
	soup.insert(0, style_tag)

	div = soup.new_tag('div')
	div['class'] = 'table-responsive'
	table_tag.wrap(div)

	with open(table_html_save_path, 'w', encoding='utf-8') as f:
	f.write(str(soup))
	else:
	print(f" Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.")
	continue

	table_image_bytes = None
	if table_rect:
	try:
	pix = page.get_pixmap(clip=table_rect)
	table_image_bytes = pix.tobytes(format='png')

	with open(table_image_save_path, "wb") as img_file:
	img_file.write(table_image_bytes)

	except Exception as img_capture_e:
	print(f" Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}")
	traceback.print_exc()
	table_image_bytes = None


	table_and_image_data.append({
	'content_type': 'table',
	'table_html_url': relative_html_url_path,
	'table_text_representation': df.to_string(index=False),
	'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None,
	'accuracy': table.accuracy,
	'image_bytes': table_image_bytes,
	'image_url': relative_image_url_path if table_image_bytes else None
	})

	return table_and_image_data

	except Exception as e:
	print(f" Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}")
	traceback.print_exc()
	return []

	def extract_images_from_page(pdf_path, page, page_num, image_save_dir, image_save_subdir, excluded_rects=[]):
	"""Extract and save images from a page, excluding specified regions (like tables)."""
	image_data = []
	image_list = page.get_images(full=True)

	for img_index, img_info in enumerate(image_list):
	xref = img_info[0]
	try:
	base_image = page.parent.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]
	width = base_image["width"]
	height = base_image["height"]

	if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT:
	continue

	img_rect = None
	img_rects = page.get_image_rects(xref)
	if img_rects:
	img_rect = img_rects[0]

	if img_rect is None:
	print(f" Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.")
	continue

	is_excluded = False
	for excluded_rect in excluded_rects:
	if img_rect.intersects(excluded_rect):
	is_excluded = True
	break
	if is_excluded:
	print(f" Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.")
	continue

	safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
	image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}"
	image_save_path = image_save_dir / image_filename
	relative_url_path = f"/static/{image_save_subdir}/{image_filename}"

	with open(image_save_path, "wb") as img_file:
	img_file.write(image_bytes)

	image_data.append({
	'content_type': 'image',
	'image_url': relative_url_path,
	'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1],
	'image_bytes': image_bytes
	})

	except Exception as img_save_e:
	print(f" Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}")
	traceback.print_exc()

	return image_data