|
import fitz |
|
import camelot |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
import re |
|
from pathlib import Path |
|
import traceback |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IMAGE_MIN_WIDTH = 100 |
|
IMAGE_MIN_HEIGHT = 100 |
|
|
|
def clean_text(text): |
|
"""Normalize whitespace and clean text while preserving paragraph breaks""" |
|
if not text: |
|
return "" |
|
|
|
text = text.replace('\t', ' ') |
|
|
|
text = re.sub(r' +', ' ', text) |
|
|
|
text = re.sub(r'\n{3,}', '\n\n', text) |
|
return text.strip() |
|
|
|
def extract_page_data_pymupdf(pdf_path, image_save_dir, table_save_dir, image_save_subdir, table_save_subdir): |
|
"""Extract text, tables and save images from each page using PyMuPDF and Camelot.""" |
|
page_data_list = [] |
|
try: |
|
doc = fitz.open(pdf_path) |
|
metadata = doc.metadata or {} |
|
pdf_data = { |
|
'pdf_title': metadata.get('title', pdf_path.name), |
|
'pdf_subject': metadata.get('subject', 'Statistiques'), |
|
'pdf_keywords': metadata.get('keywords', '') |
|
} |
|
|
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
page_index = page_num + 1 |
|
|
|
print(f" Extraction des données de la page {page_index}...") |
|
|
|
|
|
table_data = extract_tables_and_images_from_page(pdf_path, page, page_index, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir) |
|
|
|
|
|
table_regions = [] |
|
for item in table_data: |
|
if 'rect' in item and item['rect'] and len(item['rect']) == 4: |
|
table_regions.append(fitz.Rect(item['rect'])) |
|
else: |
|
print(f" Warning: Invalid rect for table on page {page_index}") |
|
|
|
|
|
|
|
page_text = "" |
|
if table_regions: |
|
|
|
blocks = page.get_text("blocks") |
|
for block in blocks: |
|
block_rect = fitz.Rect(block[:4]) |
|
is_in_table = False |
|
for table_rect in table_regions: |
|
if block_rect.intersects(table_rect): |
|
is_in_table = True |
|
break |
|
if not is_in_table: |
|
page_text += block[4] + "\n" |
|
else: |
|
|
|
page_text = page.get_text("text") |
|
|
|
page_text = clean_text(page_text) |
|
|
|
|
|
|
|
image_data = extract_images_from_page(pdf_path, page, page_index, image_save_dir, image_save_subdir, excluded_rects=table_regions) |
|
|
|
|
|
page_data_list.append({ |
|
'pdf_file': pdf_path.name, |
|
'page_number': page_index, |
|
'text': page_text, |
|
'images': image_data, |
|
'tables': [item for item in table_data if item['content_type'] == 'table'], |
|
'pdf_title': pdf_data.get('pdf_title'), |
|
'pdf_subject': pdf_data.get('pdf_subject'), |
|
'pdf_keywords': pdf_data.get('pdf_keywords') |
|
}) |
|
doc.close() |
|
except Exception as e: |
|
print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}") |
|
traceback.print_exc() |
|
return page_data_list |
|
|
|
|
|
def extract_tables_and_images_from_page(pdf_path, page, page_num, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir): |
|
"""Extract tables using Camelot and capture images of table areas.""" |
|
table_and_image_data = [] |
|
try: |
|
tables = camelot.read_pdf( |
|
str(pdf_path), |
|
pages=str(page_num), |
|
flavor='lattice', |
|
) |
|
|
|
if len(tables) == 0: |
|
tables = camelot.read_pdf( |
|
str(pdf_path), |
|
pages=str(page_num), |
|
flavor='stream' |
|
) |
|
|
|
for i, table in enumerate(tables): |
|
if table.accuracy < 70: |
|
print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}") |
|
continue |
|
|
|
table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0]) |
|
if not table_bbox or len(table_bbox) != 4: |
|
print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.") |
|
table_rect = None |
|
else: |
|
table_rect = fitz.Rect(table_bbox) |
|
|
|
safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem) |
|
table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html" |
|
table_html_save_path = table_save_dir / table_html_filename |
|
relative_html_url_path = f"/static/{table_save_subdir}/{table_html_filename}" |
|
|
|
table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png" |
|
table_image_save_path = image_save_dir / table_image_filename |
|
relative_image_url_path = f"/static/{image_save_subdir}/{table_image_filename}" |
|
|
|
|
|
df = table.df |
|
html = f"<caption>Table extrait de {pdf_path.name}, page {page_num}</caption>\n" + df.to_html(index=False) |
|
soup = BeautifulSoup(html, 'html.parser') |
|
table_tag = soup.find('table') |
|
if table_tag: |
|
table_tag['class'] = 'table table-bordered table-striped' |
|
table_tag['style'] = 'width:100%; border-collapse:collapse;' |
|
|
|
style_tag = soup.new_tag('style') |
|
style_tag.string = """ |
|
.table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;} |
|
.table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; } |
|
.table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; } |
|
.table th { background-color: #f2f2f2; font-weight: bold; } |
|
.table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); } |
|
.table-responsive { overflow-x: auto; margin-bottom: 1rem; } |
|
""" |
|
soup.insert(0, style_tag) |
|
|
|
div = soup.new_tag('div') |
|
div['class'] = 'table-responsive' |
|
table_tag.wrap(div) |
|
|
|
with open(table_html_save_path, 'w', encoding='utf-8') as f: |
|
f.write(str(soup)) |
|
else: |
|
print(f" Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.") |
|
continue |
|
|
|
table_image_bytes = None |
|
if table_rect: |
|
try: |
|
pix = page.get_pixmap(clip=table_rect) |
|
table_image_bytes = pix.tobytes(format='png') |
|
|
|
with open(table_image_save_path, "wb") as img_file: |
|
img_file.write(table_image_bytes) |
|
|
|
except Exception as img_capture_e: |
|
print(f" Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}") |
|
traceback.print_exc() |
|
table_image_bytes = None |
|
|
|
|
|
table_and_image_data.append({ |
|
'content_type': 'table', |
|
'table_html_url': relative_html_url_path, |
|
'table_text_representation': df.to_string(index=False), |
|
'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None, |
|
'accuracy': table.accuracy, |
|
'image_bytes': table_image_bytes, |
|
'image_url': relative_image_url_path if table_image_bytes else None |
|
}) |
|
|
|
return table_and_image_data |
|
|
|
except Exception as e: |
|
print(f" Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}") |
|
traceback.print_exc() |
|
return [] |
|
|
|
def extract_images_from_page(pdf_path, page, page_num, image_save_dir, image_save_subdir, excluded_rects=[]): |
|
"""Extract and save images from a page, excluding specified regions (like tables).""" |
|
image_data = [] |
|
image_list = page.get_images(full=True) |
|
|
|
for img_index, img_info in enumerate(image_list): |
|
xref = img_info[0] |
|
try: |
|
base_image = page.parent.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image_ext = base_image["ext"] |
|
width = base_image["width"] |
|
height = base_image["height"] |
|
|
|
if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT: |
|
continue |
|
|
|
img_rect = None |
|
img_rects = page.get_image_rects(xref) |
|
if img_rects: |
|
img_rect = img_rects[0] |
|
|
|
if img_rect is None: |
|
print(f" Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.") |
|
continue |
|
|
|
is_excluded = False |
|
for excluded_rect in excluded_rects: |
|
if img_rect.intersects(excluded_rect): |
|
is_excluded = True |
|
break |
|
if is_excluded: |
|
print(f" Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.") |
|
continue |
|
|
|
safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem) |
|
image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}" |
|
image_save_path = image_save_dir / image_filename |
|
relative_url_path = f"/static/{image_save_subdir}/{image_filename}" |
|
|
|
with open(image_save_path, "wb") as img_file: |
|
img_file.write(image_bytes) |
|
|
|
image_data.append({ |
|
'content_type': 'image', |
|
'image_url': relative_url_path, |
|
'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1], |
|
'image_bytes': image_bytes |
|
}) |
|
|
|
except Exception as img_save_e: |
|
print(f" Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}") |
|
traceback.print_exc() |
|
|
|
return image_data |
|
|