import fitz # PyMuPDF import camelot # For table extraction import pandas as pd from bs4 import BeautifulSoup import re from pathlib import Path import traceback # Path configuration (assuming these are passed or relative to run_pipeline.py) # For module, these should ideally be arguments or imported from a config # BASE_DIR = Path("/content/") # PDF_DIRECTORY = BASE_DIR / "docs" # IMAGE_SAVE_SUBDIR = "extracted_graphs" # TABLE_SAVE_SUBDIR = "extracted_tables" # STATIC_DIR = BASE_DIR / "static" # IMAGE_SAVE_DIR = STATIC_DIR / IMAGE_SAVE_SUBDIR # TABLE_SAVE_DIR = STATIC_DIR / TABLE_SAVE_SUBDIR # These should be passed as arguments or configured at a higher level IMAGE_MIN_WIDTH = 100 # Ignore very small images (likely logos/icons) IMAGE_MIN_HEIGHT = 100 def clean_text(text): """Normalize whitespace and clean text while preserving paragraph breaks""" if not text: return "" # Replace tabs with spaces, but preserve paragraph breaks text = text.replace('\t', ' ') # Normalize multiple spaces to single spaces text = re.sub(r' +', ' ', text) # Preserve paragraph breaks but normalize them text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def extract_page_data_pymupdf(pdf_path, image_save_dir, table_save_dir, image_save_subdir, table_save_subdir): """Extract text, tables and save images from each page using PyMuPDF and Camelot.""" page_data_list = [] try: doc = fitz.open(pdf_path) metadata = doc.metadata or {} pdf_data = { 'pdf_title': metadata.get('title', pdf_path.name), 'pdf_subject': metadata.get('subject', 'Statistiques'), 'pdf_keywords': metadata.get('keywords', '') } for page_num in range(len(doc)): page = doc.load_page(page_num) page_index = page_num + 1 # 1-based index print(f" Extraction des données de la page {page_index}...") # Extract tables first table_data = extract_tables_and_images_from_page(pdf_path, page, page_index, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir) # Track table regions to avoid double-processing text table_regions = [] for item in table_data: if 'rect' in item and item['rect'] and len(item['rect']) == 4: table_regions.append(fitz.Rect(item['rect'])) else: print(f" Warning: Invalid rect for table on page {page_index}") # Extract text excluding table regions page_text = "" if table_regions: # Get text blocks blocks = page.get_text("blocks") for block in blocks: block_rect = fitz.Rect(block[:4]) is_in_table = False for table_rect in table_regions: if block_rect.intersects(table_rect): is_in_table = True break if not is_in_table: page_text += block[4] + "\n" # Add text content else: # If no tables, get all text page_text = page.get_text("text") page_text = clean_text(page_text) # Extract and save images (excluding those identified as tables) image_data = extract_images_from_page(pdf_path, page, page_index, image_save_dir, image_save_subdir, excluded_rects=table_regions) page_data_list.append({ 'pdf_file': pdf_path.name, 'page_number': page_index, 'text': page_text, 'images': image_data, # Includes non-table images 'tables': [item for item in table_data if item['content_type'] == 'table'], # Only table data here 'pdf_title': pdf_data.get('pdf_title'), 'pdf_subject': pdf_data.get('pdf_subject'), 'pdf_keywords': pdf_data.get('pdf_keywords') }) doc.close() except Exception as e: print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}") traceback.print_exc() # Print traceback for debugging return page_data_list def extract_tables_and_images_from_page(pdf_path, page, page_num, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir): """Extract tables using Camelot and capture images of table areas.""" table_and_image_data = [] try: tables = camelot.read_pdf( str(pdf_path), pages=str(page_num), flavor='lattice', ) if len(tables) == 0: tables = camelot.read_pdf( str(pdf_path), pages=str(page_num), flavor='stream' ) for i, table in enumerate(tables): if table.accuracy < 70: print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}") continue table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0]) if not table_bbox or len(table_bbox) != 4: print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.") table_rect = None else: table_rect = fitz.Rect(table_bbox) safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem) table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html" table_html_save_path = table_save_dir / table_html_filename relative_html_url_path = f"/static/{table_save_subdir}/{table_html_filename}" table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png" table_image_save_path = image_save_dir / table_image_filename relative_image_url_path = f"/static/{image_save_subdir}/{table_image_filename}" df = table.df html = f"Table extrait de {pdf_path.name}, page {page_num}\n" + df.to_html(index=False) soup = BeautifulSoup(html, 'html.parser') table_tag = soup.find('table') if table_tag: table_tag['class'] = 'table table-bordered table-striped' table_tag['style'] = 'width:100%; border-collapse:collapse;' style_tag = soup.new_tag('style') style_tag.string = """ .table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;} .table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; } .table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; } .table th { background-color: #f2f2f2; font-weight: bold; } .table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); } .table-responsive { overflow-x: auto; margin-bottom: 1rem; } """ soup.insert(0, style_tag) div = soup.new_tag('div') div['class'] = 'table-responsive' table_tag.wrap(div) with open(table_html_save_path, 'w', encoding='utf-8') as f: f.write(str(soup)) else: print(f" Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.") continue table_image_bytes = None if table_rect: try: pix = page.get_pixmap(clip=table_rect) table_image_bytes = pix.tobytes(format='png') with open(table_image_save_path, "wb") as img_file: img_file.write(table_image_bytes) except Exception as img_capture_e: print(f" Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}") traceback.print_exc() table_image_bytes = None table_and_image_data.append({ 'content_type': 'table', 'table_html_url': relative_html_url_path, 'table_text_representation': df.to_string(index=False), 'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None, 'accuracy': table.accuracy, 'image_bytes': table_image_bytes, 'image_url': relative_image_url_path if table_image_bytes else None }) return table_and_image_data except Exception as e: print(f" Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}") traceback.print_exc() return [] def extract_images_from_page(pdf_path, page, page_num, image_save_dir, image_save_subdir, excluded_rects=[]): """Extract and save images from a page, excluding specified regions (like tables).""" image_data = [] image_list = page.get_images(full=True) for img_index, img_info in enumerate(image_list): xref = img_info[0] try: base_image = page.parent.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] width = base_image["width"] height = base_image["height"] if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT: continue img_rect = None img_rects = page.get_image_rects(xref) if img_rects: img_rect = img_rects[0] if img_rect is None: print(f" Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.") continue is_excluded = False for excluded_rect in excluded_rects: if img_rect.intersects(excluded_rect): is_excluded = True break if is_excluded: print(f" Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.") continue safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem) image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}" image_save_path = image_save_dir / image_filename relative_url_path = f"/static/{image_save_subdir}/{image_filename}" with open(image_save_path, "wb") as img_file: img_file.write(image_bytes) image_data.append({ 'content_type': 'image', 'image_url': relative_url_path, 'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1], 'image_bytes': image_bytes }) except Exception as img_save_e: print(f" Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}") traceback.print_exc() return image_data