import fitz # PyMuPDF import camelot # For table extraction import pandas as pd from bs4 import BeautifulSoup import re from pathlib import Path import traceback # Path configuration (assuming these are passed or relative to run_pipeline.py) # For module, these should ideally be arguments or imported from a config # BASE_DIR = Path("/content/") # PDF_DIRECTORY = BASE_DIR / "docs" # IMAGE_SAVE_SUBDIR = "extracted_graphs" # TABLE_SAVE_SUBDIR = "extracted_tables" # STATIC_DIR = BASE_DIR / "static" # IMAGE_SAVE_DIR = STATIC_DIR / IMAGE_SAVE_SUBDIR # TABLE_SAVE_DIR = STATIC_DIR / TABLE_SAVE_SUBDIR # These should be passed as arguments or configured at a higher level IMAGE_MIN_WIDTH = 100 # Ignore very small images (likely logos/icons) IMAGE_MIN_HEIGHT = 100 def clean_text(text): """Normalize whitespace and clean text while preserving paragraph breaks""" if not text: return "" # Replace tabs with spaces, but preserve paragraph breaks text = text.replace('\t', ' ') # Normalize multiple spaces to single spaces text = re.sub(r' +', ' ', text) # Preserve paragraph breaks but normalize them text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def extract_page_data_pymupdf(pdf_path, image_save_dir, table_save_dir, image_save_subdir, table_save_subdir): """Extract text, tables and save images from each page using PyMuPDF and Camelot.""" page_data_list = [] try: doc = fitz.open(pdf_path) metadata = doc.metadata or {} pdf_data = { 'pdf_title': metadata.get('title', pdf_path.name), 'pdf_subject': metadata.get('subject', 'Statistiques'), 'pdf_keywords': metadata.get('keywords', '') } for page_num in range(len(doc)): page = doc.load_page(page_num) page_index = page_num + 1 # 1-based index print(f" Extraction des données de la page {page_index}...") # Extract tables first table_data = extract_tables_and_images_from_page(pdf_path, page, page_index, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir) # Track table regions to avoid double-processing text table_regions = [] for item in table_data: if 'rect' in item and item['rect'] and len(item['rect']) == 4: table_regions.append(fitz.Rect(item['rect'])) else: print(f" Warning: Invalid rect for table on page {page_index}") # Extract text excluding table regions page_text = "" if table_regions: # Get text blocks blocks = page.get_text("blocks") for block in blocks: block_rect = fitz.Rect(block[:4]) is_in_table = False for table_rect in table_regions: if block_rect.intersects(table_rect): is_in_table = True break if not is_in_table: page_text += block[4] + "\n" # Add text content else: # If no tables, get all text page_text = page.get_text("text") page_text = clean_text(page_text) # Extract and save images (excluding those identified as tables) image_data = extract_images_from_page(pdf_path, page, page_index, image_save_dir, image_save_subdir, excluded_rects=table_regions) page_data_list.append({ 'pdf_file': pdf_path.name, 'page_number': page_index, 'text': page_text, 'images': image_data, # Includes non-table images 'tables': [item for item in table_data if item['content_type'] == 'table'], # Only table data here 'pdf_title': pdf_data.get('pdf_title'), 'pdf_subject': pdf_data.get('pdf_subject'), 'pdf_keywords': pdf_data.get('pdf_keywords') }) doc.close() except Exception as e: print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}") traceback.print_exc() # Print traceback for debugging return page_data_list def extract_tables_and_images_from_page(pdf_path, page, page_num, table_save_dir, image_save_dir, image_save_subdir, table_save_subdir): """Extract tables using Camelot and capture images of table areas.""" table_and_image_data = [] try: tables = camelot.read_pdf( str(pdf_path), pages=str(page_num), flavor='lattice', ) if len(tables) == 0: tables = camelot.read_pdf( str(pdf_path), pages=str(page_num), flavor='stream' ) for i, table in enumerate(tables): if table.accuracy < 70: print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}") continue table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0]) if not table_bbox or len(table_bbox) != 4: print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.") table_rect = None else: table_rect = fitz.Rect(table_bbox) safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem) table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html" table_html_save_path = table_save_dir / table_html_filename relative_html_url_path = f"/static/{table_save_subdir}/{table_html_filename}" table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png" table_image_save_path = image_save_dir / table_image_filename relative_image_url_path = f"/static/{image_save_subdir}/{table_image_filename}" df = table.df html = f"