from pathlib import Path import cv2 from img2table.document import PDF from img2table.ocr import SuryaOCR from .settings import ENABLE_DEBUG_MODE ocr = SuryaOCR( langs=["en"], ) IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table") IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True) def convert_img2table(path: str, file_name: str): doc = PDF(path) pages = doc.extract_tables( ocr=ocr, implicit_rows=False, implicit_columns=False, borderless_tables=True, min_confidence=50, ) debug_image_paths = [] if ENABLE_DEBUG_MODE: debug_path = IMG2TABLE_DEBUG_PATH / file_name debug_path.mkdir(exist_ok=True) images = doc.images for idx, page_number in enumerate(doc.pages or range(len(images))): page_image = images[idx] for table in pages[page_number]: for row in table.content.values(): for cell in row: cv2.rectangle( page_image, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (0, 0, 255), 2, ) image_path = debug_path / f"page_{idx}.png" debug_image_paths.append(image_path) cv2.imwrite(str(image_path), page_image) content = "\n\n".join( [ (table.title if table.title else "") + "\n\n" + table.html for tables in pages.values() for table in tables ] ) return content, debug_image_paths