|
from pathlib import Path |
|
|
|
from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector |
|
from gmft.pdf_bindings import PyPDFium2Document |
|
|
|
from .settings import ENABLE_DEBUG_MODE |
|
|
|
detector = TableDetector() |
|
config = AutoFormatConfig() |
|
config.semantic_spanning_cells = True |
|
config.enable_multi_header = True |
|
formatter = AutoTableFormatter(config) |
|
|
|
|
|
GMFT_DEBUG_PATH = Path("/tmp/gmft") |
|
GMFT_DEBUG_PATH.mkdir(exist_ok=True) |
|
|
|
|
|
def ingest_pdf(pdf_path) -> list[CroppedTable]: |
|
doc = PyPDFium2Document(pdf_path) |
|
|
|
tables = [] |
|
for page in doc: |
|
tables += detector.extract(page) |
|
return tables |
|
|
|
|
|
def convert_gmft(path: str, file_name: str): |
|
tables = ingest_pdf(path) |
|
formatted_tables = [] |
|
debug_image_paths = [] |
|
|
|
debug_path = GMFT_DEBUG_PATH / file_name |
|
debug_path.mkdir(exist_ok=True) |
|
|
|
for idx, table in enumerate(tables): |
|
ft = formatter.extract( |
|
table, |
|
dpi=72 * 2, |
|
) |
|
df = ft.df() |
|
if df is not None: |
|
html = df.fillna("").to_html( |
|
index=False, |
|
) |
|
formatted_tables.append(html) |
|
|
|
if ENABLE_DEBUG_MODE: |
|
image_path = debug_path / f"table_{idx}.png" |
|
ft.image().save(image_path) |
|
debug_image_paths.append(image_path) |
|
|
|
content = "\n\n".join(formatted_tables) |
|
return content, debug_image_paths |
|
|