taprosoft
feat: add img2table gmft
acbe414
raw
history blame
1.46 kB
from pathlib import Path
from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector
from gmft.pdf_bindings import PyPDFium2Document
from .settings import ENABLE_DEBUG_MODE
detector = TableDetector()
config = AutoFormatConfig()
config.semantic_spanning_cells = True # [Experimental] better spanning cells
config.enable_multi_header = True # multi-headers
formatter = AutoTableFormatter(config)
GMFT_DEBUG_PATH = Path("/tmp/gmft")
GMFT_DEBUG_PATH.mkdir(exist_ok=True)
def ingest_pdf(pdf_path) -> list[CroppedTable]:
doc = PyPDFium2Document(pdf_path)
tables = []
for page in doc:
tables += detector.extract(page)
return tables
def convert_gmft(path: str, file_name: str):
tables = ingest_pdf(path)
formatted_tables = []
debug_image_paths = []
debug_path = GMFT_DEBUG_PATH / file_name
debug_path.mkdir(exist_ok=True)
for idx, table in enumerate(tables):
ft = formatter.extract(
table,
dpi=72 * 2,
)
df = ft.df()
if df is not None:
html = df.fillna("").to_html(
index=False,
)
formatted_tables.append(html)
if ENABLE_DEBUG_MODE:
image_path = debug_path / f"table_{idx}.png"
ft.image().save(image_path)
debug_image_paths.append(image_path)
content = "\n\n".join(formatted_tables)
return content, debug_image_paths