taprosoft
feat: add img2table gmft
acbe414
raw
history blame
1.63 kB
from pathlib import Path
import cv2
from img2table.document import PDF
from img2table.ocr import SuryaOCR
from .settings import ENABLE_DEBUG_MODE
ocr = SuryaOCR(
langs=["en"],
)
IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table")
IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True)
def convert_img2table(path: str, file_name: str):
doc = PDF(path)
pages = doc.extract_tables(
ocr=ocr,
implicit_rows=False,
implicit_columns=False,
borderless_tables=True,
min_confidence=50,
)
debug_image_paths = []
if ENABLE_DEBUG_MODE:
debug_path = IMG2TABLE_DEBUG_PATH / file_name
debug_path.mkdir(exist_ok=True)
images = doc.images
for idx, page_number in enumerate(doc.pages or range(len(images))):
page_image = images[idx]
for table in pages[page_number]:
for row in table.content.values():
for cell in row:
cv2.rectangle(
page_image,
(cell.bbox.x1, cell.bbox.y1),
(cell.bbox.x2, cell.bbox.y2),
(0, 0, 255),
2,
)
image_path = debug_path / f"page_{idx}.png"
debug_image_paths.append(image_path)
cv2.imwrite(str(image_path), page_image)
content = "\n\n".join(
[
(table.title if table.title else "") + "\n\n" + table.html
for tables in pages.values()
for table in tables
]
)
return content, debug_image_paths