File size: 1,630 Bytes
acbe414 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from pathlib import Path
import cv2
from img2table.document import PDF
from img2table.ocr import SuryaOCR
from .settings import ENABLE_DEBUG_MODE
ocr = SuryaOCR(
langs=["en"],
)
IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table")
IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True)
def convert_img2table(path: str, file_name: str):
doc = PDF(path)
pages = doc.extract_tables(
ocr=ocr,
implicit_rows=False,
implicit_columns=False,
borderless_tables=True,
min_confidence=50,
)
debug_image_paths = []
if ENABLE_DEBUG_MODE:
debug_path = IMG2TABLE_DEBUG_PATH / file_name
debug_path.mkdir(exist_ok=True)
images = doc.images
for idx, page_number in enumerate(doc.pages or range(len(images))):
page_image = images[idx]
for table in pages[page_number]:
for row in table.content.values():
for cell in row:
cv2.rectangle(
page_image,
(cell.bbox.x1, cell.bbox.y1),
(cell.bbox.x2, cell.bbox.y2),
(0, 0, 255),
2,
)
image_path = debug_path / f"page_{idx}.png"
debug_image_paths.append(image_path)
cv2.imwrite(str(image_path), page_image)
content = "\n\n".join(
[
(table.title if table.title else "") + "\n\n" + table.html
for tables in pages.values()
for table in tables
]
)
return content, debug_image_paths
|