|
import logging |
|
from pathlib import Path |
|
|
|
import sycamore |
|
from sycamore import ExecMode |
|
from sycamore.data import Document |
|
from sycamore.data.document import DocumentPropertyTypes |
|
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image |
|
from sycamore.transforms.partition import ArynPartitioner |
|
from sycamore.utils.markdown import elements_to_markdown |
|
|
|
from .settings import ENABLE_DEBUG_MODE |
|
|
|
logging.getLogger().setLevel(logging.INFO) |
|
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore") |
|
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True) |
|
|
|
|
|
paritioner = ArynPartitioner( |
|
use_partitioning_service=False, |
|
extract_table_structure=True, |
|
use_ocr=True, |
|
extract_images=True, |
|
device="cpu", |
|
) |
|
context = sycamore.init( |
|
exec_mode=ExecMode.LOCAL, |
|
) |
|
|
|
|
|
def image_page_filename_fn(doc: Document) -> str: |
|
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER] |
|
return f"page_{page_num}.png" |
|
|
|
|
|
def convert_sycamore(path: str, file_name: str): |
|
docset = context.read.binary(paths=path, binary_format="pdf").partition( |
|
partitioner=paritioner, |
|
) |
|
debug_path = SYCAMORE_DEBUG_PATH / file_name |
|
debug_path.mkdir(exist_ok=True) |
|
image_paths = [] |
|
|
|
doc = docset.take_all()[0] |
|
md = elements_to_markdown(doc.elements) |
|
|
|
if ENABLE_DEBUG_MODE: |
|
docset.flat_map(split_and_convert_to_image).map_batch( |
|
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True} |
|
).write.files(str(debug_path), filename_fn=image_page_filename_fn) |
|
image_paths = [str(path) for path in debug_path.glob("*.png")] |
|
return md, image_paths |
|
|