File size: 1,597 Bytes
394280f 6738aa9 394280f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import logging
from pathlib import Path
import sycamore
from sycamore import ExecMode
from sycamore.data import Document
from sycamore.data.document import DocumentPropertyTypes
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.markdown import elements_to_markdown
from .settings import ENABLE_DEBUG_MODE
logging.getLogger().setLevel(logging.INFO)
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)
paritioner = ArynPartitioner(
use_partitioning_service=False,
extract_table_structure=True,
use_ocr=True,
extract_images=True,
device="cpu",
)
context = sycamore.init(
exec_mode=ExecMode.LOCAL,
)
def image_page_filename_fn(doc: Document) -> str:
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
return f"page_{page_num}.png"
def convert_sycamore(path: str, file_name: str):
docset = context.read.binary(paths=path, binary_format="pdf").partition(
partitioner=paritioner,
)
debug_path = SYCAMORE_DEBUG_PATH / file_name
debug_path.mkdir(exist_ok=True)
image_paths = []
doc = docset.take_all()[0]
md = elements_to_markdown(doc.elements)
if ENABLE_DEBUG_MODE:
docset.flat_map(split_and_convert_to_image).map_batch(
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
).write.files(str(debug_path), filename_fn=image_page_filename_fn)
image_paths = [str(path) for path in debug_path.glob("*.png")]
return md, image_paths
|