taprosoft
fix: move sycamore to cpu
6738aa9
raw
history blame
1.6 kB
import logging
from pathlib import Path
import sycamore
from sycamore import ExecMode
from sycamore.data import Document
from sycamore.data.document import DocumentPropertyTypes
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.markdown import elements_to_markdown
from .settings import ENABLE_DEBUG_MODE
logging.getLogger().setLevel(logging.INFO)
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)
paritioner = ArynPartitioner(
use_partitioning_service=False,
extract_table_structure=True,
use_ocr=True,
extract_images=True,
device="cpu",
)
context = sycamore.init(
exec_mode=ExecMode.LOCAL,
)
def image_page_filename_fn(doc: Document) -> str:
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
return f"page_{page_num}.png"
def convert_sycamore(path: str, file_name: str):
docset = context.read.binary(paths=path, binary_format="pdf").partition(
partitioner=paritioner,
)
debug_path = SYCAMORE_DEBUG_PATH / file_name
debug_path.mkdir(exist_ok=True)
image_paths = []
doc = docset.take_all()[0]
md = elements_to_markdown(doc.elements)
if ENABLE_DEBUG_MODE:
docset.flat_map(split_and_convert_to_image).map_batch(
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
).write.files(str(debug_path), filename_fn=image_page_filename_fn)
image_paths = [str(path) for path in debug_path.glob("*.png")]
return md, image_paths