File size: 1,597 Bytes
394280f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6738aa9
394280f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import logging
from pathlib import Path

import sycamore
from sycamore import ExecMode
from sycamore.data import Document
from sycamore.data.document import DocumentPropertyTypes
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.markdown import elements_to_markdown

from .settings import ENABLE_DEBUG_MODE

logging.getLogger().setLevel(logging.INFO)
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)


paritioner = ArynPartitioner(
    use_partitioning_service=False,
    extract_table_structure=True,
    use_ocr=True,
    extract_images=True,
    device="cpu",
)
context = sycamore.init(
    exec_mode=ExecMode.LOCAL,
)


def image_page_filename_fn(doc: Document) -> str:
    page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
    return f"page_{page_num}.png"


def convert_sycamore(path: str, file_name: str):
    docset = context.read.binary(paths=path, binary_format="pdf").partition(
        partitioner=paritioner,
    )
    debug_path = SYCAMORE_DEBUG_PATH / file_name
    debug_path.mkdir(exist_ok=True)
    image_paths = []

    doc = docset.take_all()[0]
    md = elements_to_markdown(doc.elements)

    if ENABLE_DEBUG_MODE:
        docset.flat_map(split_and_convert_to_image).map_batch(
            DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
        ).write.files(str(debug_path), filename_fn=image_page_filename_fn)
        image_paths = [str(path) for path in debug_path.glob("*.png")]
    return md, image_paths