taprosoft
commited on
Commit
·
394280f
1
Parent(s):
cb2ed5c
feat: add sycamore
Browse files- Dockerfile +1 -1
- app.py +4 -0
- backends/__init__.py +2 -0
- backends/syca.py +51 -0
- requirements.txt +1 -0
Dockerfile
CHANGED
@@ -31,7 +31,7 @@ ENV HOME=/home/user \
|
|
31 |
PYTHONUNBUFFERED=1 \
|
32 |
GRADIO_SERVER_NAME=0.0.0.0
|
33 |
|
34 |
-
RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
|
35 |
|
36 |
# Set the working directory to the user's home directory
|
37 |
WORKDIR $HOME/app
|
|
|
31 |
PYTHONUNBUFFERED=1 \
|
32 |
GRADIO_SERVER_NAME=0.0.0.0
|
33 |
|
34 |
+
RUN pip3 install --use-deprecated=legacy-resolver --no-cache-dir --upgrade -r /code/requirements.txt
|
35 |
|
36 |
# Set the working directory to the user's home directory
|
37 |
WORKDIR $HOME/app
|
app.py
CHANGED
@@ -22,6 +22,7 @@ from backends import ( # convert_zerox,
|
|
22 |
convert_img2table,
|
23 |
convert_marker,
|
24 |
convert_mineru,
|
|
|
25 |
convert_unstructured,
|
26 |
)
|
27 |
from backends.settings import ENABLE_DEBUG_MODE
|
@@ -65,6 +66,8 @@ def convert_document(path, method, start_page=0, enabled=True):
|
|
65 |
text, debug_image_paths = convert_mineru(path, file_name)
|
66 |
elif method == "Gemini (API)":
|
67 |
text, debug_image_paths = convert_gemini(path, file_name)
|
|
|
|
|
68 |
# elif method == "Zerox":
|
69 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
70 |
elif method == "Img2Table":
|
@@ -155,6 +158,7 @@ SUPPORTED_METHODS = [
|
|
155 |
"Gemini (API)",
|
156 |
"Img2Table",
|
157 |
"GMFT",
|
|
|
158 |
# "Zerox"
|
159 |
]
|
160 |
|
|
|
22 |
convert_img2table,
|
23 |
convert_marker,
|
24 |
convert_mineru,
|
25 |
+
convert_sycamore,
|
26 |
convert_unstructured,
|
27 |
)
|
28 |
from backends.settings import ENABLE_DEBUG_MODE
|
|
|
66 |
text, debug_image_paths = convert_mineru(path, file_name)
|
67 |
elif method == "Gemini (API)":
|
68 |
text, debug_image_paths = convert_gemini(path, file_name)
|
69 |
+
elif method == "Sycamore":
|
70 |
+
text, debug_image_paths = convert_sycamore(path, file_name)
|
71 |
# elif method == "Zerox":
|
72 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
73 |
elif method == "Img2Table":
|
|
|
158 |
"Gemini (API)",
|
159 |
"Img2Table",
|
160 |
"GMFT",
|
161 |
+
"Sycamore",
|
162 |
# "Zerox"
|
163 |
]
|
164 |
|
backends/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from .gmft import convert_gmft
|
|
4 |
from .img2table import convert_img2table
|
5 |
from .marker import convert_marker
|
6 |
from .mineru import convert_mineru
|
|
|
7 |
from .unstructured import convert_unstructured
|
8 |
|
9 |
# from .zerox import convert_zerox
|
@@ -17,4 +18,5 @@ __all__ = [
|
|
17 |
# "convert_zerox",
|
18 |
"convert_img2table",
|
19 |
"convert_gmft",
|
|
|
20 |
]
|
|
|
4 |
from .img2table import convert_img2table
|
5 |
from .marker import convert_marker
|
6 |
from .mineru import convert_mineru
|
7 |
+
from .syca import convert_sycamore
|
8 |
from .unstructured import convert_unstructured
|
9 |
|
10 |
# from .zerox import convert_zerox
|
|
|
18 |
# "convert_zerox",
|
19 |
"convert_img2table",
|
20 |
"convert_gmft",
|
21 |
+
"convert_sycamore",
|
22 |
]
|
backends/syca.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import sycamore
|
5 |
+
from sycamore import ExecMode
|
6 |
+
from sycamore.data import Document
|
7 |
+
from sycamore.data.document import DocumentPropertyTypes
|
8 |
+
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
|
9 |
+
from sycamore.transforms.partition import ArynPartitioner
|
10 |
+
from sycamore.utils.markdown import elements_to_markdown
|
11 |
+
|
12 |
+
from .settings import ENABLE_DEBUG_MODE
|
13 |
+
|
14 |
+
logging.getLogger().setLevel(logging.INFO)
|
15 |
+
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
|
16 |
+
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)
|
17 |
+
|
18 |
+
|
19 |
+
paritioner = ArynPartitioner(
|
20 |
+
use_partitioning_service=False,
|
21 |
+
extract_table_structure=True,
|
22 |
+
use_ocr=True,
|
23 |
+
extract_images=True,
|
24 |
+
)
|
25 |
+
context = sycamore.init(
|
26 |
+
exec_mode=ExecMode.LOCAL,
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
def image_page_filename_fn(doc: Document) -> str:
|
31 |
+
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
|
32 |
+
return f"page_{page_num}.png"
|
33 |
+
|
34 |
+
|
35 |
+
def convert_sycamore(path: str, file_name: str):
|
36 |
+
docset = context.read.binary(paths=path, binary_format="pdf").partition(
|
37 |
+
partitioner=paritioner,
|
38 |
+
)
|
39 |
+
debug_path = SYCAMORE_DEBUG_PATH / file_name
|
40 |
+
debug_path.mkdir(exist_ok=True)
|
41 |
+
image_paths = []
|
42 |
+
|
43 |
+
doc = docset.take_all()[0]
|
44 |
+
md = elements_to_markdown(doc.elements)
|
45 |
+
|
46 |
+
if ENABLE_DEBUG_MODE:
|
47 |
+
docset.flat_map(split_and_convert_to_image).map_batch(
|
48 |
+
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
|
49 |
+
).write.files(str(debug_path), filename_fn=image_page_filename_fn)
|
50 |
+
image_paths = [str(path) for path in debug_path.glob("*.png")]
|
51 |
+
return md, image_paths
|
requirements.txt
CHANGED
@@ -19,3 +19,4 @@ openai
|
|
19 |
opencv-contrib-python
|
20 |
gmft
|
21 |
img2table
|
|
|
|
19 |
opencv-contrib-python
|
20 |
gmft
|
21 |
img2table
|
22 |
+
sycamore-ai[local-inference]
|