taprosoft commited on
Commit
394280f
·
1 Parent(s): cb2ed5c

feat: add sycamore

Browse files
Files changed (5) hide show
  1. Dockerfile +1 -1
  2. app.py +4 -0
  3. backends/__init__.py +2 -0
  4. backends/syca.py +51 -0
  5. requirements.txt +1 -0
Dockerfile CHANGED
@@ -31,7 +31,7 @@ ENV HOME=/home/user \
31
  PYTHONUNBUFFERED=1 \
32
  GRADIO_SERVER_NAME=0.0.0.0
33
 
34
- RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
35
 
36
  # Set the working directory to the user's home directory
37
  WORKDIR $HOME/app
 
31
  PYTHONUNBUFFERED=1 \
32
  GRADIO_SERVER_NAME=0.0.0.0
33
 
34
+ RUN pip3 install --use-deprecated=legacy-resolver --no-cache-dir --upgrade -r /code/requirements.txt
35
 
36
  # Set the working directory to the user's home directory
37
  WORKDIR $HOME/app
app.py CHANGED
@@ -22,6 +22,7 @@ from backends import ( # convert_zerox,
22
  convert_img2table,
23
  convert_marker,
24
  convert_mineru,
 
25
  convert_unstructured,
26
  )
27
  from backends.settings import ENABLE_DEBUG_MODE
@@ -65,6 +66,8 @@ def convert_document(path, method, start_page=0, enabled=True):
65
  text, debug_image_paths = convert_mineru(path, file_name)
66
  elif method == "Gemini (API)":
67
  text, debug_image_paths = convert_gemini(path, file_name)
 
 
68
  # elif method == "Zerox":
69
  # text, debug_image_paths = convert_zerox(path, file_name)
70
  elif method == "Img2Table":
@@ -155,6 +158,7 @@ SUPPORTED_METHODS = [
155
  "Gemini (API)",
156
  "Img2Table",
157
  "GMFT",
 
158
  # "Zerox"
159
  ]
160
 
 
22
  convert_img2table,
23
  convert_marker,
24
  convert_mineru,
25
+ convert_sycamore,
26
  convert_unstructured,
27
  )
28
  from backends.settings import ENABLE_DEBUG_MODE
 
66
  text, debug_image_paths = convert_mineru(path, file_name)
67
  elif method == "Gemini (API)":
68
  text, debug_image_paths = convert_gemini(path, file_name)
69
+ elif method == "Sycamore":
70
+ text, debug_image_paths = convert_sycamore(path, file_name)
71
  # elif method == "Zerox":
72
  # text, debug_image_paths = convert_zerox(path, file_name)
73
  elif method == "Img2Table":
 
158
  "Gemini (API)",
159
  "Img2Table",
160
  "GMFT",
161
+ "Sycamore",
162
  # "Zerox"
163
  ]
164
 
backends/__init__.py CHANGED
@@ -4,6 +4,7 @@ from .gmft import convert_gmft
4
  from .img2table import convert_img2table
5
  from .marker import convert_marker
6
  from .mineru import convert_mineru
 
7
  from .unstructured import convert_unstructured
8
 
9
  # from .zerox import convert_zerox
@@ -17,4 +18,5 @@ __all__ = [
17
  # "convert_zerox",
18
  "convert_img2table",
19
  "convert_gmft",
 
20
  ]
 
4
  from .img2table import convert_img2table
5
  from .marker import convert_marker
6
  from .mineru import convert_mineru
7
+ from .syca import convert_sycamore
8
  from .unstructured import convert_unstructured
9
 
10
  # from .zerox import convert_zerox
 
18
  # "convert_zerox",
19
  "convert_img2table",
20
  "convert_gmft",
21
+ "convert_sycamore",
22
  ]
backends/syca.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import sycamore
5
+ from sycamore import ExecMode
6
+ from sycamore.data import Document
7
+ from sycamore.data.document import DocumentPropertyTypes
8
+ from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
9
+ from sycamore.transforms.partition import ArynPartitioner
10
+ from sycamore.utils.markdown import elements_to_markdown
11
+
12
+ from .settings import ENABLE_DEBUG_MODE
13
+
14
+ logging.getLogger().setLevel(logging.INFO)
15
+ SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
16
+ SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)
17
+
18
+
19
+ paritioner = ArynPartitioner(
20
+ use_partitioning_service=False,
21
+ extract_table_structure=True,
22
+ use_ocr=True,
23
+ extract_images=True,
24
+ )
25
+ context = sycamore.init(
26
+ exec_mode=ExecMode.LOCAL,
27
+ )
28
+
29
+
30
+ def image_page_filename_fn(doc: Document) -> str:
31
+ page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
32
+ return f"page_{page_num}.png"
33
+
34
+
35
+ def convert_sycamore(path: str, file_name: str):
36
+ docset = context.read.binary(paths=path, binary_format="pdf").partition(
37
+ partitioner=paritioner,
38
+ )
39
+ debug_path = SYCAMORE_DEBUG_PATH / file_name
40
+ debug_path.mkdir(exist_ok=True)
41
+ image_paths = []
42
+
43
+ doc = docset.take_all()[0]
44
+ md = elements_to_markdown(doc.elements)
45
+
46
+ if ENABLE_DEBUG_MODE:
47
+ docset.flat_map(split_and_convert_to_image).map_batch(
48
+ DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
49
+ ).write.files(str(debug_path), filename_fn=image_page_filename_fn)
50
+ image_paths = [str(path) for path in debug_path.glob("*.png")]
51
+ return md, image_paths
requirements.txt CHANGED
@@ -19,3 +19,4 @@ openai
19
  opencv-contrib-python
20
  gmft
21
  img2table
 
 
19
  opencv-contrib-python
20
  gmft
21
  img2table
22
+ sycamore-ai[local-inference]