taprosoft
commited on
Commit
·
d381432
1
Parent(s):
bb8f034
feat: add pypdf as a method
Browse files- app.py +3 -0
- backends/__init__.py +10 -0
- backends/pypdf.py +10 -0
- requirements.txt +2 -1
app.py
CHANGED
@@ -24,6 +24,7 @@ from backends import ( # convert_zerox,
|
|
24 |
convert_img2table,
|
25 |
convert_marker,
|
26 |
convert_mineru,
|
|
|
27 |
convert_sycamore,
|
28 |
convert_unstructured,
|
29 |
)
|
@@ -79,6 +80,8 @@ def convert_document(path, method, start_page=0, enabled=True):
|
|
79 |
text, debug_image_paths = convert_img2table(path, file_name)
|
80 |
elif method == "GMFT (table-only)":
|
81 |
text, debug_image_paths = convert_gmft(path, file_name)
|
|
|
|
|
82 |
else:
|
83 |
raise ValueError(f"Unsupported method: {method}")
|
84 |
|
|
|
24 |
convert_img2table,
|
25 |
convert_marker,
|
26 |
convert_mineru,
|
27 |
+
convert_pypdf,
|
28 |
convert_sycamore,
|
29 |
convert_unstructured,
|
30 |
)
|
|
|
80 |
text, debug_image_paths = convert_img2table(path, file_name)
|
81 |
elif method == "GMFT (table-only)":
|
82 |
text, debug_image_paths = convert_gmft(path, file_name)
|
83 |
+
elif method == "PyPDF":
|
84 |
+
text, debug_image_paths = convert_pypdf(path, file_name)
|
85 |
else:
|
86 |
raise ValueError(f"Unsupported method: {method}")
|
87 |
|
backends/__init__.py
CHANGED
@@ -5,6 +5,7 @@ from .gmft import convert_gmft
|
|
5 |
from .img2table import convert_img2table
|
6 |
from .marker import convert_marker
|
7 |
from .mineru import convert_mineru
|
|
|
8 |
from .syca import convert_sycamore
|
9 |
from .unstructured import convert_unstructured
|
10 |
|
@@ -20,6 +21,8 @@ __all__ = [
|
|
20 |
"convert_img2table",
|
21 |
"convert_gmft",
|
22 |
"convert_sycamore",
|
|
|
|
|
23 |
]
|
24 |
|
25 |
SUPPORTED_METHODS = [
|
@@ -32,6 +35,7 @@ SUPPORTED_METHODS = [
|
|
32 |
"Gemini (API)",
|
33 |
"Img2Table (table-only)",
|
34 |
"GMFT (table-only)",
|
|
|
35 |
# "Zerox"
|
36 |
]
|
37 |
SUPPORTED_METHODS_METADATA = {
|
@@ -89,4 +93,10 @@ SUPPORTED_METHODS_METADATA = {
|
|
89 |
"url": "https://github.com/aryn-ai/sycamore",
|
90 |
"documentation": "https://sycamore.readthedocs.io/en/stable/",
|
91 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
}
|
|
|
5 |
from .img2table import convert_img2table
|
6 |
from .marker import convert_marker
|
7 |
from .mineru import convert_mineru
|
8 |
+
from .pypdf import convert_pypdf
|
9 |
from .syca import convert_sycamore
|
10 |
from .unstructured import convert_unstructured
|
11 |
|
|
|
21 |
"convert_img2table",
|
22 |
"convert_gmft",
|
23 |
"convert_sycamore",
|
24 |
+
"convert_pypdf",
|
25 |
+
# "convert_zerox",
|
26 |
]
|
27 |
|
28 |
SUPPORTED_METHODS = [
|
|
|
35 |
"Gemini (API)",
|
36 |
"Img2Table (table-only)",
|
37 |
"GMFT (table-only)",
|
38 |
+
"PyPDF",
|
39 |
# "Zerox"
|
40 |
]
|
41 |
SUPPORTED_METHODS_METADATA = {
|
|
|
93 |
"url": "https://github.com/aryn-ai/sycamore",
|
94 |
"documentation": "https://sycamore.readthedocs.io/en/stable/",
|
95 |
},
|
96 |
+
"PyPDF": {
|
97 |
+
"name": "PyPDF",
|
98 |
+
"description": "PyPDF is a pure-Python PDF toolkit that can help you read, write, and manipulate PDF documents.",
|
99 |
+
"url": "https://github.com/py-pdf/pypdf",
|
100 |
+
"documentation": "https://pypdf.readthedocs.io/en/stable",
|
101 |
+
},
|
102 |
}
|
backends/pypdf.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pypdf import PdfReader
|
2 |
+
|
3 |
+
|
4 |
+
def convert_pypdf(path: str, file_name: str):
|
5 |
+
pdf = PdfReader(path)
|
6 |
+
pages = pdf.pages
|
7 |
+
|
8 |
+
text = "\n\n".join([page.extract_text(0) for page in pages])
|
9 |
+
|
10 |
+
return text, []
|
requirements.txt
CHANGED
@@ -18,6 +18,7 @@ openai
|
|
18 |
sycamore-ai[local-inference]
|
19 |
img2table
|
20 |
gmft
|
21 |
-
opencv-contrib-python
|
22 |
unimernet==0.2.3
|
23 |
transformers<5.0.0,>=4.45.2
|
|
|
|
|
|
18 |
sycamore-ai[local-inference]
|
19 |
img2table
|
20 |
gmft
|
|
|
21 |
unimernet==0.2.3
|
22 |
transformers<5.0.0,>=4.45.2
|
23 |
+
pypdf
|
24 |
+
opencv-contrib-python
|