taprosoft commited on
Commit
d381432
·
1 Parent(s): bb8f034

feat: add pypdf as a method

Browse files
Files changed (4) hide show
  1. app.py +3 -0
  2. backends/__init__.py +10 -0
  3. backends/pypdf.py +10 -0
  4. requirements.txt +2 -1
app.py CHANGED
@@ -24,6 +24,7 @@ from backends import ( # convert_zerox,
24
  convert_img2table,
25
  convert_marker,
26
  convert_mineru,
 
27
  convert_sycamore,
28
  convert_unstructured,
29
  )
@@ -79,6 +80,8 @@ def convert_document(path, method, start_page=0, enabled=True):
79
  text, debug_image_paths = convert_img2table(path, file_name)
80
  elif method == "GMFT (table-only)":
81
  text, debug_image_paths = convert_gmft(path, file_name)
 
 
82
  else:
83
  raise ValueError(f"Unsupported method: {method}")
84
 
 
24
  convert_img2table,
25
  convert_marker,
26
  convert_mineru,
27
+ convert_pypdf,
28
  convert_sycamore,
29
  convert_unstructured,
30
  )
 
80
  text, debug_image_paths = convert_img2table(path, file_name)
81
  elif method == "GMFT (table-only)":
82
  text, debug_image_paths = convert_gmft(path, file_name)
83
+ elif method == "PyPDF":
84
+ text, debug_image_paths = convert_pypdf(path, file_name)
85
  else:
86
  raise ValueError(f"Unsupported method: {method}")
87
 
backends/__init__.py CHANGED
@@ -5,6 +5,7 @@ from .gmft import convert_gmft
5
  from .img2table import convert_img2table
6
  from .marker import convert_marker
7
  from .mineru import convert_mineru
 
8
  from .syca import convert_sycamore
9
  from .unstructured import convert_unstructured
10
 
@@ -20,6 +21,8 @@ __all__ = [
20
  "convert_img2table",
21
  "convert_gmft",
22
  "convert_sycamore",
 
 
23
  ]
24
 
25
  SUPPORTED_METHODS = [
@@ -32,6 +35,7 @@ SUPPORTED_METHODS = [
32
  "Gemini (API)",
33
  "Img2Table (table-only)",
34
  "GMFT (table-only)",
 
35
  # "Zerox"
36
  ]
37
  SUPPORTED_METHODS_METADATA = {
@@ -89,4 +93,10 @@ SUPPORTED_METHODS_METADATA = {
89
  "url": "https://github.com/aryn-ai/sycamore",
90
  "documentation": "https://sycamore.readthedocs.io/en/stable/",
91
  },
 
 
 
 
 
 
92
  }
 
5
  from .img2table import convert_img2table
6
  from .marker import convert_marker
7
  from .mineru import convert_mineru
8
+ from .pypdf import convert_pypdf
9
  from .syca import convert_sycamore
10
  from .unstructured import convert_unstructured
11
 
 
21
  "convert_img2table",
22
  "convert_gmft",
23
  "convert_sycamore",
24
+ "convert_pypdf",
25
+ # "convert_zerox",
26
  ]
27
 
28
  SUPPORTED_METHODS = [
 
35
  "Gemini (API)",
36
  "Img2Table (table-only)",
37
  "GMFT (table-only)",
38
+ "PyPDF",
39
  # "Zerox"
40
  ]
41
  SUPPORTED_METHODS_METADATA = {
 
93
  "url": "https://github.com/aryn-ai/sycamore",
94
  "documentation": "https://sycamore.readthedocs.io/en/stable/",
95
  },
96
+ "PyPDF": {
97
+ "name": "PyPDF",
98
+ "description": "PyPDF is a pure-Python PDF toolkit that can help you read, write, and manipulate PDF documents.",
99
+ "url": "https://github.com/py-pdf/pypdf",
100
+ "documentation": "https://pypdf.readthedocs.io/en/stable",
101
+ },
102
  }
backends/pypdf.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+
3
+
4
+ def convert_pypdf(path: str, file_name: str):
5
+ pdf = PdfReader(path)
6
+ pages = pdf.pages
7
+
8
+ text = "\n\n".join([page.extract_text(0) for page in pages])
9
+
10
+ return text, []
requirements.txt CHANGED
@@ -18,6 +18,7 @@ openai
18
  sycamore-ai[local-inference]
19
  img2table
20
  gmft
21
- opencv-contrib-python
22
  unimernet==0.2.3
23
  transformers<5.0.0,>=4.45.2
 
 
 
18
  sycamore-ai[local-inference]
19
  img2table
20
  gmft
 
21
  unimernet==0.2.3
22
  transformers<5.0.0,>=4.45.2
23
+ pypdf
24
+ opencv-contrib-python