taprosoft commited on
Commit
b5e52b5
·
1 Parent(s): b7d4a95

feat: add methods metadata

Browse files
Files changed (2) hide show
  1. app.py +22 -13
  2. backends/__init__.py +76 -0
app.py CHANGED
@@ -16,6 +16,8 @@ import pymupdf4llm
16
  from gradio_pdf import PDF
17
 
18
  from backends import ( # convert_zerox,
 
 
19
  convert_docling,
20
  convert_gemini,
21
  convert_gmft,
@@ -149,18 +151,6 @@ latex_delimiters = [
149
  # startup test (also for loading models the first time)
150
  start_startup = time.time()
151
  WARMUP_PDF_PATH = "examples/table.pdf"
152
- SUPPORTED_METHODS = [
153
- "PyMuPDF",
154
- "Docling",
155
- "Marker",
156
- "MinerU",
157
- "Unstructured",
158
- "Gemini (API)",
159
- "Img2Table (table-only)",
160
- "GMFT (table-only)",
161
- "Sycamore",
162
- # "Zerox"
163
- ]
164
 
165
  if DO_WARMUP:
166
  print("Warm-up sequence")
@@ -277,8 +267,27 @@ with gr.Blocks(
277
  interactive=False,
278
  )
279
  with gr.Tab("About"):
 
 
 
 
 
 
 
 
 
 
 
280
  gr.Markdown(
281
- "About method",
 
 
 
 
 
 
 
 
282
  container=False,
283
  show_label=False,
284
  )
 
16
  from gradio_pdf import PDF
17
 
18
  from backends import ( # convert_zerox,
19
+ SUPPORTED_METHODS,
20
+ SUPPORTED_METHODS_METADATA,
21
  convert_docling,
22
  convert_gemini,
23
  convert_gmft,
 
151
  # startup test (also for loading models the first time)
152
  start_startup = time.time()
153
  WARMUP_PDF_PATH = "examples/table.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  if DO_WARMUP:
156
  print("Warm-up sequence")
 
267
  interactive=False,
268
  )
269
  with gr.Tab("About"):
270
+ method_metadata = SUPPORTED_METHODS_METADATA[
271
+ method
272
+ ] # type: ignore
273
+ method_name = method_metadata["name"] # type: ignore
274
+ method_description = method_metadata[
275
+ "description"
276
+ ] # type: ignore
277
+ method_url = method_metadata["url"] # type: ignore
278
+ method_documentation = method_metadata[
279
+ "documentation"
280
+ ] # type: ignore
281
  gr.Markdown(
282
+ value=(
283
+ f"# {method_name}\n\n{method_description}\n\n"
284
+ + (
285
+ f"[[Github repo]]({method_url}) "
286
+ if method_url
287
+ else ""
288
+ )
289
+ + f"[[Documentation]]({method_documentation})"
290
+ ),
291
  container=False,
292
  show_label=False,
293
  )
backends/__init__.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from .docling import convert_docling
2
  from .gemini import convert_gemini
3
  from .gmft import convert_gmft
@@ -20,3 +21,78 @@ __all__ = [
20
  "convert_gmft",
21
  "convert_sycamore",
22
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
  from .docling import convert_docling
3
  from .gemini import convert_gemini
4
  from .gmft import convert_gmft
 
21
  "convert_gmft",
22
  "convert_sycamore",
23
  ]
24
+
25
+ SUPPORTED_METHODS = [
26
+ "PyMuPDF",
27
+ "Docling",
28
+ "Marker",
29
+ "MinerU",
30
+ "Unstructured",
31
+ "Gemini (API)",
32
+ "Img2Table (table-only)",
33
+ "GMFT (table-only)",
34
+ "Sycamore",
35
+ # "Zerox"
36
+ ]
37
+ SUPPORTED_METHODS_METADATA = {
38
+ "Unstructured": {
39
+ "name": "Unstructured",
40
+ "description": "Open-Source Pre-Processing Tools for Unstructured Data.",
41
+ "url": "https://github.com/Unstructured-IO/unstructured",
42
+ "documentation": "https://docs.unstructured.io/welcome",
43
+ },
44
+ "Marker": {
45
+ "name": "Marker",
46
+ "description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.",
47
+ "url": "https://github.com/VikParuchuri/marker",
48
+ "documentation": "https://github.com/VikParuchuri/marker",
49
+ },
50
+ "MinerU": {
51
+ "name": "MinerU",
52
+ "description": "A high-quality tool for convert PDF to Markdown and JSON.",
53
+ "url": "https://github.com/opendatalab/MinerU",
54
+ "documentation": "https://github.com/opendatalab/MinerU",
55
+ },
56
+ "Docling": {
57
+ "name": "Docling",
58
+ "description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.",
59
+ "url": "https://github.com/DS4SD/docling",
60
+ "documentation": "https://ds4sd.github.io/docling/",
61
+ },
62
+ "PyMuPDF": {
63
+ "name": "PyMuPDF",
64
+ "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
65
+ "url": "https://github.com/pymupdf/PyMuPDF",
66
+ "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
67
+ },
68
+ "PyMuPDF": {
69
+ "name": "PyMuPDF",
70
+ "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
71
+ "url": "https://github.com/pymupdf/PyMuPDF",
72
+ "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
73
+ },
74
+ "Gemini (API)": {
75
+ "name": "Gemini",
76
+ "description": "Using Gemini multimodal API to parse PDF to markdown.",
77
+ "url": None,
78
+ "documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
79
+ },
80
+ "Img2Table (table-only)": {
81
+ "name": "Img2Table",
82
+ "description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.",
83
+ "url": "https://github.com/xavctn/img2table",
84
+ "documentation": "https://github.com/xavctn/img2table",
85
+ },
86
+ "GMFT (table-only)": {
87
+ "name": "GMFT",
88
+ "description": "Lightweight, performant, deep table extraction.",
89
+ "url": "https://github.com/conjuncts/gmft",
90
+ "documentation": "https://github.com/conjuncts/gmft",
91
+ },
92
+ "Sycamore": {
93
+ "name": "Sycamore",
94
+ "description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.",
95
+ "url": "https://github.com/aryn-ai/sycamore",
96
+ "documentation": "https://sycamore.readthedocs.io/en/stable/",
97
+ },
98
+ }