Spaces:

chunking-ai
/

smoldocling-preview

Paused

App Files Files Community

taprosoft commited on Mar 3

Commit

b5e52b5

1 Parent(s): b7d4a95

feat: add methods metadata

Browse files

Files changed (2) hide show

app.py +22 -13
backends/__init__.py +76 -0

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ import pymupdf4llm
 from gradio_pdf import PDF
 from backends import (  # convert_zerox,
     convert_docling,
     convert_gemini,
     convert_gmft,
@@ -149,18 +151,6 @@ latex_delimiters = [
 # startup test (also for loading models the first time)
 start_startup = time.time()
 WARMUP_PDF_PATH = "examples/table.pdf"
-SUPPORTED_METHODS = [
-    "PyMuPDF",
-    "Docling",
-    "Marker",
-    "MinerU",
-    "Unstructured",
-    "Gemini (API)",
-    "Img2Table (table-only)",
-    "GMFT (table-only)",
-    "Sycamore",
-    # "Zerox"
-]
 if DO_WARMUP:
     print("Warm-up sequence")
@@ -277,8 +267,27 @@ with gr.Blocks(
                                     interactive=False,
                                 )
                             with gr.Tab("About"):
                                 gr.Markdown(
-                                    "About method",
                                     container=False,
                                     show_label=False,
                                 )

 from gradio_pdf import PDF
 from backends import (  # convert_zerox,
+    SUPPORTED_METHODS,
+    SUPPORTED_METHODS_METADATA,
     convert_docling,
     convert_gemini,
     convert_gmft,
 # startup test (also for loading models the first time)
 start_startup = time.time()
 WARMUP_PDF_PATH = "examples/table.pdf"
 if DO_WARMUP:
     print("Warm-up sequence")
                                     interactive=False,
                                 )
                             with gr.Tab("About"):
+                                method_metadata = SUPPORTED_METHODS_METADATA[
+                                    method
+                                ]  # type: ignore
+                                method_name = method_metadata["name"]  # type: ignore
+                                method_description = method_metadata[
+                                    "description"
+                                ]  # type: ignore
+                                method_url = method_metadata["url"]  # type: ignore
+                                method_documentation = method_metadata[
+                                    "documentation"
+                                ]  # type: ignore
                                 gr.Markdown(
+                                    value=(
+                                        f"# {method_name}\n\n{method_description}\n\n"
+                                        + (
+                                            f"[[Github repo]]({method_url})    "
+                                            if method_url
+                                            else ""
+                                        )
+                                        + f"[[Documentation]]({method_documentation})"
+                                    ),
                                     container=False,
                                     show_label=False,
                                 )

backends/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .docling import convert_docling
 from .gemini import convert_gemini
 from .gmft import convert_gmft
@@ -20,3 +21,78 @@ __all__ = [
     "convert_gmft",
     "convert_sycamore",
 ]

+# flake8: noqa
 from .docling import convert_docling
 from .gemini import convert_gemini
 from .gmft import convert_gmft
     "convert_gmft",
     "convert_sycamore",
 ]
+SUPPORTED_METHODS = [
+    "PyMuPDF",
+    "Docling",
+    "Marker",
+    "MinerU",
+    "Unstructured",
+    "Gemini (API)",
+    "Img2Table (table-only)",
+    "GMFT (table-only)",
+    "Sycamore",
+    # "Zerox"
+]
+SUPPORTED_METHODS_METADATA = {
+    "Unstructured": {
+        "name": "Unstructured",
+        "description": "Open-Source Pre-Processing Tools for Unstructured Data.",
+        "url": "https://github.com/Unstructured-IO/unstructured",
+        "documentation": "https://docs.unstructured.io/welcome",
+    },
+    "Marker": {
+        "name": "Marker",
+        "description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.",
+        "url": "https://github.com/VikParuchuri/marker",
+        "documentation": "https://github.com/VikParuchuri/marker",
+    },
+    "MinerU": {
+        "name": "MinerU",
+        "description": "A high-quality tool for convert PDF to Markdown and JSON.",
+        "url": "https://github.com/opendatalab/MinerU",
+        "documentation": "https://github.com/opendatalab/MinerU",
+    },
+    "Docling": {
+        "name": "Docling",
+        "description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.",
+        "url": "https://github.com/DS4SD/docling",
+        "documentation": "https://ds4sd.github.io/docling/",
+    },
+    "PyMuPDF": {
+        "name": "PyMuPDF",
+        "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
+        "url": "https://github.com/pymupdf/PyMuPDF",
+        "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
+    },
+    "PyMuPDF": {
+        "name": "PyMuPDF",
+        "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
+        "url": "https://github.com/pymupdf/PyMuPDF",
+        "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
+    },
+    "Gemini (API)": {
+        "name": "Gemini",
+        "description": "Using Gemini multimodal API to parse PDF to markdown.",
+        "url": None,
+        "documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
+    },
+    "Img2Table (table-only)": {
+        "name": "Img2Table",
+        "description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.",
+        "url": "https://github.com/xavctn/img2table",
+        "documentation": "https://github.com/xavctn/img2table",
+    },
+    "GMFT (table-only)": {
+        "name": "GMFT",
+        "description": "Lightweight, performant, deep table extraction.",
+        "url": "https://github.com/conjuncts/gmft",
+        "documentation": "https://github.com/conjuncts/gmft",
+    },
+    "Sycamore": {
+        "name": "Sycamore",
+        "description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.",
+        "url": "https://github.com/aryn-ai/sycamore",
+        "documentation": "https://sycamore.readthedocs.io/en/stable/",
+    },
+}