File size: 3,888 Bytes
b5e52b5
acbe414
d381432
f6539d9
77fbded
cb2ed5c
77fbded
f6539d9
77fbded
acbe414
d381432
77fbded
b5e52b5
 
f6539d9
b5e52b5
f6539d9
 
b5e52b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6539d9
 
 
 
 
 
b5e52b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d381432
 
 
 
 
 
b5e52b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# flake8: noqa
from .gmft import convert_gmft
from .pypdf import convert_pypdf
from .smoldocling import convert_smoldocling
from .unstructured import convert_unstructured

__all__ = [
    "convert_smoldocling",
    "convert_unstructured",
    "convert_gmft",
    "convert_pypdf",
]

SUPPORTED_METHODS = [
    "SmolDocling",
    "PyMuPDF",
    "PyPDF",
    "Unstructured",
    "GMFT (table-only)",
]
SUPPORTED_METHODS_METADATA = {
    "Unstructured": {
        "name": "Unstructured",
        "description": "Open-Source Pre-Processing Tools for Unstructured Data.",
        "url": "https://github.com/Unstructured-IO/unstructured",
        "documentation": "https://docs.unstructured.io/welcome",
    },
    "Marker": {
        "name": "Marker",
        "description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.",
        "url": "https://github.com/VikParuchuri/marker",
        "documentation": "https://github.com/VikParuchuri/marker",
    },
    "MinerU": {
        "name": "MinerU",
        "description": "A high-quality tool for convert PDF to Markdown and JSON.",
        "url": "https://github.com/opendatalab/MinerU",
        "documentation": "https://github.com/opendatalab/MinerU",
    },
    "Docling": {
        "name": "Docling",
        "description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.",
        "url": "https://github.com/DS4SD/docling",
        "documentation": "https://ds4sd.github.io/docling/",
    },
    "SmolDocling": {
        "name": "SmolDocling",
        "description": "SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.",
        "url": "https://huggingface.co/ds4sd/SmolDocling-256M-preview",
        "documentation": "https://huggingface.co/ds4sd/SmolDocling-256M-preview",
    },
    "PyMuPDF": {
        "name": "PyMuPDF",
        "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
        "url": "https://github.com/pymupdf/PyMuPDF",
        "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
    },
    "Gemini (API)": {
        "name": "Gemini",
        "description": "Using Gemini multimodal API to parse PDF to markdown.",
        "url": None,
        "documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
    },
    "Img2Table (table-only)": {
        "name": "Img2Table",
        "description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.",
        "url": "https://github.com/xavctn/img2table",
        "documentation": "https://github.com/xavctn/img2table",
    },
    "GMFT (table-only)": {
        "name": "GMFT",
        "description": "Lightweight, performant, deep table extraction.",
        "url": "https://github.com/conjuncts/gmft",
        "documentation": "https://github.com/conjuncts/gmft",
    },
    "Sycamore": {
        "name": "Sycamore",
        "description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.",
        "url": "https://github.com/aryn-ai/sycamore",
        "documentation": "https://sycamore.readthedocs.io/en/stable/",
    },
    "PyPDF": {
        "name": "PyPDF",
        "description": "PyPDF is a pure-Python PDF toolkit that can help you read, write, and manipulate PDF documents.",
        "url": "https://github.com/py-pdf/pypdf",
        "documentation": "https://pypdf.readthedocs.io/en/stable",
    },
}