File size: 3,509 Bytes
b5e52b5 77fbded acbe414 77fbded 394280f 77fbded cb2ed5c 77fbded acbe414 cb2ed5c acbe414 394280f 77fbded b5e52b5 00d4478 b5e52b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# flake8: noqa
from .docling import convert_docling
from .gemini import convert_gemini
from .gmft import convert_gmft
from .img2table import convert_img2table
from .marker import convert_marker
from .mineru import convert_mineru
from .syca import convert_sycamore
from .unstructured import convert_unstructured
# from .zerox import convert_zerox
__all__ = [
"convert_docling",
"convert_marker",
"convert_mineru",
"convert_unstructured",
"convert_gemini",
# "convert_zerox",
"convert_img2table",
"convert_gmft",
"convert_sycamore",
]
SUPPORTED_METHODS = [
"PyMuPDF",
"Docling",
"Marker",
"MinerU",
"Unstructured",
"Sycamore",
"Gemini (API)",
"Img2Table (table-only)",
"GMFT (table-only)",
# "Zerox"
]
SUPPORTED_METHODS_METADATA = {
"Unstructured": {
"name": "Unstructured",
"description": "Open-Source Pre-Processing Tools for Unstructured Data.",
"url": "https://github.com/Unstructured-IO/unstructured",
"documentation": "https://docs.unstructured.io/welcome",
},
"Marker": {
"name": "Marker",
"description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.",
"url": "https://github.com/VikParuchuri/marker",
"documentation": "https://github.com/VikParuchuri/marker",
},
"MinerU": {
"name": "MinerU",
"description": "A high-quality tool for convert PDF to Markdown and JSON.",
"url": "https://github.com/opendatalab/MinerU",
"documentation": "https://github.com/opendatalab/MinerU",
},
"Docling": {
"name": "Docling",
"description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.",
"url": "https://github.com/DS4SD/docling",
"documentation": "https://ds4sd.github.io/docling/",
},
"PyMuPDF": {
"name": "PyMuPDF",
"description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
"url": "https://github.com/pymupdf/PyMuPDF",
"documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
},
"Gemini (API)": {
"name": "Gemini",
"description": "Using Gemini multimodal API to parse PDF to markdown.",
"url": None,
"documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
},
"Img2Table (table-only)": {
"name": "Img2Table",
"description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.",
"url": "https://github.com/xavctn/img2table",
"documentation": "https://github.com/xavctn/img2table",
},
"GMFT (table-only)": {
"name": "GMFT",
"description": "Lightweight, performant, deep table extraction.",
"url": "https://github.com/conjuncts/gmft",
"documentation": "https://github.com/conjuncts/gmft",
},
"Sycamore": {
"name": "Sycamore",
"description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.",
"url": "https://github.com/aryn-ai/sycamore",
"documentation": "https://sycamore.readthedocs.io/en/stable/",
},
}
|