# flake8: noqa from .gmft import convert_gmft from .pypdf import convert_pypdf from .smoldocling import convert_smoldocling from .unstructured import convert_unstructured __all__ = [ "convert_smoldocling", "convert_unstructured", "convert_gmft", "convert_pypdf", ] SUPPORTED_METHODS = [ "SmolDocling", "PyMuPDF", "PyPDF", "Unstructured", "GMFT (table-only)", ] SUPPORTED_METHODS_METADATA = { "Unstructured": { "name": "Unstructured", "description": "Open-Source Pre-Processing Tools for Unstructured Data.", "url": "https://github.com/Unstructured-IO/unstructured", "documentation": "https://docs.unstructured.io/welcome", }, "Marker": { "name": "Marker", "description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.", "url": "https://github.com/VikParuchuri/marker", "documentation": "https://github.com/VikParuchuri/marker", }, "MinerU": { "name": "MinerU", "description": "A high-quality tool for convert PDF to Markdown and JSON.", "url": "https://github.com/opendatalab/MinerU", "documentation": "https://github.com/opendatalab/MinerU", }, "Docling": { "name": "Docling", "description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.", "url": "https://github.com/DS4SD/docling", "documentation": "https://ds4sd.github.io/docling/", }, "SmolDocling": { "name": "SmolDocling", "description": "SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.", "url": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", "documentation": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, "PyMuPDF": { "name": "PyMuPDF", "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.", "url": "https://github.com/pymupdf/PyMuPDF", "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html", }, "Gemini (API)": { "name": "Gemini", "description": "Using Gemini multimodal API to parse PDF to markdown.", "url": None, "documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python", }, "Img2Table (table-only)": { "name": "Img2Table", "description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.", "url": "https://github.com/xavctn/img2table", "documentation": "https://github.com/xavctn/img2table", }, "GMFT (table-only)": { "name": "GMFT", "description": "Lightweight, performant, deep table extraction.", "url": "https://github.com/conjuncts/gmft", "documentation": "https://github.com/conjuncts/gmft", }, "Sycamore": { "name": "Sycamore", "description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.", "url": "https://github.com/aryn-ai/sycamore", "documentation": "https://sycamore.readthedocs.io/en/stable/", }, "PyPDF": { "name": "PyPDF", "description": "PyPDF is a pure-Python PDF toolkit that can help you read, write, and manipulate PDF documents.", "url": "https://github.com/py-pdf/pypdf", "documentation": "https://pypdf.readthedocs.io/en/stable", }, }