Spaces:

chunking-ai
/

smoldocling-preview

Paused

App Files Files

xet

Community

smoldocling-preview / backends /__init__.py

taprosoft

fix: concurrency limit

00d4478 8 months ago

raw

history blame

3.51 kB

	# flake8: noqa
	from .docling import convert_docling
	from .gemini import convert_gemini
	from .gmft import convert_gmft
	from .img2table import convert_img2table
	from .marker import convert_marker
	from .mineru import convert_mineru
	from .syca import convert_sycamore
	from .unstructured import convert_unstructured

	# from .zerox import convert_zerox

	__all__ = [
	"convert_docling",
	"convert_marker",
	"convert_mineru",
	"convert_unstructured",
	"convert_gemini",
	# "convert_zerox",
	"convert_img2table",
	"convert_gmft",
	"convert_sycamore",
	]

	SUPPORTED_METHODS = [
	"PyMuPDF",
	"Docling",
	"Marker",
	"MinerU",
	"Unstructured",
	"Sycamore",
	"Gemini (API)",
	"Img2Table (table-only)",
	"GMFT (table-only)",
	# "Zerox"
	]
	SUPPORTED_METHODS_METADATA = {
	"Unstructured": {
	"name": "Unstructured",
	"description": "Open-Source Pre-Processing Tools for Unstructured Data.",
	"url": "https://github.com/Unstructured-IO/unstructured",
	"documentation": "https://docs.unstructured.io/welcome",
	},
	"Marker": {
	"name": "Marker",
	"description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.",
	"url": "https://github.com/VikParuchuri/marker",
	"documentation": "https://github.com/VikParuchuri/marker",
	},
	"MinerU": {
	"name": "MinerU",
	"description": "A high-quality tool for convert PDF to Markdown and JSON.",
	"url": "https://github.com/opendatalab/MinerU",
	"documentation": "https://github.com/opendatalab/MinerU",
	},
	"Docling": {
	"name": "Docling",
	"description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.",
	"url": "https://github.com/DS4SD/docling",
	"documentation": "https://ds4sd.github.io/docling/",
	},
	"PyMuPDF": {
	"name": "PyMuPDF",
	"description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.",
	"url": "https://github.com/pymupdf/PyMuPDF",
	"documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html",
	},
	"Gemini (API)": {
	"name": "Gemini",
	"description": "Using Gemini multimodal API to parse PDF to markdown.",
	"url": None,
	"documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python",
	},
	"Img2Table (table-only)": {
	"name": "Img2Table",
	"description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.",
	"url": "https://github.com/xavctn/img2table",
	"documentation": "https://github.com/xavctn/img2table",
	},
	"GMFT (table-only)": {
	"name": "GMFT",
	"description": "Lightweight, performant, deep table extraction.",
	"url": "https://github.com/conjuncts/gmft",
	"documentation": "https://github.com/conjuncts/gmft",
	},
	"Sycamore": {
	"name": "Sycamore",
	"description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.",
	"url": "https://github.com/aryn-ai/sycamore",
	"documentation": "https://sycamore.readthedocs.io/en/stable/",
	},
	}