taprosoft
feat: initial commit
77fbded
raw
history blame
658 Bytes
from pathlib import Path
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
# Marker init
marker_converter = PdfConverter(
artifact_dict=create_model_dict(),
config={
"debug_pdf_images": True,
},
)
def convert_marker(path: str, file_name: str):
rendered = marker_converter(path)
text, _, images = text_from_rendered(rendered)
debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
debug_image_paths = [
path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
]
return text, debug_image_paths