|
from pathlib import Path |
|
|
|
from marker.converters.pdf import PdfConverter |
|
from marker.models import create_model_dict |
|
from marker.output import text_from_rendered |
|
|
|
|
|
marker_converter = PdfConverter( |
|
artifact_dict=create_model_dict(), |
|
config={ |
|
"debug_pdf_images": True, |
|
}, |
|
) |
|
|
|
|
|
def convert_marker(path: str, file_name: str): |
|
rendered = marker_converter(path) |
|
text, _, images = text_from_rendered(rendered) |
|
debug_image_dir = Path(rendered.metadata.get("debug_data_path")) |
|
debug_image_paths = [ |
|
path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem |
|
] |
|
|
|
return text, debug_image_paths |
|
|