taprosoft
fix: add requirements.txt, fix empty debug dir
0933b39
raw
history blame
1.82 kB
import base64
import io
import re
from pathlib import Path
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.settings import settings
from .settings import ENABLE_DEBUG_MODE
# Marker init
marker_converter = PdfConverter(
artifact_dict=create_model_dict(),
config={
"debug_pdf_images": ENABLE_DEBUG_MODE,
},
)
def img_to_html(img, img_alt):
img_bytes = io.BytesIO()
img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
img_bytes_value = img_bytes.getvalue()
encoded = base64.b64encode(img_bytes_value).decode()
img_html = (
f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()}'
f';base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
)
return img_html
def markdown_insert_images(markdown, images):
image_tags = re.findall(
r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
markdown,
)
for image in image_tags:
image_markdown = image[0]
image_alt = image[1]
image_path = image[2]
if image_path in images:
markdown = markdown.replace(
image_markdown, img_to_html(images[image_path], image_alt)
)
return markdown
def convert_marker(path: str, file_name: str):
rendered = marker_converter(path)
text, _, images = text_from_rendered(rendered)
text = markdown_insert_images(text, images)
debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
if debug_image_dir.exists():
debug_image_paths = [
path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
]
else:
debug_image_paths = []
return text, debug_image_paths