File size: 1,816 Bytes
36add35
 
 
77fbded
 
 
 
 
36add35
77fbded
3bce890
 
77fbded
 
 
 
3bce890
77fbded
 
 
 
36add35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77fbded
 
 
36add35
77fbded
0933b39
 
 
 
 
 
77fbded
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import base64
import io
import re
from pathlib import Path

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.settings import settings

from .settings import ENABLE_DEBUG_MODE

# Marker init
marker_converter = PdfConverter(
    artifact_dict=create_model_dict(),
    config={
        "debug_pdf_images": ENABLE_DEBUG_MODE,
    },
)


def img_to_html(img, img_alt):
    img_bytes = io.BytesIO()
    img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
    img_bytes_value = img_bytes.getvalue()
    encoded = base64.b64encode(img_bytes_value).decode()
    img_html = (
        f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()}'
        f';base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
    )
    return img_html


def markdown_insert_images(markdown, images):
    image_tags = re.findall(
        r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
        markdown,
    )

    for image in image_tags:
        image_markdown = image[0]
        image_alt = image[1]
        image_path = image[2]
        if image_path in images:
            markdown = markdown.replace(
                image_markdown, img_to_html(images[image_path], image_alt)
            )
    return markdown


def convert_marker(path: str, file_name: str):
    rendered = marker_converter(path)
    text, _, images = text_from_rendered(rendered)
    text = markdown_insert_images(text, images)
    debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
    if debug_image_dir.exists():
        debug_image_paths = [
            path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
        ]
    else:
        debug_image_paths = []

    return text, debug_image_paths