File size: 2,487 Bytes
36add35
 
 
77fbded
 
 
 
 
 
3bce890
 
77fbded
 
 
 
 
 
 
 
 
36add35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77fbded
 
 
 
 
 
7e20950
77fbded
 
 
 
 
 
 
 
3bce890
7e20950
a2b4d23
77fbded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36add35
 
77fbded
0933b39
 
 
 
 
 
 
 
77fbded
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import base64
import os
import re
from pathlib import Path

import pymupdf
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.tools.common import do_parse, prepare_env

from .settings import ENABLE_DEBUG_MODE

MINERU_DEBUG_PATH = Path("/tmp/mineru")
MINERU_DEBUG_PATH.mkdir(exist_ok=True)


def read_fn(path):
    disk_rw = FileBasedDataReader(MINERU_DEBUG_PATH)
    return disk_rw.read(path)


def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def replace_image_with_base64(markdown_text, image_dir_path):
    pattern = r"\!\[(?:[^\]]*)\]\(([^)]+)\)"

    def replace(match):
        relative_path = match.group(1)
        full_path = os.path.join(image_dir_path, relative_path)
        base64_image = image_to_base64(full_path)
        return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"

    return re.sub(pattern, replace, markdown_text)


def do_process_mineru(input_path, output_dir):
    file_name = Path(input_path).stem
    output_dir = Path(output_dir)

    pdf_data = read_fn(input_path)
    parse_method = "auto"
    _, local_md_dir = prepare_env(output_dir, file_name, parse_method)
    do_parse(
        output_dir,
        file_name,
        pdf_data,
        [],
        parse_method,
        debug_able=False,
        f_dump_orig_pdf=False,
        f_draw_layout_bbox=ENABLE_DEBUG_MODE,
        f_draw_char_bbox=False,
        formula_enable=True,
        table_enable=True,
    )
    return local_md_dir, file_name


def convert_mineru(path: str, file_name: str):
    debug_image_paths = []
    output_path = MINERU_DEBUG_PATH / file_name
    output_path.mkdir(exist_ok=True)

    local_md_dir, _ = do_process_mineru(path, output_path)
    local_md_dir = Path(local_md_dir)

    with open(local_md_dir / f"{file_name}.md", "r") as file:
        text = file.read()

    text = replace_image_with_base64(text, local_md_dir)

    debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))

    if Path(debug_pdf).exists():
        doc = pymupdf.open(debug_pdf)  # open document
        for page in doc:  # iterate through the pages
            pix = page.get_pixmap()  # render page to an image
            page_debug_path = str(output_path / ("page-%i.png" % page.number))
            debug_image_paths.append(page_debug_path)
            pix.save(page_debug_path)  # store image as a PNG

    return text, debug_image_paths