taprosoft
feat: enable formula for MinerU
a2b4d23
raw
history blame
2.49 kB
import base64
import os
import re
from pathlib import Path
import pymupdf
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.tools.common import do_parse, prepare_env
from .settings import ENABLE_DEBUG_MODE
MINERU_DEBUG_PATH = Path("/tmp/mineru")
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
def read_fn(path):
disk_rw = FileBasedDataReader(MINERU_DEBUG_PATH)
return disk_rw.read(path)
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def replace_image_with_base64(markdown_text, image_dir_path):
pattern = r"\!\[(?:[^\]]*)\]\(([^)]+)\)"
def replace(match):
relative_path = match.group(1)
full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path)
return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
return re.sub(pattern, replace, markdown_text)
def do_process_mineru(input_path, output_dir):
file_name = Path(input_path).stem
output_dir = Path(output_dir)
pdf_data = read_fn(input_path)
parse_method = "auto"
_, local_md_dir = prepare_env(output_dir, file_name, parse_method)
do_parse(
output_dir,
file_name,
pdf_data,
[],
parse_method,
debug_able=False,
f_dump_orig_pdf=False,
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
f_draw_char_bbox=False,
formula_enable=True,
table_enable=True,
)
return local_md_dir, file_name
def convert_mineru(path: str, file_name: str):
debug_image_paths = []
output_path = MINERU_DEBUG_PATH / file_name
output_path.mkdir(exist_ok=True)
local_md_dir, _ = do_process_mineru(path, output_path)
local_md_dir = Path(local_md_dir)
with open(local_md_dir / f"{file_name}.md", "r") as file:
text = file.read()
text = replace_image_with_base64(text, local_md_dir)
debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
if Path(debug_pdf).exists():
doc = pymupdf.open(debug_pdf) # open document
for page in doc: # iterate through the pages
pix = page.get_pixmap() # render page to an image
page_debug_path = str(output_path / ("page-%i.png" % page.number))
debug_image_paths.append(page_debug_path)
pix.save(page_debug_path) # store image as a PNG
return text, debug_image_paths