from pathlib import Path import pymupdf from magic_pdf.data.data_reader_writer import FileBasedDataReader from magic_pdf.tools.common import do_parse, prepare_env MINERU_DEBUG_PATH = Path("/tmp/mineru") MINERU_DEBUG_PATH.mkdir(exist_ok=True) def read_fn(path): disk_rw = FileBasedDataReader(MINERU_DEBUG_PATH) return disk_rw.read(path) def do_process_mineru(input_path, output_dir): file_name = Path(input_path).stem output_dir = Path(output_dir) pdf_data = read_fn(input_path) parse_method = "auto" local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) do_parse( output_dir, file_name, pdf_data, [], parse_method, debug_able=False, f_dump_orig_pdf=False, formula_enable=False, table_enable=True, ) return local_md_dir, file_name def convert_mineru(path: str, file_name: str): debug_image_paths = [] output_path = MINERU_DEBUG_PATH / file_name output_path.mkdir(exist_ok=True) local_md_dir, _ = do_process_mineru(path, output_path) local_md_dir = Path(local_md_dir) with open(local_md_dir / f"{file_name}.md", "r") as file: text = file.read() debug_pdf = str(local_md_dir / (file_name + "_layout.pdf")) doc = pymupdf.open(debug_pdf) # open document for page in doc: # iterate through the pages pix = page.get_pixmap() # render page to an image page_debug_path = str(output_path / ("page-%i.png" % page.number)) debug_image_paths.append(page_debug_path) pix.save(page_debug_path) # store image as a PNG return text, debug_image_paths