|  | import os | 
					
						
						|  | import sys | 
					
						
						|  |  | 
					
						
						|  | if "APP_PATH" in os.environ: | 
					
						
						|  | app_path = os.path.abspath(os.environ["APP_PATH"]) | 
					
						
						|  | if os.getcwd() != app_path: | 
					
						
						|  |  | 
					
						
						|  | os.chdir(app_path) | 
					
						
						|  | if app_path not in sys.path: | 
					
						
						|  | sys.path.append(app_path) | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  |  | 
					
						
						|  | from marker.settings import settings | 
					
						
						|  |  | 
					
						
						|  | import base64 | 
					
						
						|  | import io | 
					
						
						|  | import re | 
					
						
						|  | from typing import Any, Dict | 
					
						
						|  |  | 
					
						
						|  | import pypdfium2 | 
					
						
						|  | from PIL import Image | 
					
						
						|  |  | 
					
						
						|  | from marker.converters.pdf import PdfConverter | 
					
						
						|  | from marker.models import create_model_dict | 
					
						
						|  | from marker.config.parser import ConfigParser | 
					
						
						|  | from marker.output import text_from_rendered | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def load_models(): | 
					
						
						|  | return create_model_dict() | 
					
						
						|  |  | 
					
						
						|  | def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict): | 
					
						
						|  | config_dict = config_parser.generate_config_dict() | 
					
						
						|  | config_dict["pdftext_workers"] = 1 | 
					
						
						|  | converter = PdfConverter( | 
					
						
						|  | config=config_dict, | 
					
						
						|  | artifact_dict=model_dict, | 
					
						
						|  | processor_list=config_parser.get_processors(), | 
					
						
						|  | renderer=config_parser.get_renderer() | 
					
						
						|  | ) | 
					
						
						|  | return converter(fname) | 
					
						
						|  |  | 
					
						
						|  | def open_pdf(pdf_file): | 
					
						
						|  | return pypdfium2.PdfDocument(pdf_file) | 
					
						
						|  |  | 
					
						
						|  | def count_pdf(pdf_file): | 
					
						
						|  | doc = open_pdf(pdf_file) | 
					
						
						|  | return len(doc) | 
					
						
						|  |  | 
					
						
						|  | def get_page_image(pdf_file, page_num, dpi=96): | 
					
						
						|  | doc = open_pdf(pdf_file) | 
					
						
						|  | renderer = doc.render( | 
					
						
						|  | pypdfium2.PdfBitmap.to_pil, | 
					
						
						|  | page_indices=[page_num - 1], | 
					
						
						|  | scale=dpi / 72, | 
					
						
						|  | ) | 
					
						
						|  | png = list(renderer)[0] | 
					
						
						|  | png_image = png.convert("RGB") | 
					
						
						|  | return png_image | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def img_to_html(img, img_alt): | 
					
						
						|  | img_bytes = io.BytesIO() | 
					
						
						|  | img.save(img_bytes, format="PNG") | 
					
						
						|  | img_bytes = img_bytes.getvalue() | 
					
						
						|  | encoded = base64.b64encode(img_bytes).decode() | 
					
						
						|  | img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">' | 
					
						
						|  | return img_html | 
					
						
						|  |  | 
					
						
						|  | def markdown_insert_images(markdown, images): | 
					
						
						|  | image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown) | 
					
						
						|  |  | 
					
						
						|  | for image in image_tags: | 
					
						
						|  | image_markdown = image[0] | 
					
						
						|  | image_alt = image[1] | 
					
						
						|  | image_path = image[2] | 
					
						
						|  | if image_path in images: | 
					
						
						|  | markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt)) | 
					
						
						|  | return markdown | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if 'model_dict' not in globals(): | 
					
						
						|  | model_dict = load_models() | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(title="Marker") as demo: | 
					
						
						|  | gr.Markdown(""" | 
					
						
						|  | # Marker Demo | 
					
						
						|  |  | 
					
						
						|  | This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc. | 
					
						
						|  |  | 
					
						
						|  | Find the project [here](https://github.com/VikParuchuri/marker). | 
					
						
						|  | """) | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | in_file = gr.File(label="PDF file:", file_types=[".pdf"]) | 
					
						
						|  | in_num = gr.Slider(label="PDF file page number", minimum=1, maximum=1, value=1, step=1, visible=False) | 
					
						
						|  | in_img = gr.Image(label="PDF file (preview)", type="pil", sources=None, visible=False) | 
					
						
						|  |  | 
					
						
						|  | page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"") | 
					
						
						|  | output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown") | 
					
						
						|  |  | 
					
						
						|  | force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages") | 
					
						
						|  | debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information") | 
					
						
						|  | use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing") | 
					
						
						|  | strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.") | 
					
						
						|  | run_marker_btn = gr.Button("Run Marker", interactive=False) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | result_md = gr.Markdown(label="Result markdown", visible=False) | 
					
						
						|  | result_json = gr.JSON(label="Result json", visible=False) | 
					
						
						|  | result_html = gr.Markdown(label="Result html", visible=False) | 
					
						
						|  | debug_img_pdf = gr.Image(label="PDF debug image", visible=False) | 
					
						
						|  | debug_img_layout = gr.Image(label="Layout debug image", visible=False) | 
					
						
						|  |  | 
					
						
						|  | def show_image(file, num=1): | 
					
						
						|  | if file is None: | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(visible=False, maximum=1, value=num), | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | "0-0"] | 
					
						
						|  | count = count_pdf(file) | 
					
						
						|  | img = get_page_image(file, num) | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(visible=True, maximum=count), | 
					
						
						|  | gr.update(visible=True, value=img), | 
					
						
						|  | f"0-{num-1}"] | 
					
						
						|  |  | 
					
						
						|  | in_file.clear( | 
					
						
						|  | fn=show_image, | 
					
						
						|  | inputs=[in_file], | 
					
						
						|  | outputs=[in_num, in_img, page_range_txt] | 
					
						
						|  | ) | 
					
						
						|  | in_file.upload( | 
					
						
						|  | fn=show_image, | 
					
						
						|  | inputs=[in_file], | 
					
						
						|  | outputs=[in_num, in_img, page_range_txt] | 
					
						
						|  | ) | 
					
						
						|  | in_num.change( | 
					
						
						|  | fn=show_image, | 
					
						
						|  | inputs=[in_file, in_num], | 
					
						
						|  | outputs=[in_num, in_img, page_range_txt] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def check_page_range(page_range, file): | 
					
						
						|  | count = count_pdf(file) if file is not None else 1 | 
					
						
						|  | if not re.match(r"^(\d+(-\d+)?)?$", page_range): | 
					
						
						|  | gr.Warning(f"Invalid format. Please use 0-{count-1}", duration=0) | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(info=f"format 0-{count-1}"), | 
					
						
						|  | gr.update(interactive=False)] | 
					
						
						|  | else: | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(info=f"format 0-{count-1}"), | 
					
						
						|  | gr.update(interactive=True)] | 
					
						
						|  | page_range_txt.change( | 
					
						
						|  | fn=check_page_range, | 
					
						
						|  | inputs=[page_range_txt, in_file], | 
					
						
						|  | outputs=[page_range_txt, run_marker_btn] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def run_marker_img(filename, page_range, force_ocr, output_format, debug, use_llm, strip_existing_ocr): | 
					
						
						|  | cli_options = { | 
					
						
						|  | "output_format": output_format, | 
					
						
						|  | "page_range": page_range, | 
					
						
						|  | "force_ocr": force_ocr, | 
					
						
						|  | "debug": debug, | 
					
						
						|  | "output_dir": settings.DEBUG_DATA_FOLDER if debug else None, | 
					
						
						|  | "use_llm": use_llm, | 
					
						
						|  | "strip_existing_ocr": strip_existing_ocr | 
					
						
						|  | } | 
					
						
						|  | config_parser = ConfigParser(cli_options) | 
					
						
						|  | rendered = convert_pdf( | 
					
						
						|  | filename, | 
					
						
						|  | config_parser | 
					
						
						|  | ) | 
					
						
						|  | gr_debug_pdf = gr.update(visible=False) | 
					
						
						|  | gr_debug_lay = gr.update(visible=False) | 
					
						
						|  | if debug: | 
					
						
						|  | debug_data_path = rendered.metadata.get("debug_data_path") | 
					
						
						|  | if debug_data_path: | 
					
						
						|  | page_range = config_parser.generate_config_dict()["page_range"] | 
					
						
						|  | first_page = page_range[0] if page_range else 0 | 
					
						
						|  |  | 
					
						
						|  | pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png") | 
					
						
						|  | img = Image.open(pdf_image_path) | 
					
						
						|  | gr_debug_pdf = gr.update(visible=True, value=img) | 
					
						
						|  | layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png") | 
					
						
						|  | img = Image.open(layout_image_path) | 
					
						
						|  | gr_debug_lay = gr.update(visible=True, value=img) | 
					
						
						|  |  | 
					
						
						|  | text, ext, images = text_from_rendered(rendered) | 
					
						
						|  | if output_format == "markdown": | 
					
						
						|  | text = markdown_insert_images(text, images) | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(visible=True, value=text), | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr_debug_pdf, | 
					
						
						|  | gr_debug_lay | 
					
						
						|  | ] | 
					
						
						|  | elif output_format == "json": | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr.update(visible=True, value=text), | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr_debug_pdf, | 
					
						
						|  | gr_debug_lay | 
					
						
						|  | ] | 
					
						
						|  | elif output_format == "html": | 
					
						
						|  | return [ | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr.update(visible=False), | 
					
						
						|  | gr.update(visible=True, value=text), | 
					
						
						|  | gr_debug_pdf, | 
					
						
						|  | gr_debug_lay | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | run_marker_btn.click( | 
					
						
						|  | fn=run_marker_img, | 
					
						
						|  | inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb], | 
					
						
						|  | outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | demo.launch() | 
					
						
						|  |  |