import gradio as gr import torch import base64 import fitz # PyMuPDF import tempfile from io import BytesIO from PIL import Image from transformers import AutoProcessor, Qwen2VLForConditionalGeneration from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts.anchor import get_anchor_text from latex2mathml.converter import convert as latex_to_mathml import markdown2 import html import json import re # Load model and processor model = Qwen2VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 ).eval() processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def convert_latex(text): def replacer(match): try: return f"

{latex_to_mathml(match.group(1))}

" except: return html.escape(match.group(0)) text = re.sub(r'\\\((.*?)\\\)', replacer, text) text = re.sub(r'\\\[(.*?)\\\]', replacer, text) return text def stitch_paragraphs(pages): joined = "\n".join(pages) return re.sub(r"(? 0: try: raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip() parsed = json.loads(raw) decoded = parsed.get("natural_text", raw) except: decoded = raw except Exception as e: decoded = f"[Error on page {page_num}: {str(e)}]" # Save first image as cover if page_num == 1: cover_img_html = f' cover

' # Add TOC-based headers if any header_html = "" if page_num in toc_by_page: for level, header in toc_by_page[page_num]: tag = f"h{min(level, 6)}" header_html += f"<{tag}>{html.escape(header)}\n" pages_output.append(f"{header_html}\n{decoded}") # Join paragraphs across pages stitched = stitch_paragraphs(pages_output) mathml = convert_latex(stitched) rendered = markdown2.markdown(mathml) html_doc = f""" {html.escape(title)}

{html.escape(title)}

{html.escape(author)}

{cover_img_html} {rendered} """ with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp: tmp.write(html_doc) return tmp.name iface = gr.Interface( fn=process_pdf_to_html, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Textbox(label="HTML Title"), gr.Textbox(label="Author(s)") ], outputs=gr.File(label="Download HTML"), title="PDF to HTML Converter (Refined with olmOCR)", description="Uploads a PDF, extracts text via vision+prompt, stitches paragraphs, adds headers, and converts math and markdown to styled HTML.", allow_flagging="never" ) if __name__ == "__main__": iface.launch( server_name="0.0.0.0", server_port=7860, share=True, debug=True, allowed_paths=["/tmp"] )