import gradio as gr
import torch
import base64
import fitz # PyMuPDF
import tempfile
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from latex2mathml.converter import convert as latex_to_mathml
import markdown2
import html
import json
import re
# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def convert_latex(text):
def replacer(match):
try:
return f""
except:
return html.escape(match.group(0))
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
return text
def stitch_paragraphs(pages):
joined = "\n".join(pages)
return re.sub(r"(? 0:
try:
raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
parsed = json.loads(raw)
decoded = parsed.get("natural_text", raw)
except:
decoded = raw
except Exception as e:
decoded = f"[Error on page {page_num}: {str(e)}]"
# Save first image as cover
if page_num == 1:
cover_img_html = f'