Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Update app.py

af75cff verified 4 months ago

raw

history blame

4.24 kB

	import os
	import base64
	import tempfile
	from io import BytesIO

	import torch
	import gradio as gr
	from PIL import Image
	from PyPDF2 import PdfReader
	from ebooklib import epub
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_finetuning_prompt
	from olmocr.prompts.anchor import get_anchor_text

	# Set Hugging Face and Torch cache to a guaranteed-writable location
	cache_dir = "/tmp/huggingface_cache"
	os.environ["HF_HOME"] = cache_dir
	os.environ["TORCH_HOME"] = cache_dir
	os.makedirs(cache_dir, exist_ok=True)

	# Load model and processor
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-7B-0225-preview",
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
	).eval().to(device)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

	def ocr_page(pdf_path, page_num):
	image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
	anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
	prompt = build_finetuning_prompt(anchor_text)

	messages = [{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
	],
	}]

	prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
	inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	temperature=0.8,
	max_new_tokens=1024,
	do_sample=True,
	)

	prompt_len = inputs["input_ids"].shape[1]
	new_tokens = outputs[:, prompt_len:]
	decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
	return decoded[0] if decoded else ""

	def convert_pdf_to_epub(pdf_file, title, author, language):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
	tmp_pdf.write(pdf_file.read())
	tmp_pdf_path = tmp_pdf.name

	reader = PdfReader(tmp_pdf_path)
	num_pages = len(reader.pages)

	book = epub.EpubBook()
	book.set_title(title)
	book.add_author(author)
	book.set_language(language)

	# Set cover from page 1
	cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
	cover_image_bytes = base64.b64decode(cover_image_b64)
	book.set_cover("cover.jpg", cover_image_bytes)

	# Add OCR'd pages as chapters
	for i in range(num_pages):
	text = ocr_page(tmp_pdf_path, i)
	chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
	chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
	book.add_item(chapter)
	book.spine.append(chapter)

	# Finalize EPUB
	book.add_item(epub.EpubNcx())
	book.add_item(epub.EpubNav())
	epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
	epub.write_epub(epub_path, book, {})

	with open(epub_path, "rb") as f:
	return epub_path, f.read()

	def interface_fn(pdf, title, author, language):
	epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
	return epub_path

	demo = gr.Interface(
	fn=interface_fn,
	inputs=[
	gr.File(label="Upload PDF", file_types=[".pdf"]),
	gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
	gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
	gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
	],
	outputs=gr.File(label="Download EPUB"),
	title="PDF to EPUB Converter (olmOCR)",
	description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	demo.launch(share=True)