import os os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers" import torch import base64 from io import BytesIO from PIL import Image import gradio as gr from ebooklib import epub from transformers import AutoProcessor, Qwen2VLForConditionalGeneration from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_finetuning_prompt from olmocr.prompts.anchor import get_anchor_text from PyPDF2 import PdfReader # Set a writable cache directory for HF os.environ['HF_HOME'] = '/tmp/.cache/huggingface' # Load processor and model processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") model = Qwen2VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 ).eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def extract_text_from_page(pdf_path, page_num): # Render image image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024) image = Image.open(BytesIO(base64.b64decode(image_base64))) # Prompt and input anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000) prompt = build_finetuning_prompt(anchor_text) messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): output = model.generate( **inputs, temperature=0.8, max_new_tokens=256, num_return_sequences=1, do_sample=True, ) prompt_len = inputs["input_ids"].shape[1] new_tokens = output[:, prompt_len:] decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0] return decoded, image_base64 if page_num == 1 else None def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"): file_path = file.name reader = PdfReader(file_path) num_pages = len(reader.pages) all_text = [] cover_image_data = None for page in range(1, num_pages + 1): text, cover_image = extract_text_from_page(file_path, page) all_text.append(f"

Page {page}

{text}

") if cover_image and not cover_image_data: cover_image_data = cover_image # base64 # Build EPUB book = epub.EpubBook() book.set_identifier("id123456") book.set_title(title) book.set_language(language) book.add_author(author) # Add cover image if cover_image_data: cover_bytes = base64.b64decode(cover_image_data) book.set_cover("cover.jpg", cover_bytes) # Create chapter with all text chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language) chapter.content = f"

{title}

{''.join(all_text)}" book.add_item(chapter) book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),) book.add_item(epub.EpubNavi()) book.add_item(epub.EpubNCX()) book.spine = ['nav', chapter] epub_path = f"/tmp/{title.replace(' ', '_')}.epub" epub.write_epub(epub_path, book) return epub_path # Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=[ gr.File(label="Upload PDF"), gr.Textbox(value="Extracted PDF", label="EPUB Title"), gr.Textbox(value="olmOCR", label="Author"), gr.Textbox(value="en", label="Language"), ], outputs=gr.File(label="Download EPUB"), title="olmOCR PDF to EPUB (Full PDF + Cover Image)", description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.", allow_flagging="never" ) if __name__ == "__main__": iface.launch()