import gradio as gr import time import spaces from PIL import Image from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info import torch import uuid import os import numpy as np # Load model and processor model_name = "oddadmix/Qaari-0.1-Urdu-OCR-2B" model = Qwen2VLForConditionalGeneration.from_pretrained( model_name, torch_dtype="auto", device_map="cuda" ) processor = AutoProcessor.from_pretrained(model_name) max_tokens = 2000 @spaces.GPU def perform_ocr(image): inputArray = np.any(image) if inputArray == False: return "Error Processing" """Process image and extract text using OCR model""" image = Image.fromarray(image) src = str(uuid.uuid4()) + ".png" prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate." image.save(src) messages = [ { "role": "user", "content": [ {"type": "image", "image": f"file://{src}"}, {"type": "text", "text": prompt}, ], } ] # Process inputs text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Generate text generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # Cleanup os.remove(src) return output_text # Create Gradio interface with gr.Blocks(title="Qaari-0.1-Urdu-OCR-2B Urdu OCR") as demo: gr.Markdown("# Qaari-0.1-Urdu-OCR-2B Urdu OCR") gr.Markdown("Upload an image to extract Urdu text in real-time. This model is specialized for Urdu document OCR.") with gr.Row(): with gr.Column(scale=1): # Input image image_input = gr.Image(type="numpy", label="Upload Image") # Example gallery gr.Examples( examples=[ ["1.jpg"], ["2.jpg"] ], inputs=image_input, label="Example Images", examples_per_page=4 ) # Submit button submit_btn = gr.Button("Extract Text") with gr.Column(scale=1): # Output text output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True, rtl=True) # Model details with gr.Accordion("Model Information", open=False): gr.Markdown(""" **Model:** Qaari-0.1-Urdu-OCR-2B **Description:** Urdu OCR model based on Qwen2-VL architecture **Size:** 2B parameters **Context window:** Supports up to 2000 output tokens """) # Set up processing flow submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output) image_input.change(fn=perform_ocr, inputs=image_input, outputs=output) demo.launch()