Spaces:

oddadmix
/

Qaari-Urdu-OCR

Sleeping

App Files Files Community

Qaari-Urdu-OCR / app.py

oddadmix

Update app.py

54d372d verified 19 days ago

raw

history blame

3.71 kB

	import gradio as gr
	import time
	import spaces
	from PIL import Image
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	import torch
	import uuid
	import os
	import numpy as np

	# Load model and processor
	model_name = "oddadmix/Qaari-0.1-Urdu-OCR-2B"
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_name,
	torch_dtype="auto",
	device_map="cuda"
	)
	processor = AutoProcessor.from_pretrained(model_name)
	max_tokens = 2000


	@spaces.GPU
	def perform_ocr(image):
	inputArray = np.any(image)
	if inputArray == False:
	return "Error Processing"
	"""Process image and extract text using OCR model"""
	image = Image.fromarray(image)
	src = str(uuid.uuid4()) + ".png"
	prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
	image.save(src)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Process inputs
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	# Generate text
	generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Cleanup
	os.remove(src)
	return output_text

	# Create Gradio interface
	with gr.Blocks(title="Qaari-0.1-Urdu-OCR-2B Urdu OCR") as demo:
	gr.Markdown("# Qaari-0.1-Urdu-OCR-2B Urdu OCR")
	gr.Markdown("Upload an image to extract Urdu text in real-time. This model is specialized for Urdu document OCR.")

	with gr.Row():
	with gr.Column(scale=1):
	# Input image
	image_input = gr.Image(type="numpy", label="Upload Image")

	# Example gallery
	gr.Examples(
	examples=[
	["1.jpg"],
	["2.jpg"]
	],
	inputs=image_input,
	label="Example Images",
	examples_per_page=4
	)

	# Submit button
	submit_btn = gr.Button("Extract Text")

	with gr.Column(scale=1):
	# Output text
	output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True, rtl=True)

	# Model details
	with gr.Accordion("Model Information", open=False):
	gr.Markdown("""
	Model: Qaari-0.1-Urdu-OCR-2B
	Description: Urdu OCR model based on Qwen2-VL architecture
	Size: 2B parameters
	Context window: Supports up to 2000 output tokens
	""")

	# Set up processing flow
	submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
	image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)

	demo.launch()