import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import torch import os import spaces # Import the spaces module def load_model(): """Load PaliGemma2 model and processor with Hugging Face token.""" token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Retrieve token from environment variable if not token: raise ValueError( "Hugging Face API token not found. Please set it in the environment variables." ) # Load the processor and model using the correct identifier processor = AutoProcessor.from_pretrained( "google/paligemma2-3b-pt-224", use_auth_token=token ) model = AutoModelForImageTextToText.from_pretrained( "google/paligemma2-3b-pt-224", use_auth_token=token ) return processor, model @spaces.GPU # Decorate the function that uses the GPU def process_image(image): """Extract text from image using PaliGemma2.""" processor, model = load_model() # Preprocess the image inputs = processor(images=image, return_tensors="pt") # Generate predictions with torch.no_grad(): generated_ids = model.generate(**inputs) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text if __name__ == "__main__": iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil", label="Upload an image containing text"), outputs=gr.Textbox(label="Extracted Text"), title="Text Reading from Images using PaliGemma2", description="Upload an image containing text and the model will extract the text.", ) iface.launch()