import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
import os
import spaces  # Import the spaces module


def load_model():
    """Load PaliGemma2 model and processor with Hugging Face token."""

    token = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # Retrieve token from environment variable

    if not token:
        raise ValueError(
            "Hugging Face API token not found. Please set it in the environment variables."
        )

    # Load the processor and model using the correct identifier
    processor = AutoProcessor.from_pretrained(
        "google/paligemma2-3b-pt-224", use_auth_token=token
    )
    model = AutoModelForImageTextToText.from_pretrained(
        "google/paligemma2-3b-pt-224", use_auth_token=token
    )

    return processor, model


@spaces.GPU  # Decorate the function that uses the GPU
def process_image(image):
    """Extract text from image using PaliGemma2."""
    processor, model = load_model()

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt")

    # Generate predictions
    with torch.no_grad():
        generated_ids = model.generate(**inputs)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return text


if __name__ == "__main__":
    iface = gr.Interface(
        fn=process_image,
        inputs=gr.Image(type="pil", label="Upload an image containing text"),
        outputs=gr.Textbox(label="Extracted Text"),
        title="Text Reading from Images using PaliGemma2",
        description="Upload an image containing text and the model will extract the text.",
    )
    iface.launch()