import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import gradio as gr
from PIL import Image

# Load model and processor
model_name = "google/pix2struct-docvqa-large"
model = Pix2StructForConditionalGeneration.from_pretrained(model_name)
processor = Pix2StructProcessor.from_pretrained(model_name)

def process_image(image_path):
    try:
        # Load the image
        image = Image.open(image_path).convert("RGB")

        # Prepare the input
        inputs = processor(images=image, text="What does this image say?", return_tensors="pt")

        # Generate prediction
        output = model.generate(**inputs)

        # Decode the output
        solution = processor.decode(output[0], skip_special_tokens=True)
        return solution

    except Exception as e:
        return f"Error processing image: {str(e)}"

def predict(image):
    """Handles image input for Gradio."""
    return process_image(image)

# Gradio app
iface = gr.Interface(
    fn=predict,
    inputs=gr.Image(type="filepath"),
    outputs="text",
    title="Image Text Solution"
)

if __name__ == "__main__":
    iface.launch()