import torch from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor import gradio as gr from PIL import Image # Load model and processor model_name = "google/pix2struct-docvqa-large" model = Pix2StructForConditionalGeneration.from_pretrained(model_name) processor = Pix2StructProcessor.from_pretrained(model_name) def process_image(image_path): try: # Load the image image = Image.open(image_path).convert("RGB") # Prepare the input inputs = processor(images=image, text="What does this image say?", return_tensors="pt") # Generate prediction output = model.generate(**inputs) # Decode the output solution = processor.decode(output[0], skip_special_tokens=True) return solution except Exception as e: return f"Error processing image: {str(e)}" def predict(image): """Handles image input for Gradio.""" return process_image(image) # Gradio app iface = gr.Interface( fn=predict, inputs=gr.Image(type="filepath"), outputs="text", title="Image Text Solution" ) if __name__ == "__main__": iface.launch()