# import torch # from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor # import gradio as gr # from PIL import Image # # Load model and processor # model_name = "google/pix2struct-docvqa-large" # model = Pix2StructForConditionalGeneration.from_pretrained(model_name) # processor = Pix2StructProcessor.from_pretrained(model_name) # def process_image(image_path): # try: # # Load the image # image = Image.open(image_path).convert("RGB") # # Prepare the input # inputs = processor(images=image, text="What does this image say?", return_tensors="pt") # # Generate prediction # output = model.generate(**inputs) # # Decode the output # solution = processor.decode(output[0], skip_special_tokens=True) # return solution # except Exception as e: # return f"Error processing image: {str(e)}" # def predict(image): # """Handles image input for Gradio.""" # return process_image(image) # # Gradio app # iface = gr.Interface( # fn=predict, # inputs=gr.Image(type="filepath"), # outputs="text", # title="Image Text Solution" # ) # if __name__ == "__main__": # iface.launch()