# import gradio as gr # gr.load("models/ManishThota/InstructBlip-VQA").launch() from PIL import Image from transformers import BlipProcessor, BlipForQuestionAnswering # Initialize the model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("ManishThota/InstructBlip-VQA") def predict_answer(image, question): # Convert PIL image to RGB if not already image = image.convert("RGB") # Prepare inputs encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16) out = model.generate(**encoding) generated_text = processor.decode(out[0], skip_special_tokens=True) return generated_text def gradio_predict(image, question): answer = predict_answer(image, question) return answer # Define the Gradio interface iface = gr.Interface( fn=gradio_predict, inputs=[gr.inputs.Image(), gr.inputs.Textbox(label="Question")], outputs=gr.outputs.Textbox(label="Answer"), title="Visual Question Answering", description="This model answers questions based on the content of an image. Powered by BLIP.", ) # Launch the app iface.launch()