import gradio as gr from PIL import Image import torch from transformers import BlipProcessor, BlipForConditionalGeneration processor = BlipProcessor.from_pretrained("zeddotes/blip-computer-thoughts") model = BlipForConditionalGeneration.from_pretrained("zeddotes/blip-computer-thoughts") def caption_image(image): # image is a PIL Image from Gradio # Convert to model inputs inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): # Generate text from the model generated_ids = model.generate(**inputs, max_length=50) caption = processor.decode(generated_ids[0], skip_special_tokens=True) return caption demo = gr.Interface( fn=caption_image, inputs=gr.Image(type="pil"), outputs="text", title="My Fine-Tuned BLIP Model" ) if __name__ == "__main__": demo.launch()