import torch import gradio as gr from transformers import pipeline CAPTION_MODELS = { 'blip-base': 'Salesforce/blip-image-captioning-base', 'blip-large': 'Salesforce/blip-image-captioning-large', 'vit-gpt2-coco-en': 'ydshieh/vit-gpt2-coco-en', 'blip2-2.7b-fp16': 'Mediocreatmybest/blip2-opt-2.7b-fp16-sharded', } # Create a dictionary to store loaded models loaded_models = {} # Simple caption creation def caption_image(model_choice, image_input, url_input): if image_input is not None: input_data = image_input else: input_data = url_input # Check if the model is already loaded if model_choice in loaded_models: captioner = loaded_models[model_choice] else: captioner = pipeline(task="image-to-text", model=CAPTION_MODELS[model_choice], max_new_tokens=30, device_map="cpu", use_fast=True ) # Store the loaded model loaded_models[model_choice] = captioner caption = captioner(input_data)[0]['generated_text'] return str(caption).strip() def launch(model_choice, image_input, url_input): return caption_image(model_choice, image_input, url_input) model_dropdown = gr.Dropdown(choices=list(CAPTION_MODELS.keys()), label='Select Caption Model') image_input = gr.Image(type="pil", label="Input Image") url_input = gr.Text(label="Input URL") iface = gr.Interface(launch, inputs=[model_dropdown, image_input, url_input], outputs="text") iface.launch()