import os import gradio as gr from transformers import pipeline import spaces # This module is available when deploying on HF Spaces with ZeroGPU # --- Trending models for image text-to-text tasks --- TRENDING_MODELS = [ "Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xl", "Salesforce/blip-image-captioning-base", "Salesforce/blip-image-captioning-large", "nlpconnect/vit-gpt2-image-captioning", "OFA-Sys/OFA-base", "OFA-Sys/OFA-large", "dandelin/vilt-b32-finetuned-vqa", "dandelin/vilt-b32-mlm", "uclanlp/visualbert-vqa-coco-pre" ] # --- Helper: if the user selects "Custom", then they can enter any model identifier --- def resolve_model(chosen, custom): if chosen == "Custom": return custom.strip() else: return chosen # --- Main inference function --- # If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1. # The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space. @spaces.GPU() def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom): # Determine which model identifiers to use. model1_name = resolve_model(model1_choice, model1_custom) model2_name = resolve_model(model2_choice, model2_custom) # Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1) device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1 # Create pipelines for image-to-text. # Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image. # We use the "image-to-text" task here so that the prompt is taken into account. pipe1 = pipeline("image-to-text", model=model1_name, device=device) pipe2 = pipeline("image-to-text", model=model2_name, device=device) # Run inference on the image with the provided prompt. # Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt). output1 = pipe1(image, prompt) output2 = pipe2(image, prompt) # Extract the generated text. # (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.) def extract_text(output): if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]: return output[0]["generated_text"] else: return str(output) result1 = extract_text(output1) result2 = extract_text(output2) # Format results as chat conversations. # Each chatbot conversation is a list of (speaker, message) tuples. chat1 = [("User", prompt), ("Bot", result1)] chat2 = [("User", prompt), ("Bot", result2)] return chat1, chat2 # --- Build the Gradio interface --- # Pre-populated sample prompt. sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response." with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo: gr.Markdown( """ # Image Text-to-Text Comparison Tool Compare two trending image text-to-text (instruction-following) models side-by-side. Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image. """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Input") image_input = gr.Image(label="Upload an Image", type="pil") prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3) with gr.Column(scale=1): gr.Markdown("## Model Selection") with gr.Row(): with gr.Column(): gr.Markdown("### Model 1") model1_choice = gr.Dropdown( choices=TRENDING_MODELS + ["Custom"], value=TRENDING_MODELS[0], label="Select Model 1" ) model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name") with gr.Column(): gr.Markdown("### Model 2") model2_choice = gr.Dropdown( choices=TRENDING_MODELS + ["Custom"], value=TRENDING_MODELS[1], label="Select Model 2" ) model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name") compare_button = gr.Button("Compare Models") gr.Markdown("## Chatbot Outputs (Side-by-Side)") with gr.Row(): chatbot1 = gr.Chatbot(label="Model 1 Chatbot") chatbot2 = gr.Chatbot(label="Model 2 Chatbot") compare_button.click( fn=compare_image_to_text_models, inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom], outputs=[chatbot1, chatbot2] ) demo.launch()