Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,649 Bytes
c9d256e 0fb4283 c9d256e 752ac37 0fb4283 c9d256e f1a5355 d404fc4 f1a5355 d404fc4 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 6133d17 084f1ce e329bce 0fb4283 c9d256e e329bce 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import gradio as gr
from transformers import pipeline
import spaces # This module is available when deploying on HF Spaces with ZeroGPU
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
# --- Trending models for image text-to-text tasks ---
TRENDING_MODELS = [
"Salesforce/blip2-opt-2.7b", # Uses Blip2Config
"Salesforce/blip2-flan-t5-xl", # Uses Blip2Config
"Salesforce/instructblip-vicuna-7b", # Uses InstructBlipConfig
"llava-hf/llava-1.5-7b-hf", # Uses LlavaConfig
"liuhaotian/llava-v1.5-13b", # Uses LlavaConfig
"llava-hf/llava-v1.6-mistral-7b-hf", # Uses LlavaNextConfig
"Qwen/Qwen2-VL-7B-Instruct", # Uses Qwen2VLConfig
"google/pix2struct-ai2d-base", # Uses Pix2StructConfig
"nlpconnect/vit-gpt2-image-captioning", # Uses VisionEncoderDecoderConfig
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", # Uses LlavaOnevisionConfig
"mosaicml/mpt-7b-chat", # Uses MllamaConfig
"ibm-granite/granite-vision-3.1-2b-preview",
"allenai/Molmo-7B-D-0924"
]
# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
def resolve_model(chosen, custom):
if chosen == "Custom":
return custom.strip()
else:
return chosen
# --- Main inference function ---
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
@spaces.GPU()
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
# Determine which model identifiers to use.
model1_name = resolve_model(model1_choice, model1_custom)
model2_name = resolve_model(model2_choice, model2_custom)
# Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
# Create pipelines for image-to-text.
# These models should support a call signature of (image, prompt)
pipe1 = pipeline(task="image-text-to-text", model=model1_name, device=device)
pipe2 = pipeline(task="image-text-to-text", model=model2_name, device=device)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": image,
},
{"type": "text", "text": prompt},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
]
# Run inference on the image with the provided prompt.
output1 = pipe1(text=messages, max_new_tokens=1024)
output2 = pipe2(text=messages, max_new_tokens=1024)
# Extract the generated text.
def extract_text(output):
if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
return output[0]["generated_text"]
else:
return str(output)
result1 = extract_text(output1)
result2 = extract_text(output2)
# Format results as chat conversations.
# Each chatbot conversation is a list of (speaker, message) tuples.
chat1 = [("User", prompt), ("Bot", result1)]
chat2 = [("User", prompt), ("Bot", result2)]
return chat1, chat2
# --- Build the Gradio interface ---
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
gr.Markdown(
"""
# Image Text-to-Text Comparison Tool
Compare two trending image text-to-text (instruction-following) models side-by-side.
Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input")
image_input = gr.Image(label="Upload an Image", type="pil")
prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
with gr.Column(scale=1):
gr.Markdown("## Model Selection")
with gr.Row():
with gr.Column():
gr.Markdown("### Model 1")
model1_choice = gr.Dropdown(
choices=TRENDING_MODELS + ["Custom"],
value=TRENDING_MODELS[0],
label="Select Model 1"
)
model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
with gr.Column():
gr.Markdown("### Model 2")
model2_choice = gr.Dropdown(
choices=TRENDING_MODELS + ["Custom"],
value=TRENDING_MODELS[1],
label="Select Model 2"
)
model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
compare_button = gr.Button("Compare Models")
gr.Markdown("## Chatbot Outputs (Side-by-Side)")
with gr.Row():
chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
compare_button.click(
fn=compare_image_to_text_models,
inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
outputs=[chatbot1, chatbot2]
)
demo.launch()
|