Spaces:
Sleeping
Sleeping
File size: 5,651 Bytes
c9d256e 0fb4283 c9d256e 752ac37 0fb4283 c9d256e f1a5355 d404fc4 f1a5355 d404fc4 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e 0fb4283 c9d256e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import gradio as gr
from transformers import pipeline
import spaces # This module is available when deploying on HF Spaces with ZeroGPU
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
# --- Trending models for image text-to-text tasks ---
TRENDING_MODELS = [
"Salesforce/blip2-opt-2.7b", # Uses Blip2Config
"Salesforce/blip2-flan-t5-xl", # Uses Blip2Config
"Salesforce/instructblip-vicuna-7b", # Uses InstructBlipConfig
"llava-hf/llava-1.5-7b-hf", # Uses LlavaConfig
"liuhaotian/llava-v1.5-13b", # Uses LlavaConfig
"llava-hf/llava-v1.6-mistral-7b-hf", # Uses LlavaNextConfig
"Qwen/Qwen2-VL-7B-Instruct", # Uses Qwen2VLConfig
"google/pix2struct-ai2d-base", # Uses Pix2StructConfig
"nlpconnect/vit-gpt2-image-captioning", # Uses VisionEncoderDecoderConfig
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", # Uses LlavaOnevisionConfig
"mosaicml/mpt-7b-chat", # Uses MllamaConfig
"ibm-granite/granite-vision-3.1-2b-preview",
"allenai/Molmo-7B-D-0924"
]
# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
def resolve_model(chosen, custom):
if chosen == "Custom":
return custom.strip()
else:
return chosen
# --- Main inference function ---
# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1.
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
@spaces.GPU()
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
# Determine which model identifiers to use.
model1_name = resolve_model(model1_choice, model1_custom)
model2_name = resolve_model(model2_choice, model2_custom)
# Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
# Create pipelines for image-to-text.
# Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image.
# We use the "image-to-text" task here so that the prompt is taken into account.
pipe1 = pipeline("image-to-text", model=model1_name, device=device)
pipe2 = pipeline("image-to-text", model=model2_name, device=device)
# Run inference on the image with the provided prompt.
# Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt).
output1 = pipe1(image, prompt)
output2 = pipe2(image, prompt)
# Extract the generated text.
# (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.)
def extract_text(output):
if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
return output[0]["generated_text"]
else:
return str(output)
result1 = extract_text(output1)
result2 = extract_text(output2)
# Format results as chat conversations.
# Each chatbot conversation is a list of (speaker, message) tuples.
chat1 = [("User", prompt), ("Bot", result1)]
chat2 = [("User", prompt), ("Bot", result2)]
return chat1, chat2
# --- Build the Gradio interface ---
# Pre-populated sample prompt.
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
gr.Markdown(
"""
# Image Text-to-Text Comparison Tool
Compare two trending image text-to-text (instruction-following) models side-by-side.
Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input")
image_input = gr.Image(label="Upload an Image", type="pil")
prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
with gr.Column(scale=1):
gr.Markdown("## Model Selection")
with gr.Row():
with gr.Column():
gr.Markdown("### Model 1")
model1_choice = gr.Dropdown(
choices=TRENDING_MODELS + ["Custom"],
value=TRENDING_MODELS[0],
label="Select Model 1"
)
model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
with gr.Column():
gr.Markdown("### Model 2")
model2_choice = gr.Dropdown(
choices=TRENDING_MODELS + ["Custom"],
value=TRENDING_MODELS[1],
label="Select Model 2"
)
model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
compare_button = gr.Button("Compare Models")
gr.Markdown("## Chatbot Outputs (Side-by-Side)")
with gr.Row():
chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
compare_button.click(
fn=compare_image_to_text_models,
inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
outputs=[chatbot1, chatbot2]
)
demo.launch()
|