File size: 5,649 Bytes
c9d256e
0fb4283
 
c9d256e
752ac37
 
0fb4283
c9d256e
 
f1a5355
 
 
d404fc4
 
 
 
 
f1a5355
d404fc4
 
 
 
c9d256e
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
 
c9d256e
6133d17
084f1ce
 
e329bce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
e329bce
 
0fb4283
c9d256e
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
0fb4283
 
c9d256e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import gradio as gr
from transformers import pipeline
import spaces  # This module is available when deploying on HF Spaces with ZeroGPU
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

# --- Trending models for image text-to-text tasks ---
TRENDING_MODELS = [
    "Salesforce/blip2-opt-2.7b",             # Uses Blip2Config
    "Salesforce/blip2-flan-t5-xl",             # Uses Blip2Config
    "Salesforce/instructblip-vicuna-7b",       # Uses InstructBlipConfig
    "llava-hf/llava-1.5-7b-hf",                        # Uses LlavaConfig
    "liuhaotian/llava-v1.5-13b",                       # Uses LlavaConfig
    "llava-hf/llava-v1.6-mistral-7b-hf",                  # Uses LlavaNextConfig
    "Qwen/Qwen2-VL-7B-Instruct",                           # Uses Qwen2VLConfig
    "google/pix2struct-ai2d-base",                # Uses Pix2StructConfig
    "nlpconnect/vit-gpt2-image-captioning",  # Uses VisionEncoderDecoderConfig
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",        # Uses LlavaOnevisionConfig
    "mosaicml/mpt-7b-chat",                     # Uses MllamaConfig
    "ibm-granite/granite-vision-3.1-2b-preview",
    "allenai/Molmo-7B-D-0924"
]

# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
def resolve_model(chosen, custom):
    if chosen == "Custom":
        return custom.strip()
    else:
        return chosen

# --- Main inference function ---
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
@spaces.GPU()
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
    # Determine which model identifiers to use.
    model1_name = resolve_model(model1_choice, model1_custom)
    model2_name = resolve_model(model2_choice, model2_custom)

    # Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
    device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1

    # Create pipelines for image-to-text.
    # These models should support a call signature of (image, prompt)
    pipe1 = pipeline(task="image-text-to-text", model=model1_name, device=device)
    pipe2 = pipeline(task="image-text-to-text", model=model2_name, device=device)
    messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
                     "url": image,
                 },
                 {"type": "text", "text": prompt},
             ],
         },
         {
             "role": "assistant",
             "content": [
                 {"type": "text", "text": ""},
             ],
         },
    ]

    # Run inference on the image with the provided prompt.
    output1 = pipe1(text=messages, max_new_tokens=1024)
    output2 = pipe2(text=messages, max_new_tokens=1024)

    # Extract the generated text.
    def extract_text(output):
        if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
            return output[0]["generated_text"]
        else:
            return str(output)
    
    result1 = extract_text(output1)
    result2 = extract_text(output2)

    # Format results as chat conversations.
    # Each chatbot conversation is a list of (speaker, message) tuples.
    chat1 = [("User", prompt), ("Bot", result1)]
    chat2 = [("User", prompt), ("Bot", result2)]
    return chat1, chat2

# --- Build the Gradio interface ---
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."

with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
    gr.Markdown(
        """
        # Image Text-to-Text Comparison Tool  
        Compare two trending image text-to-text (instruction-following) models side-by-side.  
        Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Input")
            image_input = gr.Image(label="Upload an Image", type="pil")
            prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
        with gr.Column(scale=1):
            gr.Markdown("## Model Selection")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Model 1")
                    model1_choice = gr.Dropdown(
                        choices=TRENDING_MODELS + ["Custom"],
                        value=TRENDING_MODELS[0],
                        label="Select Model 1"
                    )
                    model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
                with gr.Column():
                    gr.Markdown("### Model 2")
                    model2_choice = gr.Dropdown(
                        choices=TRENDING_MODELS + ["Custom"],
                        value=TRENDING_MODELS[1],
                        label="Select Model 2"
                    )
                    model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
    
    compare_button = gr.Button("Compare Models")
    
    gr.Markdown("## Chatbot Outputs (Side-by-Side)")
    with gr.Row():
        chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
        chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
    
    compare_button.click(
        fn=compare_image_to_text_models,
        inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
        outputs=[chatbot1, chatbot2]
    )

demo.launch()