Spaces:

sflindrs
/

vlm_comparer

Sleeping

App Files Files Community

sflindrs commited on Feb 11

Commit

c9d256e

verified ·

1 Parent(s): 17af200

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -69

app.py CHANGED Viewed

@@ -1,83 +1,118 @@
 import gradio as gr
 from transformers import pipeline
-import os
-import spaces
-# Define some pre-populated vision models.
-PREDEFINED_MODELS = {
-    "ViT Base (google/vit-base-patch16-224)": "google/vit-base-patch16-224",
-    "DeiT Base (facebook/deit-base-distilled-patch16-224)": "facebook/deit-base-distilled-patch16-224",
-    "CLIP ViT Base (openai/clip-vit-base-patch32)": "openai/clip-vit-base-patch32"
-}
-@spaces.GPU
-def compare_vision_models(image, model1_choice, model1_custom, model2_choice, model2_custom):
-    """
-    For each model selection, use the pre-defined model identifier unless the user selects "Custom" and enters an identifier.
-    Then create an image-classification pipeline for each model and run inference on the provided image.
-    """
-    # Determine the model names to use:
-    model1_name = (
-        PREDEFINED_MODELS.get(model1_choice, model1_custom)
-        if model1_choice != "Custom" else model1_custom
-    )
-    model2_name = (
-        PREDEFINED_MODELS.get(model2_choice, model2_custom)
-        if model2_choice != "Custom" else model2_custom
-    )
-    # Optionally, if you deploy on a GPU-enabled space (e.g. using ZeroGPU), you can set device=0.
-    # Here, we check an environment variable "USE_GPU" (set it to "1" in your Space's settings if needed).
     device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
-    # Create pipelines. In this example we assume the models support image classification.
-    classifier1 = pipeline("image-classification", model=model1_name, device=device)
-    classifier2 = pipeline("image-classification", model=model2_name, device=device)
-    # Run inference
-    preds1 = classifier1(image)
-    preds2 = classifier2(image)
-    # Format the predictions as text (each line shows the predicted label and its confidence score)
-    result1 = "\n".join([f"{pred['label']}: {pred['score']:.3f}" for pred in preds1])
-    result2 = "\n".join([f"{pred['label']}: {pred['score']:.3f}" for pred in preds2])
-    return result1, result2
-# Build the Gradio interface using Blocks.
-with gr.Blocks(title="Vision Model Comparison Tool") as demo:
-    gr.Markdown("## Vision Model Comparison Tool\nSelect two Hugging Face vision models to compare their outputs side-by-side!")
     with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Model 1")
-            model1_choice = gr.Dropdown(
-                choices=list(PREDEFINED_MODELS.keys()) + ["Custom"],
-                label="Select a pre-defined model or 'Custom'"
-            )
-            model1_custom = gr.Textbox(
-                label="Custom Hugging Face Model",
-                placeholder="e.g., username/model_name"
-            )
-        with gr.Column():
-            gr.Markdown("### Model 2")
-            model2_choice = gr.Dropdown(
-                choices=list(PREDEFINED_MODELS.keys()) + ["Custom"],
-                label="Select a pre-defined model or 'Custom'"
-            )
-            model2_custom = gr.Textbox(
-                label="Custom Hugging Face Model",
-                placeholder="e.g., username/model_name"
-            )
-    image_input = gr.Image(label="Input Image", type="pil")
-    compare_btn = gr.Button("Compare Models")
     with gr.Row():
-        output1 = gr.Textbox(label="Model 1 Output")
-        output2 = gr.Textbox(label="Model 2 Output")
-    compare_btn.click(
-        fn=compare_vision_models,
-        inputs=[image_input, model1_choice, model1_custom, model2_choice, model2_custom],
-        outputs=[output1, output2]
     )
-demo.launch()

+import os
 import gradio as gr
 from transformers import pipeline
+import spaces  # This module is available when deploying on HF Spaces with ZeroGPU
+# --- Trending models for image text-to-text tasks ---
+TRENDING_MODELS = [
+    "Salesforce/blip2-opt-2.7b",
+    "Salesforce/blip2-flan-t5-xl",
+    "Salesforce/blip-image-captioning-base",
+    "Salesforce/blip-image-captioning-large",
+    "nlpconnect/vit-gpt2-image-captioning",
+    "OFA-Sys/OFA-base",
+    "OFA-Sys/OFA-large",
+    "dandelin/vilt-b32-finetuned-vqa",
+    "dandelin/vilt-b32-mlm",
+    "uclanlp/visualbert-vqa-coco-pre"
+]
+# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
+def resolve_model(chosen, custom):
+    if chosen == "Custom":
+        return custom.strip()
+    else:
+        return chosen
+# --- Main inference function ---
+# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1.
+# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
+@spaces.GPU()
+def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
+    # Determine which model identifiers to use.
+    model1_name = resolve_model(model1_choice, model1_custom)
+    model2_name = resolve_model(model2_choice, model2_custom)
+    # Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
     device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
+    # Create pipelines for image-to-text.
+    # Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image.
+    # We use the "image-to-text" task here so that the prompt is taken into account.
+    pipe1 = pipeline("image-to-text", model=model1_name, device=device)
+    pipe2 = pipeline("image-to-text", model=model2_name, device=device)
+    # Run inference on the image with the provided prompt.
+    # Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt).
+    output1 = pipe1(image, prompt)
+    output2 = pipe2(image, prompt)
+    # Extract the generated text.
+    # (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.)
+    def extract_text(output):
+        if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
+            return output[0]["generated_text"]
+        else:
+            return str(output)
+    result1 = extract_text(output1)
+    result2 = extract_text(output2)
+    # Format results as chat conversations.
+    # Each chatbot conversation is a list of (speaker, message) tuples.
+    chat1 = [("User", prompt), ("Bot", result1)]
+    chat2 = [("User", prompt), ("Bot", result2)]
+    return chat1, chat2
+# --- Build the Gradio interface ---
+# Pre-populated sample prompt.
+sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
+with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
+    gr.Markdown(
+        """
+        # Image Text-to-Text Comparison Tool
+        Compare two trending image text-to-text (instruction-following) models side-by-side.
+        Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
+        """
+    )
     with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("## Input")
+            image_input = gr.Image(label="Upload an Image", type="pil")
+            prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
+        with gr.Column(scale=1):
+            gr.Markdown("## Model Selection")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Model 1")
+                    model1_choice = gr.Dropdown(
+                        choices=TRENDING_MODELS + ["Custom"],
+                        value=TRENDING_MODELS[0],
+                        label="Select Model 1"
+                    )
+                    model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
+                with gr.Column():
+                    gr.Markdown("### Model 2")
+                    model2_choice = gr.Dropdown(
+                        choices=TRENDING_MODELS + ["Custom"],
+                        value=TRENDING_MODELS[1],
+                        label="Select Model 2"
+                    )
+                    model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
+    compare_button = gr.Button("Compare Models")
+    gr.Markdown("## Chatbot Outputs (Side-by-Side)")
     with gr.Row():
+        chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
+        chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
+    compare_button.click(
+        fn=compare_image_to_text_models,
+        inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
+        outputs=[chatbot1, chatbot2]
     )
+demo.launch()