Spaces:

taesiri
/

CLIPScore

Running on Zero

App Files Files Community

taesiri commited on Aug 5

Commit

d6fa528

verified ·

1 Parent(s): 7fc3bdc

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -94

app.py CHANGED Viewed

@@ -1,119 +1,133 @@
 import torch
 import torch.nn.functional as F
 import gradio as gr
-from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
-import spaces
-# Dictionary of available models with their image sizes
 MODELS = {
-    "CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"),
-    "CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"),
-    "CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"),
-    "CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"),
-    "SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"),
-    "SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"),
-    "SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"),
 }
-# Initialize models and processors
-models = {}
-processors = {}
-for model_name, (model_path, _, model_type) in MODELS.items():
-    if model_type == "clip":
-        models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
-        processors[model_name] = CLIPProcessor.from_pretrained(model_path)
-    elif model_type == "siglip":
-        models[model_name] = AutoModel.from_pretrained(model_path).to("cuda")
-        processors[model_name] = AutoProcessor.from_pretrained(model_path)
 @spaces.GPU
-def calculate_score(image, text, model_name):
-    labels = text.split(";")
-    labels = [l.strip() for l in labels]
-    labels = list(filter(None, labels))
-    if len(labels) == 0:
-        return dict()
-    model = models[model_name]
-    processor = processors[model_name]
-    model_type = MODELS[model_name][2]
-    # Preprocess the image and text
-    inputs = processor(text=labels, images=[image], return_tensors="pt", padding="max_length")
-    inputs = {k: v.to("cuda") for k, v in inputs.items()}
-    # Calculate embeddings
     with torch.no_grad():
-        outputs = model(**inputs)
-        if model_type == "clip":
-            image_embeds = outputs.image_embeds
-            text_embeds = outputs.text_embeds
-        elif model_type == "siglip":
-            image_embeds = outputs.image_embeds
-            text_embeds = outputs.text_embeds
-    # Normalize embeddings
-    image_embeds = F.normalize(image_embeds, p=2, dim=1)
-    text_embeds = F.normalize(text_embeds, p=2, dim=1)
-    # Calculate similarity
-    if model_type == "clip":
-        # For CLIP, use cosine similarity
-        similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
-        similarities = torch.clamp(similarities, min=0, max=1)
-    elif model_type == "siglip":
-        # For SigLIP, use sigmoid on dot product
-        logits = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
-        similarities = torch.sigmoid(logits)
-    # Convert to numpy array
-    similarities = similarities.cpu().numpy()
-    results_dict = {label: float(score) for label, score in zip(labels, similarities)}
-    return results_dict
-with gr.Blocks() as demo:
-    gr.Markdown("# Multi-Model CLIP and SigLIP Score")
-    gr.Markdown(
-        "Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants"
-    )
     with gr.Row():
-        image_input = gr.Image(type="pil")
-        output_label = gr.Label()
     with gr.Row():
-        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
-        model_dropdown = gr.Dropdown(
-            choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16"
         )
-    def process_inputs(image, text, model_name):
-        if image is None or text.strip() == "":
-            return None
-        return calculate_score(image, text, model_name)
-    inputs = [image_input, text_input, model_dropdown]
-    outputs = output_label
-    image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
-    text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
-    model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
     gr.Examples(
         examples=[
-            [
-                "cat.jpg",
-                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
-                "CLIP ViT-B/16",
-            ]
         ],
-        fn=process_inputs,
-        inputs=inputs,
-        outputs=outputs,
     )
-demo.launch()

+import os
 import torch
 import torch.nn.functional as F
 import gradio as gr
+import spaces                               # ← keep this!
+from transformers import (
+    CLIPProcessor,
+    CLIPModel,
+    SiglipProcessor,                        # transformers ≥ 4.40
+    SiglipModel,
+)
+# ---------------------------------------------------------------------
+# 1.  CONFIG
+# ---------------------------------------------------------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {
+    "CLIP ViT-B/32":      ("openai/clip-vit-base-patch32",        224, "clip"),
+    "CLIP ViT-B/16":      ("openai/clip-vit-base-patch16",        224, "clip"),
+    "CLIP ViT-L/14":      ("openai/clip-vit-large-patch14",       224, "clip"),
+    "CLIP ViT-L/14@336":  ("openai/clip-vit-large-patch14-336",   336, "clip"),
+    "SigLIP Large-256":   ("google/siglip-large-patch16-256",     256, "siglip"),
+    "SigLIP Base-384":    ("google/siglip-base-patch16-384",      384, "siglip"),
+    "SigLIP Large-384":   ("google/siglip-large-patch16-384",     384, "siglip"),
 }
+# ---------------------------------------------------------------------
+# 2.  LAZY MODEL LOADING
+# ---------------------------------------------------------------------
+_models, _processors = {}, {}
+def _load_model(name: str):
+    path, _, kind = MODELS[name]
+    kwargs = dict(
+        low_cpu_mem_usage=False,     # avoid meta-device bug
+        torch_dtype=torch.float16,   # faster & smaller
+    )
+    if kind == "clip":
+        model     = CLIPModel.from_pretrained(path, **kwargs).to(DEVICE)
+        processor = CLIPProcessor.from_pretrained(path)
+    else:
+        model     = SiglipModel.from_pretrained(path, **kwargs).to(DEVICE)
+        processor = SiglipProcessor.from_pretrained(path)
+    model.eval()
+    return model, processor
+def get_model(name: str):
+    if name not in _models:
+        _models[name], _processors[name] = _load_model(name)
+    return _models[name], _processors[name]
+# ---------------------------------------------------------------------
+# 3.  SCORING FUNCTION (runs on GPU in Spaces)
+# ---------------------------------------------------------------------
 @spaces.GPU
+def calculate_score(image, text: str, model_name: str):
+    labels = [t.strip() for t in text.split(";") if t.strip()]
+    if not labels:
+        return {}
+    model, processor = get_model(model_name)
+    kind = MODELS[model_name][2]
+    inputs = processor(
+        text=labels,
+        images=image,
+        padding=True,
+        return_tensors="pt",
+    ).to(DEVICE)
     with torch.no_grad():
+        if kind == "clip":
+            out       = model(**inputs)
+            img_emb   = out.image_embeds
+            txt_emb   = out.text_embeds
+        else:
+            img_emb = model.get_image_features(pixel_values=inputs["pixel_values"])
+            txt_emb = model.get_text_features(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+    img_emb = F.normalize(img_emb, p=2, dim=-1)
+    txt_emb = F.normalize(txt_emb, p=2, dim=-1)
+    scores = (txt_emb @ img_emb.T).squeeze(1)          # cosine
+    if kind == "siglip":
+        scores = torch.sigmoid(scores)                 # paper’s choice
+    return {lbl: float(score.clamp(0, 1)) for lbl, score in zip(labels, scores.cpu())}
+# ---------------------------------------------------------------------
+# 4.  GRADIO UI
+# ---------------------------------------------------------------------
+with gr.Blocks(title="CLIP / SigLIP Image-Text Similarity") as demo:
+    gr.Markdown("## Compare an image with multiple text prompts")
     with gr.Row():
+        image_in  = gr.Image(type="pil", label="Image")
+        score_out = gr.Label(label="Similarity (0‒1)")
     with gr.Row():
+        text_in = gr.Textbox(
+            label="Text prompts (use ‘;’ to separate)",
+            placeholder="a cat; a flying cat; a dog",
+        )
+        model_in = gr.Dropdown(
+            choices=list(MODELS.keys()),
+            value="CLIP ViT-B/16",
+            label="Model",
         )
+    def infer(img, txt, mdl):
+        return calculate_score(img, txt, mdl) if img and txt.strip() else {}
+    for comp in (image_in, text_in, model_in):
+        comp.change(infer, [image_in, text_in, model_in], score_out)
     gr.Examples(
         examples=[
+            ["cat.jpg",
+             "a cat stuck in a door; a cat jumping; a dog",
+             "CLIP ViT-B/16"],
         ],
+        inputs=[image_in, text_in, model_in],
+        outputs=score_out,
     )
+demo.launch()