import os
import torch
import torch.nn.functional as F
import gradio as gr
import spaces                               # ← keep this!
from transformers import (
    CLIPProcessor,
    CLIPModel,
    SiglipProcessor,                        # transformers ≥ 4.40
    SiglipModel,
)

# ---------------------------------------------------------------------
# 1.  CONFIG
# ---------------------------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODELS = {
    "CLIP ViT-B/32":      ("openai/clip-vit-base-patch32",        224, "clip"),
    "CLIP ViT-B/16":      ("openai/clip-vit-base-patch16",        224, "clip"),
    "CLIP ViT-L/14":      ("openai/clip-vit-large-patch14",       224, "clip"),
    "CLIP ViT-L/14@336":  ("openai/clip-vit-large-patch14-336",   336, "clip"),
    "SigLIP Large-256":   ("google/siglip-large-patch16-256",     256, "siglip"),
    "SigLIP Base-384":    ("google/siglip-base-patch16-384",      384, "siglip"),
    "SigLIP Large-384":   ("google/siglip-large-patch16-384",     384, "siglip"),
}

# ---------------------------------------------------------------------
# 2.  LAZY MODEL LOADING
# ---------------------------------------------------------------------
_models, _processors = {}, {}

def _load_model(name: str):
    path, _, kind = MODELS[name]

    kwargs = dict(
        low_cpu_mem_usage=False,     # avoid meta-device bug
        torch_dtype=torch.float16,   # faster & smaller
    )

    if kind == "clip":
        model     = CLIPModel.from_pretrained(path, **kwargs).to(DEVICE)
        processor = CLIPProcessor.from_pretrained(path)
    else:
        model     = SiglipModel.from_pretrained(path, **kwargs).to(DEVICE)
        processor = SiglipProcessor.from_pretrained(path)

    model.eval()
    return model, processor

def get_model(name: str):
    if name not in _models:
        _models[name], _processors[name] = _load_model(name)
    return _models[name], _processors[name]

# ---------------------------------------------------------------------
# 3.  SCORING FUNCTION (runs on GPU in Spaces)
# ---------------------------------------------------------------------
@spaces.GPU
def calculate_score(image, text: str, model_name: str):
    labels = [t.strip() for t in text.split(";") if t.strip()]
    if not labels:
        return {}

    model, processor = get_model(model_name)
    kind = MODELS[model_name][2]

    inputs = processor(
        text=labels,
        images=image,
        padding=True,
        return_tensors="pt",
    ).to(DEVICE)

    with torch.no_grad():
        if kind == "clip":
            out       = model(**inputs)
            img_emb   = out.image_embeds
            txt_emb   = out.text_embeds
        else:
            img_emb = model.get_image_features(pixel_values=inputs["pixel_values"])
            txt_emb = model.get_text_features(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
            )

    img_emb = F.normalize(img_emb, p=2, dim=-1)
    txt_emb = F.normalize(txt_emb, p=2, dim=-1)

    scores = (txt_emb @ img_emb.T).squeeze(1)          # cosine
    if kind == "siglip":
        scores = torch.sigmoid(scores)                 # paper’s choice

    return {lbl: float(score.clamp(0, 1)) for lbl, score in zip(labels, scores.cpu())}

# ---------------------------------------------------------------------
# 4.  GRADIO UI
# ---------------------------------------------------------------------
with gr.Blocks(title="CLIP / SigLIP Image-Text Similarity") as demo:
    gr.Markdown("## Compare an image with multiple text prompts")

    with gr.Row():
        image_in  = gr.Image(type="pil", label="Image")
        score_out = gr.Label(label="Similarity (0‒1)")

    with gr.Row():
        text_in = gr.Textbox(
            label="Text prompts (use ‘;’ to separate)",
            placeholder="a cat; a flying cat; a dog",
        )
        model_in = gr.Dropdown(
            choices=list(MODELS.keys()),
            value="CLIP ViT-B/16",
            label="Model",
        )

    def infer(img, txt, mdl):
        return calculate_score(img, txt, mdl) if img and txt.strip() else {}

    for comp in (image_in, text_in, model_in):
        comp.change(infer, [image_in, text_in, model_in], score_out)

    gr.Examples(
        examples=[
            ["cat.jpg",
             "a cat stuck in a door; a cat jumping; a dog",
             "CLIP ViT-B/16"],
        ],
        inputs=[image_in, text_in, model_in],
        outputs=score_out,
    )

demo.launch()