import torch import gradio as gr from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def similarity(image, text, threshold, order): lines = list(map(str.strip, text.splitlines())) if len(lines) == 0: return "", "" inputs = processor(text=lines, images=image, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(**inputs) similarities = outputs.logits_per_image.view(-1) # convert to plain list of floats for display similarities = [s.item() for s in similarities] if order: tfm = lambda xs: sorted(xs, reverse=True) else: tfm = lambda xs: xs detections = [(f"{line}: {similarity:0.2f}", "yes" if similarity > threshold else "no") for similarity, line in tfm(zip(similarities, lines))] return detections demo = gr.Interface( title="CLIP Explorer", description="Input an image and lines of text then press submit to output the image-text similarity scores.", fn=similarity, inputs=[ gr.Image(label="Image"), gr.TextArea(label="Text descriptions"), gr.Slider(0, 40, 26, label="Similarity threshold"), gr.Checkbox(value=True, label="Order by similarity score?"), ], outputs=gr.HighlightedText(label="Image-text similarity scores", color_map={ "yes": "green", "no": "red", }), ) if __name__ == "__main__": demo.launch()