Spaces:
Sleeping
Sleeping
import numpy as np | |
import clip | |
import torch | |
import gradio as gr | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
# Load the CLIP model | |
model, preprocess = clip.load("ViT-B/32") | |
device = "cuda" if torch.cuda.is available() else "cpu" | |
model.to(device).eval() | |
def find_similarity(image_base64, text_input): | |
# Decode the base64 image string to bytes | |
image_bytes = base64.b64decode(image_base64) | |
image = Image.open(BytesIO(image_bytes)) | |
# Preprocess the image | |
image = preprocess(image).unsqueeze(0).to(device) | |
# Prepare input text | |
text_tokens = clip.tokenize([text_input]).to(device) | |
# Encode image and text features | |
with torch.no_grad(): | |
image_features = model.encode_image(image).float() | |
text_features = model.encode_text(text_tokens).float() | |
# Normalize features and calculate similarity | |
image_features /= image_features.norm(dim=-1, keepdim=True) | |
text_features /= text_features.norm(dim=-1, keepdim=True) | |
similarity = (text_features @ image_features.T).cpu().numpy() | |
return similarity[0, 0] | |
iface = gr.Interface( | |
fn=find_similarity, | |
inputs=[ | |
gr.inputs.Textbox(lines=3, label="Enter Base64 Image"), | |
gr.inputs.Textbox(lines=3, label="Enter Text"), | |
], | |
outputs="number", | |
live=True, | |
interpretation="default", | |
title="CLIP Model Image-Text Cosine Similarity", | |
description="Enter a base64-encoded image and text to find their cosine similarity.", | |
) | |
iface.launch() | |