Clip-Model / app.py
ktllc's picture
Update app.py
d4c665a
raw
history blame
1.82 kB
import clip
import numpy as np
import torch
import gradio as gr
from PIL import Image
import os
# Load the CLIP model
model, preprocess = clip.load("ViT-B/32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()
print(device)
# Define the Business Listing variable
Business_Listing = "Air Guide"
def find_similarity(image1, image2, text_input):
image_features = []
# Preprocess and encode the two images
for image in [image1, image2]:
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
image_feature = model.encode_image(image).float()
image_features.append(image_feature)
# Prepare input text
text_tokens = clip.tokenize([text_input]).to(device)
text_features = model.encode_text(text_tokens).float()
# Normalize text features
text_features /= text_features.norm(dim=-1, keepdim=True)
similarities = []
# Calculate cosine similarity for each image
for image_feature in image_features:
image_feature /= image_feature.norm(dim=-1, keepdim=True)
similarity = (text_features @ image_feature.T).cpu().numpy()
similarities.append(similarity[0, 0])
# Determine which image has a higher similarity to the text
best_match_index = 0 if similarities[0] > similarities[1] else 1
return similarities, best_match_index
# Define a Gradio interface
iface = gr.Interface(
fn=find_similarity,
inputs=[
gr.Image(type="pil", label="Image 1"),
gr.Image(type="pil", label="Image 2"),
"text"
],
outputs=["text", "number"],
live=True,
interpretation="default",
title="CLIP Model Image-Text Cosine Similarity",
description="Upload two images and enter text to find their cosine similarity.",
)
iface.launch()