ktllc commited on
Commit
7413961
·
1 Parent(s): 9caa677

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -16
app.py CHANGED
@@ -2,6 +2,8 @@ import clip
2
  import numpy as np
3
  import torch
4
  import gradio as gr
 
 
5
 
6
  # Load the CLIP model
7
  model, preprocess = clip.load("ViT-B/32")
@@ -12,34 +14,45 @@ print(device)
12
  # Define the Business Listing variable
13
  Business_Listing = "Air Guide"
14
 
15
- def find_similarity(image, text_input):
16
- # Preprocess the uploaded image
17
- image = preprocess(image).unsqueeze(0).to(device)
18
-
 
 
 
 
 
 
19
  # Prepare input text
20
  text_tokens = clip.tokenize([text_input]).to(device)
 
21
 
22
- # Encode image and text features
23
- with torch.no_grad():
24
- image_features = model.encode_image(image).float()
25
- text_features = model.encode_text(text_tokens).float()
26
-
27
- # Normalize features and calculate similarity
28
- image_features /= image_features.norm(dim=-1, keepdim=True)
29
  text_features /= text_features.norm(dim=-1, keepdim=True)
30
- similarity = (text_features @ image_features.T).cpu().numpy()
31
 
32
- return similarity[0, 0]
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # Define a Gradio interface
35
  iface = gr.Interface(
36
  fn=find_similarity,
37
- inputs=[gr.Image(type="pil"), "text"],
38
- outputs="number",
39
  live=True,
40
  interpretation="default",
41
  title="CLIP Model Image-Text Cosine Similarity",
42
- description="Upload an image and enter text to find their cosine similarity.",
43
  )
44
 
45
  iface.launch()
 
2
  import numpy as np
3
  import torch
4
  import gradio as gr
5
+ from PIL import Image
6
+ import os
7
 
8
  # Load the CLIP model
9
  model, preprocess = clip.load("ViT-B/32")
 
14
  # Define the Business Listing variable
15
  Business_Listing = "Air Guide"
16
 
17
+ def find_similarity(images, text_input):
18
+ image_features = []
19
+
20
+ # Preprocess and encode multiple images
21
+ for image in images:
22
+ image = preprocess(image).unsqueeze(0).to(device)
23
+ with torch.no_grad():
24
+ image_feature = model.encode_image(image).float()
25
+ image_features.append(image_feature)
26
+
27
  # Prepare input text
28
  text_tokens = clip.tokenize([text_input]).to(device)
29
+ text_features = model.encode_text(text_tokens).float()
30
 
31
+ # Normalize text features
 
 
 
 
 
 
32
  text_features /= text_features.norm(dim=-1, keepdim=True)
 
33
 
34
+ similarities = []
35
+
36
+ # Calculate cosine similarity for each image
37
+ for image_feature in image_features:
38
+ image_feature /= image_feature.norm(dim=-1, keepdim=True)
39
+ similarity = (text_features @ image_feature.T).cpu().numpy()
40
+ similarities.append(similarity[0, 0])
41
+
42
+ # Find the index of the image with the highest similarity
43
+ best_match_index = np.argmax(similarities)
44
+
45
+ return similarities, best_match_index
46
 
47
  # Define a Gradio interface
48
  iface = gr.Interface(
49
  fn=find_similarity,
50
+ inputs=[gr.Image(type="pil", label="Image 1"), gr.Image(type="pil", label="Image 2"), "text"],
51
+ outputs=["text", "number"],
52
  live=True,
53
  interpretation="default",
54
  title="CLIP Model Image-Text Cosine Similarity",
55
+ description="Upload two images and enter text to find their cosine similarity.",
56
  )
57
 
58
  iface.launch()