wilwork commited on
Commit
1cef0a7
·
verified ·
1 Parent(s): 0ee1ef8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -67
app.py CHANGED
@@ -2,78 +2,33 @@ import gradio as gr
2
  from transformers import AutoModel
3
  from PIL import Image
4
  import torch
 
 
 
5
 
6
- # Load JinaAI CLIP model
7
  model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
8
 
9
- def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
10
- """Computes similarity for Text-Text, Image-Image, or Text-Image comparisons."""
11
 
12
- # Ensure images are valid (Gradio sometimes sends `False` instead of `None`)
13
- input1_image = None if isinstance(input1_image, bool) else input1_image
14
- input2_image = None if isinstance(input2_image, bool) else input2_image
 
15
 
16
- # Validate inputs
17
- if input1_type == "Text" and not input1_text.strip():
18
- return "Error: Input 1 (Text) is empty!"
19
- if input1_type == "Image" and input1_image is None:
20
- return "Error: Please upload a valid image for Input 1!"
21
- if input2_type == "Text" and not input2_text.strip():
22
- return "Error: Input 2 (Text) is empty!"
23
- if input2_type == "Image" and input2_image is None:
24
- return "Error: Please upload a valid image for Input 2!"
25
 
26
- try:
27
- with torch.no_grad():
28
- if input1_type == "Text" and input2_type == "Text":
29
- emb1 = model.encode_text([input1_text])
30
- emb2 = model.encode_text([input2_text])
31
- elif input1_type == "Image" and input2_type == "Image":
32
- emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
33
- emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])
34
- else:
35
- if input1_type == "Image":
36
- emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
37
- emb2 = model.encode_text([input2_text])
38
- else:
39
- emb1 = model.encode_text([input1_text])
40
- emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])
41
-
42
- similarity_score = (emb1 @ emb2.T).item()
43
- return similarity_score
44
-
45
- except Exception as e:
46
- return f"Error: {str(e)}"
47
 
48
  # Gradio UI
49
- with gr.Blocks() as demo:
50
- gr.Markdown("# JinaAI CLIP Multimodal Similarity")
51
- gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.")
52
-
53
- with gr.Row():
54
- input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
55
- input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")
56
-
57
- input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
58
- input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)
59
-
60
- input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
61
- input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)
62
-
63
- output = gr.Textbox(label="Similarity Score / Error", interactive=False)
64
-
65
- def update_visibility(input1_type, input2_type):
66
- return (
67
- input1_type == "Text", # Show text input 1
68
- input1_type == "Image", # Show image input 1
69
- input2_type == "Text", # Show text input 2
70
- input2_type == "Image" # Show image input 2
71
- )
72
-
73
- input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
74
- input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
75
-
76
- compute_button = gr.Button("Compute Similarity")
77
- compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)
78
-
79
- demo.launch()
 
2
  from transformers import AutoModel
3
  from PIL import Image
4
  import torch
5
+ import torch.nn.functional as F
6
+ import requests
7
+ from io import BytesIO
8
 
9
+ # Load model with remote code support
10
  model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
11
 
12
+ def compute_similarity(image, text):
13
+ image = Image.fromarray(image) # Convert NumPy array to PIL Image
14
 
15
+ with torch.no_grad():
16
+ # Encode text and image using JinaAI CLIP model
17
+ text_embeds = model.encode_text([text]) # Expecting list input
18
+ image_embeds = model.encode_image([image]) # Expecting list input
19
 
20
+ # Compute cosine similarity
21
+ similarity_score = (text_embeds @ image_embeds.T).item()
 
 
 
 
 
 
 
22
 
23
+ return similarity_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Gradio UI
26
+ demo = gr.Interface(
27
+ fn=compute_similarity,
28
+ inputs=[gr.Image(type="numpy"), gr.Textbox(label="Enter text")],
29
+ outputs=gr.Number(label="Similarity Score"),
30
+ title="JinaAI CLIP Image-Text Similarity",
31
+ description="Upload an image and enter a text prompt to get the similarity score."
32
+ )
33
+
34
+ demo.launch()