File size: 3,640 Bytes
e074ee9 0e73f03 e074ee9 b57f8d5 7151f63 cf604df e074ee9 cf16f32 cf604df cf16f32 cf604df 1d65703 6ae178b 1d65703 6ae178b 1d65703 7cc3d8e cf604df 1d65703 cf604df 1d65703 cf604df 1d65703 cf604df 1d65703 cf604df 1d65703 f4bfa5f cf604df 7cc3d8e cf604df 7cc3d8e cf604df 7cc3d8e 7151f63 cf16f32 1d65703 cf16f32 1d65703 cf16f32 e074ee9 2a8e08c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
"""
Computes similarity between:
- Text-Text
- Image-Image
- Text-Image & Image-Text
"""
# Validate inputs
if input1_type == "Text" and not input1_text.strip():
return "Error: Input 1 is empty!"
if input1_type == "Image" and (input1_image is None or isinstance(input1_image, bool)):
return "Error: Please upload an image for Input 1!"
if input2_type == "Text" and not input2_text.strip():
return "Error: Input 2 is empty!"
if input2_type == "Image" and (input2_image is None or isinstance(input2_image, bool)):
return "Error: Please upload an image for Input 2!"
try:
with torch.no_grad():
if input1_type == "Text" and input2_type == "Text":
# Text-Text Similarity
emb1 = model.encode_text([input1_text])
emb2 = model.encode_text([input2_text])
elif input1_type == "Image" and input2_type == "Image":
# Image-Image Similarity
emb1 = model.encode_image([Image.fromarray(input1_image)])
emb2 = model.encode_image([Image.fromarray(input2_image)])
else:
# Image-Text Similarity (either order)
if input1_type == "Image":
emb1 = model.encode_image([Image.fromarray(input1_image)])
emb2 = model.encode_text([input2_text])
else:
emb1 = model.encode_text([input1_text])
emb2 = model.encode_image([Image.fromarray(input2_image)])
# Compute cosine similarity
similarity_score = (emb1 @ emb2.T).item()
return similarity_score
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# JinaAI CLIP Multimodal Similarity")
gr.Markdown("Compare similarity between **Text-Text, Image-Image, or Image-Text**.")
with gr.Row():
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)
output = gr.Textbox(label="Similarity Score / Error", interactive=False)
def update_visibility(input1_type, input2_type):
return (
input1_type == "Text", # Input 1 text visible
input1_type == "Image", # Input 1 image visible
input2_type == "Text", # Input 2 text visible
input2_type == "Image" # Input 2 image visible
)
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
compute_button = gr.Button("Compute Similarity")
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)
demo.launch() |