File size: 3,662 Bytes
e074ee9 0e73f03 e074ee9 b57f8d5 7151f63 cf604df e074ee9 cf16f32 ab10c56 0ee1ef8 42a6be5 0ee1ef8 1d65703 ab10c56 42a6be5 ab10c56 1d65703 ab10c56 42a6be5 ab10c56 7cc3d8e cf604df 1d65703 0ee1ef8 cf604df 1d65703 0ee1ef8 1d65703 cf604df 1d65703 0ee1ef8 7cc3d8e ab10c56 cf604df 7cc3d8e cf604df 7cc3d8e 7151f63 cf16f32 ab10c56 cf16f32 ab10c56 cf16f32 e074ee9 ab10c56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
"""Computes similarity for Text-Text, Image-Image, or Text-Image comparisons."""
# Ensure images are valid (Gradio sometimes sends `False` instead of `None`)
input1_image = None if isinstance(input1_image, bool) else input1_image
input2_image = None if isinstance(input2_image, bool) else input2_image
# Validate inputs
if input1_type == "Text" and not input1_text.strip():
return "Error: Input 1 (Text) is empty!"
if input1_type == "Image" and input1_image is None:
return "Error: Please upload a valid image for Input 1!"
if input2_type == "Text" and not input2_text.strip():
return "Error: Input 2 (Text) is empty!"
if input2_type == "Image" and input2_image is None:
return "Error: Please upload a valid image for Input 2!"
try:
with torch.no_grad():
if input1_type == "Text" and input2_type == "Text":
emb1 = model.encode_text([input1_text])
emb2 = model.encode_text([input2_text])
elif input1_type == "Image" and input2_type == "Image":
emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])
else:
if input1_type == "Image":
emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
emb2 = model.encode_text([input2_text])
else:
emb1 = model.encode_text([input1_text])
emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])
similarity_score = (emb1 @ emb2.T).item()
return similarity_score
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# JinaAI CLIP Multimodal Similarity")
gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.")
with gr.Row():
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)
output = gr.Textbox(label="Similarity Score / Error", interactive=False)
def update_visibility(input1_type, input2_type):
return (
input1_type == "Text", # Show text input 1
input1_type == "Image", # Show image input 1
input2_type == "Text", # Show text input 2
input2_type == "Image" # Show image input 2
)
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
compute_button = gr.Button("Compute Similarity")
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)
demo.launch()
|