File size: 3,718 Bytes
e074ee9 0e73f03 e074ee9 b57f8d5 7151f63 cf604df e074ee9 cf16f32 ab10c56 1d65703 ab10c56 6ae178b ab10c56 1d65703 ab10c56 6ae178b ab10c56 7cc3d8e cf604df 1d65703 ab10c56 cf604df 1d65703 ab10c56 1d65703 cf604df 1d65703 ab10c56 f4bfa5f ab10c56 7cc3d8e ab10c56 cf604df 7cc3d8e cf604df 7cc3d8e 7151f63 cf16f32 ab10c56 cf16f32 ab10c56 cf16f32 e074ee9 ab10c56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
"""Computes similarity for Text-Text, Image-Image, or Text-Image comparisons."""
# Handle empty inputs properly
if input1_type == "Text" and not input1_text.strip():
return "Error: Input 1 (Text) is empty!"
if input1_type == "Image" and (input1_image is None or isinstance(input1_image, bool)):
return "Error: Please upload a valid image for Input 1!"
if input2_type == "Text" and not input2_text.strip():
return "Error: Input 2 (Text) is empty!"
if input2_type == "Image" and (input2_image is None or isinstance(input2_image, bool)):
return "Error: Please upload a valid image for Input 2!"
try:
with torch.no_grad():
if input1_type == "Text" and input2_type == "Text":
emb1 = model.encode_text([input1_text])
emb2 = model.encode_text([input2_text])
elif input1_type == "Image" and input2_type == "Image":
emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None
emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None
else:
if input1_type == "Image":
emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None
emb2 = model.encode_text([input2_text])
else:
emb1 = model.encode_text([input1_text])
emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None
if emb1 is None or emb2 is None:
return "Error: Failed to process one or both inputs."
similarity_score = (emb1 @ emb2.T).item()
return similarity_score
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# JinaAI CLIP Multimodal Similarity")
gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.")
with gr.Row():
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)
output = gr.Textbox(label="Similarity Score / Error", interactive=False)
def update_visibility(input1_type, input2_type):
return (
input1_type == "Text", # Show text input 1
input1_type == "Image", # Show image input 1
input2_type == "Text", # Show text input 2
input2_type == "Image" # Show image input 2
)
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
compute_button = gr.Button("Compute Similarity")
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)
demo.launch()
|