File size: 3,992 Bytes
e074ee9 0e73f03 e074ee9 cf604df b57f8d5 7151f63 cf604df e074ee9 cf16f32 cf604df cf16f32 cf604df cf16f32 f4bfa5f cf604df cf16f32 7cc3d8e cf604df f4bfa5f cf604df 7cc3d8e cf604df 7cc3d8e cf604df 7cc3d8e 7151f63 cf16f32 e074ee9 b57f8d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
import numpy as np
# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
"""
Computes similarity between:
- Text-Text
- Image-Image
- Text-Image & Image-Text
"""
# Determine input types
if input1_type == "Text":
input1 = input1_text.strip()
input1_is_text = bool(input1)
input1_is_image = False
else:
input1 = input1_image
input1_is_text = False
input1_is_image = input1 is not None
if input2_type == "Text":
input2 = input2_text.strip()
input2_is_text = bool(input2)
input2_is_image = False
else:
input2 = input2_image
input2_is_text = False
input2_is_image = input2 is not None
# Ensure valid input
if not (input1_is_text or input1_is_image) or not (input2_is_text or input2_is_image):
return "Error: Please provide valid inputs (text or image) for both fields!"
try:
with torch.no_grad():
if input1_is_text and input2_is_text:
# Text-Text Similarity
emb1 = model.encode_text([input1])
emb2 = model.encode_text([input2])
elif input1_is_image and input2_is_image:
# Image-Image Similarity
image1 = Image.fromarray(input1)
image2 = Image.fromarray(input2)
emb1 = model.encode_image([image1])
emb2 = model.encode_image([image2])
else:
# Image-Text Similarity
if input1_is_image:
image = Image.fromarray(input1)
text = input2
emb1 = model.encode_image([image])
emb2 = model.encode_text([text])
else:
image = Image.fromarray(input2)
text = input1
emb1 = model.encode_text([text])
emb2 = model.encode_image([image])
# Compute cosine similarity
similarity_score = (emb1 @ emb2.T).item()
return similarity_score
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# JinaAI CLIP Multimodal Similarity")
gr.Markdown("Compare similarity between two inputs: **Text-Text, Image-Image, or Image-Text**.")
with gr.Row():
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)
output = gr.Textbox(label="Similarity Score / Error", interactive=False)
def update_visibility(input1_type, input2_type):
return (
input1_type == "Text", # Input 1 text visibility
input1_type == "Image", # Input 1 image visibility
input2_type == "Text", # Input 2 text visibility
input2_type == "Image" # Input 2 image visibility
)
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
compute_button = gr.Button("Compute Similarity")
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)
demo.launch()
|