|
import gradio as gr |
|
from transformers import AutoModel |
|
from PIL import Image |
|
import torch |
|
import numpy as np |
|
|
|
|
|
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True) |
|
|
|
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image): |
|
""" |
|
Computes similarity between: |
|
- Text-Text |
|
- Image-Image |
|
- Text-Image & Image-Text |
|
""" |
|
|
|
|
|
if input1_type == "Text": |
|
input1 = input1_text.strip() |
|
input1_is_text = bool(input1) |
|
input1_is_image = False |
|
else: |
|
input1 = input1_image |
|
input1_is_text = False |
|
input1_is_image = input1 is not None |
|
|
|
if input2_type == "Text": |
|
input2 = input2_text.strip() |
|
input2_is_text = bool(input2) |
|
input2_is_image = False |
|
else: |
|
input2 = input2_image |
|
input2_is_text = False |
|
input2_is_image = input2 is not None |
|
|
|
|
|
if not (input1_is_text or input1_is_image) or not (input2_is_text or input2_is_image): |
|
return "Error: Please provide valid inputs (text or image) for both fields!" |
|
|
|
try: |
|
with torch.no_grad(): |
|
if input1_is_text and input2_is_text: |
|
|
|
emb1 = model.encode_text([input1]) |
|
emb2 = model.encode_text([input2]) |
|
elif input1_is_image and input2_is_image: |
|
|
|
image1 = Image.fromarray(input1) |
|
image2 = Image.fromarray(input2) |
|
emb1 = model.encode_image([image1]) |
|
emb2 = model.encode_image([image2]) |
|
else: |
|
|
|
if input1_is_image: |
|
image = Image.fromarray(input1) |
|
text = input2 |
|
emb1 = model.encode_image([image]) |
|
emb2 = model.encode_text([text]) |
|
else: |
|
image = Image.fromarray(input2) |
|
text = input1 |
|
emb1 = model.encode_text([text]) |
|
emb2 = model.encode_image([image]) |
|
|
|
|
|
similarity_score = (emb1 @ emb2.T).item() |
|
|
|
return similarity_score |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# JinaAI CLIP Multimodal Similarity") |
|
gr.Markdown("Compare similarity between two inputs: **Text-Text, Image-Image, or Image-Text**.") |
|
|
|
with gr.Row(): |
|
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text") |
|
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text") |
|
|
|
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True) |
|
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False) |
|
|
|
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True) |
|
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False) |
|
|
|
output = gr.Textbox(label="Similarity Score / Error", interactive=False) |
|
|
|
def update_visibility(input1_type, input2_type): |
|
return ( |
|
input1_type == "Text", |
|
input1_type == "Image", |
|
input2_type == "Text", |
|
input2_type == "Image" |
|
) |
|
|
|
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) |
|
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) |
|
|
|
compute_button = gr.Button("Compute Similarity") |
|
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output) |
|
|
|
demo.launch() |
|
|