|
import gradio as gr |
|
from transformers import AutoModel |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True) |
|
|
|
def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image): |
|
"""Computes similarity for Text-Text, Image-Image, or Text-Image comparisons.""" |
|
|
|
|
|
if input1_type == "Text" and not input1_text.strip(): |
|
return "Error: Input 1 (Text) is empty!" |
|
if input1_type == "Image" and (input1_image is None or isinstance(input1_image, bool)): |
|
return "Error: Please upload a valid image for Input 1!" |
|
|
|
if input2_type == "Text" and not input2_text.strip(): |
|
return "Error: Input 2 (Text) is empty!" |
|
if input2_type == "Image" and (input2_image is None or isinstance(input2_image, bool)): |
|
return "Error: Please upload a valid image for Input 2!" |
|
|
|
try: |
|
with torch.no_grad(): |
|
if input1_type == "Text" and input2_type == "Text": |
|
emb1 = model.encode_text([input1_text]) |
|
emb2 = model.encode_text([input2_text]) |
|
elif input1_type == "Image" and input2_type == "Image": |
|
emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None |
|
emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None |
|
else: |
|
if input1_type == "Image": |
|
emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None |
|
emb2 = model.encode_text([input2_text]) |
|
else: |
|
emb1 = model.encode_text([input1_text]) |
|
emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None |
|
|
|
if emb1 is None or emb2 is None: |
|
return "Error: Failed to process one or both inputs." |
|
|
|
similarity_score = (emb1 @ emb2.T).item() |
|
return similarity_score |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# JinaAI CLIP Multimodal Similarity") |
|
gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.") |
|
|
|
with gr.Row(): |
|
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text") |
|
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text") |
|
|
|
input1_text = gr.Textbox(label="Input 1 (Text)", visible=True) |
|
input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False) |
|
|
|
input2_text = gr.Textbox(label="Input 2 (Text)", visible=True) |
|
input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False) |
|
|
|
output = gr.Textbox(label="Similarity Score / Error", interactive=False) |
|
|
|
def update_visibility(input1_type, input2_type): |
|
return ( |
|
input1_type == "Text", |
|
input1_type == "Image", |
|
input2_type == "Text", |
|
input2_type == "Image" |
|
) |
|
|
|
input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) |
|
input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) |
|
|
|
compute_button = gr.Button("Compute Similarity") |
|
compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output) |
|
|
|
demo.launch() |
|
|