File size: 3,640 Bytes
e074ee9
0e73f03
e074ee9
 
b57f8d5
7151f63
cf604df
e074ee9
cf16f32
cf604df
 
cf16f32
 
 
cf604df
1d65703
 
 
 
6ae178b
1d65703
 
 
 
6ae178b
1d65703
7cc3d8e
cf604df
 
1d65703
cf604df
1d65703
 
 
cf604df
1d65703
 
cf604df
1d65703
 
 
 
cf604df
1d65703
 
f4bfa5f
cf604df
 
7cc3d8e
cf604df
7cc3d8e
cf604df
 
7cc3d8e
7151f63
cf16f32
 
1d65703
cf16f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d65703
 
 
 
cf16f32
 
 
 
 
 
 
e074ee9
2a8e08c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch

# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
    """
    Computes similarity between:
    - Text-Text
    - Image-Image
    - Text-Image & Image-Text
    """
    
    # Validate inputs
    if input1_type == "Text" and not input1_text.strip():
        return "Error: Input 1 is empty!"
    if input1_type == "Image" and (input1_image is None or isinstance(input1_image, bool)):
        return "Error: Please upload an image for Input 1!"

    if input2_type == "Text" and not input2_text.strip():
        return "Error: Input 2 is empty!"
    if input2_type == "Image" and (input2_image is None or isinstance(input2_image, bool)):
        return "Error: Please upload an image for Input 2!"

    try:
        with torch.no_grad():
            if input1_type == "Text" and input2_type == "Text":
                # Text-Text Similarity
                emb1 = model.encode_text([input1_text])
                emb2 = model.encode_text([input2_text])
            elif input1_type == "Image" and input2_type == "Image":
                # Image-Image Similarity
                emb1 = model.encode_image([Image.fromarray(input1_image)])
                emb2 = model.encode_image([Image.fromarray(input2_image)])
            else:
                # Image-Text Similarity (either order)
                if input1_type == "Image":
                    emb1 = model.encode_image([Image.fromarray(input1_image)])
                    emb2 = model.encode_text([input2_text])
                else:
                    emb1 = model.encode_text([input1_text])
                    emb2 = model.encode_image([Image.fromarray(input2_image)])

            # Compute cosine similarity
            similarity_score = (emb1 @ emb2.T).item()

        return similarity_score

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# JinaAI CLIP Multimodal Similarity")
    gr.Markdown("Compare similarity between **Text-Text, Image-Image, or Image-Text**.")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")

    input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
    input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)

    input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
    input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    def update_visibility(input1_type, input2_type):
        return (
            input1_type == "Text",  # Input 1 text visible
            input1_type == "Image", # Input 1 image visible
            input2_type == "Text",  # Input 2 text visible
            input2_type == "Image"  # Input 2 image visible
        )

    input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])

    compute_button = gr.Button("Compute Similarity")
    compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)

demo.launch()