File size: 3,662 Bytes
e074ee9
0e73f03
e074ee9
 
b57f8d5
7151f63
cf604df
e074ee9
cf16f32
ab10c56
 
0ee1ef8
42a6be5
 
 
0ee1ef8
1d65703
ab10c56
42a6be5
ab10c56
1d65703
ab10c56
42a6be5
ab10c56
7cc3d8e
cf604df
 
1d65703
 
 
 
0ee1ef8
 
cf604df
1d65703
0ee1ef8
1d65703
cf604df
1d65703
0ee1ef8
7cc3d8e
ab10c56
cf604df
7cc3d8e
cf604df
 
7cc3d8e
7151f63
cf16f32
 
ab10c56
cf16f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab10c56
 
 
 
cf16f32
 
 
 
 
 
 
e074ee9
ab10c56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch

# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
    """Computes similarity for Text-Text, Image-Image, or Text-Image comparisons."""

    # Ensure images are valid (Gradio sometimes sends `False` instead of `None`)
    input1_image = None if isinstance(input1_image, bool) else input1_image
    input2_image = None if isinstance(input2_image, bool) else input2_image

    # Validate inputs
    if input1_type == "Text" and not input1_text.strip():
        return "Error: Input 1 (Text) is empty!"
    if input1_type == "Image" and input1_image is None:
        return "Error: Please upload a valid image for Input 1!"
    if input2_type == "Text" and not input2_text.strip():
        return "Error: Input 2 (Text) is empty!"
    if input2_type == "Image" and input2_image is None:
        return "Error: Please upload a valid image for Input 2!"

    try:
        with torch.no_grad():
            if input1_type == "Text" and input2_type == "Text":
                emb1 = model.encode_text([input1_text])
                emb2 = model.encode_text([input2_text])
            elif input1_type == "Image" and input2_type == "Image":
                emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
                emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])
            else:
                if input1_type == "Image":
                    emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))])
                    emb2 = model.encode_text([input2_text])
                else:
                    emb1 = model.encode_text([input1_text])
                    emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))])

            similarity_score = (emb1 @ emb2.T).item()
        return similarity_score

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# JinaAI CLIP Multimodal Similarity")
    gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")

    input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
    input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)

    input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
    input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    def update_visibility(input1_type, input2_type):
        return (
            input1_type == "Text",  # Show text input 1
            input1_type == "Image", # Show image input 1
            input2_type == "Text",  # Show text input 2
            input2_type == "Image"  # Show image input 2
        )

    input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])

    compute_button = gr.Button("Compute Similarity")
    compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)

demo.launch()