File size: 3,718 Bytes
e074ee9
0e73f03
e074ee9
 
b57f8d5
7151f63
cf604df
e074ee9
cf16f32
ab10c56
 
 
1d65703
ab10c56
6ae178b
ab10c56
1d65703
 
ab10c56
6ae178b
ab10c56
7cc3d8e
cf604df
 
1d65703
 
 
 
ab10c56
 
cf604df
1d65703
ab10c56
1d65703
cf604df
1d65703
ab10c56
f4bfa5f
ab10c56
 
7cc3d8e
ab10c56
cf604df
7cc3d8e
cf604df
 
7cc3d8e
7151f63
cf16f32
 
ab10c56
cf16f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab10c56
 
 
 
cf16f32
 
 
 
 
 
 
e074ee9
ab10c56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch

# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
    """Computes similarity for Text-Text, Image-Image, or Text-Image comparisons."""

    # Handle empty inputs properly
    if input1_type == "Text" and not input1_text.strip():
        return "Error: Input 1 (Text) is empty!"
    if input1_type == "Image" and (input1_image is None or isinstance(input1_image, bool)):
        return "Error: Please upload a valid image for Input 1!"

    if input2_type == "Text" and not input2_text.strip():
        return "Error: Input 2 (Text) is empty!"
    if input2_type == "Image" and (input2_image is None or isinstance(input2_image, bool)):
        return "Error: Please upload a valid image for Input 2!"

    try:
        with torch.no_grad():
            if input1_type == "Text" and input2_type == "Text":
                emb1 = model.encode_text([input1_text])
                emb2 = model.encode_text([input2_text])
            elif input1_type == "Image" and input2_type == "Image":
                emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None
                emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None
            else:
                if input1_type == "Image":
                    emb1 = model.encode_image([Image.fromarray(input1_image)]) if input1_image is not None else None
                    emb2 = model.encode_text([input2_text])
                else:
                    emb1 = model.encode_text([input1_text])
                    emb2 = model.encode_image([Image.fromarray(input2_image)]) if input2_image is not None else None

            if emb1 is None or emb2 is None:
                return "Error: Failed to process one or both inputs."

            similarity_score = (emb1 @ emb2.T).item()
        return similarity_score

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# JinaAI CLIP Multimodal Similarity")
    gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")

    input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
    input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)

    input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
    input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    def update_visibility(input1_type, input2_type):
        return (
            input1_type == "Text",  # Show text input 1
            input1_type == "Image", # Show image input 1
            input2_type == "Text",  # Show text input 2
            input2_type == "Image"  # Show image input 2
        )

    input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])

    compute_button = gr.Button("Compute Similarity")
    compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)

demo.launch()