File size: 3,038 Bytes
e074ee9
0e73f03
e074ee9
 
 
7151f63
 
e074ee9
7151f63
 
 
7cc3d8e
7151f63
7cc3d8e
7151f63
7cc3d8e
 
 
 
 
7151f63
7cc3d8e
7151f63
 
 
 
 
 
 
7cc3d8e
7151f63
 
 
 
 
 
 
7cc3d8e
7151f63
 
7cc3d8e
7151f63
d28a2eb
7cc3d8e
 
 
 
 
 
 
 
 
7151f63
 
f0b1903
30bfbf8
7151f63
 
 
30bfbf8
7151f63
 
 
 
 
 
 
e074ee9
7151f63
 
 
 
 
 
 
e074ee9
f0b1903
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch

# Load JinaAI CLIP model
model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True)

# Function to compute similarity
def compute_similarity(input1, input2, input1_type, input2_type):
    # Check if inputs are empty
    if input1_type == "Text" and not input1.strip():
        return "Error: Input 1 is empty!"
    if input2_type == "Text" and not input2.strip():
        return "Error: Input 2 is empty!"
    if input1_type == "Image" and input1 is None:
        return "Error: Image 1 is missing!"
    if input2_type == "Image" and input2 is None:
        return "Error: Image 2 is missing!"

    inputs = []

    # Process first input
    if input1_type == "Text":
        text1_embedding = model.encode_text([input1])
        inputs.append(text1_embedding)
    elif input1_type == "Image":
        image1_embedding = model.encode_image([Image.fromarray(input1)])
        inputs.append(image1_embedding)

    # Process second input
    if input2_type == "Text":
        text2_embedding = model.encode_text([input2])
        inputs.append(text2_embedding)
    elif input2_type == "Image":
        image2_embedding = model.encode_image([Image.fromarray(input2)])
        inputs.append(image2_embedding)

    # Compute cosine similarity
    similarity_score = (inputs[0] @ inputs[1].T).item()

    return similarity_score

# Function to update UI based on selected input types
def update_visibility(input1_type, input2_type):
    return (
        gr.update(visible=input1_type == "Text", value="" if input1_type == "Text" else None),
        gr.update(visible=input1_type == "Image", value=None if input1_type == "Image" else None),
        gr.update(visible=input2_type == "Text", value="" if input2_type == "Text" else None),
        gr.update(visible=input2_type == "Image", value=None if input2_type == "Image" else None),
    )

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## JinaAI CLIP Multimodal Similarity")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Image")

    with gr.Row():
        input1 = gr.Textbox(label="Text Input 1", visible=True)
        image1 = gr.Image(type="numpy", label="Image Input 1", visible=False)
    
    with gr.Row():
        input2 = gr.Textbox(label="Text Input 2", visible=False)
        image2 = gr.Image(type="numpy", label="Image Input 2", visible=True)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1, image1, input2, image2])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1, image1, input2, image2])

    btn = gr.Button("Compute Similarity")
    btn.click(compute_similarity, inputs=[input1, input2, input1_type, input2_type], outputs=output)

demo.launch()