File size: 3,332 Bytes
e074ee9
0e73f03
e074ee9
f4bfa5f
e074ee9
b57f8d5
7151f63
 
e074ee9
f4bfa5f
 
 
 
 
 
 
 
 
 
 
 
 
 
7151f63
 
f4bfa5f
 
7151f63
f4bfa5f
7151f63
7cc3d8e
 
 
 
 
f4bfa5f
 
 
 
 
 
7cc3d8e
7151f63
f4bfa5f
aa04fb9
7cc3d8e
aa04fb9
7cc3d8e
 
f4bfa5f
 
aa04fb9
 
7cc3d8e
 
7151f63
 
f0b1903
30bfbf8
7151f63
 
 
30bfbf8
7151f63
aa04fb9
b57f8d5
aa04fb9
7151f63
aa04fb9
b57f8d5
e074ee9
7151f63
 
aa04fb9
 
 
 
 
7151f63
 
aa04fb9
e074ee9
b57f8d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from transformers import AutoModel
from PIL import Image
import numpy as np
import torch

# Load JinaAI CLIP model
model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True)

# Function to process input (convert to text or PIL image)
def process_input(input_data, input_type):
    if input_type == "Text":
        return model.encode_text([input_data]) if input_data.strip() else None
    elif input_type == "Image":
        if isinstance(input_data, str):  # If it's a file path
            image = Image.open(input_data).convert("RGB")
        elif isinstance(input_data, np.ndarray):  # If it's a NumPy array (Gradio default)
            image = Image.fromarray(input_data)
        else:
            return None  # Invalid input type
        return model.encode_image([image])
    return None

# Function to compute similarity
def compute_similarity(input1, input2, input1_type, input2_type):
    # Validate inputs
    if input1_type == "Text" and not input1.strip():
        return "Error: Input 1 is empty!"
    if input2_type == "Text" and not input2.strip():
        return "Error: Input 2 is empty!"
    if input1_type == "Image" and input1 is None:
        return "Error: Image 1 is missing!"
    if input2_type == "Image" and input2 is None:
        return "Error: Image 2 is missing!"

    # Process inputs
    embedding1 = process_input(input1, input1_type)
    embedding2 = process_input(input2, input2_type)

    if embedding1 is None or embedding2 is None:
        return "Error: Failed to process input!"

    # Compute cosine similarity
    similarity_score = (embedding1 @ embedding2.T).item()
    return f"Similarity Score: {similarity_score:.4f}"

# Function to toggle input fields dynamically
def update_visibility(input1_type, input2_type):
    return (
        gr.update(visible=(input1_type == "Text")),  
        gr.update(visible=(input1_type == "Image")), 
        gr.update(visible=(input2_type == "Text")),  
        gr.update(visible=(input2_type == "Image"))
    )

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## JinaAI CLIP Multimodal Similarity")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Image")

    with gr.Row():
        input1_text = gr.Textbox(label="Text Input 1", visible=True)
        input1_image = gr.Image(type="numpy", interactive=True, label="Image Input 1", visible=False)

    with gr.Row():
        input2_text = gr.Textbox(label="Text Input 2", visible=False)
        input2_image = gr.Image(type="numpy", interactive=True, label="Image Input 2", visible=True)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    # Toggle visibility of inputs dynamically
    input1_type.change(update_visibility, inputs=[input1_type, input2_type], 
                       outputs=[input1_text, input1_image, input2_text, input2_image])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], 
                       outputs=[input1_text, input1_image, input2_text, input2_image])

    btn = gr.Button("Compute Similarity")
    btn.click(compute_similarity, inputs=[input1_text, input2_text, input1_type, input2_type], outputs=output)

demo.launch()