File size: 3,992 Bytes
e074ee9
0e73f03
e074ee9
 
cf604df
b57f8d5
7151f63
cf604df
e074ee9
cf16f32
cf604df
 
cf16f32
 
 
cf604df
cf16f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4bfa5f
cf604df
 
cf16f32
7cc3d8e
cf604df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4bfa5f
cf604df
 
7cc3d8e
cf604df
7cc3d8e
cf604df
 
7cc3d8e
7151f63
cf16f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e074ee9
b57f8d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
import numpy as np

# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image):
    """
    Computes similarity between:
    - Text-Text
    - Image-Image
    - Text-Image & Image-Text
    """

    # Determine input types
    if input1_type == "Text":
        input1 = input1_text.strip()
        input1_is_text = bool(input1)
        input1_is_image = False
    else:
        input1 = input1_image
        input1_is_text = False
        input1_is_image = input1 is not None

    if input2_type == "Text":
        input2 = input2_text.strip()
        input2_is_text = bool(input2)
        input2_is_image = False
    else:
        input2 = input2_image
        input2_is_text = False
        input2_is_image = input2 is not None

    # Ensure valid input
    if not (input1_is_text or input1_is_image) or not (input2_is_text or input2_is_image):
        return "Error: Please provide valid inputs (text or image) for both fields!"

    try:
        with torch.no_grad():
            if input1_is_text and input2_is_text:
                # Text-Text Similarity
                emb1 = model.encode_text([input1])
                emb2 = model.encode_text([input2])
            elif input1_is_image and input2_is_image:
                # Image-Image Similarity
                image1 = Image.fromarray(input1)
                image2 = Image.fromarray(input2)
                emb1 = model.encode_image([image1])
                emb2 = model.encode_image([image2])
            else:
                # Image-Text Similarity
                if input1_is_image:
                    image = Image.fromarray(input1)
                    text = input2
                    emb1 = model.encode_image([image])
                    emb2 = model.encode_text([text])
                else:
                    image = Image.fromarray(input2)
                    text = input1
                    emb1 = model.encode_text([text])
                    emb2 = model.encode_image([image])

            # Compute cosine similarity
            similarity_score = (emb1 @ emb2.T).item()

        return similarity_score

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# JinaAI CLIP Multimodal Similarity")
    gr.Markdown("Compare similarity between two inputs: **Text-Text, Image-Image, or Image-Text**.")

    with gr.Row():
        input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
        input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text")

    input1_text = gr.Textbox(label="Input 1 (Text)", visible=True)
    input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False)

    input2_text = gr.Textbox(label="Input 2 (Text)", visible=True)
    input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False)

    output = gr.Textbox(label="Similarity Score / Error", interactive=False)

    def update_visibility(input1_type, input2_type):
        return (
            input1_type == "Text",  # Input 1 text visibility
            input1_type == "Image", # Input 1 image visibility
            input2_type == "Text",  # Input 2 text visibility
            input2_type == "Image"  # Input 2 image visibility
        )

    input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])
    input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image])

    compute_button = gr.Button("Compute Similarity")
    compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output)

demo.launch()