File size: 3,277 Bytes
e074ee9
0e73f03
e074ee9
 
cf604df
b57f8d5
7151f63
cf604df
e074ee9
cf604df
 
 
 
 
 
 
 
 
 
 
 
 
f4bfa5f
cf604df
 
 
7cc3d8e
cf604df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4bfa5f
cf604df
 
7cc3d8e
cf604df
7cc3d8e
cf604df
 
7cc3d8e
7151f63
cf604df
 
 
 
 
 
30bfbf8
cf604df
 
 
 
 
 
 
 
aa04fb9
cf604df
 
 
 
 
 
 
 
7151f63
cf604df
 
e074ee9
b57f8d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from transformers import AutoModel
from PIL import Image
import torch
import numpy as np

# Load JinaAI CLIP model
model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)

def compute_similarity(input1, input2):
    """
    Computes similarity between:
    - Image and Text
    - Image and Image
    - Text and Text
    """
    
    # Detect input types
    input1_is_text = isinstance(input1, str) and input1.strip() != ""
    input2_is_text = isinstance(input2, str) and input2.strip() != ""
    input1_is_image = isinstance(input1, np.ndarray)
    input2_is_image = isinstance(input2, np.ndarray)

    # Ensure valid input
    if not (input1_is_text or input1_is_image) or not (input2_is_text or input2_is_image):
        return "Error: Both inputs must be valid (image or text)!"

    try:
        with torch.no_grad():
            if input1_is_text and input2_is_text:
                # Text-Text Similarity
                emb1 = model.encode_text([input1])
                emb2 = model.encode_text([input2])
            elif input1_is_image and input2_is_image:
                # Image-Image Similarity
                image1 = Image.fromarray(input1)
                image2 = Image.fromarray(input2)
                emb1 = model.encode_image([image1])
                emb2 = model.encode_image([image2])
            else:
                # Image-Text Similarity
                if input1_is_image:
                    image = Image.fromarray(input1)
                    text = input2
                    emb1 = model.encode_image([image])
                    emb2 = model.encode_text([text])
                else:
                    image = Image.fromarray(input2)
                    text = input1
                    emb1 = model.encode_text([text])
                    emb2 = model.encode_image([image])

            # Compute cosine similarity
            similarity_score = (emb1 @ emb2.T).item()

        return similarity_score

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
demo = gr.Interface(
    fn=compute_similarity,
    inputs=[
        gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text"),
        gr.Textbox(label="Text Input 1", visible=True),
        gr.Image(type="numpy", label="Image Input 1", visible=False),

        gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text"),
        gr.Textbox(label="Text Input 2", visible=True),
        gr.Image(type="numpy", label="Image Input 2", visible=False),
    ],
    outputs=gr.Textbox(label="Similarity Score / Error", interactive=False),
    title="JinaAI CLIP Multimodal Similarity",
    description="Compare similarity between two inputs (Text, Image, or both)."
)

# Update visibility dynamically
def update_visibility(input1_type, input2_type):
    return (
        input1_type == "Text",  # Text input 1 visibility
        input1_type == "Image", # Image input 1 visibility
        input2_type == "Text",  # Text input 2 visibility
        input2_type == "Image"  # Image input 2 visibility
    )

# Add event handlers for input type change
demo.load(update_visibility, inputs=["Input 1 Type", "Input 2 Type"], outputs=["Text Input 1", "Image Input 1", "Text Input 2", "Image Input 2"])

demo.launch()