File size: 3,332 Bytes
e074ee9 0e73f03 e074ee9 f4bfa5f e074ee9 b57f8d5 7151f63 e074ee9 f4bfa5f 7151f63 f4bfa5f 7151f63 f4bfa5f 7151f63 7cc3d8e f4bfa5f 7cc3d8e 7151f63 f4bfa5f aa04fb9 7cc3d8e aa04fb9 7cc3d8e f4bfa5f aa04fb9 7cc3d8e 7151f63 f0b1903 30bfbf8 7151f63 30bfbf8 7151f63 aa04fb9 b57f8d5 aa04fb9 7151f63 aa04fb9 b57f8d5 e074ee9 7151f63 aa04fb9 7151f63 aa04fb9 e074ee9 b57f8d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from transformers import AutoModel
from PIL import Image
import numpy as np
import torch
# Load JinaAI CLIP model
model = AutoModel.from_pretrained("jinaai/jina-clip-v1", trust_remote_code=True)
# Function to process input (convert to text or PIL image)
def process_input(input_data, input_type):
if input_type == "Text":
return model.encode_text([input_data]) if input_data.strip() else None
elif input_type == "Image":
if isinstance(input_data, str): # If it's a file path
image = Image.open(input_data).convert("RGB")
elif isinstance(input_data, np.ndarray): # If it's a NumPy array (Gradio default)
image = Image.fromarray(input_data)
else:
return None # Invalid input type
return model.encode_image([image])
return None
# Function to compute similarity
def compute_similarity(input1, input2, input1_type, input2_type):
# Validate inputs
if input1_type == "Text" and not input1.strip():
return "Error: Input 1 is empty!"
if input2_type == "Text" and not input2.strip():
return "Error: Input 2 is empty!"
if input1_type == "Image" and input1 is None:
return "Error: Image 1 is missing!"
if input2_type == "Image" and input2 is None:
return "Error: Image 2 is missing!"
# Process inputs
embedding1 = process_input(input1, input1_type)
embedding2 = process_input(input2, input2_type)
if embedding1 is None or embedding2 is None:
return "Error: Failed to process input!"
# Compute cosine similarity
similarity_score = (embedding1 @ embedding2.T).item()
return f"Similarity Score: {similarity_score:.4f}"
# Function to toggle input fields dynamically
def update_visibility(input1_type, input2_type):
return (
gr.update(visible=(input1_type == "Text")),
gr.update(visible=(input1_type == "Image")),
gr.update(visible=(input2_type == "Text")),
gr.update(visible=(input2_type == "Image"))
)
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## JinaAI CLIP Multimodal Similarity")
with gr.Row():
input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text")
input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Image")
with gr.Row():
input1_text = gr.Textbox(label="Text Input 1", visible=True)
input1_image = gr.Image(type="numpy", interactive=True, label="Image Input 1", visible=False)
with gr.Row():
input2_text = gr.Textbox(label="Text Input 2", visible=False)
input2_image = gr.Image(type="numpy", interactive=True, label="Image Input 2", visible=True)
output = gr.Textbox(label="Similarity Score / Error", interactive=False)
# Toggle visibility of inputs dynamically
input1_type.change(update_visibility, inputs=[input1_type, input2_type],
outputs=[input1_text, input1_image, input2_text, input2_image])
input2_type.change(update_visibility, inputs=[input1_type, input2_type],
outputs=[input1_text, input1_image, input2_text, input2_image])
btn = gr.Button("Compute Similarity")
btn.click(compute_similarity, inputs=[input1_text, input2_text, input1_type, input2_type], outputs=output)
demo.launch()
|