import gradio as gr from transformers import AutoModel from PIL import Image import torch import numpy as np # Load JinaAI CLIP model model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True) def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image): """ Computes similarity between: - Text-Text - Image-Image - Text-Image & Image-Text """ # Determine input types if input1_type == "Text": input1 = input1_text.strip() input1_is_text = bool(input1) input1_is_image = False else: input1 = input1_image input1_is_text = False input1_is_image = input1 is not None if input2_type == "Text": input2 = input2_text.strip() input2_is_text = bool(input2) input2_is_image = False else: input2 = input2_image input2_is_text = False input2_is_image = input2 is not None # Ensure valid input if not (input1_is_text or input1_is_image) or not (input2_is_text or input2_is_image): return "Error: Please provide valid inputs (text or image) for both fields!" try: with torch.no_grad(): if input1_is_text and input2_is_text: # Text-Text Similarity emb1 = model.encode_text([input1]) emb2 = model.encode_text([input2]) elif input1_is_image and input2_is_image: # Image-Image Similarity image1 = Image.fromarray(input1) image2 = Image.fromarray(input2) emb1 = model.encode_image([image1]) emb2 = model.encode_image([image2]) else: # Image-Text Similarity if input1_is_image: image = Image.fromarray(input1) text = input2 emb1 = model.encode_image([image]) emb2 = model.encode_text([text]) else: image = Image.fromarray(input2) text = input1 emb1 = model.encode_text([text]) emb2 = model.encode_image([image]) # Compute cosine similarity similarity_score = (emb1 @ emb2.T).item() return similarity_score except Exception as e: return f"Error: {str(e)}" # Gradio UI with gr.Blocks() as demo: gr.Markdown("# JinaAI CLIP Multimodal Similarity") gr.Markdown("Compare similarity between two inputs: **Text-Text, Image-Image, or Image-Text**.") with gr.Row(): input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text") input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text") input1_text = gr.Textbox(label="Input 1 (Text)", visible=True) input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False) input2_text = gr.Textbox(label="Input 2 (Text)", visible=True) input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False) output = gr.Textbox(label="Similarity Score / Error", interactive=False) def update_visibility(input1_type, input2_type): return ( input1_type == "Text", # Input 1 text visibility input1_type == "Image", # Input 1 image visibility input2_type == "Text", # Input 2 text visibility input2_type == "Image" # Input 2 image visibility ) input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) compute_button = gr.Button("Compute Similarity") compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output) demo.launch()