import gradio as gr from transformers import AutoModel from PIL import Image import torch # Load JinaAI CLIP model model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True) def compute_similarity(input1_type, input1_text, input1_image, input2_type, input2_text, input2_image): """Computes similarity for Text-Text, Image-Image, or Text-Image comparisons.""" # Ensure images are valid (Gradio sometimes sends `False` instead of `None`) input1_image = None if isinstance(input1_image, bool) else input1_image input2_image = None if isinstance(input2_image, bool) else input2_image # Validate inputs if input1_type == "Text" and not input1_text.strip(): return "Error: Input 1 (Text) is empty!" if input1_type == "Image" and input1_image is None: return "Error: Please upload a valid image for Input 1!" if input2_type == "Text" and not input2_text.strip(): return "Error: Input 2 (Text) is empty!" if input2_type == "Image" and input2_image is None: return "Error: Please upload a valid image for Input 2!" try: with torch.no_grad(): if input1_type == "Text" and input2_type == "Text": emb1 = model.encode_text([input1_text]) emb2 = model.encode_text([input2_text]) elif input1_type == "Image" and input2_type == "Image": emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))]) emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))]) else: if input1_type == "Image": emb1 = model.encode_image([Image.fromarray(input1_image.astype("uint8"))]) emb2 = model.encode_text([input2_text]) else: emb1 = model.encode_text([input1_text]) emb2 = model.encode_image([Image.fromarray(input2_image.astype("uint8"))]) similarity_score = (emb1 @ emb2.T).item() return similarity_score except Exception as e: return f"Error: {str(e)}" # Gradio UI with gr.Blocks() as demo: gr.Markdown("# JinaAI CLIP Multimodal Similarity") gr.Markdown("Compare **Text-Text, Image-Image, or Text-Image** similarity.") with gr.Row(): input1_type = gr.Radio(["Text", "Image"], label="Input 1 Type", value="Text") input2_type = gr.Radio(["Text", "Image"], label="Input 2 Type", value="Text") input1_text = gr.Textbox(label="Input 1 (Text)", visible=True) input1_image = gr.Image(type="numpy", label="Input 1 (Image)", visible=False) input2_text = gr.Textbox(label="Input 2 (Text)", visible=True) input2_image = gr.Image(type="numpy", label="Input 2 (Image)", visible=False) output = gr.Textbox(label="Similarity Score / Error", interactive=False) def update_visibility(input1_type, input2_type): return ( input1_type == "Text", # Show text input 1 input1_type == "Image", # Show image input 1 input2_type == "Text", # Show text input 2 input2_type == "Image" # Show image input 2 ) input1_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) input2_type.change(update_visibility, inputs=[input1_type, input2_type], outputs=[input1_text, input1_image, input2_text, input2_image]) compute_button = gr.Button("Compute Similarity") compute_button.click(compute_similarity, inputs=[input1_type, input1_text, input1_image, input2_type, input2_text, input2_image], outputs=output) demo.launch()