import gradio as gr from PIL import Image from transformers import CLIPModel, AutoTokenizer, AutoProcessor import torch # Ensure required dependencies are installed try: import timm except ImportError: import subprocess subprocess.run(["pip", "install", "timm"], check=True) # Load Jina CLIP model with trust_remote_code=True model_name = "jinaai/jina-clip-v1" model = CLIPModel.from_pretrained(model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) def compute_similarity(input1, input2, text1, text2, type1, type2): # Process input1 if type1 == "Image": if not input1: return "Error: No image provided for Input 1" image1 = Image.open(input1).convert("RGB") input1_tensor = processor(images=image1, return_tensors="pt")["pixel_values"] elif type1 == "Text": if not text1.strip(): return "Error: No text provided for Input 1" input1_tensor = tokenizer(text1, return_tensors="pt")["input_ids"] else: return "Error: Invalid input type for Input 1" # Process input2 if type2 == "Image": if not input2: return "Error: No image provided for Input 2" image2 = Image.open(input2).convert("RGB") input2_tensor = processor(images=image2, return_tensors="pt")["pixel_values"] elif type2 == "Text": if not text2.strip(): return "Error: No text provided for Input 2" input2_tensor = tokenizer(text2, return_tensors="pt")["input_ids"] else: return "Error: Invalid input type for Input 2" # Compute embeddings with torch.no_grad(): if type1 == "Image": embedding1 = model.get_image_features(pixel_values=input1_tensor) else: embedding1 = model.get_text_features(input_ids=input1_tensor) if type2 == "Image": embedding2 = model.get_image_features(pixel_values=input2_tensor) else: embedding2 = model.get_text_features(input_ids=input2_tensor) # Normalize embeddings embedding1 = embedding1 / embedding1.norm(dim=-1, keepdim=True) embedding2 = embedding2 / embedding2.norm(dim=-1, keepdim=True) # Compute cosine similarity similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2).item() return f"Similarity Score: {similarity:.4f}" with gr.Blocks() as demo: gr.Markdown("# CLIP-based Similarity Comparison") with gr.Row(): type1 = gr.Radio(["Image", "Text"], label="Input 1 Type", value="Image") type2 = gr.Radio(["Image", "Text"], label="Input 2 Type", value="Text") with gr.Row(): input1 = gr.Image(type="filepath", label="Upload Image 1") input2 = gr.Image(type="filepath", label="Upload Image 2") text1 = gr.Textbox(label="Enter Text 1") text2 = gr.Textbox(label="Enter Text 2") compare_btn = gr.Button("Compare") output = gr.Textbox(label="Similarity Score") compare_btn.click( compute_similarity, inputs=[ input1, input2, text1, text2, type1, type2 ], outputs=output ) demo.launch()