Spaces:

wilwork
/

jina-clip-v1-test

Running

File size: 2,865 Bytes

import gradio as gr
from PIL import Image
from transformers import CLIPModel, AutoTokenizer, AutoProcessor
import torch

# Ensure required dependencies are installed
try:
    import timm
except ImportError:
    import subprocess
    subprocess.run(["pip", "install", "timm"], check=True)

# Load Jina CLIP model with trust_remote_code=True
model_name = "jinaai/jina-clip-v1"
model = CLIPModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

def compute_similarity(input1, input2, type1, type2):
    # Process input1
    if type1 == "Image":
        image1 = Image.open(input1).convert("RGB")
        input1_tensor = processor(images=image1, return_tensors="pt")["pixel_values"]
    elif isinstance(input1, str) and input1.strip():
        input1_tensor = tokenizer(input1, return_tensors="pt")["input_ids"]
    else:
        return "Error: Invalid text input for Input 1"
    
    # Process input2
    if type2 == "Image":
        image2 = Image.open(input2).convert("RGB")
        input2_tensor = processor(images=image2, return_tensors="pt")["pixel_values"]
    elif isinstance(input2, str) and input2.strip():
        input2_tensor = tokenizer(input2, return_tensors="pt")["input_ids"]
    else:
        return "Error: Invalid text input for Input 2"
    
    # Compute embeddings
    with torch.no_grad():
        if type1 == "Image":
            embedding1 = model.get_image_features(pixel_values=input1_tensor)
        else:
            embedding1 = model.get_text_features(input_ids=input1_tensor)
        
        if type2 == "Image":
            embedding2 = model.get_image_features(pixel_values=input2_tensor)
        else:
            embedding2 = model.get_text_features(input_ids=input2_tensor)
    
    # Compute cosine similarity
    similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2).item()
    return f"Similarity Score: {similarity:.4f}"

with gr.Blocks() as demo:
    gr.Markdown("# CLIP-based Similarity Comparison")
    
    with gr.Row():
        type1 = gr.Radio(["Image", "Text"], label="Input 1 Type", value="Image")
        type2 = gr.Radio(["Image", "Text"], label="Input 2 Type", value="Text")
    
    with gr.Row():
        input1 = gr.Image(type="filepath", label="Upload Image 1")
        input2 = gr.Image(type="filepath", label="Upload Image 2")
        text1 = gr.Textbox(label="Enter Text 1")
        text2 = gr.Textbox(label="Enter Text 2")
    
    compare_btn = gr.Button("Compare")
    output = gr.Textbox(label="Similarity Score")
    
    compare_btn.click(
        compute_similarity, 
        inputs=[
            input1,
            input2,
            type1,
            type2
        ], 
        outputs=output
    )

demo.launch()