Spaces:

ktllc
/

Clip-Model

Runtime error

App Files Files Community

ktllc commited on Oct 16, 2023

Commit

a012f3a

1 Parent(s): ac1aaed

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -53

app.py CHANGED Viewed

@@ -5,75 +5,40 @@ import gradio as gr
 # Load the CLIP model
 model, preprocess = clip.load("ViT-B/32")
-device = "cuda" if torch.cuda.is_available() else "cpu"  # Check for GPU availability
 model.to(device).eval()
 # Define the Business Listing variable
 Business_Listing = "Air Guide"
-def find_similar_images(text_input):
-    # Directory where you want to load images
-    image_dir = "/content/sample_data/Tourism"
-    # Create an empty description dictionary
-    description = f"{Business_Listing} Logo"
-    # Set up the layout for displaying images
-    num_rows = 4
-    num_cols = 8
-    original_images = []
-    images = []
-    texts = []
-    # Load and preprocess images
-    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.ico', '.svg', '.eps', '.pdf']
-    for filename in [filename for filename in os.listdir(image_dir) if any(filename.endswith(ext) for ext in image_extensions)]:
-        # Get the image name (without extension)
-        image_name, _ = os.path.splitext(filename)
-        # Load the image
-        image = Image.open(os.path.join(image_dir, filename)).convert("RGB")
-        original_images.append(image)
-        images.append(preprocess(image))
-        texts.append(description)
-    # Prepare input text and images
-    image_input = torch.tensor(np.stack(images)).to(device)
-    text_tokens = clip.tokenize([f"This is {text_input}"])
-    text_tokens = text_tokens.to(device)
-    # Encode text and image features
     with torch.no_grad():
-        image_features = model.encode_image(image_input).float()
         text_features = model.encode_text(text_tokens).float()
     # Normalize features and calculate similarity
     image_features /= image_features.norm(dim=-1, keepdim=True)
     text_features /= text_features.norm(dim=-1, keepdim=True)
-    similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
-    # Find the maximum similarity value
-    max_similarity_value = similarity[0, :].max()
-    # Find all indices with the maximum similarity value
-    max_similarity_indices = np.where(similarity[0, :] == max_similarity_value)
-    # Get the filenames with the highest similarity
-    valid_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.ico', '.svg', '.eps', '.pdf')
-    image_files = [filename for filename in os.listdir(image_dir) if filename.endswith(valid_extensions)]
-    filenames_with_highest_similarity = [image_files[i] for i in max_similarity_indices[0]]
-    return filenames_with_highest_similarity, max_similarity_value
 # Define a Gradio interface
 iface = gr.Interface(
-    fn=find_similar_images,
-    inputs="text",
-    outputs=["text", "number"],
     live=True,
     interpretation="default",
-    title="CLIP Model Image Search",
 )
 iface.launch()

 # Load the CLIP model
 model, preprocess = clip.load("ViT-B/32")
+device = "cuda" if torch.cuda.is available() else "cpu"
 model.to(device).eval()
 # Define the Business Listing variable
 Business_Listing = "Air Guide"
+def find_similarity(image, text_input):
+    # Preprocess the uploaded image
+    image = preprocess(image).unsqueeze(0).to(device)
+    # Prepare input text
+    text_tokens = clip.tokenize([text_input]).to(device)
+    # Encode image and text features
     with torch.no_grad():
+        image_features = model.encode_image(image).float()
         text_features = model.encode_text(text_tokens).float()
     # Normalize features and calculate similarity
     image_features /= image_features.norm(dim=-1, keepdim=True)
     text_features /= text_features.norm(dim=-1, keepdim=True)
+    similarity = (text_features @ image_features.T).cpu().numpy()
+    return similarity[0, 0]
 # Define a Gradio interface
 iface = gr.Interface(
+    fn=find_similarity,
+    inputs=[gr.Image(type="pil"), "text"],
+    outputs="number",
     live=True,
     interpretation="default",
+    title="CLIP Model Image-Text Cosine Similarity",
+    description="Upload an image and enter text to find their cosine similarity.",
 )
 iface.launch()