Spaces:

curt-park
/

segment-anything-with-clip

Runtime error

App Files Files Community

curt-park commited on Apr 6, 2023

Commit

f17c02a

1 Parent(s): 4ddb621

Add clip query

Browse files

Files changed (2) hide show

ViT-B-32.pt +3 -0
app.py +63 -14

ViT-B-32.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af
+size 353976522

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 from functools import lru_cache
 from random import randint
-from typing import Dict, List
 import cv2
 import gradio as gr
 import numpy as np
@@ -13,17 +14,27 @@ from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
 CHECKPOINT_PATH = "sam_vit_h_4b8939.pth"
 MODEL_TYPE = "default"
 MAX_WIDTH = MAX_HEIGHT = 800
 THRESHOLD = 0.05
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @lru_cache
-def load_mask_generator(model_size: str = "large") -> SamAutomaticMaskGenerator:
     sam = sam_model_registry[MODEL_TYPE](checkpoint=CHECKPOINT_PATH).to(device)
     mask_generator = SamAutomaticMaskGenerator(sam)
     return mask_generator
 def adjust_image_size(image: np.ndarray) -> np.ndarray:
     height, width = image.shape[:2]
     if height > width:
@@ -36,23 +47,56 @@ def adjust_image_size(image: np.ndarray) -> np.ndarray:
     return image
 def filter_masks(
-    masks: List[Dict[str, np.ndarray]],
     predicted_iou_threshold: float,
     stability_score_threshold: float,
     query: str,
     clip_threshold: float,
-) -> List[np.ndarray]:
-    filtered_masks: List[Dict[str, np.ndarray]] = []
     for mask in masks:
         if (
             mask["predicted_iou"] < predicted_iou_threshold
             or mask["stability_score"] < stability_score_threshold
         ):
             continue
         filtered_masks.append(mask)
-    return [mask["segmentation"] for mask in filtered_masks]
 def draw_masks(
@@ -62,7 +106,7 @@ def draw_masks(
         color = [randint(127, 255) for _ in range(3)]
         # draw mask overlay
-        colored_mask = np.expand_dims(mask, 0).repeat(3, axis=0)
         colored_mask = np.moveaxis(colored_mask, 0, -1)
         masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
         image_overlay = masked.filled()
@@ -70,7 +114,7 @@ def draw_masks(
         # draw contour
         contours, _ = cv2.findContours(
-            np.uint8(mask), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
         )
         cv2.drawContours(image, contours, -1, (255, 0, 0), 2)
     return image
@@ -88,7 +132,12 @@ def segment(
     image = adjust_image_size(cv2.imread(image_path))
     masks = mask_generator.generate(image)
     masks = filter_masks(
-        masks, predicted_iou_threshold, stability_score_threshold, query, clip_threshold
     )
     image = draw_masks(image, masks)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -112,16 +161,16 @@ demo = gr.Interface(
         [
             0.9,
             0.8,
-            0.05,
             os.path.join(os.path.dirname(__file__), "examples/dog.jpg"),
-            "",
         ],
         [
             0.9,
             0.8,
-            0.05,
             os.path.join(os.path.dirname(__file__), "examples/city.jpg"),
-            "",
         ],
         [
             0.9,
@@ -135,7 +184,7 @@ demo = gr.Interface(
             0.8,
             0.05,
             os.path.join(os.path.dirname(__file__), "examples/horse.jpg"),
-            "",
         ],
     ],
 )

 import os
 from functools import lru_cache
 from random import randint
+from typing import Any, Callable, Dict, List, Tuple
+import clip
 import cv2
 import gradio as gr
 import numpy as np
 CHECKPOINT_PATH = "sam_vit_h_4b8939.pth"
 MODEL_TYPE = "default"
 MAX_WIDTH = MAX_HEIGHT = 800
+CLIP_WIDTH = CLIP_HEIGHT = 300
 THRESHOLD = 0.05
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @lru_cache
+def load_mask_generator() -> SamAutomaticMaskGenerator:
     sam = sam_model_registry[MODEL_TYPE](checkpoint=CHECKPOINT_PATH).to(device)
     mask_generator = SamAutomaticMaskGenerator(sam)
     return mask_generator
+@lru_cache
+def load_clip(
+    name: str = "ViT-B-32.pt",
+) -> Tuple[torch.nn.Module, Callable[[PIL.Image.Image], torch.Tensor]]:
+    model_path = os.path.join(".", name)
+    model, preprocess = clip.load(model_path, device=device)
+    return model.to(device), preprocess
 def adjust_image_size(image: np.ndarray) -> np.ndarray:
     height, width = image.shape[:2]
     if height > width:
     return image
+@torch.no_grad()
+def get_scores(crops: List[PIL.Image.Image], query: str) -> torch.Tensor:
+    model, preprocess = load_clip()
+    preprocessed = [preprocess(crop) for crop in crops]
+    preprocessed = torch.stack(preprocessed).to(device)
+    token = clip.tokenize(query).to(device)
+    img_features = model.encode_image(preprocessed)
+    txt_features = model.encode_text(token)
+    img_features /= img_features.norm(dim=-1, keepdim=True)
+    txt_features /= txt_features.norm(dim=-1, keepdim=True)
+    probs = 100.0 * img_features @ txt_features.T
+    return probs[:, 0].softmax(dim=0)
 def filter_masks(
+    image: np.ndarray,
+    masks: List[Dict[str, Any]],
     predicted_iou_threshold: float,
     stability_score_threshold: float,
     query: str,
     clip_threshold: float,
+) -> List[Dict[str, Any]]:
+    cropped_masks: List[PIL.Image.Image] = []
+    filtered_masks: List[Dict[str, Any]] = []
     for mask in masks:
         if (
             mask["predicted_iou"] < predicted_iou_threshold
             or mask["stability_score"] < stability_score_threshold
         ):
             continue
         filtered_masks.append(mask)
+        x, y, w, h = mask["bbox"]
+        crop = image[y : y + h, x : x + w]
+        crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
+        crop = PIL.Image.fromarray(np.uint8(crop * 255)).convert("RGB")
+        crop.resize((CLIP_WIDTH, CLIP_HEIGHT))
+        cropped_masks.append(crop)
+    if query and filtered_masks:
+        scores = get_scores(cropped_masks, query)
+        filtered_masks = [
+            filtered_masks[i]
+            for i, score in enumerate(scores)
+            if score > clip_threshold
+        ]
+    return filtered_masks
 def draw_masks(
         color = [randint(127, 255) for _ in range(3)]
         # draw mask overlay
+        colored_mask = np.expand_dims(mask["segmentation"], 0).repeat(3, axis=0)
         colored_mask = np.moveaxis(colored_mask, 0, -1)
         masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
         image_overlay = masked.filled()
         # draw contour
         contours, _ = cv2.findContours(
+            np.uint8(mask["segmentation"]), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
         )
         cv2.drawContours(image, contours, -1, (255, 0, 0), 2)
     return image
     image = adjust_image_size(cv2.imread(image_path))
     masks = mask_generator.generate(image)
     masks = filter_masks(
+        image,
+        masks,
+        predicted_iou_threshold,
+        stability_score_threshold,
+        query,
+        clip_threshold,
     )
     image = draw_masks(image, masks)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         [
             0.9,
             0.8,
+            0.15,
             os.path.join(os.path.dirname(__file__), "examples/dog.jpg"),
+            "A dog only",
         ],
         [
             0.9,
             0.8,
+            0.1,
             os.path.join(os.path.dirname(__file__), "examples/city.jpg"),
+            "A bridge on the water",
         ],
         [
             0.9,
             0.8,
             0.05,
             os.path.join(os.path.dirname(__file__), "examples/horse.jpg"),
+            "horse",
         ],
     ],
 )