Spaces:

wondervictor
/

Mask-Adapter

Runtime error

App Files Files Community

wondervictor commited on Dec 7, 2024

Commit

45caf8e

verified ·

1 Parent(s): 2ed7fa7

Update mask_adapter/sam_maskadapter.py

Browse files

Files changed (1) hide show

mask_adapter/sam_maskadapter.py +74 -3

mask_adapter/sam_maskadapter.py CHANGED Viewed

@@ -343,9 +343,6 @@ class SAMPointVisualizationDemo(object):
         alpha = 0.5
         cv2.addWeighted(mask_colored, alpha, overlay, 1 - alpha, 0, overlay)
-        # Draw boundary (contours) on the overlay
-        contours, _ = cv2.findContours(pred_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        cv2.drawContours(overlay, contours, -1, (255, 255, 255), 2)  # White boundary
         # Add label based on the class with the highest score
         max_scores, max_score_idx = class_preds.max(dim=1)  # Find the max score across the class predictions
@@ -358,4 +355,78 @@ class SAMPointVisualizationDemo(object):
         # Put text near the point
         cv2.putText(overlay, label, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
         return None, Image.fromarray(overlay)

         alpha = 0.5
         cv2.addWeighted(mask_colored, alpha, overlay, 1 - alpha, 0, overlay)
         # Add label based on the class with the highest score
         max_scores, max_score_idx = class_preds.max(dim=1)  # Find the max score across the class predictions
         # Put text near the point
         cv2.putText(overlay, label, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+        return None, Image.fromarray(overlay)
+    def run_on_image_with_boxes(self, ori_image, bbox,text_features):
+        height, width, _ = ori_image.shape
+        image = ori_image
+        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # ori_image = cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB)
+        with torch.no_grad():
+            self.predictor.set_image(image)
+            masks, _, _ = self.predictor.predict(box=bbox[None, :],  multimask_output=False)
+        pred_masks = BitMasks(masks)
+        image = torch.as_tensor(image.astype("float16").transpose(2, 0, 1))
+        pixel_mean = torch.tensor(PIXEL_MEAN).view(-1, 1, 1)
+        pixel_std = torch.tensor(PIXEL_STD).view(-1, 1, 1)
+        image = (image - pixel_mean) / pixel_std
+        image = image.unsqueeze(0)
+        # txts = [f'a photo of {cls_name}' for cls_name in self.class_names]
+        # text = open_clip.tokenize(txts)
+        with torch.no_grad():
+            # text_features = self.clip_model.encode_text(text.cuda())
+            # text_features /= text_features.norm(dim=-1, keepdim=True)
+            #np.save("/home/yongkangli/Mask-Adapter/text_embedding/lvis_coco_text_embedding.npy", text_features.cpu().numpy())
+            #text_features = self.text_embedding.to(self.mask_adapter.device)
+            features = self.extract_features_convnext(image.to(text_features).float())
+            clip_feature = features['clip_vis_dense']
+            clip_vis_dense = self.visual_prediction_forward_convnext_2d(clip_feature)
+            semantic_activation_maps = self.mask_adapter(clip_vis_dense, pred_masks.tensor.unsqueeze(0).to(text_features).float())
+            maps_for_pooling = F.interpolate(semantic_activation_maps, size=clip_feature.shape[-2:], mode='bilinear', align_corners=False)
+            B, C = clip_feature.size(0), clip_feature.size(1)
+            N = maps_for_pooling.size(1)
+            num_instances = N // 16
+            maps_for_pooling = F.softmax(F.logsigmoid(maps_for_pooling).view(B, N,-1), dim=-1)
+            pooled_clip_feature = torch.bmm(maps_for_pooling, clip_feature.view(B, C, -1).permute(0, 2, 1))
+            pooled_clip_feature = self.visual_prediction_forward_convnext(pooled_clip_feature)
+            pooled_clip_feature = (pooled_clip_feature.reshape(B, num_instances, 16, -1).mean(dim=-2).contiguous())
+            class_preds = (100.0 * pooled_clip_feature @ text_features.T).softmax(dim=-1)
+        class_preds = class_preds.squeeze(0)
+        # Resize mask to match original image size
+        pred_mask = cv2.resize(masks.squeeze(0), (width, height), interpolation=cv2.INTER_NEAREST)  # Resize mask to match original image size
+        # Create an overlay for the mask with a transparent background (using alpha transparency)
+        overlay = ori_image.copy()
+        mask_colored = np.zeros_like(ori_image)
+        mask_colored[pred_mask == 1] = [234, 103, 112]  # Green color for the mask
+        alpha = 0.5
+        cv2.addWeighted(mask_colored, alpha, overlay, 1 - alpha, 0, overlay)
+        # Add label based on the class with the highest score
+        max_scores, max_score_idx = class_preds.max(dim=1)  # Find the max score across the class predictions
+        label = f"{self.class_names[max_score_idx.item()]}: {max_scores.item():.2f}"
+        # Dynamically place the label near the clicked point
+        text_x = min(width - 200, bbox[0] + 20)  # Add some offset from the point
+        text_y = min(height - 30, bbox[1] + 20)  # Ensure the text does not go out of bounds
+        # Put text near the point
+        cv2.putText(overlay, label, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
         return None, Image.fromarray(overlay)