wondervictor commited on
Commit
20b299a
·
verified ·
1 Parent(s): 41d006c

Update mask_adapter/sam_maskadapter.py

Browse files
Files changed (1) hide show
  1. mask_adapter/sam_maskadapter.py +6 -6
mask_adapter/sam_maskadapter.py CHANGED
@@ -131,7 +131,7 @@ class SAMVisualizationDemo(object):
131
  pred_masks = np.row_stack(pred_masks)
132
  pred_masks = BitMasks(pred_masks)
133
 
134
- image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
135
 
136
  pixel_mean = torch.tensor(PIXEL_MEAN).view(-1, 1, 1)
137
  pixel_std = torch.tensor(PIXEL_STD).view(-1, 1, 1)
@@ -150,13 +150,13 @@ class SAMVisualizationDemo(object):
150
  # text_features = self.clip_model.encode_text(text)
151
  # text_features /= text_features.norm(dim=-1, keepdim=True)
152
 
153
- features = self.extract_features_convnext(image.float())
154
 
155
  clip_feature = features['clip_vis_dense']
156
 
157
  clip_vis_dense = self.visual_prediction_forward_convnext_2d(clip_feature)
158
 
159
- semantic_activation_maps = self.mask_adapter(clip_vis_dense, pred_masks.tensor.unsqueeze(0).float())
160
 
161
  maps_for_pooling = F.interpolate(semantic_activation_maps, size=clip_feature.shape[-2:],
162
  mode='bilinear', align_corners=False)
@@ -295,7 +295,7 @@ class SAMPointVisualizationDemo(object):
295
 
296
  pred_masks = BitMasks(masks)
297
 
298
- image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
299
 
300
  pixel_mean = torch.tensor(PIXEL_MEAN).view(-1, 1, 1)
301
  pixel_std = torch.tensor(PIXEL_STD).view(-1, 1, 1)
@@ -311,12 +311,12 @@ class SAMPointVisualizationDemo(object):
311
  # text_features /= text_features.norm(dim=-1, keepdim=True)
312
  #np.save("/home/yongkangli/Mask-Adapter/text_embedding/lvis_coco_text_embedding.npy", text_features.cpu().numpy())
313
  text_features = self.text_embedding
314
- features = self.extract_features_convnext(image.float())
315
  clip_feature = features['clip_vis_dense']
316
 
317
  clip_vis_dense = self.visual_prediction_forward_convnext_2d(clip_feature)
318
 
319
- semantic_activation_maps = self.mask_adapter(clip_vis_dense, pred_masks.tensor.unsqueeze(0).float())
320
  maps_for_pooling = F.interpolate(semantic_activation_maps, size=clip_feature.shape[-2:], mode='bilinear', align_corners=False)
321
 
322
  B, C = clip_feature.size(0), clip_feature.size(1)
 
131
  pred_masks = np.row_stack(pred_masks)
132
  pred_masks = BitMasks(pred_masks)
133
 
134
+ image = torch.as_tensor(image.astype("float16").transpose(2, 0, 1))
135
 
136
  pixel_mean = torch.tensor(PIXEL_MEAN).view(-1, 1, 1)
137
  pixel_std = torch.tensor(PIXEL_STD).view(-1, 1, 1)
 
150
  # text_features = self.clip_model.encode_text(text)
151
  # text_features /= text_features.norm(dim=-1, keepdim=True)
152
 
153
+ features = self.extract_features_convnext(image))
154
 
155
  clip_feature = features['clip_vis_dense']
156
 
157
  clip_vis_dense = self.visual_prediction_forward_convnext_2d(clip_feature)
158
 
159
+ semantic_activation_maps = self.mask_adapter(clip_vis_dense, pred_masks.tensor.unsqueeze(0).half())
160
 
161
  maps_for_pooling = F.interpolate(semantic_activation_maps, size=clip_feature.shape[-2:],
162
  mode='bilinear', align_corners=False)
 
295
 
296
  pred_masks = BitMasks(masks)
297
 
298
+ image = torch.as_tensor(image.astype("float16").transpose(2, 0, 1))
299
 
300
  pixel_mean = torch.tensor(PIXEL_MEAN).view(-1, 1, 1)
301
  pixel_std = torch.tensor(PIXEL_STD).view(-1, 1, 1)
 
311
  # text_features /= text_features.norm(dim=-1, keepdim=True)
312
  #np.save("/home/yongkangli/Mask-Adapter/text_embedding/lvis_coco_text_embedding.npy", text_features.cpu().numpy())
313
  text_features = self.text_embedding
314
+ features = self.extract_features_convnext(image)
315
  clip_feature = features['clip_vis_dense']
316
 
317
  clip_vis_dense = self.visual_prediction_forward_convnext_2d(clip_feature)
318
 
319
+ semantic_activation_maps = self.mask_adapter(clip_vis_dense, pred_masks.tensor.unsqueeze(0).half())
320
  maps_for_pooling = F.interpolate(semantic_activation_maps, size=clip_feature.shape[-2:], mode='bilinear', align_corners=False)
321
 
322
  B, C = clip_feature.size(0), clip_feature.size(1)