Spaces:

roll-ai
/

EPiC

Paused

App Files Files Community

Muhammad Taqi Raza commited on Jul 5

Commit

0cc03a7

1 Parent(s): 0d2f841

print shapes

Browse files

Files changed (1) hide show

inference/cli_demo_camera_i2v_pcd.py +15 -6

inference/cli_demo_camera_i2v_pcd.py CHANGED Viewed

@@ -75,15 +75,20 @@ def maxpool_mask_tensor(mask_tensor):
     """
     T, H, W = mask_tensor.shape
     assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
-    assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
     # Reshape to (B=T, C=1, H, W) for 2D spatial pooling
     x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
-    x_pooled = F.max_pool2d(x, kernel_size=(H // 30, W // 45))  # → (T, 1, 30, 45)
     # Temporal pooling: reshape to (12, T//12, 30, 45) and max along dim=1
     t_groups = T // 12
-    x_pooled = x_pooled.view(12, t_groups, 30, 45)
     pooled_mask = torch.amax(x_pooled, dim=1)  # → (12, 30, 45)
     # Add a zero frame at the beginning: shape (1, 30, 45)
@@ -105,15 +110,19 @@ def avgpool_mask_tensor(mask_tensor):
     """
     T, H, W = mask_tensor.shape
     assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
-    assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
     # Spatial average pooling
     x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
-    x_pooled = F.avg_pool2d(x, kernel_size=(H // 30, W // 45))  # → (T, 1, 30, 45)
     # Temporal pooling
     t_groups = T // 12
-    x_pooled = x_pooled.view(12, t_groups, 30, 45)
     pooled_avg = torch.mean(x_pooled, dim=1)  # → (12, 30, 45)
     # Threshold: keep only when > 0.5

     """
     T, H, W = mask_tensor.shape
     assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
+    # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
+    assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
+    downsampling_factor_h = H // 8
+    downsampling_factor_w = W // 8
     # Reshape to (B=T, C=1, H, W) for 2D spatial pooling
     x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
+    x_pooled = F.max_pool2d(x, kernel_size=(H // downsampling_factor_h, W // downsampling_factor_w))  # → (T, 1, 30, 45)
     # Temporal pooling: reshape to (12, T//12, 30, 45) and max along dim=1
     t_groups = T // 12
+    x_pooled = x_pooled.view(12, t_groups, downsampling_factor_h, downsampling_factor_w)
     pooled_mask = torch.amax(x_pooled, dim=1)  # → (12, 30, 45)
     # Add a zero frame at the beginning: shape (1, 30, 45)
     """
     T, H, W = mask_tensor.shape
     assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
+    # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
+    assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
+    downsampling_factor_h = H // 8
+    downsampling_factor_w = W // 8
     # Spatial average pooling
     x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
+    x_pooled = F.avg_pool2d(x, kernel_size=(H // downsampling_factor_h, W // downsampling_factor_w))  # → (T, 1, 30, 45)
     # Temporal pooling
     t_groups = T // 12
+    x_pooled = x_pooled.view(12, t_groups, downsampling_factor_h, downsampling_factor_w)
     pooled_avg = torch.mean(x_pooled, dim=1)  # → (12, 30, 45)
     # Threshold: keep only when > 0.5