shilinxu
/

MoonViT

shilinxu commited on Jul 28, 2025

Commit

1b4543a

verified ·

1 Parent(s): f3d2bc2

Update modeling_moonvit.py

Files changed (1) hide show

modeling_moonvit.py CHANGED Viewed

@@ -180,7 +180,7 @@ class Learnable2DInterpPosEmb(nn.Module):
     def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
         pos_embs = []
-        for shape in grid_hws[:, 1:].tolist():
             if shape == self.weight.shape[:-1]:
                 pos_embs.append(self.weight.flatten(end_dim=1))
             else:
@@ -596,6 +596,8 @@ class MoonVitPretrainedModel(PreTrainedModel):
         Returns:
             torch.Tensor: The output tokens.
         """
         hidden_states = self.patch_embed(pixel_values, image_grid_hws)
         hidden_states = self.encoder(hidden_states, image_grid_hws)
         hidden_states = patch_merger(

     def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
         pos_embs = []
+        for shape in grid_hws.tolist():
             if shape == self.weight.shape[:-1]:
                 pos_embs.append(self.weight.flatten(end_dim=1))
             else:
         Returns:
             torch.Tensor: The output tokens.
         """
+        if image_grid_hws.shape[-1] == 3:
+            image_grid_hws = image_grid_hws[:, 1:]
         hidden_states = self.patch_embed(pixel_values, image_grid_hws)
         hidden_states = self.encoder(hidden_states, image_grid_hws)
         hidden_states = patch_merger(