Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

action_tokenizer.py +5 -20
config.json +1 -2
configuration_spatialvla.py +13 -64
modeling_gemma2.py +2 -4
modeling_spatialvla.py +121 -366
processing_spatialvla.py +33 -218
test_huggingface.py +2 -7

action_tokenizer.py CHANGED Viewed

@@ -1,27 +1,16 @@
-# MIT License
-# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
-# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
-# coding=utf-8
 """
 action_tokenizer.py
 Extension class; wraps base LLM/VLM tokenizer with logic to discretize and tokenize continuous robot actions.
 """
-from typing import List, Union, Dict, Tuple, Optional
 import numpy as np
 from transformers import PreTrainedTokenizerBase
-from pathlib import Path
-import json
 from scipy.stats import norm
 import torch
 ACTION_TOKEN = '<ACTION{:05d}>'
-"""Spatial Tokenizer"""
 class ActionTokenizer:
     def __init__(
         self,
@@ -67,7 +56,6 @@ class ActionTokenizer:
     def vocab_size(self) -> int:
         return self._vocab_size
-"""Spatial Tokenizer"""
 class TranslationTokenizer:
     def __init__(
         self,
@@ -258,7 +246,7 @@ class GripperTokenzier:
     def vocab_size(self) -> int:
         return self.num_bins
-class SphericalCoordinateActionTokenizer:
     range_bins = {
         "translation": {
             "theta_bins": (0.0, np.pi),
@@ -282,7 +270,7 @@ class SphericalCoordinateActionTokenizer:
         min_action: float = -1.0,
         max_action: float = 1.0,
     ):
-        """set bin_policy if exist, otherwise, caculate bin_policy from gs_params.(unifrom if None Gaussian)
         gs_params: Optional[Dict],
         bin_policy: Optional[Dict],
         """
@@ -293,7 +281,6 @@ class SphericalCoordinateActionTokenizer:
         # set bin policy
         self.bin_policy = bin_policy if bin_policy else self.get_bin_policy(gs_params, self.min_sigma)
         self.translation_tokenizer = TranslationTokenizer(
             self.tokenizer,
             self.num_bins["translation"],
@@ -406,13 +393,11 @@ class SphericalCoordinateActionTokenizer:
         embeddings: tensor (S,E)
         """
         from scipy.interpolate import griddata
-        # __import__("ipdb").set_trace()
         new_policy = self.get_bin_policy(gs_params, min_sigma=min_sigma)
         trans_grids0, rot_grids0 = self.get_norm_meshgrid(self.bin_policy)
         trans_grids1, rot_grids1 = self.get_norm_meshgrid(new_policy)
-        print("🔥 overwrite bin policy and tokenizer bins ...")
         self.bin_policy = new_policy
         self.min_sigma = min_sigma
         self.translation_tokenizer.set_bins(new_policy["translation"])
@@ -442,5 +427,5 @@ class SphericalCoordinateActionTokenizer:
             device, dtype = embeddings.weight.data.device, embeddings.weight.data.dtype
             embeddings.weight.data[:N] = torch.Tensor(adpt_trans_emb.reshape(-1, E), device=device).to(dtype)
             embeddings.weight.data[N:N+M] = torch.Tensor(adpt_rot_emb.reshape(-1, E), device=device).to(dtype)
-            print("🚀 DONE! adapt spatial embedding to new gaussian distributation finished.")
             print(embeddings.weight.data)

 """
 action_tokenizer.py
 Extension class; wraps base LLM/VLM tokenizer with logic to discretize and tokenize continuous robot actions.
 """
+from typing import List, Union, Dict, Optional
 import numpy as np
 from transformers import PreTrainedTokenizerBase
 from scipy.stats import norm
 import torch
 ACTION_TOKEN = '<ACTION{:05d}>'
 class ActionTokenizer:
     def __init__(
         self,
     def vocab_size(self) -> int:
         return self._vocab_size
 class TranslationTokenizer:
     def __init__(
         self,
     def vocab_size(self) -> int:
         return self.num_bins
+class SpatialActionTokenizer:
     range_bins = {
         "translation": {
             "theta_bins": (0.0, np.pi),
         min_action: float = -1.0,
         max_action: float = 1.0,
     ):
+        """set bin_policy if exist, otherwise, caculate bin_policy from gs_params or use uniform bin grids.
         gs_params: Optional[Dict],
         bin_policy: Optional[Dict],
         """
         # set bin policy
         self.bin_policy = bin_policy if bin_policy else self.get_bin_policy(gs_params, self.min_sigma)
         self.translation_tokenizer = TranslationTokenizer(
             self.tokenizer,
             self.num_bins["translation"],
         embeddings: tensor (S,E)
         """
         from scipy.interpolate import griddata
         new_policy = self.get_bin_policy(gs_params, min_sigma=min_sigma)
         trans_grids0, rot_grids0 = self.get_norm_meshgrid(self.bin_policy)
         trans_grids1, rot_grids1 = self.get_norm_meshgrid(new_policy)
+        print("overwrite bin policy and tokenizer bins ...")
         self.bin_policy = new_policy
         self.min_sigma = min_sigma
         self.translation_tokenizer.set_bins(new_policy["translation"])
             device, dtype = embeddings.weight.data.device, embeddings.weight.data.dtype
             embeddings.weight.data[:N] = torch.Tensor(adpt_trans_emb.reshape(-1, E), device=device).to(dtype)
             embeddings.weight.data[N:N+M] = torch.Tensor(adpt_rot_emb.reshape(-1, E), device=device).to(dtype)
+            print("DONE! adapt spatial embedding to new gaussian distributation finished.")
             print(embeddings.weight.data)

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "../pretrained/2025-01-05_09-12-37_oxe_spatial_vla_paligemma3b_zoe_gsN8194_gpu64-204k",
   "_vocab_size": 265347,
   "action_token_begin_idx": 257153,
   "architectures": [
@@ -317,4 +316,4 @@
     "use_bias_in_fusion_residual": null,
     "use_pretrained_backbone": false
   }
-}

 {
   "_vocab_size": 265347,
   "action_token_begin_idx": 257153,
   "architectures": [
     "use_bias_in_fusion_residual": null,
     "use_pretrained_backbone": false
   }
+}

configuration_spatialvla.py CHANGED Viewed

@@ -1,12 +1,16 @@
-# MIT License
-# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
-# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
-# Based on code licensed under the Apache License, Version 2.0 by Google Inc. and HuggingFace Inc. team (Copyright 2024).
 # coding=utf-8
 """PaliGemmamodel configuration"""
 import warnings
@@ -15,59 +19,9 @@ from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 from transformers import CONFIG_MAPPING, AutoConfig
 logger = logging.get_logger(__name__)
 class SpatialVLAConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`PaliGemmaForConditionalGeneration`]. It is used to instantiate an
-    PaliGemmamodel according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
-    e.g. [paligemma-hf/paligemma-2b](https://huggingface.co/paligemma-hf/paligemma-2b)
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vision_config (`PaliGemmaVisionConfig`,  *optional*):
-            Custom vision config or dict
-        text_config (`Union[AutoConfig, dict]`, *optional*):
-            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
-        ignore_index (`int`, *optional*, defaults to -100):
-            The ignore index for the loss function.
-        image_token_index (`int`, *optional*, defaults to 256000):
-            The image token index to encode the image prompt.
-        vocab_size (`int`, *optional*, defaults to 257152):
-            Vocabulary size of the PaliGemmamodel. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~PaliGemmaForConditionalGeneration`]
-        projection_dim (`int`, *optional*, defaults to 2048):
-            Dimension of the multimodal projection space.
-        hidden_size (`int`, *optional*, defaults to 2048):
-            Dimension of the hidden layer of the Language model.
-    Example:
-    ```python
-    >>> from transformers import PaliGemmaForConditionalGeneration, PaliGemmaConfig, SiglipVisionConfig, GemmaConfig
-    >>> # Initializing a Siglip-like vision config
-    >>> vision_config = SiglipVisionConfig()
-    >>> # Initializing a PaliGemma config
-    >>> text_config = GemmaConfig()
-    >>> # Initializing a PaliGemma paligemma-3b-224 style configuration
-    >>> configuration = PaliGemmaConfig(vision_config, text_config)
-    >>> # Initializing a model from the paligemma-3b-224 style configuration
-    >>> model = PaliGemmaForConditionalGeneration(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
     model_type = "spatialvla"
     sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
@@ -87,7 +41,6 @@ class SpatialVLAConfig(PretrainedConfig):
         ego3d_patch_reso=4,
         n_freqs=8,
         use_vision_zoe=True,
-        # wrap_lora=False,
         **kwargs,
     ):
         self._ignore_index = ignore_index
@@ -138,19 +91,15 @@ class SpatialVLAConfig(PretrainedConfig):
             vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
             self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
         else:
-            print(f"🔥 init from default configurations ... {self.vision_zoe_config}")
-            # BUG: initializing zoe in default cause key error
-            # self.vision_zoe_config = CONFIG_MAPPING["zoedepth"]()
             pass
-        # NOTE: additional attributes
         self.action_token_begin_idx = action_token_begin_idx
         self.spatial_token_num = spatial_token_num
         self.use_spatial_token = use_spatial_token
         self.ego3d_patch_reso = ego3d_patch_reso
         self.n_freqs = n_freqs
         self.use_vision_zoe = use_vision_zoe
-        # self.wrap_lora = wrap_lora
         super().__init__(**kwargs)

 # coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """PaliGemmamodel configuration"""
 import warnings
 from transformers.utils import logging
 from transformers import CONFIG_MAPPING, AutoConfig
 logger = logging.get_logger(__name__)
 class SpatialVLAConfig(PretrainedConfig):
     model_type = "spatialvla"
     sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
         ego3d_patch_reso=4,
         n_freqs=8,
         use_vision_zoe=True,
         **kwargs,
     ):
         self._ignore_index = ignore_index
             vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
             self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
         else:
             pass
+        # additional attributes
         self.action_token_begin_idx = action_token_begin_idx
         self.spatial_token_num = spatial_token_num
         self.use_spatial_token = use_spatial_token
         self.ego3d_patch_reso = ego3d_patch_reso
         self.n_freqs = n_freqs
         self.use_vision_zoe = use_vision_zoe
         super().__init__(**kwargs)

modeling_gemma2.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# custom gemma2 to support flash_attention_2
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
@@ -205,10 +206,7 @@ def flash_attention_forward(
 ) -> Tuple[torch.Tensor, None]:
     # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211
     seq_len = query.shape[2]
-    # print(f"🔥 query {query.shape}, key {key.shape}, value: {value.shape}")
     if mask is not None:
-        # print(f"🔥 mask {mask.shape}")
-        # seq_len = mask.shape[1]
         query = query[:, :, :seq_len]
         value = value[:, :, :seq_len]

+# custom gemma2 to support flash_attention_2,
+# source from https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/gemma2/modeling_gemma2.py
 # coding=utf-8
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
 ) -> Tuple[torch.Tensor, None]:
     # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211
     seq_len = query.shape[2]
     if mask is not None:
         query = query[:, :, :seq_len]
         value = value[:, :, :seq_len]

modeling_spatialvla.py CHANGED Viewed

@@ -1,153 +1,118 @@
-# MIT License
-# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
-# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
-# Based on code licensed under the Apache License, Version 2.0 by Google Inc. and HuggingFace Inc. team (Copyright 2024).
 # coding=utf-8
 """PyTorch PaliGemmamodel."""
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.linalg import inv
-import torchvision.transforms.functional as F
-import os
 from transformers.cache_utils import Cache, HybridCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
 from transformers.utils import (
     ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
     logging,
-    replace_return_docstrings,
 )
 from .configuration_spatialvla import SpatialVLAConfig
-from .modeling_ego3d import Ego3DPositionEmbeddingMLP, process_zoe
 from .modeling_gemma2 import Gemma2ForCausalLM
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-from transformers import AutoModel, AutoModelForCausalLM, ZoeDepthForDepthEstimation
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "PaliGemmaConfig"
-# constant
-SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
-# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
-# But Paligemma has no causal mask on prefix
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-    is_training: bool = False,
-    token_type_ids: torch.Tensor = None,
-    **kwargs,
-):
     """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-        is_training (`bool`):
-            Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels`
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
-        if sequence_length != 1:
-            if is_training:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            else:
-                causal_mask[:, :sequence_length] = 0.0
-        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
-            if is_training:
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
-                )
-    return causal_mask
 @dataclass
 class SpatialVLACausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for PaliGemmacausal language model (or autoregressive) outputs.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`torch.FloatTensor`, *optional*):
-            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
-            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
-    """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
@@ -155,7 +120,6 @@ class SpatialVLACausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 class SpatialVLAMultiModalProjector(nn.Module):
     def __init__(self, config: SpatialVLAConfig):
         super().__init__()
@@ -163,31 +127,8 @@ class SpatialVLAMultiModalProjector(nn.Module):
     def forward(self, image_features):
         hidden_states = self.linear(image_features)
         return hidden_states
-PALIGEMMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`PaliGemmaConfig`] or [`PaliGemmaVisionConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    PALIGEMMA_START_DOCSTRING,
-)
 class SpatialVLAPreTrainedModel(PreTrainedModel):
     config_class = SpatialVLAConfig
     base_model_prefix = "model"
@@ -202,8 +143,6 @@ class SpatialVLAPreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
     def _init_weights(self, module):
-        # important: this ported version of PaliGemmaisn't meant for training from scratch - only
-        # inference and fine-tuning
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
@@ -222,99 +161,20 @@ class SpatialVLAPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-PALIGEMMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`PaliGemmaProcessor`] uses
-            [`SiglipImageProcessor`] for processing images).
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-@add_start_docstrings(
-    """The PALIGEMMA model which consists of a vision backbone and a language model.""",
-    PALIGEMMA_START_DOCSTRING,
-)
 class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin):
     def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None):
         super().__init__(config)
-        # vision model
         self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config)
-        # projector
         self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config)
-        # language model
         self.vocab_size = config.text_config.vocab_size
         if language_model is None:
-            language_model = Gemma2ForCausalLM(config=config.text_config) if config.text_config.model_type == "gemma2" else AutoModelForCausalLM.from_config(config=config.text_config)
-        # set tile key
         if language_model._tied_weights_keys is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
         if config.use_vision_zoe:
-            # zoe model
             self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config)
             self.position_embedding_3d = Ego3DPositionEmbeddingMLP(
                 config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs
@@ -326,15 +186,12 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1)  # (3 hw)
             self.register_buffer("uv_h", uv_h, persistent=False)
-        # NOTE: add shared addtional spatial token embeddings for <ACTION> <IMG>
         if config.use_spatial_token:
             self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size)
         else:
             self.spatial_embed_tokens = None
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
-        # self.post_init() # BUG: cause from_pretrained failed!
-        # self.position_embedding_3d._reset_parameters()
     def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor:
@@ -343,44 +200,48 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         Args:
             K: camera intrinsic matrix (b 3 3)
             depth: depth map (b 1 h w)
-            pixel_offset: offset to the pixel coordinate
         """
-        # __import__("ipdb").set_trace()
         b, c, h, w = depth.shape
         hp, wp = h // patch_size, w // patch_size
         sub_hp = sub_wp = reso
-        patch_depth = torch.nn.functional.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1)
-        # import torchvision; torchvision.utils.save_image(zoe_pixel_values[0], "zoe_image.png")
         p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth  # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw)
         patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1)
         return patch_p_cam
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings with Llava->PaliGemma
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings with Llava->PaliGemma
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings with Llava->PaliGemma
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings with Llava->PaliGemma
     def set_output_embeddings(self, new_embeddings):
         self.language_model.set_output_embeddings(new_embeddings)
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder with Llava->PaliGemma
     def set_decoder(self, decoder):
         self.language_model.set_decoder(decoder)
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder with Llava->PaliGemma
     def get_decoder(self):
         return self.language_model.get_decoder()
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights with Llava->PaliGemma
     def tie_weights(self):
         return self.language_model.tie_weights()
@@ -390,11 +251,7 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         pad_to_multiple_of: Optional[int] = None,
         mean_resizing: bool = True,
     ) -> nn.Embedding:
-        # TODO: is_deepspeed_zero3_enabled gather
-        print(f"resize token embeddings from {self.language_model.get_output_embeddings().weight.shape} to (*,{new_num_tokens})")
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
-        # update base model and current model config
         vocab_size = model_embeds.weight.shape[0]
         self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size
         self.tie_weights()
@@ -431,18 +288,12 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             )
         if attention_mask is not None and attention_mask.dim() == 4:
-            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
             return attention_mask
-        causal_mask = torch.full(
-            (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
-        )
-        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
         if sequence_length != 1:
-            if is_training:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            else:
-                causal_mask[:, :sequence_length] = 0.0
         causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
@@ -451,29 +302,13 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             mask_length = attention_mask.shape[-1]
             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
             padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
             if is_training:
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
-                )
         return causal_mask
     def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor):
-        """
-        Obtains image last hidden states from the vision tower and apply multimodal projection.
-        Args:
-            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
-               The tensors corresponding to the input images.
-        Returns:
-            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
-        """
-        # mintrinsic = intrinsic.reshape(-1, 3, 3)
-        # siglip vision tower
-        siglip_pixel_values = F.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD)
         image_outputs = self.vision_tower(siglip_pixel_values)
         # ego3d position encoding
@@ -482,13 +317,12 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             with torch.no_grad():
                 pvh, pvw = pixel_values.shape[-2:]
                 depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth
-                depth = torch.nn.functional.interpolate(
                     depth.unsqueeze(1),
                     size=(pvh+2*ph, pvw+2*pw),
                     mode="bicubic",
                     align_corners=True,
                 )[..., ph:-ph, pw:-pw]
-                # depth = torch.clamp(depth, 0., 4.0) # NOTE: we find that depth w/o clamp performs better
                 xyz = self.backproject_patch(
                     intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso
                 )  # (b, n, 3*4)
@@ -500,8 +334,6 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         image_features = image_features / (self.config.text_config.hidden_size**0.5)
         return image_features
-    @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=SpatialVLACausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -521,93 +353,29 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         return_dict: Optional[bool] = None,
         num_logits_to_keep: int = 0,
     ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-            num_logits_to_keep (`int`, *optional*):
-                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-        Returns:
-        Example:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
-        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
-        >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
-        >>> prompt = "answer en Where is the cow standing?"
-        >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_length=30)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "answer en Where is the cow standing?\nbeach"
-        ```"""
-        # print(f"**************************************\n \
-        #       input_ids {input_ids} \n \
-        #       labels {labels} \n \
-        #       token_type_ids {token_type_ids} \n \
-        #       attention_mask {attention_mask} \n \
-        #       actions {actions} \n \
-        #         **************************************"
-        #       )
-        # print(f"model.language_model.config._attn_implementation {self.language_model.config._attn_implementation} model.config.vision_config._attn_implementation_internal {self.config.vision_config._attn_implementation_internal} \n \
-        #       model.vision_tower.config._attn_implementation {self.vision_tower.config._attn_implementation} model.config.vision_config._attn_implementation_internal {self.config.vision_config._attn_implementation_internal}")
-        # __import__("ipdb").set_trace()
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if pixel_values is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-            )
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         is_training = token_type_ids is not None and labels is not None
-        if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids).clone() ## avoid checkpint grad True
-        # NOTE: replace the fixed embeddings with trainable spatial embeddings
-        # BUG: LoRA causes inputs_embeds requires_grad = True
-        # peft: https://github.com/huggingface/peft/blob/ec92cdcc41fe1b141bfe1e0da69b38a7e601cc80/src/peft/peft_model.py#L687
-        # hf: https://github.com/huggingface/transformers/blob/05260a1fc1c8571a2b421ce72b680d5f1bc3e5a4/src/transformers/modeling_utils.py#L2545
-        # lora w/ prompt: https://discuss.huggingface.co/t/combine-between-lora-and-prompt-tunning/65151
         if self.config.use_spatial_token:
             spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num)
             inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
-        # Merge text and images
         if pixel_values is not None:
             image_features = self.get_image_features(pixel_values, intrinsic)
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             if inputs_embeds[special_image_mask].numel() != image_features.numel():
@@ -647,20 +415,16 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         logits = outputs.logits
         loss = None
         if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
             logits = logits.float()
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
             if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
                 shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
                 shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
             else:
                 shift_logits = shift_logits.contiguous()
                 shift_labels = shift_labels.contiguous()
-            # Flatten the tokens
             loss_fct = nn.CrossEntropyLoss()
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
@@ -679,6 +443,7 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             image_hidden_states=image_features if pixel_values is not None else None,
         )
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -695,7 +460,6 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
         labels=None,
         **kwargs,
     ):
-        # Overwritten -- custom `position_ids` and `pixel_values` handling
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
@@ -708,19 +472,13 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             token_type_ids=token_type_ids,
             **kwargs,
         )
-        # position_ids in Paligemma are 1-indexed
         if model_inputs.get("position_ids") is not None:
             model_inputs["position_ids"] += 1
-        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
         if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
         is_training = token_type_ids is not None and labels is not None
         if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
-            causal_mask = self._update_causal_mask(
-                attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
-            )
             model_inputs["attention_mask"] = causal_mask
         model_inputs["intrinsic"] = intrinsic
         return model_inputs
@@ -765,9 +523,6 @@ class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMi
             weights_only=weights_only,
             **kwargs,
         )
-        # NOTE: tie the weights of the embed_tokens with lm head (donot work if un_tie_weight)
-        # model.language_model.tie_weights()
-        # NOTE: tie the data of spatial_embed_tokens with embed_tokens (BUG: forweight sync issue in training)
         if model.config.use_spatial_token:
             model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data
         return model

 # coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """PyTorch PaliGemmamodel."""
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
+import os
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.linalg import inv
+import torchvision.transforms.functional as TF
+import torch.nn.functional as F
 from transformers.cache_utils import Cache, HybridCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
 from transformers.utils import (
     ModelOutput,
     logging,
 )
 from .configuration_spatialvla import SpatialVLAConfig
 from .modeling_gemma2 import Gemma2ForCausalLM
+from transformers import AutoModel, ZoeDepthForDepthEstimation
+SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
 logger = logging.get_logger(__name__)
+class Ego3DPositionEmbeddingMLP(nn.Module):
+    """Absolute pos embedding, learned.
+    https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
     """
+    def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
+        super(Ego3DPositionEmbeddingMLP, self).__init__()
+        self.n_freqs = n_freqs
+        self.freq_out_channels = in_channels * (2 * n_freqs + 1)
+        if logscale:
+            freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
+        else:
+            freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
+        center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+        self.register_buffer("center", center, persistent=False)
+        self.position_embedding_head = nn.Sequential(
+            nn.Linear(self.freq_out_channels, num_pos_feats),
+            nn.LayerNorm(num_pos_feats),
+            nn.ReLU(),
+            nn.Linear(num_pos_feats, num_pos_feats),
+        )
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """init with small weights to maintain stable training."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p, gain=0.01)
+    @torch.no_grad()
+    def frequency_encoding(self, xyz):
+        """
+        Embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        Different from the paper, "x" is also in the output
+        See https://github.com/bmild/nerf/issues/12
+        x \in [-2, 2]
+        y \in [-2, 2]
+        z \in [0., 4]
+        Inputs:
+            x: (b n m)
+        Outputs:
+            out: (b n o)
+        """
+        xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
+        xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands  # (b n m 1)
+        sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq)  # (b n m nf)
+        encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
+        return encoding
+    def forward(self, xyz):
+        """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
+        freq_encoding = self.frequency_encoding(xyz)
+        position_embedding = self.position_embedding_head(freq_encoding)
+        return position_embedding
+def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
+    """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
+    # h, w = images.shape[-2:]
+    # pad
+    ph, pw = 31, 31  # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
+    images = F.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
+    # resize
+    size = (384, 384)  # get_resize_output_image_size
+    images = F.interpolate(images, size=size, mode="bicubic", align_corners=True)
+    # zoe: padding -> resize -> nomalize. we follow `nomalize -> padding -> resize` from siglip
+    images = TF.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
+    return images, ph, pw
 @dataclass
 class SpatialVLACausalLMOutputWithPast(ModelOutput):
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 class SpatialVLAMultiModalProjector(nn.Module):
     def __init__(self, config: SpatialVLAConfig):
         super().__init__()
     def forward(self, image_features):
         hidden_states = self.linear(image_features)
         return hidden_states
 class SpatialVLAPreTrainedModel(PreTrainedModel):
     config_class = SpatialVLAConfig
     base_model_prefix = "model"
     _supports_sdpa = True
     def _init_weights(self, module):
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin):
     def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None):
         super().__init__(config)
         self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config)
         self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         if language_model is None:
+            language_model = Gemma2ForCausalLM(config=config.text_config)
         if language_model._tied_weights_keys is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
         if config.use_vision_zoe:
             self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config)
             self.position_embedding_3d = Ego3DPositionEmbeddingMLP(
                 config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs
             uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1)  # (3 hw)
             self.register_buffer("uv_h", uv_h, persistent=False)
+        # shared spatial embeddings for <ACTION> <IMG>
         if config.use_spatial_token:
             self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size)
         else:
             self.spatial_embed_tokens = None
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
     def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor:
         Args:
             K: camera intrinsic matrix (b 3 3)
             depth: depth map (b 1 h w)
+            patch_size: patch size for siglip
+            reso: reso^2 -> sample points in each patch
+        patch sz = 14  ......
+        ┌────────┬────────┐
+        │ ─    ─ │ ─    ─ │
+        │ points │        ├─ ─ ─
+        │ ─    ─ │ ─    ─ │
+        ├────────┼────────┤
+        │ ─    ─ │ ─    ─ │
+        │        │        │
+        │ ─    ─ │ ─    ─ │
+        └────────┴────────┘
+        reso=2───►points=4
+            │
+            │
         """
         b, c, h, w = depth.shape
         hp, wp = h // patch_size, w // patch_size
         sub_hp = sub_wp = reso
+        patch_depth = F.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1)
         p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth  # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw)
         patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1)
         return patch_p_cam
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
     def set_input_embeddings(self, value):
         self.language_model.set_input_embeddings(value)
     def get_output_embeddings(self):
         return self.language_model.get_output_embeddings()
     def set_output_embeddings(self, new_embeddings):
         self.language_model.set_output_embeddings(new_embeddings)
     def set_decoder(self, decoder):
         self.language_model.set_decoder(decoder)
     def get_decoder(self):
         return self.language_model.get_decoder()
     def tie_weights(self):
         return self.language_model.tie_weights()
         pad_to_multiple_of: Optional[int] = None,
         mean_resizing: bool = True,
     ) -> nn.Embedding:
         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
         vocab_size = model_embeds.weight.shape[0]
         self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size
         self.tie_weights()
             )
         if attention_mask is not None and attention_mask.dim() == 4:
             return attention_mask
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device)
         if sequence_length != 1:
+            if is_training: causal_mask = torch.triu(causal_mask, diagonal=1)
+            else: causal_mask[:, :sequence_length] = 0.0
         causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
             mask_length = attention_mask.shape[-1]
             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
             padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
             if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0)
         return causal_mask
     def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor):
+        siglip_pixel_values = TF.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD)
         image_outputs = self.vision_tower(siglip_pixel_values)
         # ego3d position encoding
             with torch.no_grad():
                 pvh, pvw = pixel_values.shape[-2:]
                 depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth
+                depth = F.interpolate(
                     depth.unsqueeze(1),
                     size=(pvh+2*ph, pvw+2*pw),
                     mode="bicubic",
                     align_corners=True,
                 )[..., ph:-ph, pw:-pw]
                 xyz = self.backproject_patch(
                     intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso
                 )  # (b, n, 3*4)
         image_features = image_features / (self.config.text_config.hidden_size**0.5)
         return image_features
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         return_dict: Optional[bool] = None,
         num_logits_to_keep: int = 0,
     ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]:
+        output_attentions = output_attentions or self.config.output_attentions
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        return_dict = return_dict or self.config.use_return_dict
         is_training = token_type_ids is not None and labels is not None
+        if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids).clone() # avoid checkpint grad True
         if self.config.use_spatial_token:
             spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num)
             inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+        # merge
         if pixel_values is not None:
             image_features = self.get_image_features(pixel_values, intrinsic)
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
             special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             if inputs_embeds[special_image_mask].numel() != image_features.numel():
         logits = outputs.logits
         loss = None
         if labels is not None:
             logits = logits.float()
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
             if attention_mask is not None:
                 shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
                 shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
             else:
                 shift_logits = shift_logits.contiguous()
                 shift_labels = shift_labels.contiguous()
             loss_fct = nn.CrossEntropyLoss()
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             image_hidden_states=image_features if pixel_values is not None else None,
         )
+    # AR inference
     def prepare_inputs_for_generation(
         self,
         input_ids,
         labels=None,
         **kwargs,
     ):
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             token_type_ids=token_type_ids,
             **kwargs,
         )
         if model_inputs.get("position_ids") is not None:
             model_inputs["position_ids"] += 1
         if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
         is_training = token_type_ids is not None and labels is not None
         if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            causal_mask = self._update_causal_mask(attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training)
             model_inputs["attention_mask"] = causal_mask
         model_inputs["intrinsic"] = intrinsic
         return model_inputs
             weights_only=weights_only,
             **kwargs,
         )
         if model.config.use_spatial_token:
             model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data
         return model

processing_spatialvla.py CHANGED Viewed

@@ -1,142 +1,38 @@
-# MIT License
-# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
-# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
-# Based on code licensed under the Apache License, Version 2.0 by Google Inc. and HuggingFace Inc. team (Copyright 2024).
 # coding=utf-8
-"""
-Processor class for PaliGemma.
-"""
 import logging
 from typing import List, Optional, Union, Dict
-import torch
 import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, is_valid_image
-from transformers.processing_utils import (
-    ImagesKwargs,
-    ProcessingKwargs,
-    ProcessorMixin,
-    TextKwargs,
-    Unpack,
-    _validate_images_text_input_order,
-)
-from transformers.tokenization_utils_base import (
-    AddedToken,
-    PreTokenizedInput,
-    TextInput,
-)
 from transformers.utils import logging
-from .action_tokenizer import SphericalCoordinateActionTokenizer
 logger = logging.get_logger(__name__)
-IMAGE_TOKEN = "<image>"
-EXTRA_TOKENS = [f"<loc{i:0>4}>" for i in range(1024)] + [f"<seg{i:0>3}>" for i in range(128)]
-class PaliGemmaTextKwargs(TextKwargs):
-    suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-class PaliGemmaImagesKwargs(ImagesKwargs):
-    do_convert_rgb: Optional[bool]
-class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: PaliGemmaTextKwargs
-    images_kwargs: PaliGemmaImagesKwargs
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
-        "images_kwargs": {
-            "data_format": "channels_first",
-        },
-    }
-# Copied from transformers.models.idefics2.processing_idefics2.is_url
-def is_url(val) -> bool:
-    return isinstance(val, str) and val.startswith("http")
-# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
-def is_image_or_image_url(elem):
-    return is_url(elem) or is_valid_image(elem)
-def _is_str_or_image(elem):
-    return isinstance(elem, (str)) or is_image_or_image_url(elem)
-def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_images):
-    """
-    Builds a string from the input prompt and image tokens.
-    For example, for the call:
-    build_string_from_input(
-        prompt="Prefix str"
-        bos_token="<s>",
-        image_seq_len=3,
-        image_token="<im>",
-    )
-    The output will be:
-    "<im><im><im><s>Initial str"
-    Args:
-        prompt (`List[Union[str, ImageInput]]`): The input prompt.
-        bos_token (`str`): The beginning of sentence token.
-        image_seq_len (`int`): The length of the image sequence.
-        image_token (`str`): The image token.
-        num_images (`int`): Number of images in the prompt.
-    """
-    return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
-# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-    elif is_valid_image(images):
-        return [images]
-    raise ValueError(f"Could not make batched video from {images}")
 class SpatialVLAProcessor(ProcessorMixin):
-    r"""
-    Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
-    [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
-    Args:
-        image_processor ([`SiglipImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template"]
     image_processor_class = "SiglipImageProcessor"
@@ -192,17 +88,13 @@ class SpatialVLAProcessor(ProcessorMixin):
         self.dataset_intrinsics = {}
         height, width = image_processor.size["height"], image_processor.size["width"]
         for k, v in intrinsic_config.items():
             K = torch.tensor(v["intrinsic"]).float()
-            h, w = v["height"], v["width"]
-            K[0, 0] *= width / w
-            K[1, 1] *= height / h
-            K[0, 2] *= width / w
-            K[1, 2] *= height / h
             self.dataset_intrinsics[k] = K
-            print(f"scale intrinsic of {k} from {v['intrinsic']} to {K} ...")
-        self.action_tokenizer = SphericalCoordinateActionTokenizer(
             tokenizer=tokenizer, num_bins=action_config["num_bins"],
             bin_policy=bin_policy, use_spherical=action_config["use_spherical"],
             min_sigma=min_sigma,
@@ -212,70 +104,10 @@ class SpatialVLAProcessor(ProcessorMixin):
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         unnorm_key: Optional[str] = None,
         suffix_actions: Optional[np.array] = None, # (t e)
         **kwargs: Unpack[PaliGemmaProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-        The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
-        the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for
-        the prefix and the suffix. For instance,
-        ```python
-        image = PIL_cow_image
-        prompt = "answer en Where is the cow standing?"
-        suffix = "on the beach"
-        inputs = processor(text=prompt, images=image, suffix=suffix)
-        ```
-        Here `inputs` will contain the `input_ids` and `token_type_ids` that follow
-        ```python
-        inputs["input_ids"][:, 256:]
-        # tensor([[     2,   6006,    603,    573,  13910,   9980, 235336,    108,    477,   573,   8318]])
-        inputs["token_type_ids"][:, 256:]
-        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
-        ```
-        Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-            suffix (`str`, `List[str]`, `List[List[str]]`):
-                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
-                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
-              is provided, the `input_ids` will also contain the suffix input ids.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-            - **labels** -- Labels compatible with training if `suffix` is not None
-        """
-        # check if images and text inputs are reversed for BC
         images, text = _validate_images_text_input_order(images, text)
         output_kwargs = self._merge_kwargs(
@@ -294,9 +126,7 @@ class SpatialVLAProcessor(ProcessorMixin):
         if images is None:
             raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
         if text is None:
-            logger.warning_once(
-                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
-            )
             text = ""
         if _is_str_or_image(text):
@@ -306,31 +136,19 @@ class SpatialVLAProcessor(ProcessorMixin):
         if text is not None and images is not None:
             if not any(IMAGE_TOKEN in sample for sample in text):
-                # logger.warning(
-                #     "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special "
-                #     "image tokens in the text, as many tokens as there are images per each text. It is recommended to "
-                #     "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images "
-                #     "each text has and add special tokens."
-                # )
                 if isinstance(text, List) and isinstance(images, List):
                     if len(images) != len(text):
                         raise ValueError(
                             f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
                         )
-                # make a nested list of lists to be able to iterate over the images and text below
                 if is_valid_image(images):
                     images = [[images]]
                 elif isinstance(images, list) and is_valid_image(images[0]):
                     images = [[image] for image in images]
                 elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])):
                     raise ValueError("images must be an image, list of images or list of list of images")
-                if suffix is not None and _is_str_or_image(suffix):
-                    suffix = [suffix]
-                if suffix is not None:
-                    suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
                 input_strings = [
                     build_string_from_input(
                         prompt=prompt,
@@ -355,7 +173,6 @@ class SpatialVLAProcessor(ProcessorMixin):
                 input_strings = [f"{sample}\n" for sample in expanded_samples]
         pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
-        # max_length has to account for the image tokens
         if output_kwargs["text_kwargs"].get("max_length", None) is not None:
             output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
@@ -391,7 +208,6 @@ class SpatialVLAProcessor(ProcessorMixin):
         return self.tokenizer.decode(*args, **kwargs)
     @property
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
@@ -407,7 +223,7 @@ class SpatialVLAProcessor(ProcessorMixin):
         assert self.tokenizer.eos_token != predicted_action_token_ids[-1], "[error] actions contain EOS token, please check you truncation settings!"
         if predicted_action_token_ids.shape[0] < action_token_num * self.action_chunk_size:  # pad with zeros
-            print(f"[warning] Padding zero action!")
             predicted_action_token_ids = np.concatenate(
                 [
                     predicted_action_token_ids,
@@ -417,9 +233,8 @@ class SpatialVLAProcessor(ProcessorMixin):
         predicted_action_token_ids = predicted_action_token_ids.reshape(-1, action_token_num)
         normalized_action_chunks = self.action_tokenizer.decode_token_ids_to_actions(predicted_action_token_ids)
-        # Unnormalize actions
         if unnorm_key is None:
-            print(f"🔥 unnorm_key {unnorm_key} is not in statistics, use next one")
             unnorm_key = next(self.statistics.keys())
         action_norm_stats = self.statistics[unnorm_key]["action"]

 # coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 from typing import List, Optional, Union, Dict
 import numpy as np
+import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import Unpack, _validate_images_text_input_order, ProcessorMixin
+from transformers.tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from transformers.utils import logging
+from transformers.models.paligemma.processing_paligemma import (
+    make_batched_images,
+    build_string_from_input,
+    _is_str_or_image,
+    PaliGemmaProcessorKwargs,
+    IMAGE_TOKEN,
+    EXTRA_TOKENS
+)
+from .action_tokenizer import SpatialActionTokenizer
 logger = logging.get_logger(__name__)
 class SpatialVLAProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template"]
     image_processor_class = "SiglipImageProcessor"
         self.dataset_intrinsics = {}
         height, width = image_processor.size["height"], image_processor.size["width"]
+        # scale intrinsic matrix
         for k, v in intrinsic_config.items():
             K = torch.tensor(v["intrinsic"]).float()
+            K[:2] *= torch.tensor([width / v["width"], height / v["height"]])[:, None]
             self.dataset_intrinsics[k] = K
+        self.action_tokenizer = SpatialActionTokenizer(
             tokenizer=tokenizer, num_bins=action_config["num_bins"],
             bin_policy=bin_policy, use_spherical=action_config["use_spherical"],
             min_sigma=min_sigma,
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         unnorm_key: Optional[str] = None,
         suffix_actions: Optional[np.array] = None, # (t e)
         **kwargs: Unpack[PaliGemmaProcessorKwargs],
     ) -> BatchFeature:
         images, text = _validate_images_text_input_order(images, text)
         output_kwargs = self._merge_kwargs(
         if images is None:
             raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
         if text is None:
+            logger.warning_once( "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model.")
             text = ""
         if _is_str_or_image(text):
         if text is not None and images is not None:
             if not any(IMAGE_TOKEN in sample for sample in text):
                 if isinstance(text, List) and isinstance(images, List):
                     if len(images) != len(text):
                         raise ValueError(
                             f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
                         )
                 if is_valid_image(images):
                     images = [[images]]
                 elif isinstance(images, list) and is_valid_image(images[0]):
                     images = [[image] for image in images]
                 elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])):
                     raise ValueError("images must be an image, list of images or list of list of images")
+                if suffix is not None and _is_str_or_image(suffix): suffix = [suffix]
+                if suffix is not None: suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
                 input_strings = [
                     build_string_from_input(
                         prompt=prompt,
                 input_strings = [f"{sample}\n" for sample in expanded_samples]
         pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
         if output_kwargs["text_kwargs"].get("max_length", None) is not None:
             output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
         return self.tokenizer.decode(*args, **kwargs)
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         assert self.tokenizer.eos_token != predicted_action_token_ids[-1], "[error] actions contain EOS token, please check you truncation settings!"
         if predicted_action_token_ids.shape[0] < action_token_num * self.action_chunk_size:  # pad with zeros
+            logger.warning(f"Padding zero action!")
             predicted_action_token_ids = np.concatenate(
                 [
                     predicted_action_token_ids,
         predicted_action_token_ids = predicted_action_token_ids.reshape(-1, action_token_num)
         normalized_action_chunks = self.action_tokenizer.decode_token_ids_to_actions(predicted_action_token_ids)
         if unnorm_key is None:
+            logger.warning(f"unnorm_key {unnorm_key} is not in statistics, use next one")
             unnorm_key = next(self.statistics.keys())
         action_norm_stats = self.statistics[unnorm_key]["action"]

test_huggingface.py CHANGED Viewed

@@ -1,17 +1,12 @@
 import os
 import argparse
 from pathlib import Path
-import shutil
-import os
-import argparse
-from pathlib import Path
-import shutil
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoProcessor
 parser = argparse.ArgumentParser("Huggingface AutoModel Tesing")
-parser.add_argument("--model_name_or_path", default="", help="pretrained model name or path.")
 parser.add_argument("--num_images", type=int, default=1, help="num_images for testing.")
 args = parser.parse_args()
@@ -32,4 +27,4 @@ if __name__ == "__main__":
     print(generation_outputs, processor.batch_decode(generation_outputs))
     actions = processor.decode_actions(generation_outputs, unnorm_key="bridge_orig/1.0.0")
-    print(actions)

 import os
 import argparse
 from pathlib import Path
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoProcessor
 parser = argparse.ArgumentParser("Huggingface AutoModel Tesing")
+parser.add_argument("--model_name_or_path", default=".", help="pretrained model name or path.")
 parser.add_argument("--num_images", type=int, default=1, help="num_images for testing.")
 args = parser.parse_args()
     print(generation_outputs, processor.batch_decode(generation_outputs))
     actions = processor.decode_actions(generation_outputs, unnorm_key="bridge_orig/1.0.0")
+    print(actions)