delinqu commited on Jan 28

Commit

8cbc8de

verified ·

1 Parent(s): 985151c

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +1 -0
action_tokenizer.py +445 -0
config.json +318 -0
configuration_spatialvla.py +171 -0
dataset_statistics.json +3502 -0
example.png +0 -0
gaussian_statistic.json +38 -0
generation_config.json +8 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_ego3d.py +124 -0
modeling_gemma2.py +1286 -0
modeling_spatialvla.py +773 -0
preprocessor_config.json +28 -0
processing_spatialvla.py +439 -0
processor_config.json +3701 -0
special_tokens_map.json +39 -0
test_huggingface.py +35 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

action_tokenizer.py ADDED Viewed

	@@ -0,0 +1,445 @@

+# MIT License
+# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
+# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+# coding=utf-8
+"""
+action_tokenizer.py
+Extension class; wraps base LLM/VLM tokenizer with logic to discretize and tokenize continuous robot actions.
+"""
+from typing import List, Union, Dict, Tuple, Optional
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+from pathlib import Path
+import json
+from scipy.stats import norm
+import torch
+ACTION_TOKEN = '<ACTION{:05d}>'
+"""Spatial Tokenizer"""
+class ActionTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 256,
+        min_action: int = -1,
+        max_action: int = 1,
+    ):
+        self._vocab_size = num_bins
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.bin_centers = np.linspace(min_action, max_action, num_bins)
+        # add special action tokens to language tokenizer
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.action_token_begin_idx = self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n, 7), tokens.
+        """
+        action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
+        ids = np.digitize(action, self.bin_centers, right=True)  # [0, 255]
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n, 7), token ids.
+        return: np.ndarray, (n, 7), continuous actions
+        """
+        ids = action_token_id - self.action_token_begin_idx
+        ids = np.clip(ids, a_min=0, a_max=self._vocab_size - 1)
+        return self.bin_centers[ids]
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+"""Spatial Tokenizer"""
+class TranslationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        use_spherical: bool = True,
+    ):
+        self.tokenizer = tokenizer
+        self.num_theta_bins = num_bins["theta_bins"]
+        self.num_phi_bins = num_bins["phi_bins"]
+        self.num_r_bins = num_bins["r_bins"]
+        self.use_spherical = use_spherical
+        # for indexing
+        self.NP = self.num_phi_bins * self.num_r_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_theta_bins * self.num_phi_bins * self.num_r_bins
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.theta_bins = np.array(bin_policy["theta_bins"])
+        self.phi_bins = np.array(bin_policy["phi_bins"])
+        self.r_bins = np.array(bin_policy["r_bins"])
+    def cartesian_to_spherical(self, x, y, z):
+        theta = np.arctan2(np.sqrt(x**2 + y**2), z)  # polar angle
+        phi = np.arctan2(y, x)  # azimuthal angle
+        r = np.sqrt(x**2 + y**2 + z**2)
+        return theta, phi, r
+    def spherical_to_cartesian(self, theta, phi, r):
+        x = r * np.sin(theta) * np.cos(phi)
+        y = r * np.sin(theta) * np.sin(phi)
+        z = r * np.cos(theta)
+        return x, y, z
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        if self.use_spherical:
+            theta, phi, r = self.cartesian_to_spherical(action[:, 0], action[:, 1], action[:, 2])
+        else:
+            theta, phi, r = action[:, 0], action[:, 1], action[:, 2]
+        disc_theta = np.digitize(theta, self.theta_bins[1:-1]) # b
+        disc_phi = np.digitize(phi, self.phi_bins[1:-1])
+        disc_r = np.digitize(r, self.r_bins[1:-1])
+        ids = disc_theta * self.NP + disc_phi * self.num_r_bins + disc_r
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_theta, disc_phi, disc_r = ids // self.NP, (ids % self.NP) // self.num_r_bins, ids % self.num_r_bins
+        theta = 0.5 * (self.theta_bins[disc_theta] + self.theta_bins[disc_theta + 1])
+        phi = 0.5 * (self.phi_bins[disc_phi] + self.phi_bins[disc_phi + 1])
+        r = 0.5 * (self.r_bins[disc_r] + self.r_bins[disc_r + 1])
+        # clip action to [-1, 1], due to the spherical coordinate action space is the circumscribed sphere of the Cartesian action space.
+        x, y, z = self.spherical_to_cartesian(theta, phi, r) if self.use_spherical else (theta, phi, r)
+        x, y, z = np.clip([x, y, z], -1, 1)
+        return np.stack((x, y, z), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class RotationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        array_begin_idx=None,
+    ):
+        self.tokenizer = tokenizer
+        self.num_roll_bins = num_bins["roll_bins"] # M
+        self.num_pitch_bins = num_bins["pitch_bins"] # N
+        self.num_yaw_bins = num_bins["yaw_bins"] # P
+        self.array_begin_idx = array_begin_idx
+        # for indexing
+        self.NP = self.num_pitch_bins * self.num_yaw_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_roll_bins * self.num_pitch_bins * self.num_yaw_bins
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} ROTATION TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.roll_bins = np.array(bin_policy["roll_bins"])
+        self.pitch_bins = np.array(bin_policy["pitch_bins"])
+        self.yaw_bins = np.array(bin_policy["yaw_bins"])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        roll, pitch, yaw = action[:, 0], action[:, 1], action[:, 2]
+        disc_roll = np.clip(np.digitize(roll, self.roll_bins) - 1, 0, self.num_roll_bins - 1)
+        disc_pitch = np.clip(np.digitize(pitch, self.pitch_bins) - 1, 0, self.num_pitch_bins - 1)
+        disc_yaw = np.clip(np.digitize(yaw, self.yaw_bins) - 1, 0, self.num_yaw_bins - 1)
+        ids = disc_roll * self.NP + disc_pitch * self.num_yaw_bins + disc_yaw
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: Union[np.int64, np.ndarray]) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, a_min=self.token_start_idx, a_max=self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_roll, disc_pitch, disc_yaw = ids // self.NP, (ids % self.NP) // self.num_yaw_bins, ids % self.num_yaw_bins
+        roll = 0.5 * (self.roll_bins[disc_roll] + self.roll_bins[disc_roll + 1])
+        pitch = 0.5 * (self.pitch_bins[disc_pitch] + self.pitch_bins[disc_pitch + 1])
+        yaw = 0.5 * (self.yaw_bins[disc_yaw] + self.yaw_bins[disc_yaw + 1])
+        return np.stack((roll, pitch, yaw), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class GripperTokenzier:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 2,
+        array_begin_idx = None,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.num_bins = num_bins
+        self.array_begin_idx = array_begin_idx
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self.num_bins)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} GRIPPER TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n,), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        ids = np.where(action >= 0.5, 1, 0)
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 1), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        actions = np.where(ids == 0, 0., 1.)
+        return actions[:, None]
+    @property
+    def vocab_size(self) -> int:
+        return self.num_bins
+class SphericalCoordinateActionTokenizer:
+    range_bins = {
+        "translation": {
+            "theta_bins": (0.0, np.pi),
+            "phi_bins": (-np.pi, np.pi),
+            "r_bins": (0.0, np.sqrt(3)),
+        },
+        "rotation": {
+            "roll_bins": (-1.0, 1.0),
+            "pitch_bins": (-1.0, 1.0),
+            "yaw_bins": (-1.0, 1.0),
+        },
+    }
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        gs_params: Dict = None,
+        bin_policy: Dict = None,
+        use_spherical: bool = True,
+        min_sigma: float = 0.0,
+        min_action: float = -1.0,
+        max_action: float = 1.0,
+    ):
+        """set bin_policy if exist, otherwise, caculate bin_policy from gs_params.(unifrom if None Gaussian)
+        gs_params: Optional[Dict],
+        bin_policy: Optional[Dict],
+        """
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.num_bins = num_bins
+        self.min_sigma = min_sigma
+        # set bin policy
+        self.bin_policy = bin_policy if bin_policy else self.get_bin_policy(gs_params, self.min_sigma)
+        self.translation_tokenizer = TranslationTokenizer(
+            self.tokenizer,
+            self.num_bins["translation"],
+            self.bin_policy["translation"],
+            use_spherical=use_spherical
+        )
+        self.rotation_tokenizer = RotationTokenizer(
+            self.tokenizer,
+            self.num_bins["rotation"],
+            self.bin_policy["rotation"],
+            array_begin_idx=self.translation_tokenizer.vocab_size,
+        )
+        self.gripper_tokenizer = GripperTokenzier(
+            self.tokenizer,
+            self.num_bins["gripper"],
+            array_begin_idx=self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size
+        )
+        self._vocab_size = self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size + self.gripper_tokenizer.vocab_size
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian coordinates.
+        return: np.ndarray, (n, 3), tokens.
+        """
+        if len(action.shape) == 1:
+            assert action.shape[0] == 7, f"action dim mismatch, got action shape: {action.shape}"
+            action = action.reshape(1, 7)
+        assert action.shape[1] == 7, f"action dim mismatch, got action shape: {action.shape}"
+        action = np.clip(action, a_min=self.min_action, a_max=self.max_action)
+        trans_tokens = self.translation_tokenizer(action[:, :3]) # (n,)
+        rot_tokens = self.rotation_tokenizer(action[:, 3:6]) # (n,)
+        grip_tokens = self.gripper_tokenizer(action[:, 6]) # (n,)
+        return np.stack((trans_tokens, rot_tokens, grip_tokens), axis=1) # (n, 3)
+    def decode_token_ids_to_actions(self, action_token_ids: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_ids: np.ndarray, (n, 3), token ids.
+        """
+        if len(action_token_ids.shape) == 1:
+            assert action_token_ids.shape[0] == 3, f"action token id numbers mismatich, need 3 got {action_token_ids.shape[0]}"
+            action_token_ids = action_token_ids.reshape(1, 3)
+        assert action_token_ids.shape[1] == 3, f"token id numbers mismatich, need 3 got {action_token_ids.shape[1]}"
+        trans_action = self.translation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 0]) # (n, 3)
+        rot_action = self.rotation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 1]) # (n, 3)
+        grip_action = self.gripper_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 2]) # (n, 1)
+        return np.concatenate((trans_action, rot_action, grip_action), axis=1) # (n, 7)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    @property
+    def action_token_begin_idx(self) -> int:
+        return self.translation_tokenizer.token_start_idx
+    def get_bin_policy(self, gs_params=None, min_sigma=0.0):
+        bin_policy = {
+            "translation": {"theta_bins": None, "phi_bins": None, "r_bins": None},
+            "rotation": {"roll_bins": None, "pitch_bins": None, "yaw_bins": None}
+        }
+        if gs_params is None:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    bin_policy[bin_type][bin_key] = np.linspace(*self.range_bins[bin_type][bin_key], self.num_bins[bin_type][bin_key] + 1)
+            print(f"use unifrom bin grids ... \n{bin_policy}")
+        else:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    mu = gs_params[bin_key.split("_")[0].lower()]["mu"]
+                    sigma = max(gs_params[bin_key.split("_")[0].lower()]["sigma"], min_sigma)
+                    bin_bound_prob = np.linspace(
+                        norm.cdf(self.range_bins[bin_type][bin_key][0], loc=mu, scale=sigma),
+                        norm.cdf(self.range_bins[bin_type][bin_key][1], loc=mu, scale=sigma),
+                        self.num_bins[bin_type][bin_key] + 1,
+                    )
+                    bin_boundary = norm.ppf(bin_bound_prob, loc=mu, scale=sigma)
+                    bin_policy[bin_type][bin_key] = np.clip(
+                            bin_boundary,
+                            self.range_bins[bin_type][bin_key][0],
+                            self.range_bins[bin_type][bin_key][1],
+                        ).tolist() # for serialize
+            print(f"caculate bin grids from gaussians \n{bin_policy}")
+        return bin_policy
+    def get_norm_meshgrid(self, bin_policy):
+        grids = []
+        policy = {k1: {k2: np.array(v2) for k2, v2 in v1.items()} for k1, v1 in bin_policy.items()}
+        # NOTE: use unify k,v order of range_bins (tpr, rpy)
+        for bin_type in self.range_bins.keys():
+            bounds = []
+            for bin_key in self.range_bins[bin_type].keys():
+                minb, maxb = self.range_bins[bin_type][bin_key][0], self.range_bins[bin_type][bin_key][1]
+                bin_boundary = policy[bin_type][bin_key]
+                bin_center = (bin_boundary[:-1] + bin_boundary[1:]) / 2
+                bin_center = np.concatenate([np.array([minb]),bin_center,np.array([maxb])]) # padding
+                bin_center = (bin_center - minb) /  (maxb - minb) # nomalize (m, n, k)
+                bounds.append(bin_center)
+            # generate grids
+            grid_x, grid_y, grid_z = np.meshgrid(*bounds)
+            grids += [np.stack([grid_x, grid_y, grid_z], -1).reshape(-1, 3)]
+        return grids[0], grids[1] # (N, 3)
+    def spatial_embedding_adaption(self, gs_params, embeddings: torch.nn.Embedding, min_sigma=0.0, adpt_feature=False):
+        """
+        gs_params0, gs_params1: Dict
+        embeddings: tensor (S,E)
+        """
+        from scipy.interpolate import griddata
+        # __import__("ipdb").set_trace()
+        new_policy = self.get_bin_policy(gs_params, min_sigma=min_sigma)
+        trans_grids0, rot_grids0 = self.get_norm_meshgrid(self.bin_policy)
+        trans_grids1, rot_grids1 = self.get_norm_meshgrid(new_policy)
+        print("🔥 overwrite bin policy and tokenizer bins ...")
+        self.bin_policy = new_policy
+        self.min_sigma = min_sigma
+        self.translation_tokenizer.set_bins(new_policy["translation"])
+        self.rotation_tokenizer.set_bins(new_policy["rotation"])
+        if adpt_feature:
+            emb_data = embeddings.weight.data # (S, e)
+            _, E = emb_data.shape
+            # translation
+            m, n, k = (self.num_bins["translation"][k] for k in ["theta_bins", "phi_bins", "r_bins"])
+            N = m*n*k
+            trans_emb_data = emb_data[:N,].reshape(m, n, k, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(trans_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_trans_emb = griddata(trans_grids0, pad_emb.float(), trans_grids1, method='linear')
+            adpt_trans_emb = adpt_trans_emb.reshape(m+2, n+2, k+2, E)[1:-1, 1:-1, 1:-1,]
+            # rotation
+            m1, n1, k1 = (self.num_bins["rotation"][k] for k in ["roll_bins", "pitch_bins", "yaw_bins"])
+            M = m1*n1*k1
+            rot_emb_data = emb_data[N : N + M,].reshape(m1, n1, k1, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(rot_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_rot_emb = griddata(rot_grids0, pad_emb.float(), rot_grids1, method='linear')
+            adpt_rot_emb = adpt_rot_emb.reshape(m1+2, n1+2, k1+2, E)[1:-1, 1:-1, 1:-1,]
+            # set data
+            device, dtype = embeddings.weight.data.device, embeddings.weight.data.dtype
+            embeddings.weight.data[:N] = torch.Tensor(adpt_trans_emb.reshape(-1, E), device=device).to(dtype)
+            embeddings.weight.data[N:N+M] = torch.Tensor(adpt_rot_emb.reshape(-1, E), device=device).to(dtype)
+            print("🚀 DONE! adapt spatial embedding to new gaussian distributation finished.")
+            print(embeddings.weight.data)

config.json ADDED Viewed

	@@ -0,0 +1,318 @@

+{
+  "_vocab_size": 265347,
+  "action_token_begin_idx": 257153,
+  "architectures": [
+    "SpatialVLAForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_spatialvla.SpatialVLAConfig",
+    "AutoModel": "modeling_spatialvla.SpatialVLAForConditionalGeneration"
+  },
+  "bos_token_id": 2,
+  "ego3d_patch_reso": 2,
+  "eos_token_id": 1,
+  "hidden_size": 2048,
+  "image_token_index": 257152,
+  "model_type": "spatialvla",
+  "n_freqs": 8,
+  "num_hidden_layers": 26,
+  "pad_token_id": 0,
+  "projection_dim": 2304,
+  "spatial_token_num": 8194,
+  "text_config": {
+    "architectures": [
+      "Gemma2ForCausalLM"
+    ],
+    "eos_token_id": [
+      1,
+      107
+    ],
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 2304,
+    "intermediate_size": 9216,
+    "model_type": "gemma2",
+    "num_hidden_layers": 26,
+    "num_image_tokens": 256,
+    "num_key_value_heads": 4,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 265347
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_spatial_token": true,
+  "use_vision_zoe": true,
+  "vision_config": {
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "num_positions": 256,
+    "patch_size": 14,
+    "projection_dim": 2304,
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  },
+  "vision_zoe_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "Intel/zoedepth-nyu-kitti",
+    "add_cross_attention": false,
+    "add_projection": false,
+    "architectures": [
+      "ZoeDepthForDepthEstimation"
+    ],
+    "attractor_alpha": 1000,
+    "attractor_gamma": 2,
+    "attractor_kind": "mean",
+    "backbone": null,
+    "backbone_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "add_fpn": false,
+      "architectures": null,
+      "attention_probs_dropout_prob": 0.0,
+      "auxiliary_channels": 256,
+      "auxiliary_concat_input": false,
+      "auxiliary_loss_weight": 0.4,
+      "auxiliary_num_convs": 1,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "drop_path_rate": 0.1,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu",
+      "hidden_dropout_prob": 0.0,
+      "hidden_size": 1024,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 384,
+      "initializer_range": 0.02,
+      "intermediate_size": 4096,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_norm_eps": 1e-12,
+      "layer_scale_init_value": 0.1,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "beit",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 16,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_channels": 3,
+      "num_hidden_layers": 24,
+      "num_return_sequences": 1,
+      "out_features": [
+        "stage6",
+        "stage12",
+        "stage18",
+        "stage24"
+      ],
+      "out_indices": [
+        6,
+        12,
+        18,
+        24
+      ],
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 16,
+      "pool_scales": [
+        1,
+        2,
+        3,
+        6
+      ],
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "reshape_hidden_states": false,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "semantic_loss_ignore_index": 255,
+      "sep_token_id": null,
+      "stage_names": [
+        "stem",
+        "stage1",
+        "stage2",
+        "stage3",
+        "stage4",
+        "stage5",
+        "stage6",
+        "stage7",
+        "stage8",
+        "stage9",
+        "stage10",
+        "stage11",
+        "stage12",
+        "stage13",
+        "stage14",
+        "stage15",
+        "stage16",
+        "stage17",
+        "stage18",
+        "stage19",
+        "stage20",
+        "stage21",
+        "stage22",
+        "stage23",
+        "stage24"
+      ],
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_absolute_position_embeddings": false,
+      "use_auxiliary_head": true,
+      "use_bfloat16": false,
+      "use_mask_token": false,
+      "use_mean_pooling": true,
+      "use_relative_position_bias": true,
+      "use_shared_relative_position_bias": false,
+      "vocab_size": 8192
+    },
+    "backbone_hidden_size": 1024,
+    "bad_words_ids": null,
+    "batch_norm_eps": 1e-05,
+    "begin_suppress_tokens": null,
+    "bin_centers_type": "softplus",
+    "bin_configurations": [
+      {
+        "max_depth": 10.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "nyu"
+      },
+      {
+        "max_depth": 80.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "kitti"
+      }
+    ],
+    "bin_embedding_dim": 128,
+    "bos_token_id": null,
+    "bottleneck_features": 256,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_hidden_size": 256,
+    "head_in_index": -1,
+    "hidden_act": "gelu",
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_temp": 50.0,
+    "min_length": 0,
+    "min_temp": 0.0212,
+    "model_type": "zoedepth",
+    "neck_hidden_sizes": [
+      256,
+      512,
+      1024,
+      1024
+    ],
+    "no_repeat_ngram_size": 0,
+    "num_attractors": [
+      16,
+      8,
+      4,
+      1
+    ],
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_patch_transformer_layers": 4,
+    "num_relative_features": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_transformer_hidden_size": 128,
+    "patch_transformer_intermediate_size": 1024,
+    "patch_transformer_num_attention_heads": 4,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "readout_type": "project",
+    "reassemble_factors": [
+      4,
+      2,
+      1,
+      0.5
+    ],
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_batch_norm_in_fusion_residual": false,
+    "use_bfloat16": false,
+    "use_bias_in_fusion_residual": null,
+    "use_pretrained_backbone": false
+  }
+}

configuration_spatialvla.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# MIT License
+# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
+# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+# coding=utf-8
+"""PaliGemmamodel configuration"""
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class SpatialVLAConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PaliGemmaForConditionalGeneration`]. It is used to instantiate an
+    PaliGemmamodel according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+    e.g. [paligemma-hf/paligemma-2b](https://huggingface.co/paligemma-hf/paligemma-2b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`PaliGemmaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 256000):
+            The image token index to encode the image prompt.
+        vocab_size (`int`, *optional*, defaults to 257152):
+            Vocabulary size of the PaliGemmamodel. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~PaliGemmaForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the multimodal projection space.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden layer of the Language model.
+    Example:
+    ```python
+    >>> from transformers import PaliGemmaForConditionalGeneration, PaliGemmaConfig, SiglipVisionConfig, GemmaConfig
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+    >>> # Initializing a PaliGemma config
+    >>> text_config = GemmaConfig()
+    >>> # Initializing a PaliGemma paligemma-3b-224 style configuration
+    >>> configuration = PaliGemmaConfig(vision_config, text_config)
+    >>> # Initializing a model from the paligemma-3b-224 style configuration
+    >>> model = PaliGemmaForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "spatialvla"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=256000,
+        vocab_size=257152,
+        projection_dim=2048,
+        hidden_size=2048,
+        vision_zoe_config=None,
+        action_token_begin_idx=None,
+        spatial_token_num=259,
+        use_spatial_token=False,
+        ego3d_patch_reso=4,
+        n_freqs=8,
+        use_vision_zoe=True,
+        # wrap_lora=False,
+        **kwargs,
+    ):
+        self._ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self._vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        self.hidden_size = hidden_size
+        self.vision_config = vision_config
+        self.is_encoder_decoder = False
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1152,
+                patch_size=14,
+                image_size=224,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                vocab_size=257152,
+                vision_use_head=False,
+            )
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["gemma2"](
+                hidden_size=2048,
+                num_hidden_layers=18,
+                intermediate_size=16384,
+                num_attention_heads=8,
+                num_key_value_heads=1,
+                is_encoder_decoder=False,
+                vocab_size=vocab_size,
+            )
+        self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
+        self.vision_config.projection_dim = projection_dim
+        # vision zoe config
+        self.vision_zoe_config = vision_zoe_config
+        if isinstance(self.vision_zoe_config, dict):
+            vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
+            self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
+        else:
+            print(f"🔥 init from default configurations ... {self.vision_zoe_config}")
+            # BUG: initializing zoe in default cause key error
+            # self.vision_zoe_config = CONFIG_MAPPING["zoedepth"]()
+            pass
+        # NOTE: additional attributes
+        self.action_token_begin_idx = action_token_begin_idx
+        self.spatial_token_num = spatial_token_num
+        self.use_spatial_token = use_spatial_token
+        self.ego3d_patch_reso = ego3d_patch_reso
+        self.n_freqs = n_freqs
+        self.use_vision_zoe = use_vision_zoe
+        # self.wrap_lora = wrap_lora
+        super().__init__(**kwargs)
+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
+            FutureWarning,
+        )
+        return self._ignore_index
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_ignore_index", None)
+        return output

dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,3502 @@

+{
+  "fractal20220817_data/0.1.0": {
+    "action": {
+      "mean": [
+        0.006987507455050945,
+        0.006265853065997362,
+        -0.012625162489712238,
+        0.04333285242319107,
+        -0.005756276659667492,
+        0.0009130403632298112,
+        0.5354204773902893
+      ],
+      "std": [
+        0.06921109557151794,
+        0.05970889702439308,
+        0.0735311210155487,
+        0.1561058759689331,
+        0.1316441297531128,
+        0.14593777060508728,
+        0.49711623787879944
+      ],
+      "max": [
+        2.9984593391418457,
+        22.09052848815918,
+        2.7507524490356445,
+        1.570636510848999,
+        1.5321086645126343,
+        1.5691522359848022,
+        1.0
+      ],
+      "min": [
+        -2.0204520225524902,
+        -5.497899532318115,
+        -2.031663417816162,
+        -1.569917917251587,
+        -1.569892168045044,
+        -1.570419430732727,
+        0.0
+      ],
+      "q01": [
+        -0.22453527510166169,
+        -0.14820013284683228,
+        -0.231589707583189,
+        -0.3517994859814644,
+        -0.4193011274933815,
+        -0.43643461108207704,
+        0.0
+      ],
+      "q99": [
+        0.17824687153100965,
+        0.14938379630446405,
+        0.21842354819178575,
+        0.5892666035890578,
+        0.35272657424211445,
+        0.44796681255102094,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 3786400,
+    "num_trajectories": 87212
+  },
+  "kuka/0.1.0": {
+    "action": {
+      "mean": [
+        -0.00046687963185831904,
+        0.00040137648466043174,
+        -0.0012807906605303288,
+        0.0,
+        0.0,
+        -0.037225183099508286,
+        0.4131543040275574
+      ],
+      "std": [
+        0.020832739770412445,
+        0.029158642515540123,
+        0.0642285868525505,
+        0.0,
+        0.0,
+        0.14224639534950256,
+        0.4908643662929535
+      ],
+      "max": [
+        0.1697135865688324,
+        0.2777623236179352,
+        0.43710532784461975,
+        0.0,
+        0.0,
+        1.9684287309646606,
+        1.0
+      ],
+      "min": [
+        -0.159867063164711,
+        -0.2892282009124756,
+        -0.2795473635196686,
+        0.0,
+        0.0,
+        -1.9875637292861938,
+        0.0
+      ],
+      "q01": [
+        -0.06619441494345665,
+        -0.08713878810405731,
+        -0.15083016991615295,
+        0.0,
+        0.0,
+        -0.5415697038173676,
+        0.0
+      ],
+      "q99": [
+        0.06601839080452929,
+        0.08732476785779003,
+        0.18168179214000715,
+        0.0,
+        0.0,
+        0.2923380345106127,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 2455879,
+    "num_trajectories": 209880
+  },
+  "bridge_orig/1.0.0": {
+    "action": {
+      "mean": [
+        0.00023341714404523373,
+        0.00013004327774979174,
+        -0.00012762591359205544,
+        -0.0001556579809403047,
+        -0.00040393328526988626,
+        0.00023558337124995887,
+        0.5764582753181458
+      ],
+      "std": [
+        0.009765734896063805,
+        0.013689505867660046,
+        0.012667152099311352,
+        0.028534479439258575,
+        0.03063790127635002,
+        0.07691770792007446,
+        0.4973658621311188
+      ],
+      "max": [
+        0.41691166162490845,
+        0.25864794850349426,
+        0.21218234300613403,
+        3.122201919555664,
+        1.8618112802505493,
+        6.280478477478027,
+        1.0
+      ],
+      "min": [
+        -0.4007510244846344,
+        -0.13874775171279907,
+        -0.22553899884223938,
+        -3.2010786533355713,
+        -1.8618112802505493,
+        -6.279075622558594,
+        0.0
+      ],
+      "q01": [
+        -0.02872725307941437,
+        -0.04170349963009357,
+        -0.026093858778476715,
+        -0.08092105075716972,
+        -0.09288699507713317,
+        -0.20718276381492615,
+        0.0
+      ],
+      "q99": [
+        0.028309678435325586,
+        0.040855254605412394,
+        0.040161586627364146,
+        0.08192047759890528,
+        0.07792850524187081,
+        0.20382574498653397,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 2135463,
+    "num_trajectories": 60064
+  },
+  "taco_play/0.1.0": {
+    "action": {
+      "mean": [
+        -0.0038459226489067078,
+        0.009671436622738838,
+        0.01278059184551239,
+        -0.0054037850350141525,
+        -0.009606562554836273,
+        -0.0024807206355035305,
+        0.4263913035392761
+      ],
+      "std": [
+        0.23254045844078064,
+        0.3629826307296753,
+        0.2869291603565216,
+        0.261770635843277,
+        0.24388927221298218,
+        0.5216501355171204,
+        0.49469029903411865
+      ],
+      "max": [
+        1.4915844202041626,
+        2.1842432022094727,
+        2.6836395263671875,
+        5.035226821899414,
+        2.665864944458008,
+        4.250768661499023,
+        1.0
+      ],
+      "min": [
+        -4.242457866668701,
+        -3.192805051803589,
+        -1.3371467590332031,
+        -4.202683448791504,
+        -2.6722638607025146,
+        -3.3467135429382324,
+        0.0
+      ],
+      "q01": [
+        -0.7106140398979186,
+        -1.056944659948349,
+        -0.5878450274467468,
+        -0.7682853937149048,
+        -0.7180147767066956,
+        -1.5527938604354858,
+        0.0
+      ],
+      "q99": [
+        0.6482916426658629,
+        1.0051310062408447,
+        0.9480248689651489,
+        0.6926478147506714,
+        0.6351067513227462,
+        1.628010264635086,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 237798,
+    "num_trajectories": 3603
+  },
+  "jaco_play/0.1.0": {
+    "action": {
+      "mean": [
+        0.0009658387862145901,
+        -0.005800850689411163,
+        -0.003950685728341341,
+        0.0,
+        0.0,
+        0.0,
+        0.34934908151626587
+      ],
+      "std": [
+        0.12234985828399658,
+        0.09678783267736435,
+        0.1115543395280838,
+        0.0,
+        0.0,
+        0.0,
+        0.47682321071624756
+      ],
+      "max": [
+        0.20000000298023224,
+        0.20000000298023224,
+        0.20000000298023224,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "min": [
+        -0.20000000298023224,
+        -0.20000000298023224,
+        -0.20000000298023224,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -0.20000000298023224,
+        -0.20000000298023224,
+        -0.20000000298023224,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.20000000298023224,
+        0.20000000298023224,
+        0.20000000298023224,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 77965,
+    "num_trajectories": 1085
+  },
+  "berkeley_cable_routing/0.1.0": {
+    "action": {
+      "mean": [
+        -0.07139858603477478,
+        0.023608991876244545,
+        0.10241956263780594,
+        0.0,
+        0.0,
+        0.04967105761170387,
+        0.0
+      ],
+      "std": [
+        0.18155010044574738,
+        0.18109896779060364,
+        0.21220752596855164,
+        0.0,
+        0.0,
+        0.3475516438484192,
+        0.0
+      ],
+      "max": [
+        0.9633283019065857,
+        1.0,
+        1.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0
+      ],
+      "min": [
+        -0.9809081554412842,
+        -0.9554349184036255,
+        -0.9994775056838989,
+        0.0,
+        0.0,
+        -1.0,
+        0.0
+      ],
+      "q01": [
+        -0.5534318816661835,
+        -0.4797285574674606,
+        -0.5314934802055359,
+        0.0,
+        0.0,
+        -0.8855219376087189,
+        0.0
+      ],
+      "q99": [
+        0.42652835428714786,
+        0.5000944086909298,
+        0.639823433756829,
+        0.0,
+        0.0,
+        0.984243879914284,
+        0.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 42328,
+    "num_trajectories": 1647
+  },
+  "roboturk/0.1.0": {
+    "action": {
+      "mean": [
+        0.001444889116100967,
+        -0.0015945355407893658,
+        -0.0011753803119063377,
+        0.002301239175722003,
+        -0.0009382442804053426,
+        -0.00011485860886750743,
+        0.5746025443077087
+      ],
+      "std": [
+        0.0493537075817585,
+        0.06354564428329468,
+        0.06116492301225662,
+        0.0955340564250946,
+        0.08420011401176453,
+        0.06517910957336426,
+        0.4945177137851715
+      ],
+      "max": [
+        0.39124172925949097,
+        0.4601028263568878,
+        0.4870833456516266,
+        1.816888689994812,
+        1.8240282535552979,
+        1.4824820756912231,
+        1.0
+      ],
+      "min": [
+        -0.6546999216079712,
+        -0.6365841031074524,
+        -0.4217723608016968,
+        -1.6695482730865479,
+        -1.8023357391357422,
+        -1.4630827903747559,
+        0.0
+      ],
+      "q01": [
+        -0.1342635464668274,
+        -0.19996687173843383,
+        -0.1482972100377083,
+        -0.20720748245716095,
+        -0.09676413893699647,
+        -0.18075634717941286,
+        0.0
+      ],
+      "q99": [
+        0.14956976801157001,
+        0.1805950567126275,
+        0.18841815620660796,
+        0.21615413755178453,
+        0.09457383215427405,
+        0.18543301910162005,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 187507,
+    "num_trajectories": 1995
+  },
+  "viola/0.1.0": {
+    "action": {
+      "mean": [
+        0.04761853069067001,
+        -0.029204534366726875,
+        0.055867329239845276,
+        -0.0026185200549662113,
+        0.006867341697216034,
+        -0.016821356490254402,
+        0.7323777675628662
+      ],
+      "std": [
+        0.39157867431640625,
+        0.40765219926834106,
+        0.40077903866767883,
+        0.10023998469114304,
+        0.08443189412355423,
+        0.10375089943408966,
+        0.442600816488266
+      ],
+      "max": [
+        1.0,
+        1.0,
+        1.0,
+        0.375,
+        0.36321428418159485,
+        0.375,
+        1.0
+      ],
+      "min": [
+        -1.0,
+        -1.0,
+        -1.0,
+        -0.375,
+        -0.375,
+        -0.375,
+        0.0
+      ],
+      "q01": [
+        -0.9628571271896362,
+        -1.0,
+        -1.0,
+        -0.26249998807907104,
+        -0.21321429312229156,
+        -0.3385714292526245,
+        0.0
+      ],
+      "q99": [
+        0.9114285707473755,
+        0.868571400642395,
+        1.0,
+        0.2817857265472412,
+        0.2239285707473755,
+        0.3557142913341522,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 76324,
+    "num_trajectories": 150
+  },
+  "berkeley_autolab_ur5/0.1.0": {
+    "action": {
+      "mean": [
+        0.0005683613708242774,
+        0.0012176961172372103,
+        -0.0005296385497786105,
+        0.00021029777417425066,
+        6.069485243642703e-05,
+        0.0012049867073073983,
+        0.6298308372497559
+      ],
+      "std": [
+        0.011533073149621487,
+        0.007990497164428234,
+        0.009577799588441849,
+        0.009432999417185783,
+        0.016427574679255486,
+        0.011054049246013165,
+        0.482679545879364
+      ],
+      "max": [
+        0.019999999552965164,
+        0.019999999552965164,
+        0.019999999552965164,
+        0.06666667014360428,
+        0.06666667014360428,
+        0.06666667014360428,
+        1.0
+      ],
+      "min": [
+        -0.019999999552965164,
+        -0.019999999552965164,
+        -0.019999999552965164,
+        -0.06666667014360428,
+        -0.06666667014360428,
+        -0.06666667014360428,
+        0.0
+      ],
+      "q01": [
+        -0.019999999552965164,
+        -0.019999999552965164,
+        -0.019999999552965164,
+        -0.02628571353852749,
+        -0.06666667014360428,
+        -0.03847619146108627,
+        0.0
+      ],
+      "q99": [
+        0.019999999552965164,
+        0.019999999552965164,
+        0.019999999552965164,
+        0.031809523701667786,
+        0.06666667014360428,
+        0.036571428179740906,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 97939,
+    "num_trajectories": 1000
+  },
+  "toto/0.1.0": {
+    "action": {
+      "mean": [
+        0.3854214549064636,
+        0.007769507821649313,
+        0.3632742166519165,
+        -0.665202796459198,
+        0.1890396624803543,
+        0.0329875648021698,
+        0.0
+      ],
+      "std": [
+        0.12211630493402481,
+        0.19378569722175598,
+        0.10178232192993164,
+        0.5725256204605103,
+        0.298846036195755,
+        0.32599160075187683,
+        0.0
+      ],
+      "max": [
+        0.6839867234230042,
+        0.4454185664653778,
+        0.7984078526496887,
+        2.120781660079956,
+        1.371164321899414,
+        1.4118704795837402,
+        0.0
+      ],
+      "min": [
+        0.09922284632921219,
+        -0.5180193781852722,
+        0.13791072368621826,
+        -2.635117530822754,
+        -1.0734480619430542,
+        -1.9282547235488892,
+        0.0
+      ],
+      "q01": [
+        0.1756722891330719,
+        -0.3077590811252594,
+        0.235383919775486,
+        -2.0908505964279174,
+        -0.6191593289375306,
+        -0.7488683319091797,
+        0.0
+      ],
+      "q99": [
+        0.6136963081359863,
+        0.33704194784164443,
+        0.6681221985816956,
+        0.7422861719131538,
+        0.7955395007133507,
+        0.740464625358582,
+        0.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 325699,
+    "num_trajectories": 1003
+  },
+  "language_table/0.1.0": {
+    "action": {
+      "mean": [
+        0.00014891766477376223,
+        -0.0005636657006107271,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "std": [
+        0.030162859708070755,
+        0.04230763390660286,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.23357294499874115,
+        0.24496802687644958,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "min": [
+        -0.21989956498146057,
+        -0.23736150562763214,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "q01": [
+        -0.08179590478539467,
+        -0.11795833334326744,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "q99": [
+        0.08822273463010788,
+        0.1191693339496851,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 7045476,
+    "num_trajectories": 442226
+  },
+  "stanford_hydra_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.0007790043600834906,
+        0.00013707877951674163,
+        -0.000254859565757215,
+        0.0012903243768960238,
+        -0.004751724191009998,
+        0.002692892448976636,
+        0.48855218291282654
+      ],
+      "std": [
+        0.008022183552384377,
+        0.009131456725299358,
+        0.00957438349723816,
+        0.04122224077582359,
+        0.03843001648783684,
+        0.046067025512456894,
+        0.49978113174438477
+      ],
+      "max": [
+        0.02499854564666748,
+        0.02499903365969658,
+        0.024999922141432762,
+        0.24974457919597626,
+        0.24997030198574066,
+        0.24999946355819702,
+        1.0
+      ],
+      "min": [
+        -0.024999044835567474,
+        -0.024999700486660004,
+        -0.02499929815530777,
+        -0.24993225932121277,
+        -0.2499666064977646,
+        -0.2499932497739792,
+        0.0
+      ],
+      "q01": [
+        -0.019992006458342076,
+        -0.02415412735193968,
+        -0.022941758055239916,
+        -0.11085530579090118,
+        -0.12024572037160397,
+        -0.13314770206809043,
+        0.0
+      ],
+      "q99": [
+        0.022886231057345868,
+        0.022358838934451335,
+        0.02410089675337076,
+        0.12370114490389822,
+        0.11323311634361738,
+        0.18474749639630164,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 358234,
+    "num_trajectories": 570
+  },
+  "austin_buds_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        -0.07678329944610596,
+        0.0036849123425781727,
+        0.05644941329956055,
+        0.0,
+        0.0,
+        0.0,
+        0.3510494828224182
+      ],
+      "std": [
+        0.6367746591567993,
+        0.3788914680480957,
+        0.47796377539634705,
+        0.0,
+        0.0,
+        0.0,
+        0.4772108495235443
+      ],
+      "max": [
+        1.0,
+        1.0,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "min": [
+        -1.0,
+        -1.0,
+        -1.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -1.0,
+        -0.9599999785423279,
+        -0.8714285492897034,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        1.0,
+        0.8600000143051147,
+        1.0,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 34112,
+    "num_trajectories": 50
+  },
+  "nyu_franka_play_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.0010219910182058811,
+        -0.00012002632865915075,
+        0.00032894135802052915,
+        0.0015034276293590665,
+        -0.002198528265580535,
+        -0.0016632305923849344,
+        0.7230083346366882
+      ],
+      "std": [
+        0.013274150900542736,
+        0.013215919025242329,
+        0.01282210648059845,
+        0.27324533462524414,
+        0.05702253058552742,
+        0.03917279839515686,
+        0.44753193855285645
+      ],
+      "max": [
+        0.06424188613891602,
+        0.07027634978294373,
+        0.06129661202430725,
+        6.281067848205566,
+        0.1967729926109314,
+        0.26377415657043457,
+        1.0
+      ],
+      "min": [
+        -0.05952230095863342,
+        -0.07232445478439331,
+        -0.06730806827545166,
+        -6.278434753417969,
+        -0.21479034423828125,
+        -0.3627619743347168,
+        0.0
+      ],
+      "q01": [
+        -0.03199600875377655,
+        -0.032861671447753905,
+        -0.03368805110454559,
+        -0.12080862045288086,
+        -0.12175218224525451,
+        -0.11370223641395569,
+        0.0
+      ],
+      "q99": [
+        0.03101520001888276,
+        0.0373908892273903,
+        0.03646374464035038,
+        0.11764093399047852,
+        0.1258920183777809,
+        0.09366151213645942,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 44875,
+    "num_trajectories": 456
+  },
+  "furniture_bench_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.0001461071806261316,
+        0.0010830992832779884,
+        0.0006224963581189513,
+        -0.0033032014034688473,
+        -0.002688060747459531,
+        0.018242614343762398,
+        0.48854944109916687
+      ],
+      "std": [
+        0.016107233241200447,
+        0.014891570433974266,
+        0.014014236629009247,
+        0.05827433615922928,
+        0.11417083442211151,
+        0.33479660749435425,
+        0.4999157190322876
+      ],
+      "max": [
+        0.10000000149011612,
+        0.10000000149011612,
+        0.10000000149011612,
+        0.8651833534240723,
+        1.0909736156463623,
+        2.863185405731201,
+        1.0
+      ],
+      "min": [
+        -0.10495579987764359,
+        -0.10939455777406693,
+        -0.10000000149011612,
+        -0.971906840801239,
+        -1.0475432872772217,
+        -3.06000018119812,
+        0.0
+      ],
+      "q01": [
+        -0.053988199681043625,
+        -0.05049169331789017,
+        -0.032499241530895236,
+        -0.1953887003660202,
+        -0.41674559473991396,
+        -0.8886768388748169,
+        0.0
+      ],
+      "q99": [
+        0.05414841488003723,
+        0.04965164884924884,
+        0.060055799782276154,
+        0.18231668293476103,
+        0.39867786407470646,
+        0.8772023963928218,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 3948057,
+    "num_trajectories": 5100
+  },
+  "ucsd_kitchen_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        410.375732421875,
+        116.9518814086914,
+        192.35031127929688,
+        -121.22441864013672,
+        -33.84892654418945,
+        50.016136169433594,
+        0.741813600063324
+      ],
+      "std": [
+        122.81488037109375,
+        108.80094909667969,
+        130.30345153808594,
+        116.2820053100586,
+        27.62191390991211,
+        41.02091979980469,
+        0.4376337230205536
+      ],
+      "max": [
+        678.0,
+        400.0,
+        507.0,
+        180.00001525878906,
+        6.000013828277588,
+        116.99998474121094,
+        1.0
+      ],
+      "min": [
+        172.0,
+        -166.0,
+        -99.99999237060547,
+        -180.00001525878906,
+        -89.0,
+        -96.00010681152344,
+        0.0
+      ],
+      "q01": [
+        200.00001052856445,
+        -102.31004211425781,
+        -94.99993370056153,
+        -180.00001525878906,
+        -88.00001525878906,
+        -38.999977111816406,
+        0.0
+      ],
+      "q99": [
+        637.0,
+        368.30999999999995,
+        493.0,
+        180.00001525878906,
+        0.999983012676239,
+        105.00001525878906,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 3970,
+    "num_trajectories": 150
+  },
+  "austin_sailor_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.011825386434793472,
+        0.0064610871486365795,
+        0.060236409306526184,
+        0.0,
+        0.0,
+        0.0016465834341943264,
+        0.5260950326919556
+      ],
+      "std": [
+        0.46348854899406433,
+        0.41240164637565613,
+        0.41186293959617615,
+        0.0,
+        0.0,
+        0.0578608438372612,
+        0.49893733859062195
+      ],
+      "max": [
+        1.0,
+        1.0,
+        1.0,
+        0.0,
+        0.0,
+        0.375,
+        1.0
+      ],
+      "min": [
+        -1.0,
+        -1.0,
+        -1.0,
+        0.0,
+        0.0,
+        -0.375,
+        0.0
+      ],
+      "q01": [
+        -1.0,
+        -0.9828571677207947,
+        -0.6000000238418579,
+        0.0,
+        0.0,
+        -0.17249999940395355,
+        0.0
+      ],
+      "q99": [
+        1.0,
+        0.9457142949104309,
+        1.0,
+        0.0,
+        0.0,
+        0.17892856895923615,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 353094,
+    "num_trajectories": 240
+  },
+  "austin_sirius_dataset_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.077476866543293,
+        0.031955525279045105,
+        0.04244735836982727,
+        0.0,
+        0.0,
+        -0.01603454165160656,
+        0.43260180950164795
+      ],
+      "std": [
+        0.3906330168247223,
+        0.2998153865337372,
+        0.2782270312309265,
+        0.0,
+        0.0,
+        0.08120641857385635,
+        0.49528202414512634
+      ],
+      "max": [
+        1.0002285242080688,
+        0.960608720779419,
+        1.105179786682129,
+        0.0,
+        0.0,
+        0.341785728931427,
+        1.0
+      ],
+      "min": [
+        -1.0183025598526,
+        -0.9800000190734863,
+        -0.9774575233459473,
+        0.0,
+        0.0,
+        -0.34607142210006714,
+        0.0
+      ],
+      "q01": [
+        -0.780905865430832,
+        -0.5667179036140442,
+        -0.5254343223571777,
+        0.0,
+        0.0,
+        -0.28495091378688814,
+        0.0
+      ],
+      "q99": [
+        0.9569637751579284,
+        0.6971374487876891,
+        0.8124888157844541,
+        0.0,
+        0.0,
+        0.1971428543329239,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 279939,
+    "num_trajectories": 559
+  },
+  "dlr_edan_shared_control_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.0066478196531534195,
+        -0.0007657355745323002,
+        0.006522845011204481,
+        0.0011679773451760411,
+        -0.006395624950528145,
+        -0.011903021484613419,
+        0.6985887289047241
+      ],
+      "std": [
+        0.021393585950136185,
+        0.018142299726605415,
+        0.03374377265572548,
+        0.01743541844189167,
+        0.03394372761249542,
+        0.04641878604888916,
+        0.45885783433914185
+      ],
+      "max": [
+        0.18991442024707794,
+        0.0739002525806427,
+        0.18064819276332855,
+        0.0866486132144928,
+        0.13464981317520142,
+        0.16910280287265778,
+        1.0
+      ],
+      "min": [
+        -0.10054297000169754,
+        -0.08427435159683228,
+        -0.13533438742160797,
+        -0.17556548118591309,
+        -0.18485672771930695,
+        -0.2680685818195343,
+        0.0
+      ],
+      "q01": [
+        -0.02987122368067503,
+        -0.06013262912631035,
+        -0.08286409199237824,
+        -0.05924444157630205,
+        -0.15986866518855095,
+        -0.15636983573436739,
+        0.0
+      ],
+      "q99": [
+        0.08832092039287087,
+        0.042126184627413736,
+        0.11311905644834042,
+        0.0643695573508739,
+        0.03941855944693088,
+        0.156646853685379,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 8928,
+    "num_trajectories": 104
+  },
+  "iamlab_cmu_pickup_insert_converted_externally_to_rlds/0.1.0": {
+    "action": {
+      "mean": [
+        0.5274373292922974,
+        0.028582017868757248,
+        0.18712472915649414,
+        1.2339569330215454,
+        0.03226622939109802,
+        -1.4199472665786743,
+        0.5550631880760193
+      ],
+      "std": [
+        0.08108346909284592,
+        0.1116756722331047,
+        0.07747555524110794,
+        2.8737244606018066,
+        0.02774704433977604,
+        2.7678685188293457,
+        0.4969509243965149
+      ],
+      "max": [
+        0.6634981632232666,
+        0.23428471386432648,
+        0.4308285415172577,
+        3.1415927410125732,
+        0.13647015392780304,
+        3.141592502593994,
+        1.0
+      ],
+      "min": [
+        0.3071657121181488,
+        -0.29754969477653503,
+        0.06578229367733002,
+        -3.1415927410125732,
+        -0.04584203287959099,
+        -3.141592502593994,
+        0.0
+      ],
+      "q01": [
+        0.3148897051811218,
+        -0.20317550599575043,
+        0.06785467118024827,
+        -3.140952730178833,
+        -0.029743434861302376,
+        -3.141091251373291,
+        0.0
+      ],
+      "q99": [
+        0.6472805738449097,
+        0.20846802592277527,
+        0.36855655312538155,
+        3.1409926891326903,
+        0.11424950212240226,
+        3.1410969257354737,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 146241,
+    "num_trajectories": 631
+  },
+  "utaustin_mutex/0.1.0": {
+    "action": {
+      "mean": [
+        0.06176406517624855,
+        -0.005005490034818649,
+        0.10216782987117767,
+        -0.03314131125807762,
+        0.013895022682845592,
+        -0.011317633092403412,
+        0.5038976669311523
+      ],
+      "std": [
+        0.187501460313797,
+        0.4468473196029663,
+        0.3792876601219177,
+        0.14097853004932404,
+        0.06453699618577957,
+        0.11765265464782715,
+        0.501045286655426
+      ],
+      "max": [
+        1.0,
+        1.0,
+        1.0,
+        0.375,
+        0.375,
+        0.375,
+        1.0
+      ],
+      "min": [
+        -1.0,
+        -1.0,
+        -1.0,
+        -0.375,
+        -0.375,
+        -0.375,
+        0.0
+      ],
+      "q01": [
+        -0.4285714328289032,
+        -0.9800000190734863,
+        -0.5571428537368774,
+        -0.375,
+        -0.15642857551574707,
+        -0.335357129573822,
+        0.0
+      ],
+      "q99": [
+        0.5914285778999329,
+        0.9714285731315613,
+        1.0,
+        0.3278571367263794,
+        0.207857146859169,
+        0.25607141852378845,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 361883,
+    "num_trajectories": 1500
+  },
+  "berkeley_fanuc_manipulation/0.1.0": {
+    "action": {
+      "mean": [
+        0.0007744057802483439,
+        -0.00031240080716088414,
+        -0.0015001941937953234,
+        -0.0007515158504247665,
+        -0.00015832878125365824,
+        0.00014327642566058785,
+        0.699295699596405
+      ],
+      "std": [
+        0.0034070133697241545,
+        0.00499219074845314,
+        0.005344326142221689,
+        0.007599010597914457,
+        0.004081932827830315,
+        0.008568963967263699,
+        0.45868709683418274
+      ],
+      "max": [
+        0.009999999776482582,
+        0.009999999776482582,
+        0.009999999776482582,
+        0.03490658476948738,
+        0.03490658476948738,
+        0.03490658476948738,
+        1.0
+      ],
+      "min": [
+        -0.009999999776482582,
+        -0.009999999776482582,
+        -0.009999999776482582,
+        -0.03490658476948738,
+        -0.03490658476948738,
+        -0.03490658476948738,
+        0.0
+      ],
+      "q01": [
+        -0.009999999776482582,
+        -0.009999999776482582,
+        -0.009999999776482582,
+        -0.03490658476948738,
+        0.0,
+        -0.03490658476948738,
+        0.0
+      ],
+      "q99": [
+        0.009999999776482582,
+        0.009999999776482582,
+        0.009999999776482582,
+        0.03490658476948738,
+        0.0,
+        0.03490658476948738,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 62613,
+    "num_trajectories": 415
+  },
+  "cmu_stretch/0.1.0": {
+    "action": {
+      "mean": [
+        0.0003630445571616292,
+        0.0,
+        0.0016466928645968437,
+        0.0,
+        0.0,
+        0.0,
+        0.3987048268318176
+      ],
+      "std": [
+        0.004081855062395334,
+        0.0,
+        0.003774340031668544,
+        0.0,
+        0.0,
+        0.0,
+        0.489638090133667
+      ],
+      "max": [
+        0.02338407188653946,
+        0.0,
+        0.023404927924275398,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "min": [
+        -0.019353797659277916,
+        0.0,
+        -0.02019215188920498,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        -0.011175686959177256,
+        0.0,
+        -0.0032206363626755773,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.014501785952597848,
+        0.0,
+        0.015056106168776728,
+        0.0,
+        0.0,
+        0.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 25016,
+    "num_trajectories": 135
+  },
+  "bc_z/0.1.0": {
+    "action": {
+      "mean": [
+        -0.009958645328879356,
+        0.0008958434336818755,
+        0.00499522453173995,
+        0.000297540333122015,
+        -0.008734511211514473,
+        -0.03068969026207924,
+        0.8344562649726868
+      ],
+      "std": [
+        0.030533093959093094,
+        0.0231416504830122,
+        0.020642085000872612,
+        0.04156165570020676,
+        0.04643021523952484,
+        0.07697845250368118,
+        0.36111101508140564
+      ],
+      "max": [
+        0.2165454924106598,
+        0.1251407265663147,
+        0.10772687941789627,
+        0.33544227480888367,
+        0.28117990493774414,
+        0.40614867210388184,
+        1.0
+      ],
+      "min": [
+        -0.1677047461271286,
+        -0.14630407094955444,
+        -0.10066790133714676,
+        -0.29421567916870117,
+        -0.32101404666900635,
+        -0.4635624885559082,
+        0.0
+      ],
+      "q01": [
+        -0.09220654994249344,
+        -0.06456145539879798,
+        -0.049121275544166565,
+        -0.11594625547528267,
+        -0.14152548640966414,
+        -0.2251061636209488,
+        0.0
+      ],
+      "q99": [
+        0.07628866866230968,
+        0.058019736707210584,
+        0.052540797740221024,
+        0.11740604028105736,
+        0.11703975558280955,
+        0.16729306846857078,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 6015535,
+    "num_trajectories": 43264
+  },
+  "fmb_dataset/1.0.0": {
+    "action": {
+      "mean": [
+        0.05902976542711258,
+        -0.06476633995771408,
+        -0.09787469357252121,
+        0.004325387068092823,
+        0.00028963759541511536,
+        -0.04457257315516472,
+        0.7336440086364746
+      ],
+      "std": [
+        0.28809186816215515,
+        0.2820416986942291,
+        0.4626740515232086,
+        0.3266514539718628,
+        0.10842999070882797,
+        0.34400978684425354,
+        0.4435289800167084
+      ],
+      "max": [
+        1.399999976158142,
+        1.0,
+        1.399999976158142,
+        1.0,
+        1.0,
+        1.0,
+        1.0
+      ],
+      "min": [
+        -1.399999976158142,
+        -1.399999976158142,
+        -1.0,
+        -1.0,
+        -1.0,
+        -1.0,
+        0.0
+      ],
+      "q01": [
+        -0.8257142901420593,
+        -1.399999976158142,
+        -1.0,
+        -1.0,
+        -0.3028571307659149,
+        -1.0,
+        0.0
+      ],
+      "q99": [
+        1.0,
+        0.5257142782211304,
+        1.0,
+        1.0,
+        0.3400000035762787,
+        1.0,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 1137459,
+    "num_trajectories": 8612
+  },
+  "dobbe/0.0.1": {
+    "action": {
+      "mean": [
+        -0.00011206958151888102,
+        0.0011229681549593806,
+        -0.00010193959315074608,
+        -7.37128357286565e-05,
+        -0.0006753374473191798,
+        -5.664441778208129e-05,
+        0.6318688988685608
+      ],
+      "std": [
+        0.042660679668188095,
+        0.04428431764245033,
+        0.12224890291690826,
+        0.005388470832258463,
+        0.011246936395764351,
+        0.006288259290158749,
+        0.3973240256309509
+      ],
+      "max": [
+        38.590423583984375,
+        17.932697296142578,
+        4.843764305114746,
+        1.4372116327285767,
+        0.4340403974056244,
+        1.2057193517684937,
+        0.9998947381973267
+      ],
+      "min": [
+        -5.700923442840576,
+        -21.605947494506836,
+        -123.72489929199219,
+        -1.7229845523834229,
+        -0.4998578727245331,
+        -0.8867913484573364,
+        1.4196479014572105e-06
+      ],
+      "q01": [
+        -0.01119564864784479,
+        -0.014266146533191203,
+        -0.0071747214533388615,
+        -0.009444301575422287,
+        -0.03990109823644161,
+        -0.017422311007976532,
+        4.003279136668425e-05
+      ],
+      "q99": [
+        0.01015154086053368,
+        0.017181577533483497,
+        0.007216989761218411,
+        0.010380979906767595,
+        0.03556173853576176,
+        0.018032474815845446,
+        0.9982578039169312
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 1139911,
+    "num_trajectories": 5208
+  },
+  "droid/1.0.0": {
+    "action": {
+      "mean": [
+        0.027425529435276985,
+        -0.0026820411439985037,
+        0.01595238223671913,
+        0.0035501928068697453,
+        -0.030532635748386383,
+        -0.006685464642941952,
+        0.5860344171524048
+      ],
+      "std": [
+        0.25387412309646606,
+        0.18426834046840668,
+        0.22532416880130768,
+        0.21757009625434875,
+        0.22572560608386993,
+        0.2867794930934906,
+        0.4287726879119873
+      ],
+      "max": [
+        0.9999998211860657,
+        0.999991774559021,
+        0.9999973177909851,
+        0.9999874830245972,
+        0.9999954104423523,
+        0.9999998807907104,
+        1.0
+      ],
+      "min": [
+        -0.9999999403953552,
+        -0.9999951124191284,
+        -0.9999960660934448,
+        -0.9999980330467224,
+        -0.9999982118606567,
+        -0.9999998807907104,
+        0.0
+      ],
+      "q01": [
+        -0.7776297926902771,
+        -0.5803514122962952,
+        -0.5795090794563293,
+        -0.6464047729969025,
+        -0.7041108310222626,
+        -0.8895104378461838,
+        0.0
+      ],
+      "q99": [
+        0.7597932070493698,
+        0.5726242214441299,
+        0.7351000607013702,
+        0.6705610305070877,
+        0.6464948207139969,
+        0.8897542208433151,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 27044326,
+    "num_trajectories": 92233
+  },
+  "rh20t_rlds/1.0.0": {
+    "action": {
+      "mean": [
+        -5.332157638779582e+28,
+        -1.5128827327837974e+29,
+        -1.832736619079747e+28,
+        0.5735913515090942,
+        -0.00847744569182396,
+        -0.5566052198410034,
+        0.3186892569065094
+      ],
+      "std": [
+        Infinity,
+        Infinity,
+        Infinity,
+        2.2581026554107666,
+        0.1548534482717514,
+        2.2581026554107666,
+        0.39917993545532227
+      ],
+      "max": [
+        7.582831568163597e+35,
+        7.557172735451728e+35,
+        2.2717764477020827e+27,
+        3.1415927410125732,
+        1.5116956233978271,
+        3.1415927410125732,
+        1.0
+      ],
+      "min": [
+        -3.5543094244408723e+36,
+        -8.723098019507117e+36,
+        -9.648338287048974e+35,
+        -3.1415927410125732,
+        -1.5062522888183594,
+        -3.1415927410125732,
+        0.0
+      ],
+      "q01": [
+        0.36028257966041566,
+        -0.272584410905838,
+        0.005985925104469062,
+        -3.1411514282226562,
+        -0.5925320792198181,
+        -3.1415159702301025,
+        0.0
+      ],
+      "q99": [
+        0.7534684538841248,
+        0.31738221645355225,
+        0.33061375379562374,
+        3.141425132751465,
+        0.47507260441780086,
+        3.141479730606079,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "std": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "max": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "min": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ]
+    },
+    "num_transitions": 52644433,
+    "num_trajectories": 104392
+  }
+}

example.png ADDED Viewed

gaussian_statistic.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "x": {
+        "mu": 0.029053575254787405,
+        "sigma": 0.35739711346177905
+    },
+    "y": {
+        "mu": 0.02786103774647074,
+        "sigma": 0.3559571318187452
+    },
+    "z": {
+        "mu": -0.09711947743976429,
+        "sigma": 0.3812679355450573
+    },
+    "theta": {
+        "mu": 1.876412496914298,
+        "sigma": 0.7796698531704824
+    },
+    "phi": {
+        "mu": 0.11370731154738657,
+        "sigma": 1.6770051583708574
+    },
+    "r": {
+        "mu": 0.5388089344232078,
+        "sigma": 0.3471899869415345
+    },
+    "roll": {
+        "mu": -0.027831684347476855,
+        "sigma": 0.3128015393591425
+    },
+    "pitch": {
+        "mu": 0.024452017209463325,
+        "sigma": 0.30003871781999225
+    },
+    "yaw": {
+        "mu": 0.00545067902753645,
+        "sigma": 0.33581468270262405
+    }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.0"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:713ee1bd116c8f77e2f56903d15e6c3f3dff6ea6b12fc71bfcc3bd4a53a2cc2b
+size 4969426016

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c87abc1b91598ad237c31ee2286c5c783bfb9f4142696d9b1c36e62f634a34
+size 3086476734

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_ego3d.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# MIT License
+# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
+# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+# coding=utf-8
+import torch.utils.checkpoint
+from torch import nn
+from transformers.utils import logging
+import torchvision.transforms.functional as F
+import numpy as np
+import math
+logger = logging.get_logger(__name__)
+class Ego3DPositionEmbeddingMLP(nn.Module):
+    """Absolute pos embedding, learned.
+    https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
+    """
+    def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
+        super(Ego3DPositionEmbeddingMLP, self).__init__()
+        self.n_freqs = n_freqs
+        self.freq_out_channels = in_channels * (2 * n_freqs + 1)
+        if logscale:
+            freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
+        else:
+            freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
+        center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+        self.register_buffer("center", center, persistent=False)
+        self.position_embedding_head = nn.Sequential(
+            nn.Linear(self.freq_out_channels, num_pos_feats),
+            nn.LayerNorm(num_pos_feats),
+            nn.ReLU(),
+            nn.Linear(num_pos_feats, num_pos_feats),
+        )
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """init with small weights to maintain stable training."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p, gain=0.01)
+    @torch.no_grad()
+    def frequency_encoding(self, xyz):
+        """
+        Embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        Different from the paper, "x" is also in the output
+        See https://github.com/bmild/nerf/issues/12
+        x \in [-2, 2]
+        y \in [-2, 2]
+        z \in [0., 4]
+        Inputs:
+            x: (b n m)
+        Outputs:
+            out: (b n o)
+        """
+        xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
+        xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands  # (b n m 1)
+        sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq)  # (b n m nf)
+        encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
+        return encoding
+    def forward(self, xyz):
+        """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
+        # TODO: encoding with 3D position
+        freq_encoding = self.frequency_encoding(xyz)
+        position_embedding = self.position_embedding_head(freq_encoding)
+        return position_embedding
+def get_resize_output_image_size(
+    input_height: int,
+    input_width: int,
+    output_size: tuple = (384, 512),
+    keep_aspect_ratio: bool = True,
+    multiple: int = 32,
+):
+    def constrain_to_multiple_of(val, multiple, min_val=0):
+        x = (np.round(val / multiple) * multiple).astype(int)
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+        return x
+    output_height, output_width = output_size
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            scale_height = scale_width
+        else:
+            scale_width = scale_height
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+    return (int(new_height), int(new_width))
+def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
+    """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
+    # h, w = images.shape[-2:]
+    # pad images
+    ph, pw = 31, 31  # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
+    images = torch.nn.functional.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
+    # resize images
+    size = (384, 384)  # get_resize_output_image_size(h, w, output_size=output_size, keep_aspect_ratio=True, multiple=32) # 384, 384
+    images = torch.nn.functional.interpolate(images, size=size, mode="bicubic", align_corners=True)
+    # NOTE: zoe: padding -> resize -> nomalize.
+    # BUT: siglip processor get nomalized image, we simplely follow `nomalize -> padding -> resize` in reflect pad mode
+    ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+    images = F.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
+    return images, ph, pw

modeling_gemma2.py ADDED Viewed

	@@ -0,0 +1,1286 @@

+# custom gemma2 to support flash_attention_2
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, HybridCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_torch_greater_or_equal,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_greater_or_equal_2_10,
+)
+from transformers import Gemma2Config
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "google/gemma2-7b"
+_CONFIG_FOR_DOC = "Gemma2Config"
+class Gemma2RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+class Gemma2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class Gemma2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    key_states = repeat_kv(key, config.num_key_value_groups)
+    value_states = repeat_kv(value, config.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
+    if config.attn_logit_softcapping is not None:
+        attn_weights = attn_weights / config.attn_logit_softcapping
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * config.attn_logit_softcapping
+    if mask is not None:  # no matter the length, we just slice it
+        causal_mask = mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def flash_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    target_dtype: torch.dtype = torch.float16,
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211
+    seq_len = query.shape[2]
+    # print(f"🔥 query {query.shape}, key {key.shape}, value: {value.shape}")
+    if mask is not None:
+        # print(f"🔥 mask {mask.shape}")
+        # seq_len = mask.shape[1]
+        query = query[:, :, :seq_len]
+        value = value[:, :, :seq_len]
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+    dropout_rate = config.attention_dropout if config.training else 0.0
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    attn_output = _flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        mask,
+        seq_len,
+        dropout=dropout_rate,
+        softmax_scale=config.scaling,
+        is_causal=config.is_causal,
+        sliding_window=config.sliding_window,
+        use_top_left_mask=config._flash_attn_uses_top_left_mask,
+        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    )
+    return attn_output, None
+def flex_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    output_attentions: bool = False,
+    **_kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        soft_cap = config.attn_logit_softcapping
+        score = soft_cap * torch.tanh(score / soft_cap)
+        if mask is not None:
+            return score + mask[b][0][q_idx][kv_idx]
+        return score
+    attn_output = flex_attention(
+        query,
+        key,
+        value,
+        score_mod=tanh_softcap,
+        enable_gqa=True,
+        scale=config.scaling,
+        return_lse=output_attentions,
+    )
+    if not output_attentions:
+        attn_weights = None
+    else:
+        attn_output, attn_weights = attn_output
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def sdpa_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    key = repeat_kv(key, config.num_key_value_groups)
+    value = repeat_kv(value, config.num_key_value_groups)
+    causal_mask = mask
+    if mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query.device.type == "cuda" and causal_mask is not None:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=causal_mask,
+        dropout_p=config.attention_dropout if config.training else 0.0,
+        is_causal=is_causal,
+        scale=config.scaling,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
+GEMMA2_ATTENTION_FUNCTION = {
+    "flash_attention_2": flash_attention_forward,
+    "flex_attention": flex_attention_forward,
+    "eager": eager_attention_forward,
+    "sdpa": sdpa_attention_forward,
+}
+class Gemma2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
+        self.attn_logit_softcapping = config.attn_logit_softcapping
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = Gemma2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        # NOTE: gemma2 do not include _flash_attn_uses_top_left_mask for flash attention
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
+            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
+            attention_type = "flex_attention"
+        else:
+            attention_type = self.config._attn_implementation
+        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
+            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Gemma2FlashAttention2(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "flash_attention_2"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2SdpaAttention(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "sdpa"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.is_sliding = not bool(layer_idx % 2)
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.sliding_window = config.sliding_window
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
+            # Flash-attn is a 2D tensor
+            if self.config._attn_implementation == "flash_attention_2":
+                if past_key_value is not None:  # when decoding
+                    attention_mask = attention_mask[:, -self.sliding_window :]
+            else:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
+                )
+                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
+                if attention_mask.shape[-1] <= 1:  # when decoding
+                    attention_mask = attention_mask[:, :, :, -self.sliding_window :]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+GEMMA2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Gemma2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2PreTrainedModel(PreTrainedModel):
+    config_class = Gemma2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = False
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
+        """
+        Overloads `PreTrainedModel._check_and_enable_sdpa` so as to DISABLE torch SDPA by default on Gemma2 models.
+        SDPA reduces the model performance on Gemma2 because of the logits softcapping.
+        """
+        config = super()._check_and_enable_sdpa(config, hard_check_only=hard_check_only)
+        # if using the default path -> swap sdpa by eager
+        if not hard_check_only and config._attn_implementation == "sdpa":
+            config._attn_implementation = "eager"
+        return config
+GEMMA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2Model(Gemma2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`]
+    Args:
+        config: Gemma2Config
+    """
+    def __init__(self, config: Gemma2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None and not self.training:
+            batch_size, seq_len, _ = inputs_embeds.shape
+            past_key_values = HybridCache(
+                self.config,
+                batch_size=batch_size,
+                max_cache_len=seq_len,
+                device=self.device,
+                dtype=inputs_embeds.dtype,
+            )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        # embed positions
+        hidden_states = inputs_embeds
+        # normalized
+        # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = past_key_values if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    @torch.no_grad()
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: HybridCache,
+        output_attentions: bool,
+    ):
+        # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache.
+        # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
+        # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
+        # as it doesn't cause dynamic control issues.
+        if self.config._attn_implementation == "flash_attention_2":
+            return attention_mask
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Gemma2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten: has a special cache type, `HybridCache`
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
+                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
+                # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
+                # batch size = 1 case, `position_ids` is already contiguous but with varying stride
+                # which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if (
+            isinstance(past_key_values, HybridCache)
+            and attention_mask.ndim == 2
+            and not self.config._attn_implementation == "flash_attention_2"
+        ):
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a sequence classification head on top (linear layer).
+    [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForSequenceClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForTokenClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling_spatialvla.py ADDED Viewed

	@@ -0,0 +1,773 @@

+# MIT License
+# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
+# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+# Based on code licensed under the Apache License, Version 2.0 by Google Inc. and HuggingFace Inc. team (Copyright 2024).
+# coding=utf-8
+"""PyTorch PaliGemmamodel."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.linalg import inv
+import torchvision.transforms.functional as F
+import os
+from transformers.cache_utils import Cache, HybridCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_spatialvla import SpatialVLAConfig
+from .modeling_ego3d import Ego3DPositionEmbeddingMLP, process_zoe
+from .modeling_gemma2 import Gemma2ForCausalLM
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+from transformers import AutoModel, AutoModelForCausalLM, ZoeDepthForDepthEstimation
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "PaliGemmaConfig"
+# constant
+SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+# But Paligemma has no causal mask on prefix
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    is_training: bool = False,
+    token_type_ids: torch.Tensor = None,
+    **kwargs,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+        is_training (`bool`):
+            Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels`
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask[:, :sequence_length] = 0.0
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+    return causal_mask
+@dataclass
+class SpatialVLACausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for PaliGemmacausal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+class SpatialVLAMultiModalProjector(nn.Module):
+    def __init__(self, config: SpatialVLAConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True)
+    def forward(self, image_features):
+        hidden_states = self.linear(image_features)
+        return hidden_states
+PALIGEMMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PaliGemmaConfig`] or [`PaliGemmaVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    PALIGEMMA_START_DOCSTRING,
+)
+class SpatialVLAPreTrainedModel(PreTrainedModel):
+    config_class = SpatialVLAConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SpatialVLAMultiModalProjector", "ZoeDepthForDepthEstimation", "Ego3DPositionEmbeddingMLP"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        # important: this ported version of PaliGemmaisn't meant for training from scratch - only
+        # inference and fine-tuning
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+PALIGEMMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`PaliGemmaProcessor`] uses
+            [`SiglipImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    """The PALIGEMMA model which consists of a vision backbone and a language model.""",
+    PALIGEMMA_START_DOCSTRING,
+)
+class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin):
+    def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None):
+        super().__init__(config)
+        # vision model
+        self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config)
+        # projector
+        self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config)
+        # language model
+        self.vocab_size = config.text_config.vocab_size
+        if language_model is None:
+            language_model = Gemma2ForCausalLM(config=config.text_config) if config.text_config.model_type == "gemma2" else AutoModelForCausalLM.from_config(config=config.text_config)
+        # set tile key
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+        if config.use_vision_zoe:
+            # zoe model
+            self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config)
+            self.position_embedding_3d = Ego3DPositionEmbeddingMLP(
+                config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs
+            )
+            # register buffer
+            patch_size, reso, image_size = config.vision_config.patch_size, config.ego3d_patch_reso, config.vision_config.image_size
+            y, x = torch.meshgrid(torch.arange(0, image_size, patch_size // reso), torch.arange(0, image_size, patch_size // reso), indexing="ij")  # (h//sp w//sp)
+            y, x = y + patch_size / reso / 2, x + patch_size / reso / 2
+            uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1)  # (3 hw)
+            self.register_buffer("uv_h", uv_h, persistent=False)
+        # NOTE: add shared addtional spatial token embeddings for <ACTION> <IMG>
+        if config.use_spatial_token:
+            self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size)
+        else:
+            self.spatial_embed_tokens = None
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        # self.post_init() # BUG: cause from_pretrained failed!
+        # self.position_embedding_3d._reset_parameters()
+    def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor:
+        """
+        Backproject depth map to 3D points in camera coordinate.
+        Args:
+            K: camera intrinsic matrix (b 3 3)
+            depth: depth map (b 1 h w)
+            pixel_offset: offset to the pixel coordinate
+        """
+        # __import__("ipdb").set_trace()
+        b, c, h, w = depth.shape
+        hp, wp = h // patch_size, w // patch_size
+        sub_hp = sub_wp = reso
+        patch_depth = torch.nn.functional.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1)
+        # import torchvision; torchvision.utils.save_image(zoe_pixel_values[0], "zoe_image.png")
+        p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth  # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw)
+        patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1)
+        return patch_p_cam
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings with Llava->PaliGemma
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings with Llava->PaliGemma
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings with Llava->PaliGemma
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings with Llava->PaliGemma
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder with Llava->PaliGemma
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder with Llava->PaliGemma
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights with Llava->PaliGemma
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
+    ) -> nn.Embedding:
+        # TODO: is_deepspeed_zero3_enabled gather
+        print(f"resize token embeddings from {self.language_model.get_output_embeddings().weight.shape} to (*,{new_num_tokens})")
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        # update base model and current model config
+        vocab_size = model_embeds.weight.shape[0]
+        self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size
+        self.tie_weights()
+        return model_embeds
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids,
+        past_key_values,
+        cache_position,
+        input_ids=None,
+        inputs_embeds=None,
+        is_training: bool = False,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.dtype).min
+        inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        elif isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            return attention_mask
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
+        )
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask[:, :sequence_length] = 0.0
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+        return causal_mask
+    def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        # mintrinsic = intrinsic.reshape(-1, 3, 3)
+        # siglip vision tower
+        siglip_pixel_values = F.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD)
+        image_outputs = self.vision_tower(siglip_pixel_values)
+        # ego3d position encoding
+        if self.config.use_vision_zoe:
+            zoe_pixel_values, ph, pw = process_zoe(pixel_values, pad_mode="reflect")
+            with torch.no_grad():
+                pvh, pvw = pixel_values.shape[-2:]
+                depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth
+                depth = torch.nn.functional.interpolate(
+                    depth.unsqueeze(1),
+                    size=(pvh+2*ph, pvw+2*pw),
+                    mode="bicubic",
+                    align_corners=True,
+                )[..., ph:-ph, pw:-pw]
+                # depth = torch.clamp(depth, 0., 4.0) # NOTE: we find that depth w/o clamp performs better
+                xyz = self.backproject_patch(
+                    intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso
+                )  # (b, n, 3*4)
+            pos_embed_3d = self.position_embedding_3d(xyz)
+            selected_image_feature = image_outputs.last_hidden_state + pos_embed_3d
+        else:
+            selected_image_feature = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (self.config.text_config.hidden_size**0.5)
+        return image_features
+    @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SpatialVLACausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        actions: Optional[torch.FloatTensor] = None,
+        intrinsic: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```"""
+        # print(f"**************************************\n \
+        #       input_ids {input_ids} \n \
+        #       labels {labels} \n \
+        #       token_type_ids {token_type_ids} \n \
+        #       attention_mask {attention_mask} \n \
+        #       actions {actions} \n \
+        #         **************************************"
+        #       )
+        # print(f"model.language_model.config._attn_implementation {self.language_model.config._attn_implementation} model.config.vision_config._attn_implementation_internal {self.config.vision_config._attn_implementation_internal} \n \
+        #       model.vision_tower.config._attn_implementation {self.vision_tower.config._attn_implementation} model.config.vision_config._attn_implementation_internal {self.config.vision_config._attn_implementation_internal}")
+        # __import__("ipdb").set_trace()
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        is_training = token_type_ids is not None and labels is not None
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids).clone() ## avoid checkpint grad True
+        # NOTE: replace the fixed embeddings with trainable spatial embeddings
+        # BUG: LoRA causes inputs_embeds requires_grad = True
+        # peft: https://github.com/huggingface/peft/blob/ec92cdcc41fe1b141bfe1e0da69b38a7e601cc80/src/peft/peft_model.py#L687
+        # hf: https://github.com/huggingface/transformers/blob/05260a1fc1c8571a2b421ce72b680d5f1bc3e5a4/src/transformers/modeling_utils.py#L2545
+        # lora w/ prompt: https://discuss.huggingface.co/t/combine-between-lora-and-prompt-tunning/65151
+        if self.config.use_spatial_token:
+            spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num)
+            inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values, intrinsic)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
+        )
+        outputs = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return SpatialVLACausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        intrinsic=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+        # position_ids in Paligemma are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            causal_mask = self._update_causal_mask(
+                attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
+            )
+            model_inputs["attention_mask"] = causal_mask
+        model_inputs["intrinsic"] = intrinsic
+        return model_inputs
+    @torch.no_grad()
+    def predict_action(
+        self,
+        model_inputs,
+    ) -> torch.Tensor:
+        model_inputs = model_inputs.to(torch.bfloat16).to(self.device)
+        input_len = model_inputs["input_ids"].shape[-1]
+        generation_outputs = self.generate(**model_inputs, max_new_tokens=256, do_sample=False)
+        return generation_outputs[:,input_len:]
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        # NOTE: tie the weights of the embed_tokens with lm head (donot work if un_tie_weight)
+        # model.language_model.tie_weights()
+        # NOTE: tie the data of spatial_embed_tokens with embed_tokens (BUG: forweight sync issue in training)
+        if model.config.use_spatial_token:
+            model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data
+        return model

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SpatialVLAProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

processing_spatialvla.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# MIT License
+# Copyright (c) 2025 IPEC at Shanghai AI Laboratory
+# Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
+# Based on code licensed under the Apache License, Version 2.0 by Google Inc. and HuggingFace Inc. team (Copyright 2024).
+# coding=utf-8
+"""
+Processor class for PaliGemma.
+"""
+import logging
+from typing import List, Optional, Union, Dict
+import torch
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+    _validate_images_text_input_order,
+)
+from transformers.tokenization_utils_base import (
+    AddedToken,
+    PreTokenizedInput,
+    TextInput,
+)
+from transformers.utils import logging
+from .action_tokenizer import SphericalCoordinateActionTokenizer
+logger = logging.get_logger(__name__)
+IMAGE_TOKEN = "<image>"
+EXTRA_TOKENS = [f"<loc{i:0>4}>" for i in range(1024)] + [f"<seg{i:0>3}>" for i in range(128)]
+class PaliGemmaTextKwargs(TextKwargs):
+    suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+class PaliGemmaImagesKwargs(ImagesKwargs):
+    do_convert_rgb: Optional[bool]
+class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: PaliGemmaTextKwargs
+    images_kwargs: PaliGemmaImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "data_format": "channels_first",
+        },
+    }
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+def _is_str_or_image(elem):
+    return isinstance(elem, (str)) or is_image_or_image_url(elem)
+def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_images):
+    """
+    Builds a string from the input prompt and image tokens.
+    For example, for the call:
+    build_string_from_input(
+        prompt="Prefix str"
+        bos_token="<s>",
+        image_seq_len=3,
+        image_token="<im>",
+    )
+    The output will be:
+    "<im><im><im><s>Initial str"
+    Args:
+        prompt (`List[Union[str, ImageInput]]`): The input prompt.
+        bos_token (`str`): The beginning of sentence token.
+        image_seq_len (`int`): The length of the image sequence.
+        image_token (`str`): The image token.
+        num_images (`int`): Number of images in the prompt.
+    """
+    return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
+# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched video from {images}")
+class SpatialVLAProcessor(ProcessorMixin):
+    r"""
+    Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
+    [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
+    Args:
+        image_processor ([`SiglipImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        statistics: Optional[dict] = None,
+        bin_policy=None,
+        intrinsic_config=None,
+        action_config=None,
+        num_obs_steps=1,
+        obs_delta=1,
+        action_chunk_size=1,
+        min_sigma=0.0,
+        **kwargs,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        if not hasattr(tokenizer, "image_token"):
+            image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        else:
+            self.image_token_id = tokenizer.image_token_id
+        tokenizer.add_tokens(EXTRA_TOKENS)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        # action tokenizer
+        self.statistics = statistics if statistics else {}
+        self.bin_policy = bin_policy
+        self.min_sigma = min_sigma
+        self.intrinsic_config = intrinsic_config
+        self.action_config = action_config
+        self.num_obs_steps = num_obs_steps
+        self.obs_delta = obs_delta
+        self.action_chunk_size = action_chunk_size
+        self.dataset_intrinsics = {}
+        height, width = image_processor.size["height"], image_processor.size["width"]
+        for k, v in intrinsic_config.items():
+            K = torch.tensor(v["intrinsic"]).float()
+            h, w = v["height"], v["width"]
+            K[0, 0] *= width / w
+            K[1, 1] *= height / h
+            K[0, 2] *= width / w
+            K[1, 2] *= height / h
+            self.dataset_intrinsics[k] = K
+            print(f"scale intrinsic of {k} from {v['intrinsic']} to {K} ...")
+        self.action_tokenizer = SphericalCoordinateActionTokenizer(
+            tokenizer=tokenizer, num_bins=action_config["num_bins"],
+            bin_policy=bin_policy, use_spherical=action_config["use_spherical"],
+            min_sigma=min_sigma,
+        )
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        unnorm_key: Optional[str] = None,
+        suffix_actions: Optional[np.array] = None, # (t e)
+        **kwargs: Unpack[PaliGemmaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
+        the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for
+        the prefix and the suffix. For instance,
+        ```python
+        image = PIL_cow_image
+        prompt = "answer en Where is the cow standing?"
+        suffix = "on the beach"
+        inputs = processor(text=prompt, images=image, suffix=suffix)
+        ```
+        Here `inputs` will contain the `input_ids` and `token_type_ids` that follow
+        ```python
+        inputs["input_ids"][:, 256:]
+        # tensor([[     2,   6006,    603,    573,  13910,   9980, 235336,    108,    477,   573,   8318]])
+        inputs["token_type_ids"][:, 256:]
+        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
+        ```
+        Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            suffix (`str`, `List[str]`, `List[List[str]]`):
+                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
+                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+              is provided, the `input_ids` will also contain the suffix input ids.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- Labels compatible with training if `suffix` is not None
+        """
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+        output_kwargs = self._merge_kwargs(
+            PaliGemmaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if suffix_actions is not None:
+            action_tokens = self.action_tokenizer(suffix_actions) # (n,3)
+            suffix="".join(action_tokens.flatten())
+        else:
+            suffix = output_kwargs["text_kwargs"].pop("suffix", None)
+        return_token_type_ids = True if suffix is not None else False
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
+        if text is None:
+            logger.warning_once(
+                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
+            )
+            text = ""
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+        if text is not None and images is not None:
+            if not any(IMAGE_TOKEN in sample for sample in text):
+                # logger.warning(
+                #     "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special "
+                #     "image tokens in the text, as many tokens as there are images per each text. It is recommended to "
+                #     "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images "
+                #     "each text has and add special tokens."
+                # )
+                if isinstance(text, List) and isinstance(images, List):
+                    if len(images) != len(text):
+                        raise ValueError(
+                            f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
+                        )
+                # make a nested list of lists to be able to iterate over the images and text below
+                if is_valid_image(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_valid_image(images[0]):
+                    images = [[image] for image in images]
+                elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])):
+                    raise ValueError("images must be an image, list of images or list of list of images")
+                if suffix is not None and _is_str_or_image(suffix):
+                    suffix = [suffix]
+                if suffix is not None:
+                    suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
+                input_strings = [
+                    build_string_from_input(
+                        prompt=prompt,
+                        bos_token=self.tokenizer.bos_token,
+                        image_seq_len=self.image_seq_length,
+                        image_token=IMAGE_TOKEN,
+                        num_images=len(image_list) if isinstance(image_list, list) else 1,
+                    )
+                    for prompt, image_list in zip(text, images)
+                ]
+                images = make_batched_images(images)
+            else:
+                expanded_samples = []
+                for sample in text:
+                    expanded_sample = sample.replace(IMAGE_TOKEN, IMAGE_TOKEN * self.image_seq_length)
+                    bos_rfind_index = expanded_sample.rfind(IMAGE_TOKEN)
+                    bos_index = bos_rfind_index + len(IMAGE_TOKEN) if bos_rfind_index != -1 else 0
+                    expanded_sample = (
+                        expanded_sample[:bos_index] + self.tokenizer.bos_token + expanded_sample[bos_index:]
+                    )
+                    expanded_samples.append(expanded_sample)
+                input_strings = [f"{sample}\n" for sample in expanded_samples]
+        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+        # max_length has to account for the image tokens
+        if output_kwargs["text_kwargs"].get("max_length", None) is not None:
+            output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
+        inputs = self.tokenizer(
+            input_strings,
+            text_pair=suffix,
+            return_token_type_ids=return_token_type_ids,
+            **output_kwargs["text_kwargs"],
+        )
+        intrinsic = self.dataset_intrinsics[unnorm_key] if unnorm_key in self.dataset_intrinsics else self.dataset_intrinsics["default"]
+        return_data = {**inputs, "pixel_values": pixel_values, "intrinsic": intrinsic}
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def decode_actions(
+        self,
+        generation_outputs: torch.Tensor,
+        unnorm_key: Optional[str] = None,
+    ) -> Dict[str, torch.Tensor]:
+        action_token_num = 3  # translation + rotation + gripper
+        predicted_action_token_ids = generation_outputs[0, : action_token_num * self.action_chunk_size].detach().cpu().long().numpy()
+        assert self.tokenizer.eos_token != predicted_action_token_ids[-1], "[error] actions contain EOS token, please check you truncation settings!"
+        if predicted_action_token_ids.shape[0] < action_token_num * self.action_chunk_size:  # pad with zeros
+            print(f"[warning] Padding zero action!")
+            predicted_action_token_ids = np.concatenate(
+                [
+                    predicted_action_token_ids,
+                    np.zeros(action_token_num * self.action_chunk_size - predicted_action_token_ids.shape[0], dtype=np.longlong),
+                ]
+            )
+        predicted_action_token_ids = predicted_action_token_ids.reshape(-1, action_token_num)
+        normalized_action_chunks = self.action_tokenizer.decode_token_ids_to_actions(predicted_action_token_ids)
+        # Unnormalize actions
+        if unnorm_key is None:
+            print(f"🔥 unnorm_key {unnorm_key} is not in statistics, use next one")
+            unnorm_key = next(self.statistics.keys())
+        action_norm_stats = self.statistics[unnorm_key]["action"]
+        action_dim = len(action_norm_stats["q01"])
+        mask = np.array(action_norm_stats.get("mask", np.ones(action_dim)), dtype=bool)
+        action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
+        actions = []
+        for normalized_actions in normalized_action_chunks:
+            action = np.where(
+                mask,
+                0.5 * (normalized_actions + 1) * (action_high - action_low) + action_low,
+                normalized_actions,
+            )
+            actions.append(action)
+        actions = np.stack(actions)
+        return {"actions": actions, "action_ids": predicted_action_token_ids}

processor_config.json ADDED Viewed

	@@ -0,0 +1,3701 @@

+{
+  "action_chunk_size": 4,
+  "action_config": {
+    "distribution": "gaussian",
+    "num_bins": {
+      "gripper": 2,
+      "rotation": {
+        "pitch_bins": 16,
+        "roll_bins": 16,
+        "yaw_bins": 16
+      },
+      "total": 8194,
+      "translation": {
+        "phi_bins": 32,
+        "r_bins": 8,
+        "theta_bins": 16
+      }
+    },
+    "use_spherical": true
+  },
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "bin_policy": {
+    "rotation": {
+      "pitch_bins": [
+        -1.0,
+        -0.6785015894338633,
+        -0.516796358161167,
+        -0.3978678314258641,
+        -0.29907867426319246,
+        -0.21158608510441518,
+        -0.13081651669135252,
+        -0.05392877158612959,
+        0.02113881590329744,
+        0.0961313749999302,
+        0.17278161860263358,
+        0.25310821063971767,
+        0.33985580585203445,
+        0.4373796767941653,
+        0.5539451994131283,
+        0.7100308525313351,
+        0.9999999999999999
+      ],
+      "roll_bins": [
+        -1.0,
+        -0.7121298287894609,
+        -0.5564581819056097,
+        -0.440071773405789,
+        -0.3426461358467384,
+        -0.25595819395001274,
+        -0.17566893098554964,
+        -0.09904102149491184,
+        -0.024059205927849478,
+        0.05100802578115137,
+        0.12790631705350436,
+        0.20869987492610076,
+        0.2962359118858219,
+        0.3951018734752948,
+        0.5141779624401348,
+        0.6762450862353777,
+        1.0
+      ],
+      "yaw_bins": [
+        -1.0,
+        -0.6910047644696934,
+        -0.5313988287371314,
+        -0.4133376866679583,
+        -0.3150057290436059,
+        -0.22777658299365705,
+        -0.14715771012527992,
+        -0.07034330907230311,
+        0.004712965738136004,
+        0.07975252682496348,
+        0.15651401950954372,
+        0.23703420508371892,
+        0.32409736463921823,
+        0.4221473708283458,
+        0.5396818128475004,
+        0.6980345545587262,
+        1.0
+      ]
+    },
+    "translation": {
+      "phi_bins": [
+        -3.1415926535897927,
+        -2.5597806593194092,
+        -2.1899702111786126,
+        -1.9071489188814448,
+        -1.6724463283141142,
+        -1.4683467869586326,
+        -1.2853487663890668,
+        -1.1176672338183495,
+        -0.961484031585327,
+        -0.8141204989748655,
+        -0.6736024210639718,
+        -0.5384120746595923,
+        -0.40733740832383114,
+        -0.279375002438531,
+        -0.15366425283265983,
+        -0.029440234757304742,
+        0.0940021938080639,
+        0.2173378027339352,
+        0.34123726674747146,
+        0.46639302836823826,
+        0.5935473848733163,
+        0.7235258808185444,
+        0.857280204661428,
+        0.9959469801163238,
+        1.1409329906705301,
+        1.2940454053271015,
+        1.4577019170652383,
+        1.6352913749303837,
+        1.8318407243899377,
+        2.0553733807372363,
+        2.320069275631962,
+        2.6552436426949604,
+        3.141592653589793
+      ],
+      "r_bins": [
+        2.220446049250313e-16,
+        0.19677118231539265,
+        0.3506298590504556,
+        0.4881976731379496,
+        0.621970275186659,
+        0.7620978861167458,
+        0.9228346010157172,
+        1.1393317208802278,
+        1.7320508075688767
+      ],
+      "theta_bins": [
+        0.0,
+        0.7067187338585303,
+        0.9814199309359143,
+        1.1752042640550222,
+        1.3331175751173345,
+        1.4713205387280388,
+        1.5977846301055496,
+        1.7172771763957553,
+        1.8331248472067783,
+        1.9480194771467687,
+        2.0644993054216925,
+        2.1853608246107656,
+        2.314189357400805,
+        2.456314355008026,
+        2.621028843347318,
+        2.828352346005421,
+        3.141592653589793
+      ]
+    }
+  },
+  "intrinsic_config": {
+    "bridge_orig/1.0.0": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    },
+    "default": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    }
+  },
+  "num_obs_steps": 1,
+  "obs_delta": 1,
+  "processor_class": "SpatialVLAProcessor",
+  "statistics": {
+    "fractal20220817_data/0.1.0": {
+      "action": {
+        "mean": [
+          0.006987507455050945,
+          0.006265853065997362,
+          -0.012625162489712238,
+          0.04333285242319107,
+          -0.005756276659667492,
+          0.0009130403632298112,
+          0.5354204773902893
+        ],
+        "std": [
+          0.06921109557151794,
+          0.05970889702439308,
+          0.0735311210155487,
+          0.1561058759689331,
+          0.1316441297531128,
+          0.14593777060508728,
+          0.49711623787879944
+        ],
+        "max": [
+          2.9984593391418457,
+          22.09052848815918,
+          2.7507524490356445,
+          1.570636510848999,
+          1.5321086645126343,
+          1.5691522359848022,
+          1.0
+        ],
+        "min": [
+          -2.0204520225524902,
+          -5.497899532318115,
+          -2.031663417816162,
+          -1.569917917251587,
+          -1.569892168045044,
+          -1.570419430732727,
+          0.0
+        ],
+        "q01": [
+          -0.22453527510166169,
+          -0.14820013284683228,
+          -0.231589707583189,
+          -0.3517994859814644,
+          -0.4193011274933815,
+          -0.43643461108207704,
+          0.0
+        ],
+        "q99": [
+          0.17824687153100965,
+          0.14938379630446405,
+          0.21842354819178575,
+          0.5892666035890578,
+          0.35272657424211445,
+          0.44796681255102094,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 3786400,
+      "num_trajectories": 87212
+    },
+    "kuka/0.1.0": {
+      "action": {
+        "mean": [
+          -0.00046687963185831904,
+          0.00040137648466043174,
+          -0.0012807906605303288,
+          0.0,
+          0.0,
+          -0.037225183099508286,
+          0.4131543040275574
+        ],
+        "std": [
+          0.020832739770412445,
+          0.029158642515540123,
+          0.0642285868525505,
+          0.0,
+          0.0,
+          0.14224639534950256,
+          0.4908643662929535
+        ],
+        "max": [
+          0.1697135865688324,
+          0.2777623236179352,
+          0.43710532784461975,
+          0.0,
+          0.0,
+          1.9684287309646606,
+          1.0
+        ],
+        "min": [
+          -0.159867063164711,
+          -0.2892282009124756,
+          -0.2795473635196686,
+          0.0,
+          0.0,
+          -1.9875637292861938,
+          0.0
+        ],
+        "q01": [
+          -0.06619441494345665,
+          -0.08713878810405731,
+          -0.15083016991615295,
+          0.0,
+          0.0,
+          -0.5415697038173676,
+          0.0
+        ],
+        "q99": [
+          0.06601839080452929,
+          0.08732476785779003,
+          0.18168179214000715,
+          0.0,
+          0.0,
+          0.2923380345106127,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 2455879,
+      "num_trajectories": 209880
+    },
+    "bridge_orig/1.0.0": {
+      "action": {
+        "mean": [
+          0.00023341714404523373,
+          0.00013004327774979174,
+          -0.00012762591359205544,
+          -0.0001556579809403047,
+          -0.00040393328526988626,
+          0.00023558337124995887,
+          0.5764582753181458
+        ],
+        "std": [
+          0.009765734896063805,
+          0.013689505867660046,
+          0.012667152099311352,
+          0.028534479439258575,
+          0.03063790127635002,
+          0.07691770792007446,
+          0.4973658621311188
+        ],
+        "max": [
+          0.41691166162490845,
+          0.25864794850349426,
+          0.21218234300613403,
+          3.122201919555664,
+          1.8618112802505493,
+          6.280478477478027,
+          1.0
+        ],
+        "min": [
+          -0.4007510244846344,
+          -0.13874775171279907,
+          -0.22553899884223938,
+          -3.2010786533355713,
+          -1.8618112802505493,
+          -6.279075622558594,
+          0.0
+        ],
+        "q01": [
+          -0.02872725307941437,
+          -0.04170349963009357,
+          -0.026093858778476715,
+          -0.08092105075716972,
+          -0.09288699507713317,
+          -0.20718276381492615,
+          0.0
+        ],
+        "q99": [
+          0.028309678435325586,
+          0.040855254605412394,
+          0.040161586627364146,
+          0.08192047759890528,
+          0.07792850524187081,
+          0.20382574498653397,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 2135463,
+      "num_trajectories": 60064
+    },
+    "taco_play/0.1.0": {
+      "action": {
+        "mean": [
+          -0.0038459226489067078,
+          0.009671436622738838,
+          0.01278059184551239,
+          -0.0054037850350141525,
+          -0.009606562554836273,
+          -0.0024807206355035305,
+          0.4263913035392761
+        ],
+        "std": [
+          0.23254045844078064,
+          0.3629826307296753,
+          0.2869291603565216,
+          0.261770635843277,
+          0.24388927221298218,
+          0.5216501355171204,
+          0.49469029903411865
+        ],
+        "max": [
+          1.4915844202041626,
+          2.1842432022094727,
+          2.6836395263671875,
+          5.035226821899414,
+          2.665864944458008,
+          4.250768661499023,
+          1.0
+        ],
+        "min": [
+          -4.242457866668701,
+          -3.192805051803589,
+          -1.3371467590332031,
+          -4.202683448791504,
+          -2.6722638607025146,
+          -3.3467135429382324,
+          0.0
+        ],
+        "q01": [
+          -0.7106140398979186,
+          -1.056944659948349,
+          -0.5878450274467468,
+          -0.7682853937149048,
+          -0.7180147767066956,
+          -1.5527938604354858,
+          0.0
+        ],
+        "q99": [
+          0.6482916426658629,
+          1.0051310062408447,
+          0.9480248689651489,
+          0.6926478147506714,
+          0.6351067513227462,
+          1.628010264635086,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 237798,
+      "num_trajectories": 3603
+    },
+    "jaco_play/0.1.0": {
+      "action": {
+        "mean": [
+          0.0009658387862145901,
+          -0.005800850689411163,
+          -0.003950685728341341,
+          0.0,
+          0.0,
+          0.0,
+          0.34934908151626587
+        ],
+        "std": [
+          0.12234985828399658,
+          0.09678783267736435,
+          0.1115543395280838,
+          0.0,
+          0.0,
+          0.0,
+          0.47682321071624756
+        ],
+        "max": [
+          0.20000000298023224,
+          0.20000000298023224,
+          0.20000000298023224,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "min": [
+          -0.20000000298023224,
+          -0.20000000298023224,
+          -0.20000000298023224,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          -0.20000000298023224,
+          -0.20000000298023224,
+          -0.20000000298023224,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.20000000298023224,
+          0.20000000298023224,
+          0.20000000298023224,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 77965,
+      "num_trajectories": 1085
+    },
+    "berkeley_cable_routing/0.1.0": {
+      "action": {
+        "mean": [
+          -0.07139858603477478,
+          0.023608991876244545,
+          0.10241956263780594,
+          0.0,
+          0.0,
+          0.04967105761170387,
+          0.0
+        ],
+        "std": [
+          0.18155010044574738,
+          0.18109896779060364,
+          0.21220752596855164,
+          0.0,
+          0.0,
+          0.3475516438484192,
+          0.0
+        ],
+        "max": [
+          0.9633283019065857,
+          1.0,
+          1.0,
+          0.0,
+          0.0,
+          1.0,
+          0.0
+        ],
+        "min": [
+          -0.9809081554412842,
+          -0.9554349184036255,
+          -0.9994775056838989,
+          0.0,
+          0.0,
+          -1.0,
+          0.0
+        ],
+        "q01": [
+          -0.5534318816661835,
+          -0.4797285574674606,
+          -0.5314934802055359,
+          0.0,
+          0.0,
+          -0.8855219376087189,
+          0.0
+        ],
+        "q99": [
+          0.42652835428714786,
+          0.5000944086909298,
+          0.639823433756829,
+          0.0,
+          0.0,
+          0.984243879914284,
+          0.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 42328,
+      "num_trajectories": 1647
+    },
+    "roboturk/0.1.0": {
+      "action": {
+        "mean": [
+          0.001444889116100967,
+          -0.0015945355407893658,
+          -0.0011753803119063377,
+          0.002301239175722003,
+          -0.0009382442804053426,
+          -0.00011485860886750743,
+          0.5746025443077087
+        ],
+        "std": [
+          0.0493537075817585,
+          0.06354564428329468,
+          0.06116492301225662,
+          0.0955340564250946,
+          0.08420011401176453,
+          0.06517910957336426,
+          0.4945177137851715
+        ],
+        "max": [
+          0.39124172925949097,
+          0.4601028263568878,
+          0.4870833456516266,
+          1.816888689994812,
+          1.8240282535552979,
+          1.4824820756912231,
+          1.0
+        ],
+        "min": [
+          -0.6546999216079712,
+          -0.6365841031074524,
+          -0.4217723608016968,
+          -1.6695482730865479,
+          -1.8023357391357422,
+          -1.4630827903747559,
+          0.0
+        ],
+        "q01": [
+          -0.1342635464668274,
+          -0.19996687173843383,
+          -0.1482972100377083,
+          -0.20720748245716095,
+          -0.09676413893699647,
+          -0.18075634717941286,
+          0.0
+        ],
+        "q99": [
+          0.14956976801157001,
+          0.1805950567126275,
+          0.18841815620660796,
+          0.21615413755178453,
+          0.09457383215427405,
+          0.18543301910162005,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 187507,
+      "num_trajectories": 1995
+    },
+    "viola/0.1.0": {
+      "action": {
+        "mean": [
+          0.04761853069067001,
+          -0.029204534366726875,
+          0.055867329239845276,
+          -0.0026185200549662113,
+          0.006867341697216034,
+          -0.016821356490254402,
+          0.7323777675628662
+        ],
+        "std": [
+          0.39157867431640625,
+          0.40765219926834106,
+          0.40077903866767883,
+          0.10023998469114304,
+          0.08443189412355423,
+          0.10375089943408966,
+          0.442600816488266
+        ],
+        "max": [
+          1.0,
+          1.0,
+          1.0,
+          0.375,
+          0.36321428418159485,
+          0.375,
+          1.0
+        ],
+        "min": [
+          -1.0,
+          -1.0,
+          -1.0,
+          -0.375,
+          -0.375,
+          -0.375,
+          0.0
+        ],
+        "q01": [
+          -0.9628571271896362,
+          -1.0,
+          -1.0,
+          -0.26249998807907104,
+          -0.21321429312229156,
+          -0.3385714292526245,
+          0.0
+        ],
+        "q99": [
+          0.9114285707473755,
+          0.868571400642395,
+          1.0,
+          0.2817857265472412,
+          0.2239285707473755,
+          0.3557142913341522,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 76324,
+      "num_trajectories": 150
+    },
+    "berkeley_autolab_ur5/0.1.0": {
+      "action": {
+        "mean": [
+          0.0005683613708242774,
+          0.0012176961172372103,
+          -0.0005296385497786105,
+          0.00021029777417425066,
+          6.069485243642703e-05,
+          0.0012049867073073983,
+          0.6298308372497559
+        ],
+        "std": [
+          0.011533073149621487,
+          0.007990497164428234,
+          0.009577799588441849,
+          0.009432999417185783,
+          0.016427574679255486,
+          0.011054049246013165,
+          0.482679545879364
+        ],
+        "max": [
+          0.019999999552965164,
+          0.019999999552965164,
+          0.019999999552965164,
+          0.06666667014360428,
+          0.06666667014360428,
+          0.06666667014360428,
+          1.0
+        ],
+        "min": [
+          -0.019999999552965164,
+          -0.019999999552965164,
+          -0.019999999552965164,
+          -0.06666667014360428,
+          -0.06666667014360428,
+          -0.06666667014360428,
+          0.0
+        ],
+        "q01": [
+          -0.019999999552965164,
+          -0.019999999552965164,
+          -0.019999999552965164,
+          -0.02628571353852749,
+          -0.06666667014360428,
+          -0.03847619146108627,
+          0.0
+        ],
+        "q99": [
+          0.019999999552965164,
+          0.019999999552965164,
+          0.019999999552965164,
+          0.031809523701667786,
+          0.06666667014360428,
+          0.036571428179740906,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 97939,
+      "num_trajectories": 1000
+    },
+    "toto/0.1.0": {
+      "action": {
+        "mean": [
+          0.3854214549064636,
+          0.007769507821649313,
+          0.3632742166519165,
+          -0.665202796459198,
+          0.1890396624803543,
+          0.0329875648021698,
+          0.0
+        ],
+        "std": [
+          0.12211630493402481,
+          0.19378569722175598,
+          0.10178232192993164,
+          0.5725256204605103,
+          0.298846036195755,
+          0.32599160075187683,
+          0.0
+        ],
+        "max": [
+          0.6839867234230042,
+          0.4454185664653778,
+          0.7984078526496887,
+          2.120781660079956,
+          1.371164321899414,
+          1.4118704795837402,
+          0.0
+        ],
+        "min": [
+          0.09922284632921219,
+          -0.5180193781852722,
+          0.13791072368621826,
+          -2.635117530822754,
+          -1.0734480619430542,
+          -1.9282547235488892,
+          0.0
+        ],
+        "q01": [
+          0.1756722891330719,
+          -0.3077590811252594,
+          0.235383919775486,
+          -2.0908505964279174,
+          -0.6191593289375306,
+          -0.7488683319091797,
+          0.0
+        ],
+        "q99": [
+          0.6136963081359863,
+          0.33704194784164443,
+          0.6681221985816956,
+          0.7422861719131538,
+          0.7955395007133507,
+          0.740464625358582,
+          0.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 325699,
+      "num_trajectories": 1003
+    },
+    "language_table/0.1.0": {
+      "action": {
+        "mean": [
+          0.00014891766477376223,
+          -0.0005636657006107271,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "std": [
+          0.030162859708070755,
+          0.04230763390660286,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.23357294499874115,
+          0.24496802687644958,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "min": [
+          -0.21989956498146057,
+          -0.23736150562763214,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "q01": [
+          -0.08179590478539467,
+          -0.11795833334326744,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "q99": [
+          0.08822273463010788,
+          0.1191693339496851,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 7045476,
+      "num_trajectories": 442226
+    },
+    "stanford_hydra_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.0007790043600834906,
+          0.00013707877951674163,
+          -0.000254859565757215,
+          0.0012903243768960238,
+          -0.004751724191009998,
+          0.002692892448976636,
+          0.48855218291282654
+        ],
+        "std": [
+          0.008022183552384377,
+          0.009131456725299358,
+          0.00957438349723816,
+          0.04122224077582359,
+          0.03843001648783684,
+          0.046067025512456894,
+          0.49978113174438477
+        ],
+        "max": [
+          0.02499854564666748,
+          0.02499903365969658,
+          0.024999922141432762,
+          0.24974457919597626,
+          0.24997030198574066,
+          0.24999946355819702,
+          1.0
+        ],
+        "min": [
+          -0.024999044835567474,
+          -0.024999700486660004,
+          -0.02499929815530777,
+          -0.24993225932121277,
+          -0.2499666064977646,
+          -0.2499932497739792,
+          0.0
+        ],
+        "q01": [
+          -0.019992006458342076,
+          -0.02415412735193968,
+          -0.022941758055239916,
+          -0.11085530579090118,
+          -0.12024572037160397,
+          -0.13314770206809043,
+          0.0
+        ],
+        "q99": [
+          0.022886231057345868,
+          0.022358838934451335,
+          0.02410089675337076,
+          0.12370114490389822,
+          0.11323311634361738,
+          0.18474749639630164,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 358234,
+      "num_trajectories": 570
+    },
+    "austin_buds_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          -0.07678329944610596,
+          0.0036849123425781727,
+          0.05644941329956055,
+          0.0,
+          0.0,
+          0.0,
+          0.3510494828224182
+        ],
+        "std": [
+          0.6367746591567993,
+          0.3788914680480957,
+          0.47796377539634705,
+          0.0,
+          0.0,
+          0.0,
+          0.4772108495235443
+        ],
+        "max": [
+          1.0,
+          1.0,
+          1.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "min": [
+          -1.0,
+          -1.0,
+          -1.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          -1.0,
+          -0.9599999785423279,
+          -0.8714285492897034,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          1.0,
+          0.8600000143051147,
+          1.0,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 34112,
+      "num_trajectories": 50
+    },
+    "nyu_franka_play_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.0010219910182058811,
+          -0.00012002632865915075,
+          0.00032894135802052915,
+          0.0015034276293590665,
+          -0.002198528265580535,
+          -0.0016632305923849344,
+          0.7230083346366882
+        ],
+        "std": [
+          0.013274150900542736,
+          0.013215919025242329,
+          0.01282210648059845,
+          0.27324533462524414,
+          0.05702253058552742,
+          0.03917279839515686,
+          0.44753193855285645
+        ],
+        "max": [
+          0.06424188613891602,
+          0.07027634978294373,
+          0.06129661202430725,
+          6.281067848205566,
+          0.1967729926109314,
+          0.26377415657043457,
+          1.0
+        ],
+        "min": [
+          -0.05952230095863342,
+          -0.07232445478439331,
+          -0.06730806827545166,
+          -6.278434753417969,
+          -0.21479034423828125,
+          -0.3627619743347168,
+          0.0
+        ],
+        "q01": [
+          -0.03199600875377655,
+          -0.032861671447753905,
+          -0.03368805110454559,
+          -0.12080862045288086,
+          -0.12175218224525451,
+          -0.11370223641395569,
+          0.0
+        ],
+        "q99": [
+          0.03101520001888276,
+          0.0373908892273903,
+          0.03646374464035038,
+          0.11764093399047852,
+          0.1258920183777809,
+          0.09366151213645942,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 44875,
+      "num_trajectories": 456
+    },
+    "furniture_bench_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.0001461071806261316,
+          0.0010830992832779884,
+          0.0006224963581189513,
+          -0.0033032014034688473,
+          -0.002688060747459531,
+          0.018242614343762398,
+          0.48854944109916687
+        ],
+        "std": [
+          0.016107233241200447,
+          0.014891570433974266,
+          0.014014236629009247,
+          0.05827433615922928,
+          0.11417083442211151,
+          0.33479660749435425,
+          0.4999157190322876
+        ],
+        "max": [
+          0.10000000149011612,
+          0.10000000149011612,
+          0.10000000149011612,
+          0.8651833534240723,
+          1.0909736156463623,
+          2.863185405731201,
+          1.0
+        ],
+        "min": [
+          -0.10495579987764359,
+          -0.10939455777406693,
+          -0.10000000149011612,
+          -0.971906840801239,
+          -1.0475432872772217,
+          -3.06000018119812,
+          0.0
+        ],
+        "q01": [
+          -0.053988199681043625,
+          -0.05049169331789017,
+          -0.032499241530895236,
+          -0.1953887003660202,
+          -0.41674559473991396,
+          -0.8886768388748169,
+          0.0
+        ],
+        "q99": [
+          0.05414841488003723,
+          0.04965164884924884,
+          0.060055799782276154,
+          0.18231668293476103,
+          0.39867786407470646,
+          0.8772023963928218,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 3948057,
+      "num_trajectories": 5100
+    },
+    "ucsd_kitchen_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          410.375732421875,
+          116.9518814086914,
+          192.35031127929688,
+          -121.22441864013672,
+          -33.84892654418945,
+          50.016136169433594,
+          0.741813600063324
+        ],
+        "std": [
+          122.81488037109375,
+          108.80094909667969,
+          130.30345153808594,
+          116.2820053100586,
+          27.62191390991211,
+          41.02091979980469,
+          0.4376337230205536
+        ],
+        "max": [
+          678.0,
+          400.0,
+          507.0,
+          180.00001525878906,
+          6.000013828277588,
+          116.99998474121094,
+          1.0
+        ],
+        "min": [
+          172.0,
+          -166.0,
+          -99.99999237060547,
+          -180.00001525878906,
+          -89.0,
+          -96.00010681152344,
+          0.0
+        ],
+        "q01": [
+          200.00001052856445,
+          -102.31004211425781,
+          -94.99993370056153,
+          -180.00001525878906,
+          -88.00001525878906,
+          -38.999977111816406,
+          0.0
+        ],
+        "q99": [
+          637.0,
+          368.30999999999995,
+          493.0,
+          180.00001525878906,
+          0.999983012676239,
+          105.00001525878906,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 3970,
+      "num_trajectories": 150
+    },
+    "austin_sailor_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.011825386434793472,
+          0.0064610871486365795,
+          0.060236409306526184,
+          0.0,
+          0.0,
+          0.0016465834341943264,
+          0.5260950326919556
+        ],
+        "std": [
+          0.46348854899406433,
+          0.41240164637565613,
+          0.41186293959617615,
+          0.0,
+          0.0,
+          0.0578608438372612,
+          0.49893733859062195
+        ],
+        "max": [
+          1.0,
+          1.0,
+          1.0,
+          0.0,
+          0.0,
+          0.375,
+          1.0
+        ],
+        "min": [
+          -1.0,
+          -1.0,
+          -1.0,
+          0.0,
+          0.0,
+          -0.375,
+          0.0
+        ],
+        "q01": [
+          -1.0,
+          -0.9828571677207947,
+          -0.6000000238418579,
+          0.0,
+          0.0,
+          -0.17249999940395355,
+          0.0
+        ],
+        "q99": [
+          1.0,
+          0.9457142949104309,
+          1.0,
+          0.0,
+          0.0,
+          0.17892856895923615,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 353094,
+      "num_trajectories": 240
+    },
+    "austin_sirius_dataset_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.077476866543293,
+          0.031955525279045105,
+          0.04244735836982727,
+          0.0,
+          0.0,
+          -0.01603454165160656,
+          0.43260180950164795
+        ],
+        "std": [
+          0.3906330168247223,
+          0.2998153865337372,
+          0.2782270312309265,
+          0.0,
+          0.0,
+          0.08120641857385635,
+          0.49528202414512634
+        ],
+        "max": [
+          1.0002285242080688,
+          0.960608720779419,
+          1.105179786682129,
+          0.0,
+          0.0,
+          0.341785728931427,
+          1.0
+        ],
+        "min": [
+          -1.0183025598526,
+          -0.9800000190734863,
+          -0.9774575233459473,
+          0.0,
+          0.0,
+          -0.34607142210006714,
+          0.0
+        ],
+        "q01": [
+          -0.780905865430832,
+          -0.5667179036140442,
+          -0.5254343223571777,
+          0.0,
+          0.0,
+          -0.28495091378688814,
+          0.0
+        ],
+        "q99": [
+          0.9569637751579284,
+          0.6971374487876891,
+          0.8124888157844541,
+          0.0,
+          0.0,
+          0.1971428543329239,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 279939,
+      "num_trajectories": 559
+    },
+    "dlr_edan_shared_control_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.0066478196531534195,
+          -0.0007657355745323002,
+          0.006522845011204481,
+          0.0011679773451760411,
+          -0.006395624950528145,
+          -0.011903021484613419,
+          0.6985887289047241
+        ],
+        "std": [
+          0.021393585950136185,
+          0.018142299726605415,
+          0.03374377265572548,
+          0.01743541844189167,
+          0.03394372761249542,
+          0.04641878604888916,
+          0.45885783433914185
+        ],
+        "max": [
+          0.18991442024707794,
+          0.0739002525806427,
+          0.18064819276332855,
+          0.0866486132144928,
+          0.13464981317520142,
+          0.16910280287265778,
+          1.0
+        ],
+        "min": [
+          -0.10054297000169754,
+          -0.08427435159683228,
+          -0.13533438742160797,
+          -0.17556548118591309,
+          -0.18485672771930695,
+          -0.2680685818195343,
+          0.0
+        ],
+        "q01": [
+          -0.02987122368067503,
+          -0.06013262912631035,
+          -0.08286409199237824,
+          -0.05924444157630205,
+          -0.15986866518855095,
+          -0.15636983573436739,
+          0.0
+        ],
+        "q99": [
+          0.08832092039287087,
+          0.042126184627413736,
+          0.11311905644834042,
+          0.0643695573508739,
+          0.03941855944693088,
+          0.156646853685379,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 8928,
+      "num_trajectories": 104
+    },
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds/0.1.0": {
+      "action": {
+        "mean": [
+          0.5274373292922974,
+          0.028582017868757248,
+          0.18712472915649414,
+          1.2339569330215454,
+          0.03226622939109802,
+          -1.4199472665786743,
+          0.5550631880760193
+        ],
+        "std": [
+          0.08108346909284592,
+          0.1116756722331047,
+          0.07747555524110794,
+          2.8737244606018066,
+          0.02774704433977604,
+          2.7678685188293457,
+          0.4969509243965149
+        ],
+        "max": [
+          0.6634981632232666,
+          0.23428471386432648,
+          0.4308285415172577,
+          3.1415927410125732,
+          0.13647015392780304,
+          3.141592502593994,
+          1.0
+        ],
+        "min": [
+          0.3071657121181488,
+          -0.29754969477653503,
+          0.06578229367733002,
+          -3.1415927410125732,
+          -0.04584203287959099,
+          -3.141592502593994,
+          0.0
+        ],
+        "q01": [
+          0.3148897051811218,
+          -0.20317550599575043,
+          0.06785467118024827,
+          -3.140952730178833,
+          -0.029743434861302376,
+          -3.141091251373291,
+          0.0
+        ],
+        "q99": [
+          0.6472805738449097,
+          0.20846802592277527,
+          0.36855655312538155,
+          3.1409926891326903,
+          0.11424950212240226,
+          3.1410969257354737,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 146241,
+      "num_trajectories": 631
+    },
+    "utaustin_mutex/0.1.0": {
+      "action": {
+        "mean": [
+          0.06176406517624855,
+          -0.005005490034818649,
+          0.10216782987117767,
+          -0.03314131125807762,
+          0.013895022682845592,
+          -0.011317633092403412,
+          0.5038976669311523
+        ],
+        "std": [
+          0.187501460313797,
+          0.4468473196029663,
+          0.3792876601219177,
+          0.14097853004932404,
+          0.06453699618577957,
+          0.11765265464782715,
+          0.501045286655426
+        ],
+        "max": [
+          1.0,
+          1.0,
+          1.0,
+          0.375,
+          0.375,
+          0.375,
+          1.0
+        ],
+        "min": [
+          -1.0,
+          -1.0,
+          -1.0,
+          -0.375,
+          -0.375,
+          -0.375,
+          0.0
+        ],
+        "q01": [
+          -0.4285714328289032,
+          -0.9800000190734863,
+          -0.5571428537368774,
+          -0.375,
+          -0.15642857551574707,
+          -0.335357129573822,
+          0.0
+        ],
+        "q99": [
+          0.5914285778999329,
+          0.9714285731315613,
+          1.0,
+          0.3278571367263794,
+          0.207857146859169,
+          0.25607141852378845,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 361883,
+      "num_trajectories": 1500
+    },
+    "berkeley_fanuc_manipulation/0.1.0": {
+      "action": {
+        "mean": [
+          0.0007744057802483439,
+          -0.00031240080716088414,
+          -0.0015001941937953234,
+          -0.0007515158504247665,
+          -0.00015832878125365824,
+          0.00014327642566058785,
+          0.699295699596405
+        ],
+        "std": [
+          0.0034070133697241545,
+          0.00499219074845314,
+          0.005344326142221689,
+          0.007599010597914457,
+          0.004081932827830315,
+          0.008568963967263699,
+          0.45868709683418274
+        ],
+        "max": [
+          0.009999999776482582,
+          0.009999999776482582,
+          0.009999999776482582,
+          0.03490658476948738,
+          0.03490658476948738,
+          0.03490658476948738,
+          1.0
+        ],
+        "min": [
+          -0.009999999776482582,
+          -0.009999999776482582,
+          -0.009999999776482582,
+          -0.03490658476948738,
+          -0.03490658476948738,
+          -0.03490658476948738,
+          0.0
+        ],
+        "q01": [
+          -0.009999999776482582,
+          -0.009999999776482582,
+          -0.009999999776482582,
+          -0.03490658476948738,
+          0.0,
+          -0.03490658476948738,
+          0.0
+        ],
+        "q99": [
+          0.009999999776482582,
+          0.009999999776482582,
+          0.009999999776482582,
+          0.03490658476948738,
+          0.0,
+          0.03490658476948738,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 62613,
+      "num_trajectories": 415
+    },
+    "cmu_stretch/0.1.0": {
+      "action": {
+        "mean": [
+          0.0003630445571616292,
+          0.0,
+          0.0016466928645968437,
+          0.0,
+          0.0,
+          0.0,
+          0.3987048268318176
+        ],
+        "std": [
+          0.004081855062395334,
+          0.0,
+          0.003774340031668544,
+          0.0,
+          0.0,
+          0.0,
+          0.489638090133667
+        ],
+        "max": [
+          0.02338407188653946,
+          0.0,
+          0.023404927924275398,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "min": [
+          -0.019353797659277916,
+          0.0,
+          -0.02019215188920498,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          -0.011175686959177256,
+          0.0,
+          -0.0032206363626755773,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.014501785952597848,
+          0.0,
+          0.015056106168776728,
+          0.0,
+          0.0,
+          0.0,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 25016,
+      "num_trajectories": 135
+    },
+    "bc_z/0.1.0": {
+      "action": {
+        "mean": [
+          -0.009958645328879356,
+          0.0008958434336818755,
+          0.00499522453173995,
+          0.000297540333122015,
+          -0.008734511211514473,
+          -0.03068969026207924,
+          0.8344562649726868
+        ],
+        "std": [
+          0.030533093959093094,
+          0.0231416504830122,
+          0.020642085000872612,
+          0.04156165570020676,
+          0.04643021523952484,
+          0.07697845250368118,
+          0.36111101508140564
+        ],
+        "max": [
+          0.2165454924106598,
+          0.1251407265663147,
+          0.10772687941789627,
+          0.33544227480888367,
+          0.28117990493774414,
+          0.40614867210388184,
+          1.0
+        ],
+        "min": [
+          -0.1677047461271286,
+          -0.14630407094955444,
+          -0.10066790133714676,
+          -0.29421567916870117,
+          -0.32101404666900635,
+          -0.4635624885559082,
+          0.0
+        ],
+        "q01": [
+          -0.09220654994249344,
+          -0.06456145539879798,
+          -0.049121275544166565,
+          -0.11594625547528267,
+          -0.14152548640966414,
+          -0.2251061636209488,
+          0.0
+        ],
+        "q99": [
+          0.07628866866230968,
+          0.058019736707210584,
+          0.052540797740221024,
+          0.11740604028105736,
+          0.11703975558280955,
+          0.16729306846857078,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 6015535,
+      "num_trajectories": 43264
+    },
+    "fmb_dataset/1.0.0": {
+      "action": {
+        "mean": [
+          0.05902976542711258,
+          -0.06476633995771408,
+          -0.09787469357252121,
+          0.004325387068092823,
+          0.00028963759541511536,
+          -0.04457257315516472,
+          0.7336440086364746
+        ],
+        "std": [
+          0.28809186816215515,
+          0.2820416986942291,
+          0.4626740515232086,
+          0.3266514539718628,
+          0.10842999070882797,
+          0.34400978684425354,
+          0.4435289800167084
+        ],
+        "max": [
+          1.399999976158142,
+          1.0,
+          1.399999976158142,
+          1.0,
+          1.0,
+          1.0,
+          1.0
+        ],
+        "min": [
+          -1.399999976158142,
+          -1.399999976158142,
+          -1.0,
+          -1.0,
+          -1.0,
+          -1.0,
+          0.0
+        ],
+        "q01": [
+          -0.8257142901420593,
+          -1.399999976158142,
+          -1.0,
+          -1.0,
+          -0.3028571307659149,
+          -1.0,
+          0.0
+        ],
+        "q99": [
+          1.0,
+          0.5257142782211304,
+          1.0,
+          1.0,
+          0.3400000035762787,
+          1.0,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 1137459,
+      "num_trajectories": 8612
+    },
+    "dobbe/0.0.1": {
+      "action": {
+        "mean": [
+          -0.00011206958151888102,
+          0.0011229681549593806,
+          -0.00010193959315074608,
+          -7.37128357286565e-05,
+          -0.0006753374473191798,
+          -5.664441778208129e-05,
+          0.6318688988685608
+        ],
+        "std": [
+          0.042660679668188095,
+          0.04428431764245033,
+          0.12224890291690826,
+          0.005388470832258463,
+          0.011246936395764351,
+          0.006288259290158749,
+          0.3973240256309509
+        ],
+        "max": [
+          38.590423583984375,
+          17.932697296142578,
+          4.843764305114746,
+          1.4372116327285767,
+          0.4340403974056244,
+          1.2057193517684937,
+          0.9998947381973267
+        ],
+        "min": [
+          -5.700923442840576,
+          -21.605947494506836,
+          -123.72489929199219,
+          -1.7229845523834229,
+          -0.4998578727245331,
+          -0.8867913484573364,
+          1.4196479014572105e-06
+        ],
+        "q01": [
+          -0.01119564864784479,
+          -0.014266146533191203,
+          -0.0071747214533388615,
+          -0.009444301575422287,
+          -0.03990109823644161,
+          -0.017422311007976532,
+          4.003279136668425e-05
+        ],
+        "q99": [
+          0.01015154086053368,
+          0.017181577533483497,
+          0.007216989761218411,
+          0.010380979906767595,
+          0.03556173853576176,
+          0.018032474815845446,
+          0.9982578039169312
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 1139911,
+      "num_trajectories": 5208
+    },
+    "droid/1.0.0": {
+      "action": {
+        "mean": [
+          0.027425529435276985,
+          -0.0026820411439985037,
+          0.01595238223671913,
+          0.0035501928068697453,
+          -0.030532635748386383,
+          -0.006685464642941952,
+          0.5860344171524048
+        ],
+        "std": [
+          0.25387412309646606,
+          0.18426834046840668,
+          0.22532416880130768,
+          0.21757009625434875,
+          0.22572560608386993,
+          0.2867794930934906,
+          0.4287726879119873
+        ],
+        "max": [
+          0.9999998211860657,
+          0.999991774559021,
+          0.9999973177909851,
+          0.9999874830245972,
+          0.9999954104423523,
+          0.9999998807907104,
+          1.0
+        ],
+        "min": [
+          -0.9999999403953552,
+          -0.9999951124191284,
+          -0.9999960660934448,
+          -0.9999980330467224,
+          -0.9999982118606567,
+          -0.9999998807907104,
+          0.0
+        ],
+        "q01": [
+          -0.7776297926902771,
+          -0.5803514122962952,
+          -0.5795090794563293,
+          -0.6464047729969025,
+          -0.7041108310222626,
+          -0.8895104378461838,
+          0.0
+        ],
+        "q99": [
+          0.7597932070493698,
+          0.5726242214441299,
+          0.7351000607013702,
+          0.6705610305070877,
+          0.6464948207139969,
+          0.8897542208433151,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 27044326,
+      "num_trajectories": 92233
+    },
+    "rh20t_rlds/1.0.0": {
+      "action": {
+        "mean": [
+          -5.332157638779582e+28,
+          -1.5128827327837974e+29,
+          -1.832736619079747e+28,
+          0.5735913515090942,
+          -0.00847744569182396,
+          -0.5566052198410034,
+          0.3186892569065094
+        ],
+        "std": [
+          Infinity,
+          Infinity,
+          Infinity,
+          2.2581026554107666,
+          0.1548534482717514,
+          2.2581026554107666,
+          0.39917993545532227
+        ],
+        "max": [
+          7.582831568163597e+35,
+          7.557172735451728e+35,
+          2.2717764477020827e+27,
+          3.1415927410125732,
+          1.5116956233978271,
+          3.1415927410125732,
+          1.0
+        ],
+        "min": [
+          -3.5543094244408723e+36,
+          -8.723098019507117e+36,
+          -9.648338287048974e+35,
+          -3.1415927410125732,
+          -1.5062522888183594,
+          -3.1415927410125732,
+          0.0
+        ],
+        "q01": [
+          0.36028257966041566,
+          -0.272584410905838,
+          0.005985925104469062,
+          -3.1411514282226562,
+          -0.5925320792198181,
+          -3.1415159702301025,
+          0.0
+        ],
+        "q99": [
+          0.7534684538841248,
+          0.31738221645355225,
+          0.33061375379562374,
+          3.141425132751465,
+          0.47507260441780086,
+          3.141479730606079,
+          1.0
+        ],
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ]
+      },
+      "proprio": {
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      },
+      "num_transitions": 52644433,
+      "num_trajectories": 104392
+    }
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_huggingface.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+import argparse
+from pathlib import Path
+import shutil
+import os
+import argparse
+from pathlib import Path
+import shutil
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+parser = argparse.ArgumentParser("Huggingface AutoModel Tesing")
+parser.add_argument("--model_name_or_path", default="", help="pretrained model name or path.")
+parser.add_argument("--num_images", type=int, default=1, help="num_images for testing.")
+args = parser.parse_args()
+if __name__ == "__main__":
+    model_name_or_path = Path(args.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+    print(processor.statistics)
+    model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16).eval().cuda()
+    image = Image.open("example.png").convert("RGB")
+    images = [image] * args.num_images
+    prompt = "What action should the robot take to pick the cpu?"
+    inputs = processor(images=images, text=prompt, unnorm_key="bridge_orig/1.0.0", return_tensors="pt")
+    print(inputs)
+    generation_outputs = model.predict_action(inputs)
+    print(generation_outputs, processor.batch_decode(generation_outputs))
+    actions = processor.decode_actions(generation_outputs, unnorm_key="bridge_orig/1.0.0")
+    print(actions)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2523a63c898ebf0a32c7282a2e459ef2c950a846c5f3172305089e4149b6b6c3
+size 36157680

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff