add initial files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +71 -0
config.json +124 -0
config.yaml +322 -0
model.safetensors +3 -0
replay.mp4 +3 -0
train_config.json +282 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+replay.mp4 filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+library_name: lerobot
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+- robotics
+- dot
+license: apache-2.0
+datasets:
+- lerobot/aloha_sim_insertion_human
+pipeline_tag: robotics
+---
+# Model Card for "Decoder Only Transformer (DOT) Policy" for ALOHA bimanual insert problem
+Read more about the model and implementation details in the [DOT Policy repository](https://github.com/IliaLarchenko/dot_policy).
+This model is trained using the [LeRobot library](https://huggingface.co/lerobot) and achieves state-of-the-art results on behavior cloning on ALOHA bimanual insert dataset. It achieves 29.6% success rate vs. 21% for the previous state-of-the-art model (ACT).
+This result is achieved without the checkpoint selection and is easy to reproduce.
+You can use this model by installing LeRobot from [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot_new_config)
+To train the model:
+```bash
+python lerobot/scripts/train.py \
+    --policy.type=dot \
+    --dataset.repo_id=lerobot/aloha_sim_insertion_human \
+    --env.type=aloha \
+    --env.task=AlohaInsertion-v0 \
+    --env.episode_length=500 \
+    --output_dir=outputs/train/pusht_aloha_insert \
+    --batch_size=24  \
+    --log_freq=1000 \
+    --eval_freq=10000 \
+    --save_freq=10000 \
+    --offline.steps=100000 \
+    --seed=100000 \
+    --wandb.enable=true \
+    --num_workers=24 \
+    --use_amp=true \
+    --device=cuda \
+    --policy.optimizer_lr=0.00003 \
+    --policy.optimizer_min_lr=0.00001 \
+    --policy.optimizer_lr_cycle_steps=100000 \
+    --policy.train_horizon=150 \
+    --policy.inference_horizon=100 \
+    --policy.lookback_obs_steps=30 \
+    --policy.lookback_aug=5 \
+    --policy.rescale_shape="[480,640]" \
+    --policy.alpha=0.98 \
+    --policy.train_alpha=0.99
+```
+To evaluate the model:
+```bash
+python lerobot/scripts/eval.py \
+    --policy.path=IliaLarchenko/dot_bimanual_insert \
+    --env.type=aloha \
+    --env.task=AlohaInsertion-v0 \
+    --env.episode_length=500 \
+    --eval.n_episodes=1000 \
+    --eval.batch_size=100 \
+    --seed=1000000
+```
+Model size:
+- Total parameters: 14.1m
+- Trainable parameters: 2.9m

config.json ADDED Viewed

	@@ -0,0 +1,124 @@

+{
+    "type": "dot",
+    "n_obs_steps": 3,
+    "normalization_mapping": {
+        "VISUAL": "MEAN_STD",
+        "STATE": "MIN_MAX",
+        "ENV": "MIN_MAX",
+        "ACTION": "MIN_MAX"
+    },
+    "input_features": {
+        "observation.images.top": {
+            "type": "VISUAL",
+            "shape": [
+                3,
+                480,
+                640
+            ]
+        },
+        "observation.state": {
+            "type": "STATE",
+            "shape": [
+                14
+            ]
+        }
+    },
+    "output_features": {
+        "action": {
+            "type": "ACTION",
+            "shape": [
+                14
+            ]
+        }
+    },
+    "train_horizon": 150,
+    "inference_horizon": 100,
+    "lookback_obs_steps": 30,
+    "lookback_aug": 5,
+    "override_dataset_stats": false,
+    "new_dataset_stats": {
+        "action": {
+            "max": [
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0
+            ]
+        },
+        "observation.environment_state": {
+            "max": [
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ]
+        },
+        "observation.state": {
+            "max": [
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0
+            ]
+        }
+    },
+    "vision_backbone": "resnet18",
+    "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
+    "pre_norm": true,
+    "lora_rank": 20,
+    "merge_lora": false,
+    "dim_model": 128,
+    "n_heads": 8,
+    "dim_feedforward": 512,
+    "n_decoder_layers": 8,
+    "rescale_shape": [
+        480,
+        640
+    ],
+    "crop_scale": 1.0,
+    "state_noise": 0.01,
+    "noise_decay": 0.999995,
+    "dropout": 0.1,
+    "alpha": 0.98,
+    "train_alpha": 0.99,
+    "predict_every_n": 1,
+    "return_every_n": 1,
+    "optimizer_lr": 3e-05,
+    "optimizer_min_lr": 1e-05,
+    "optimizer_lr_cycle_steps": 100000,
+    "optimizer_weight_decay": 1e-05
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,322 @@

+resume: false
+device: cuda
+use_amp: true
+seed: 100000
+dataset_repo_id: lerobot/aloha_sim_insertion_human
+video_backend: pyav
+training:
+  offline_steps: 100000
+  num_workers: 12
+  batch_size: 12
+  eval_freq: 10000
+  log_freq: 1000
+  save_checkpoint: true
+  save_freq: 10000
+  online_steps: 0
+  online_rollout_n_episodes: 1
+  online_rollout_batch_size: 1
+  online_steps_between_rollouts: 1
+  online_sampling_ratio: 0.5
+  online_env_seed: null
+  online_buffer_capacity: null
+  online_buffer_seed_size: 0
+  do_online_rollout_async: false
+  image_transforms:
+    enable: false
+    max_num_transforms: 3
+    random_order: false
+    brightness:
+      weight: 1
+      min_max:
+      - 0.8
+      - 1.2
+    contrast:
+      weight: 1
+      min_max:
+      - 0.8
+      - 1.2
+    saturation:
+      weight: 1
+      min_max:
+      - 0.5
+      - 1.5
+    hue:
+      weight: 1
+      min_max:
+      - -0.05
+      - 0.05
+    sharpness:
+      weight: 1
+      min_max:
+      - 0.8
+      - 1.2
+  save_model: true
+  grad_clip_norm: 50
+  lr: 3.0e-05
+  min_lr: 1.0e-05
+  lr_cycle_steps: 100000
+  weight_decay: 1.0e-05
+  delta_timestamps:
+    observation.images.top:
+    - -0.7
+    - -0.68
+    - -0.66
+    - -0.64
+    - -0.62
+    - -0.6
+    - -0.58
+    - -0.56
+    - -0.54
+    - -0.52
+    - -0.5
+    - -0.02
+    - 0.0
+    observation.state:
+    - -0.7
+    - -0.68
+    - -0.66
+    - -0.64
+    - -0.62
+    - -0.6
+    - -0.58
+    - -0.56
+    - -0.54
+    - -0.52
+    - -0.5
+    - -0.02
+    - 0.0
+    action:
+    - -0.7
+    - -0.68
+    - -0.66
+    - -0.64
+    - -0.62
+    - -0.6
+    - -0.58
+    - -0.56
+    - -0.54
+    - -0.52
+    - -0.5
+    - -0.02
+    - 0.0
+    - 0.02
+    - 0.04
+    - 0.06
+    - 0.08
+    - 0.1
+    - 0.12
+    - 0.14
+    - 0.16
+    - 0.18
+    - 0.2
+    - 0.22
+    - 0.24
+    - 0.26
+    - 0.28
+    - 0.3
+    - 0.32
+    - 0.34
+    - 0.36
+    - 0.38
+    - 0.4
+    - 0.42
+    - 0.44
+    - 0.46
+    - 0.48
+    - 0.5
+    - 0.52
+    - 0.54
+    - 0.56
+    - 0.58
+    - 0.6
+    - 0.62
+    - 0.64
+    - 0.66
+    - 0.68
+    - 0.7
+    - 0.72
+    - 0.74
+    - 0.76
+    - 0.78
+    - 0.8
+    - 0.82
+    - 0.84
+    - 0.86
+    - 0.88
+    - 0.9
+    - 0.92
+    - 0.94
+    - 0.96
+    - 0.98
+    - 1.0
+    - 1.02
+    - 1.04
+    - 1.06
+    - 1.08
+    - 1.1
+    - 1.12
+    - 1.14
+    - 1.16
+    - 1.18
+    - 1.2
+    - 1.22
+    - 1.24
+    - 1.26
+    - 1.28
+    - 1.3
+    - 1.32
+    - 1.34
+    - 1.36
+    - 1.38
+    - 1.4
+    - 1.42
+    - 1.44
+    - 1.46
+    - 1.48
+    - 1.5
+    - 1.52
+    - 1.54
+    - 1.56
+    - 1.58
+    - 1.6
+    - 1.62
+    - 1.64
+    - 1.66
+    - 1.68
+    - 1.7
+    - 1.72
+    - 1.74
+    - 1.76
+    - 1.78
+    - 1.8
+    - 1.82
+    - 1.84
+    - 1.86
+    - 1.88
+    - 1.9
+    - 1.92
+    - 1.94
+    - 1.96
+    - 1.98
+    - 2.0
+    - 2.02
+    - 2.04
+    - 2.06
+    - 2.08
+    - 2.1
+    - 2.12
+    - 2.14
+    - 2.16
+    - 2.18
+    - 2.2
+    - 2.22
+    - 2.24
+    - 2.26
+    - 2.28
+    - 2.3
+    - 2.32
+    - 2.34
+    - 2.36
+    - 2.38
+    - 2.4
+    - 2.42
+    - 2.44
+    - 2.46
+    - 2.48
+    - 2.5
+    - 2.52
+    - 2.54
+    - 2.56
+    - 2.58
+    - 2.6
+    - 2.62
+    - 2.64
+    - 2.66
+    - 2.68
+    - 2.7
+    - 2.72
+    - 2.74
+    - 2.76
+    - 2.78
+    - 2.8
+    - 2.82
+    - 2.84
+    - 2.86
+    - 2.88
+    - 2.9
+    - 2.92
+    - 2.94
+    - 2.96
+    - 2.98
+eval:
+  n_episodes: 50
+  batch_size: 10
+  use_async_envs: false
+wandb:
+  enable: true
+  disable_artifact: false
+  project: insert
+  notes: ''
+fps: 50
+env:
+  name: aloha
+  task: AlohaInsertion-v0
+  state_dim: 14
+  action_dim: 14
+  fps: ${fps}
+  episode_length: 500
+  gym:
+    obs_type: pixels_agent_pos
+    render_mode: rgb_array
+override_dataset_stats:
+  observation.images.top:
+    mean:
+    - - - 0.485
+    - - - 0.456
+    - - - 0.406
+    std:
+    - - - 0.229
+    - - - 0.224
+    - - - 0.225
+policy:
+  name: dot
+  n_obs_steps: 3
+  train_horizon: 150
+  inference_horizon: 100
+  lookback_obs_steps: 30
+  lookback_aug: 5
+  input_shapes:
+    observation.images.top:
+    - 3
+    - 480
+    - 640
+    observation.state:
+    - ${env.state_dim}
+  output_shapes:
+    action:
+    - ${env.action_dim}
+  input_normalization_modes:
+    observation.images.top: mean_std
+    observation.state: min_max
+  output_normalization_modes:
+    action: min_max
+  vision_backbone: resnet18
+  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
+  rescale_shape:
+  - 480
+  - 640
+  lora_rank: 20
+  merge_lora: true
+  crop_scale: 0.8
+  state_noise: 0.01
+  noise_decay: 0.999995
+  pre_norm: true
+  dim_model: 128
+  n_heads: 8
+  dim_feedforward: 512
+  n_decoder_layers: 8
+  dropout: 0.1
+  alpha: 0.98
+  train_alpha: 0.99
+  predict_every_n: 1
+  return_every_n: 1

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:706683a6b1c1c69f0b5cc577c9dcf08a8761ff30b1b25ab3511f7a0ab050ae5e
+size 56555664

replay.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff0d0e1c523e870ff2a57f9cd4823d07335973367c2e9e0ee71913b5894234e9
+size 202117

train_config.json ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+    "dataset": {
+        "repo_id": "lerobot/aloha_sim_insertion_human",
+        "episodes": null,
+        "image_transforms": {
+            "enable": false,
+            "max_num_transforms": 3,
+            "random_order": false,
+            "tfs": {
+                "brightness": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "brightness": [
+                            0.8,
+                            1.2
+                        ]
+                    }
+                },
+                "contrast": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "contrast": [
+                            0.8,
+                            1.2
+                        ]
+                    }
+                },
+                "saturation": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "saturation": [
+                            0.5,
+                            1.5
+                        ]
+                    }
+                },
+                "hue": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "hue": [
+                            -0.05,
+                            0.05
+                        ]
+                    }
+                },
+                "sharpness": {
+                    "weight": 1.0,
+                    "type": "SharpnessJitter",
+                    "kwargs": {
+                        "sharpness": [
+                            0.5,
+                            1.5
+                        ]
+                    }
+                }
+            }
+        },
+        "local_files_only": false,
+        "use_imagenet_stats": true,
+        "video_backend": "pyav"
+    },
+    "env": {
+        "type": "aloha",
+        "task": "AlohaInsertion-v0",
+        "fps": 50,
+        "features": {
+            "action": {
+                "type": "ACTION",
+                "shape": [
+                    14
+                ]
+            },
+            "agent_pos": {
+                "type": "STATE",
+                "shape": [
+                    14
+                ]
+            },
+            "pixels/top": {
+                "type": "VISUAL",
+                "shape": [
+                    480,
+                    640,
+                    3
+                ]
+            }
+        },
+        "features_map": {
+            "action": "action",
+            "agent_pos": "observation.state",
+            "top": "observation.image.top",
+            "pixels/top": "observation.images.top"
+        },
+        "episode_length": 500,
+        "obs_type": "pixels_agent_pos",
+        "render_mode": "rgb_array"
+    },
+    "policy": {
+        "type": "dot",
+        "n_obs_steps": 3,
+        "normalization_mapping": {
+            "VISUAL": "MEAN_STD",
+            "STATE": "MIN_MAX",
+            "ENV": "MIN_MAX",
+            "ACTION": "MIN_MAX"
+        },
+        "input_features": {
+            "observation.images.top": {
+                "type": "VISUAL",
+                "shape": [
+                    3,
+                    480,
+                    640
+                ]
+            },
+            "observation.state": {
+                "type": "STATE",
+                "shape": [
+                    14
+                ]
+            }
+        },
+        "output_features": {
+            "action": {
+                "type": "ACTION",
+                "shape": [
+                    14
+                ]
+            }
+        },
+        "train_horizon": 150,
+        "inference_horizon": 100,
+        "lookback_obs_steps": 30,
+        "lookback_aug": 5,
+        "override_dataset_stats": false,
+        "new_dataset_stats": {
+            "action": {
+                "max": [
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0
+                ]
+            },
+            "observation.environment_state": {
+                "max": [
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0
+                ]
+            },
+            "observation.state": {
+                "max": [
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0
+                ]
+            }
+        },
+        "vision_backbone": "resnet18",
+        "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
+        "pre_norm": true,
+        "lora_rank": 20,
+        "merge_lora": false,
+        "dim_model": 128,
+        "n_heads": 8,
+        "dim_feedforward": 512,
+        "n_decoder_layers": 8,
+        "rescale_shape": [
+            480,
+            640
+        ],
+        "crop_scale": 1.0,
+        "state_noise": 0.01,
+        "noise_decay": 0.999995,
+        "dropout": 0.1,
+        "alpha": 0.98,
+        "train_alpha": 0.99,
+        "predict_every_n": 1,
+        "return_every_n": 1,
+        "optimizer_lr": 3e-05,
+        "optimizer_min_lr": 1e-05,
+        "optimizer_lr_cycle_steps": 100000,
+        "optimizer_weight_decay": 1e-05
+    },
+    "output_dir": "outputs/train/pusht_aloha_insert",
+    "job_name": "aloha_dot",
+    "resume": false,
+    "device": "cuda",
+    "use_amp": true,
+    "seed": 100000,
+    "num_workers": 24,
+    "batch_size": 24,
+    "eval_freq": 10000,
+    "log_freq": 1000,
+    "save_checkpoint": true,
+    "save_freq": 10000,
+    "offline": {
+        "steps": 100000
+    },
+    "online": {
+        "steps": 0,
+        "rollout_n_episodes": 1,
+        "rollout_batch_size": 1,
+        "steps_between_rollouts": null,
+        "sampling_ratio": 0.5,
+        "env_seed": null,
+        "buffer_capacity": null,
+        "buffer_seed_size": 0,
+        "do_rollout_async": false
+    },
+    "use_policy_training_preset": true,
+    "optimizer": {
+        "type": "adamw",
+        "lr": 3e-05,
+        "weight_decay": 1e-05,
+        "grad_clip_norm": 10.0,
+        "betas": [
+            0.9,
+            0.999
+        ],
+        "eps": 1e-08
+    },
+    "scheduler": {
+        "type": "cosine_annealing",
+        "num_warmup_steps": 0,
+        "min_lr": 1e-05,
+        "T_max": 100000
+    },
+    "eval": {
+        "n_episodes": 50,
+        "batch_size": 50,
+        "use_async_envs": false
+    },
+    "wandb": {
+        "enable": true,
+        "disable_artifact": false,
+        "project": "insert",
+        "entity": null,
+        "notes": null
+    }
+}