jadechoghari commited on
Commit
bcc39cb
·
verified ·
1 Parent(s): 360a2e9

add initial files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +60 -0
  3. config.json +124 -0
  4. config.yaml +209 -0
  5. model.safetensors +3 -0
  6. replay.mp4 +3 -0
  7. train_config.json +284 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ replay.mp4 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: lerobot
3
+ tags:
4
+ - model_hub_mixin
5
+ - pytorch_model_hub_mixin
6
+ - robotics
7
+ - dot
8
+ license: apache-2.0
9
+ datasets:
10
+ - lerobot/pusht
11
+ pipeline_tag: robotics
12
+ ---
13
+
14
+ # Model Card for "Decoder Only Transformer (DOT) Policy" for PushT images dataset
15
+
16
+ Read more about the model and implementation details in the [DOT Policy repository](https://github.com/IliaLarchenko/dot_policy).
17
+
18
+ This model is trained using the [LeRobot library](https://huggingface.co/lerobot) and achieves state-of-the-art results on behavior cloning on the PushT images dataset. It achieves a 74.2% success rate (and 0.936 average max reward) vs. ~69% for the previous state-of-the-art model (Diffusion and VQ-BET perform the same).
19
+
20
+ This result is achieved without the checkpoint selection and is easy to reproduce.
21
+
22
+ You can use this model by installing LeRobot from [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot)
23
+
24
+ To train the model:
25
+
26
+ ```bash
27
+ python lerobot/scripts/train.py \
28
+ --policy.type=dot \
29
+ --dataset.repo_id=lerobot/pusht \
30
+ --env.type=pusht \
31
+ --env.task=PushT-v0 \
32
+ --output_dir=outputs/train/pusht_images \
33
+ --batch_size=24 \
34
+ --log_freq=1000 \
35
+ --eval_freq=10000 \
36
+ --save_freq=50000 \
37
+ --offline.steps=1000000 \
38
+ --seed=100000 \
39
+ --wandb.enable=true \
40
+ --num_workers=24 \
41
+ --use_amp=true \
42
+ --device=cuda \
43
+ --policy.return_every_n=2
44
+ ```
45
+
46
+ To evaluate the model:
47
+
48
+ ```bash
49
+ python lerobot/scripts/eval.py \
50
+ --policy.path=IliaLarchenko/dot_pusht_images \
51
+ --env.type=pusht \
52
+ --env.task=PushT-v0 \
53
+ --eval.n_episodes=1000 \
54
+ --eval.batch_size=100 \
55
+ --seed=1000000
56
+ ```
57
+
58
+ Model size:
59
+ - Total parameters: 14.1m
60
+ - Trainable parameters: 2.9m
config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "dot",
3
+ "n_obs_steps": 3,
4
+ "normalization_mapping": {
5
+ "VISUAL": "MEAN_STD",
6
+ "STATE": "MIN_MAX",
7
+ "ENV": "MIN_MAX",
8
+ "ACTION": "MIN_MAX"
9
+ },
10
+ "input_features": {
11
+ "observation.image": {
12
+ "type": "VISUAL",
13
+ "shape": [
14
+ 3,
15
+ 96,
16
+ 96
17
+ ]
18
+ },
19
+ "observation.state": {
20
+ "type": "STATE",
21
+ "shape": [
22
+ 2
23
+ ]
24
+ }
25
+ },
26
+ "output_features": {
27
+ "action": {
28
+ "type": "ACTION",
29
+ "shape": [
30
+ 2
31
+ ]
32
+ }
33
+ },
34
+ "train_horizon": 20,
35
+ "inference_horizon": 20,
36
+ "lookback_obs_steps": 10,
37
+ "lookback_aug": 5,
38
+ "override_dataset_stats": false,
39
+ "new_dataset_stats": {
40
+ "action": {
41
+ "max": [
42
+ 512.0,
43
+ 512.0
44
+ ],
45
+ "min": [
46
+ 0.0,
47
+ 0.0
48
+ ]
49
+ },
50
+ "observation.environment_state": {
51
+ "max": [
52
+ 512.0,
53
+ 512.0,
54
+ 512.0,
55
+ 512.0,
56
+ 512.0,
57
+ 512.0,
58
+ 512.0,
59
+ 512.0,
60
+ 512.0,
61
+ 512.0,
62
+ 512.0,
63
+ 512.0,
64
+ 512.0,
65
+ 512.0,
66
+ 512.0,
67
+ 512.0
68
+ ],
69
+ "min": [
70
+ 0.0,
71
+ 0.0,
72
+ 0.0,
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0,
77
+ 0.0,
78
+ 0.0,
79
+ 0.0,
80
+ 0.0,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0
86
+ ]
87
+ },
88
+ "observation.state": {
89
+ "max": [
90
+ 512.0,
91
+ 512.0
92
+ ],
93
+ "min": [
94
+ 0.0,
95
+ 0.0
96
+ ]
97
+ }
98
+ },
99
+ "vision_backbone": "resnet18",
100
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
101
+ "pre_norm": true,
102
+ "lora_rank": 20,
103
+ "merge_lora": false,
104
+ "dim_model": 128,
105
+ "n_heads": 8,
106
+ "dim_feedforward": 512,
107
+ "n_decoder_layers": 8,
108
+ "rescale_shape": [
109
+ 96,
110
+ 96
111
+ ],
112
+ "crop_scale": 1.0,
113
+ "state_noise": 0.01,
114
+ "noise_decay": 0.999995,
115
+ "dropout": 0.1,
116
+ "alpha": 0.75,
117
+ "train_alpha": 0.9,
118
+ "predict_every_n": 1,
119
+ "return_every_n": 2,
120
+ "optimizer_lr": 0.0001,
121
+ "optimizer_min_lr": 0.0001,
122
+ "optimizer_lr_cycle_steps": 300000,
123
+ "optimizer_weight_decay": 1e-05
124
+ }
config.yaml ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resume: false
2
+ device: cuda
3
+ use_amp: true
4
+ seed: 100000
5
+ dataset_repo_id: lerobot/pusht
6
+ video_backend: pyav
7
+ training:
8
+ offline_steps: 1000000
9
+ num_workers: 24
10
+ batch_size: 24
11
+ eval_freq: 10000
12
+ log_freq: 1000
13
+ save_checkpoint: true
14
+ save_freq: 50000
15
+ online_steps: 0
16
+ online_rollout_n_episodes: 1
17
+ online_rollout_batch_size: 1
18
+ online_steps_between_rollouts: 1
19
+ online_sampling_ratio: 0.5
20
+ online_env_seed: null
21
+ online_buffer_capacity: null
22
+ online_buffer_seed_size: 0
23
+ do_online_rollout_async: false
24
+ image_transforms:
25
+ enable: false
26
+ max_num_transforms: 3
27
+ random_order: false
28
+ brightness:
29
+ weight: 1
30
+ min_max:
31
+ - 0.8
32
+ - 1.2
33
+ contrast:
34
+ weight: 1
35
+ min_max:
36
+ - 0.8
37
+ - 1.2
38
+ saturation:
39
+ weight: 1
40
+ min_max:
41
+ - 0.5
42
+ - 1.5
43
+ hue:
44
+ weight: 1
45
+ min_max:
46
+ - -0.05
47
+ - 0.05
48
+ sharpness:
49
+ weight: 1
50
+ min_max:
51
+ - 0.8
52
+ - 1.2
53
+ save_model: true
54
+ grad_clip_norm: 50
55
+ lr: 0.0001
56
+ min_lr: 0.0001
57
+ lr_cycle_steps: 300000
58
+ weight_decay: 1.0e-05
59
+ delta_timestamps:
60
+ observation.image:
61
+ - -1.5
62
+ - -1.4
63
+ - -1.3
64
+ - -1.2
65
+ - -1.1
66
+ - -1.0
67
+ - -0.9
68
+ - -0.8
69
+ - -0.7
70
+ - -0.6
71
+ - -0.5
72
+ - -0.1
73
+ - 0.0
74
+ observation.state:
75
+ - -1.5
76
+ - -1.4
77
+ - -1.3
78
+ - -1.2
79
+ - -1.1
80
+ - -1.0
81
+ - -0.9
82
+ - -0.8
83
+ - -0.7
84
+ - -0.6
85
+ - -0.5
86
+ - -0.1
87
+ - 0.0
88
+ action:
89
+ - -1.5
90
+ - -1.4
91
+ - -1.3
92
+ - -1.2
93
+ - -1.1
94
+ - -1.0
95
+ - -0.9
96
+ - -0.8
97
+ - -0.7
98
+ - -0.6
99
+ - -0.5
100
+ - -0.1
101
+ - 0.0
102
+ - 0.1
103
+ - 0.2
104
+ - 0.3
105
+ - 0.4
106
+ - 0.5
107
+ - 0.6
108
+ - 0.7
109
+ - 0.8
110
+ - 0.9
111
+ - 1.0
112
+ - 1.1
113
+ - 1.2
114
+ - 1.3
115
+ - 1.4
116
+ - 1.5
117
+ - 1.6
118
+ - 1.7
119
+ - 1.8
120
+ - 1.9
121
+ eval:
122
+ n_episodes: 100
123
+ batch_size: 100
124
+ use_async_envs: false
125
+ wandb:
126
+ enable: true
127
+ disable_artifact: false
128
+ project: lerobot
129
+ notes: ''
130
+ fps: 10
131
+ env:
132
+ name: pusht
133
+ task: PushT-v0
134
+ image_size: 96
135
+ state_dim: 2
136
+ action_dim: 2
137
+ fps: ${fps}
138
+ episode_length: 300
139
+ gym:
140
+ obs_type: pixels_agent_pos
141
+ render_mode: rgb_array
142
+ visualization_width: 384
143
+ visualization_height: 384
144
+ override_dataset_stats:
145
+ observation.image:
146
+ mean:
147
+ - - - 0.485
148
+ - - - 0.456
149
+ - - - 0.406
150
+ std:
151
+ - - - 0.229
152
+ - - - 0.224
153
+ - - - 0.225
154
+ observation.state:
155
+ min:
156
+ - 0.0
157
+ - 0.0
158
+ max:
159
+ - 512.0
160
+ - 512.0
161
+ action:
162
+ min:
163
+ - 0.0
164
+ - 0.0
165
+ max:
166
+ - 512.0
167
+ - 512.0
168
+ policy:
169
+ name: dot
170
+ n_obs_steps: 3
171
+ train_horizon: 20
172
+ inference_horizon: 20
173
+ lookback_obs_steps: 10
174
+ lookback_aug: 5
175
+ input_shapes:
176
+ observation.image:
177
+ - 3
178
+ - 96
179
+ - 96
180
+ observation.state:
181
+ - ${env.state_dim}
182
+ output_shapes:
183
+ action:
184
+ - ${env.action_dim}
185
+ input_normalization_modes:
186
+ observation.image: mean_std
187
+ observation.state: min_max
188
+ output_normalization_modes:
189
+ action: min_max
190
+ vision_backbone: resnet18
191
+ pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
192
+ rescale_shape:
193
+ - 96
194
+ - 96
195
+ lora_rank: 20
196
+ merge_lora: true
197
+ crop_scale: 0.8
198
+ state_noise: 0.01
199
+ noise_decay: 0.999995
200
+ pre_norm: true
201
+ dim_model: 128
202
+ n_heads: 8
203
+ dim_feedforward: 512
204
+ n_decoder_layers: 8
205
+ dropout: 0.1
206
+ alpha: 0.75
207
+ train_alpha: 0.9
208
+ predict_every_n: 1
209
+ return_every_n: 2
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f87cdd4cc31b979724c8bc0dde0076f5d533398d0a4f419edf5b961e38ee460
3
+ size 56412020
replay.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c125bf9bc41d5c9eba693a5f9d1781148b5629910429d7c9b477c65eb33d53e0
3
+ size 146456
train_config.json ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "repo_id": "lerobot/pusht",
4
+ "episodes": null,
5
+ "image_transforms": {
6
+ "enable": false,
7
+ "max_num_transforms": 3,
8
+ "random_order": false,
9
+ "tfs": {
10
+ "brightness": {
11
+ "weight": 1.0,
12
+ "type": "ColorJitter",
13
+ "kwargs": {
14
+ "brightness": [
15
+ 0.8,
16
+ 1.2
17
+ ]
18
+ }
19
+ },
20
+ "contrast": {
21
+ "weight": 1.0,
22
+ "type": "ColorJitter",
23
+ "kwargs": {
24
+ "contrast": [
25
+ 0.8,
26
+ 1.2
27
+ ]
28
+ }
29
+ },
30
+ "saturation": {
31
+ "weight": 1.0,
32
+ "type": "ColorJitter",
33
+ "kwargs": {
34
+ "saturation": [
35
+ 0.5,
36
+ 1.5
37
+ ]
38
+ }
39
+ },
40
+ "hue": {
41
+ "weight": 1.0,
42
+ "type": "ColorJitter",
43
+ "kwargs": {
44
+ "hue": [
45
+ -0.05,
46
+ 0.05
47
+ ]
48
+ }
49
+ },
50
+ "sharpness": {
51
+ "weight": 1.0,
52
+ "type": "SharpnessJitter",
53
+ "kwargs": {
54
+ "sharpness": [
55
+ 0.5,
56
+ 1.5
57
+ ]
58
+ }
59
+ }
60
+ }
61
+ },
62
+ "local_files_only": false,
63
+ "use_imagenet_stats": true,
64
+ "video_backend": "pyav"
65
+ },
66
+ "env": {
67
+ "type": "pusht",
68
+ "task": "PushT-v0",
69
+ "fps": 10,
70
+ "features": {
71
+ "action": {
72
+ "type": "ACTION",
73
+ "shape": [
74
+ 2
75
+ ]
76
+ },
77
+ "agent_pos": {
78
+ "type": "STATE",
79
+ "shape": [
80
+ 2
81
+ ]
82
+ },
83
+ "pixels": {
84
+ "type": "VISUAL",
85
+ "shape": [
86
+ 384,
87
+ 384,
88
+ 3
89
+ ]
90
+ }
91
+ },
92
+ "features_map": {
93
+ "action": "action",
94
+ "agent_pos": "observation.state",
95
+ "environment_state": "observation.environment_state",
96
+ "pixels": "observation.image"
97
+ },
98
+ "episode_length": 300,
99
+ "obs_type": "pixels_agent_pos",
100
+ "render_mode": "rgb_array",
101
+ "visualization_width": 384,
102
+ "visualization_height": 384
103
+ },
104
+ "policy": {
105
+ "type": "dot",
106
+ "n_obs_steps": 3,
107
+ "normalization_mapping": {
108
+ "VISUAL": "MEAN_STD",
109
+ "STATE": "MIN_MAX",
110
+ "ENV": "MIN_MAX",
111
+ "ACTION": "MIN_MAX"
112
+ },
113
+ "input_features": {
114
+ "observation.image": {
115
+ "type": "VISUAL",
116
+ "shape": [
117
+ 3,
118
+ 96,
119
+ 96
120
+ ]
121
+ },
122
+ "observation.state": {
123
+ "type": "STATE",
124
+ "shape": [
125
+ 2
126
+ ]
127
+ }
128
+ },
129
+ "output_features": {
130
+ "action": {
131
+ "type": "ACTION",
132
+ "shape": [
133
+ 2
134
+ ]
135
+ }
136
+ },
137
+ "train_horizon": 20,
138
+ "inference_horizon": 20,
139
+ "lookback_obs_steps": 10,
140
+ "lookback_aug": 5,
141
+ "override_dataset_stats": false,
142
+ "new_dataset_stats": {
143
+ "action": {
144
+ "max": [
145
+ 512.0,
146
+ 512.0
147
+ ],
148
+ "min": [
149
+ 0.0,
150
+ 0.0
151
+ ]
152
+ },
153
+ "observation.environment_state": {
154
+ "max": [
155
+ 512.0,
156
+ 512.0,
157
+ 512.0,
158
+ 512.0,
159
+ 512.0,
160
+ 512.0,
161
+ 512.0,
162
+ 512.0,
163
+ 512.0,
164
+ 512.0,
165
+ 512.0,
166
+ 512.0,
167
+ 512.0,
168
+ 512.0,
169
+ 512.0,
170
+ 512.0
171
+ ],
172
+ "min": [
173
+ 0.0,
174
+ 0.0,
175
+ 0.0,
176
+ 0.0,
177
+ 0.0,
178
+ 0.0,
179
+ 0.0,
180
+ 0.0,
181
+ 0.0,
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0,
186
+ 0.0,
187
+ 0.0,
188
+ 0.0
189
+ ]
190
+ },
191
+ "observation.state": {
192
+ "max": [
193
+ 512.0,
194
+ 512.0
195
+ ],
196
+ "min": [
197
+ 0.0,
198
+ 0.0
199
+ ]
200
+ }
201
+ },
202
+ "vision_backbone": "resnet18",
203
+ "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
204
+ "pre_norm": true,
205
+ "lora_rank": 20,
206
+ "merge_lora": false,
207
+ "dim_model": 128,
208
+ "n_heads": 8,
209
+ "dim_feedforward": 512,
210
+ "n_decoder_layers": 8,
211
+ "rescale_shape": [
212
+ 96,
213
+ 96
214
+ ],
215
+ "crop_scale": 1.0,
216
+ "state_noise": 0.01,
217
+ "noise_decay": 0.999995,
218
+ "dropout": 0.1,
219
+ "alpha": 0.75,
220
+ "train_alpha": 0.9,
221
+ "predict_every_n": 1,
222
+ "return_every_n": 2,
223
+ "optimizer_lr": 0.0001,
224
+ "optimizer_min_lr": 0.0001,
225
+ "optimizer_lr_cycle_steps": 300000,
226
+ "optimizer_weight_decay": 1e-05
227
+ },
228
+ "output_dir": "outputs/train/pusht_images",
229
+ "job_name": "pusht_dot",
230
+ "resume": false,
231
+ "device": "cuda",
232
+ "use_amp": true,
233
+ "seed": 100000,
234
+ "num_workers": 24,
235
+ "batch_size": 24,
236
+ "eval_freq": 10000,
237
+ "log_freq": 1000,
238
+ "save_checkpoint": true,
239
+ "save_freq": 50000,
240
+ "offline": {
241
+ "steps": 1000000
242
+ },
243
+ "online": {
244
+ "steps": 0,
245
+ "rollout_n_episodes": 1,
246
+ "rollout_batch_size": 1,
247
+ "steps_between_rollouts": null,
248
+ "sampling_ratio": 0.5,
249
+ "env_seed": null,
250
+ "buffer_capacity": null,
251
+ "buffer_seed_size": 0,
252
+ "do_rollout_async": false
253
+ },
254
+ "use_policy_training_preset": true,
255
+ "optimizer": {
256
+ "type": "adamw",
257
+ "lr": 0.0001,
258
+ "weight_decay": 1e-05,
259
+ "grad_clip_norm": 10.0,
260
+ "betas": [
261
+ 0.9,
262
+ 0.999
263
+ ],
264
+ "eps": 1e-08
265
+ },
266
+ "scheduler": {
267
+ "type": "cosine_annealing",
268
+ "num_warmup_steps": 0,
269
+ "min_lr": 0.0001,
270
+ "T_max": 300000
271
+ },
272
+ "eval": {
273
+ "n_episodes": 50,
274
+ "batch_size": 50,
275
+ "use_async_envs": false
276
+ },
277
+ "wandb": {
278
+ "enable": true,
279
+ "disable_artifact": false,
280
+ "project": "pusht",
281
+ "entity": null,
282
+ "notes": null
283
+ }
284
+ }