evgmaslov commited on
Commit
4b6787f
·
verified ·
1 Parent(s): 15174b7

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c41c39d9c3a8d36a83c6153b792c6e9513ee0da56d09c279e6e83bdc7e082274
3
  size 17314248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba2490278745370a0e92feab010428c3d46e49841f1ce40a8ddcf298824c4be
3
  size 17314248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e20e173755c7aa5618757be3132d099c1ffe3c13d7d69593b5143ffbe5529794
3
  size 34683834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4efb73d606ea4b9b94666be2dbc85f0a8c60c04af8ea5b2fe019ecd2feeb4cfd
3
  size 34683834
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccebcf5733293da2f68a329f1452457cf7c7711cf93a243687f09350d477ae12
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec90b0f58085113a67da177dc5886f1d7d2c830d920310fbe2d270df4d2e0e6c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cbcf0598523b56d5f4007caf4b30be22f0216f18a94e550ee53f76bb51e29ea
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11cbc571c0ea4ecfbd0bc7e22b86e1f554cb7f4c3811c527589eede8d6383d3c
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2096d81ed883fb030e6bcacd60fbf8324656acc54a93864ebc32cc4dc168f344
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71909fdec59845704cbc3f7a48087748da252f0311063596d2fb98b5e6dea843
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:229997ea7ae1a976ba7cdf7afc75d010f51443ed8a5ff599641a51d554b1bc1b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6238d47c062d6f056b0d2b1c5c05d8ce6a9725849934b38ed3caf5eba400d1f
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89d56f21be427e9d352f9200c72550da0726fb8591f2f2ab23a2f7669602b531
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:516eb9dd6a876ecc0e42a390244d9a3d6b93b342ffc744b38d719d5915f737c9
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8cf3f2633e09298d1cc0110cda1ea1de49e640c8195560e3b22f379c6169cd8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e97851b2d6745d832ea0f07f952215ad1038a884726609e20c1b61f5099d1d4
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e6e8d3c8f74a7d4a780a5b156d3d3eb25de240f02cdd1359b29ad8a6f8dd6f3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1276e795a5322ee56e33d91c08950330de1ca7a6fc188fd7eda4cffeecbbcc07
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40a0d5aa8e343ebcaa6ed3c99ba601e5f939b625317c6656b2cff2f5ed37cae6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d1616ffb7a967870e2e11d0459bca8e44cce795e5850676f198ba8b3dc642b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2425222312045271,
5
  "eval_steps": 500,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5107,6 +5107,1706 @@
5107
  "rewards/prompt_consistency_reward_2": 0.0,
5108
  "rewards/walls_orthogonality_reward_2": 0.0,
5109
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5110
  }
5111
  ],
5112
  "logging_steps": 1,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.32336297493936944,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5107
  "rewards/prompt_consistency_reward_2": 0.0,
5108
  "rewards/walls_orthogonality_reward_2": 0.0,
5109
  "step": 300
5110
+ },
5111
+ {
5112
+ "completion_length": 1201.0,
5113
+ "epoch": 0.2433306386418755,
5114
+ "grad_norm": 0.042805083096027374,
5115
+ "kl": 0.018200375139713287,
5116
+ "learning_rate": 6.053354890864995e-06,
5117
+ "loss": 0.0007,
5118
+ "reward": 0.3005952537059784,
5119
+ "reward_std": 0.23874062299728394,
5120
+ "rewards/answer_format_reward_2": 0.0,
5121
+ "rewards/common_format_reward": 0.3005952537059784,
5122
+ "rewards/doors_consistency_reward_2": 0.0,
5123
+ "rewards/geometry_consistency_reward_2": 0.0,
5124
+ "rewards/prompt_consistency_reward_2": 0.0,
5125
+ "rewards/walls_orthogonality_reward_2": 0.0,
5126
+ "step": 301
5127
+ },
5128
+ {
5129
+ "completion_length": 1202.71435546875,
5130
+ "epoch": 0.24413904607922393,
5131
+ "grad_norm": 0.028551368042826653,
5132
+ "kl": 0.004892417229712009,
5133
+ "learning_rate": 6.046887631366209e-06,
5134
+ "loss": 0.0002,
5135
+ "reward": 0.3528439402580261,
5136
+ "reward_std": 0.3776737153530121,
5137
+ "rewards/answer_format_reward_2": 0.034391533583402634,
5138
+ "rewards/common_format_reward": 0.318452388048172,
5139
+ "rewards/doors_consistency_reward_2": 0.0,
5140
+ "rewards/geometry_consistency_reward_2": 0.0,
5141
+ "rewards/prompt_consistency_reward_2": 0.0,
5142
+ "rewards/walls_orthogonality_reward_2": 0.0,
5143
+ "step": 302
5144
+ },
5145
+ {
5146
+ "completion_length": 1226.9285888671875,
5147
+ "epoch": 0.24494745351657235,
5148
+ "grad_norm": 0.03154255077242851,
5149
+ "kl": 0.027800392359495163,
5150
+ "learning_rate": 6.040420371867421e-06,
5151
+ "loss": 0.0011,
5152
+ "reward": 0.2857142984867096,
5153
+ "reward_std": 0.27040985226631165,
5154
+ "rewards/answer_format_reward_2": 0.0,
5155
+ "rewards/common_format_reward": 0.2857142984867096,
5156
+ "rewards/doors_consistency_reward_2": 0.0,
5157
+ "rewards/geometry_consistency_reward_2": 0.0,
5158
+ "rewards/prompt_consistency_reward_2": 0.0,
5159
+ "rewards/walls_orthogonality_reward_2": 0.0,
5160
+ "step": 303
5161
+ },
5162
+ {
5163
+ "completion_length": 1404.71435546875,
5164
+ "epoch": 0.24575586095392077,
5165
+ "grad_norm": 0.03344067931175232,
5166
+ "kl": 0.00035073162871412933,
5167
+ "learning_rate": 6.033953112368633e-06,
5168
+ "loss": 0.0,
5169
+ "reward": 0.3095238208770752,
5170
+ "reward_std": 0.2419729083776474,
5171
+ "rewards/answer_format_reward_2": 0.0,
5172
+ "rewards/common_format_reward": 0.3095238208770752,
5173
+ "rewards/doors_consistency_reward_2": 0.0,
5174
+ "rewards/geometry_consistency_reward_2": 0.0,
5175
+ "rewards/prompt_consistency_reward_2": 0.0,
5176
+ "rewards/walls_orthogonality_reward_2": 0.0,
5177
+ "step": 304
5178
+ },
5179
+ {
5180
+ "completion_length": 1265.642822265625,
5181
+ "epoch": 0.2465642683912692,
5182
+ "grad_norm": 0.016976112499833107,
5183
+ "kl": 0.00034852040698751807,
5184
+ "learning_rate": 6.027485852869846e-06,
5185
+ "loss": 0.0,
5186
+ "reward": 0.3154762089252472,
5187
+ "reward_std": 0.2720959186553955,
5188
+ "rewards/answer_format_reward_2": 0.0,
5189
+ "rewards/common_format_reward": 0.3154762089252472,
5190
+ "rewards/doors_consistency_reward_2": 0.0,
5191
+ "rewards/geometry_consistency_reward_2": 0.0,
5192
+ "rewards/prompt_consistency_reward_2": 0.0,
5193
+ "rewards/walls_orthogonality_reward_2": 0.0,
5194
+ "step": 305
5195
+ },
5196
+ {
5197
+ "completion_length": 1374.357177734375,
5198
+ "epoch": 0.24737267582861763,
5199
+ "grad_norm": 0.057332366704940796,
5200
+ "kl": 0.028435252606868744,
5201
+ "learning_rate": 6.0210185933710585e-06,
5202
+ "loss": 0.0011,
5203
+ "reward": 0.3333333432674408,
5204
+ "reward_std": 0.24095678329467773,
5205
+ "rewards/answer_format_reward_2": 0.0,
5206
+ "rewards/common_format_reward": 0.3333333432674408,
5207
+ "rewards/doors_consistency_reward_2": 0.0,
5208
+ "rewards/geometry_consistency_reward_2": 0.0,
5209
+ "rewards/prompt_consistency_reward_2": 0.0,
5210
+ "rewards/walls_orthogonality_reward_2": 0.0,
5211
+ "step": 306
5212
+ },
5213
+ {
5214
+ "completion_length": 1381.642822265625,
5215
+ "epoch": 0.24818108326596605,
5216
+ "grad_norm": 0.020299529656767845,
5217
+ "kl": 0.0003246701671741903,
5218
+ "learning_rate": 6.0145513338722715e-06,
5219
+ "loss": 0.0,
5220
+ "reward": 0.3333333432674408,
5221
+ "reward_std": 0.25516438484191895,
5222
+ "rewards/answer_format_reward_2": 0.0,
5223
+ "rewards/common_format_reward": 0.3333333432674408,
5224
+ "rewards/doors_consistency_reward_2": 0.0,
5225
+ "rewards/geometry_consistency_reward_2": 0.0,
5226
+ "rewards/prompt_consistency_reward_2": 0.0,
5227
+ "rewards/walls_orthogonality_reward_2": 0.0,
5228
+ "step": 307
5229
+ },
5230
+ {
5231
+ "completion_length": 1389.047607421875,
5232
+ "epoch": 0.24898949070331447,
5233
+ "grad_norm": 0.025671424344182014,
5234
+ "kl": 0.015724550932645798,
5235
+ "learning_rate": 6.0080840743734845e-06,
5236
+ "loss": 0.0006,
5237
+ "reward": 0.3303571343421936,
5238
+ "reward_std": 0.23312142491340637,
5239
+ "rewards/answer_format_reward_2": 0.0,
5240
+ "rewards/common_format_reward": 0.3303571343421936,
5241
+ "rewards/doors_consistency_reward_2": 0.0,
5242
+ "rewards/geometry_consistency_reward_2": 0.0,
5243
+ "rewards/prompt_consistency_reward_2": 0.0,
5244
+ "rewards/walls_orthogonality_reward_2": 0.0,
5245
+ "step": 308
5246
+ },
5247
+ {
5248
+ "completion_length": 1175.8809814453125,
5249
+ "epoch": 0.2497978981406629,
5250
+ "grad_norm": 0.04253083094954491,
5251
+ "kl": 0.01416640356183052,
5252
+ "learning_rate": 6.001616814874697e-06,
5253
+ "loss": 0.0006,
5254
+ "reward": 0.3630952537059784,
5255
+ "reward_std": 0.28659990429878235,
5256
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5257
+ "rewards/common_format_reward": 0.3511904776096344,
5258
+ "rewards/doors_consistency_reward_2": 0.0,
5259
+ "rewards/geometry_consistency_reward_2": 0.0,
5260
+ "rewards/prompt_consistency_reward_2": 0.0,
5261
+ "rewards/walls_orthogonality_reward_2": 0.0,
5262
+ "step": 309
5263
+ },
5264
+ {
5265
+ "completion_length": 1261.3333740234375,
5266
+ "epoch": 0.25060630557801133,
5267
+ "grad_norm": 0.023210233077406883,
5268
+ "kl": 0.01728440821170807,
5269
+ "learning_rate": 5.995149555375909e-06,
5270
+ "loss": 0.0007,
5271
+ "reward": 0.3422619104385376,
5272
+ "reward_std": 0.21419870853424072,
5273
+ "rewards/answer_format_reward_2": 0.0,
5274
+ "rewards/common_format_reward": 0.3422619104385376,
5275
+ "rewards/doors_consistency_reward_2": 0.0,
5276
+ "rewards/geometry_consistency_reward_2": 0.0,
5277
+ "rewards/prompt_consistency_reward_2": 0.0,
5278
+ "rewards/walls_orthogonality_reward_2": 0.0,
5279
+ "step": 310
5280
+ },
5281
+ {
5282
+ "completion_length": 1486.59521484375,
5283
+ "epoch": 0.2514147130153597,
5284
+ "grad_norm": 0.01862664520740509,
5285
+ "kl": 0.006662237923592329,
5286
+ "learning_rate": 5.988682295877122e-06,
5287
+ "loss": 0.0003,
5288
+ "reward": 0.4375,
5289
+ "reward_std": 0.27554935216903687,
5290
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5291
+ "rewards/common_format_reward": 0.4255952537059784,
5292
+ "rewards/doors_consistency_reward_2": 0.0,
5293
+ "rewards/geometry_consistency_reward_2": 0.0,
5294
+ "rewards/prompt_consistency_reward_2": 0.0,
5295
+ "rewards/walls_orthogonality_reward_2": 0.0,
5296
+ "step": 311
5297
+ },
5298
+ {
5299
+ "completion_length": 1378.90478515625,
5300
+ "epoch": 0.25222312045270817,
5301
+ "grad_norm": 0.021754320710897446,
5302
+ "kl": 0.006686628330498934,
5303
+ "learning_rate": 5.982215036378335e-06,
5304
+ "loss": 0.0003,
5305
+ "reward": 0.2767857313156128,
5306
+ "reward_std": 0.2382291704416275,
5307
+ "rewards/answer_format_reward_2": 0.0,
5308
+ "rewards/common_format_reward": 0.2767857313156128,
5309
+ "rewards/doors_consistency_reward_2": 0.0,
5310
+ "rewards/geometry_consistency_reward_2": 0.0,
5311
+ "rewards/prompt_consistency_reward_2": 0.0,
5312
+ "rewards/walls_orthogonality_reward_2": 0.0,
5313
+ "step": 312
5314
+ },
5315
+ {
5316
+ "completion_length": 1350.952392578125,
5317
+ "epoch": 0.25303152789005656,
5318
+ "grad_norm": 0.01587211713194847,
5319
+ "kl": 0.007829620502889156,
5320
+ "learning_rate": 5.975747776879547e-06,
5321
+ "loss": 0.0003,
5322
+ "reward": 0.2708333432674408,
5323
+ "reward_std": 0.27112093567848206,
5324
+ "rewards/answer_format_reward_2": 0.0,
5325
+ "rewards/common_format_reward": 0.2708333432674408,
5326
+ "rewards/doors_consistency_reward_2": 0.0,
5327
+ "rewards/geometry_consistency_reward_2": 0.0,
5328
+ "rewards/prompt_consistency_reward_2": 0.0,
5329
+ "rewards/walls_orthogonality_reward_2": 0.0,
5330
+ "step": 313
5331
+ },
5332
+ {
5333
+ "completion_length": 1411.5714111328125,
5334
+ "epoch": 0.253839935327405,
5335
+ "grad_norm": 0.036893732845783234,
5336
+ "kl": 0.020039239898324013,
5337
+ "learning_rate": 5.96928051738076e-06,
5338
+ "loss": 0.0008,
5339
+ "reward": 0.261904776096344,
5340
+ "reward_std": 0.25443580746650696,
5341
+ "rewards/answer_format_reward_2": 0.0,
5342
+ "rewards/common_format_reward": 0.261904776096344,
5343
+ "rewards/doors_consistency_reward_2": 0.0,
5344
+ "rewards/geometry_consistency_reward_2": 0.0,
5345
+ "rewards/prompt_consistency_reward_2": 0.0,
5346
+ "rewards/walls_orthogonality_reward_2": 0.0,
5347
+ "step": 314
5348
+ },
5349
+ {
5350
+ "completion_length": 1321.4761962890625,
5351
+ "epoch": 0.25464834276475345,
5352
+ "grad_norm": 0.03680587187409401,
5353
+ "kl": 0.007459270767867565,
5354
+ "learning_rate": 5.962813257881972e-06,
5355
+ "loss": 0.0003,
5356
+ "reward": 0.3303571343421936,
5357
+ "reward_std": 0.22571614384651184,
5358
+ "rewards/answer_format_reward_2": 0.0,
5359
+ "rewards/common_format_reward": 0.3303571343421936,
5360
+ "rewards/doors_consistency_reward_2": 0.0,
5361
+ "rewards/geometry_consistency_reward_2": 0.0,
5362
+ "rewards/prompt_consistency_reward_2": 0.0,
5363
+ "rewards/walls_orthogonality_reward_2": 0.0,
5364
+ "step": 315
5365
+ },
5366
+ {
5367
+ "completion_length": 1151.3333740234375,
5368
+ "epoch": 0.25545675020210185,
5369
+ "grad_norm": 0.020981987938284874,
5370
+ "kl": 0.009367621503770351,
5371
+ "learning_rate": 5.956345998383184e-06,
5372
+ "loss": 0.0004,
5373
+ "reward": 0.3154762089252472,
5374
+ "reward_std": 0.27415454387664795,
5375
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5376
+ "rewards/common_format_reward": 0.3035714328289032,
5377
+ "rewards/doors_consistency_reward_2": 0.0,
5378
+ "rewards/geometry_consistency_reward_2": 0.0,
5379
+ "rewards/prompt_consistency_reward_2": 0.0,
5380
+ "rewards/walls_orthogonality_reward_2": 0.0,
5381
+ "step": 316
5382
+ },
5383
+ {
5384
+ "completion_length": 1168.8809814453125,
5385
+ "epoch": 0.2562651576394503,
5386
+ "grad_norm": 0.019157856702804565,
5387
+ "kl": 0.0004043486842419952,
5388
+ "learning_rate": 5.949878738884398e-06,
5389
+ "loss": 0.0,
5390
+ "reward": 0.2767857313156128,
5391
+ "reward_std": 0.26202771067619324,
5392
+ "rewards/answer_format_reward_2": 0.0,
5393
+ "rewards/common_format_reward": 0.2767857313156128,
5394
+ "rewards/doors_consistency_reward_2": 0.0,
5395
+ "rewards/geometry_consistency_reward_2": 0.0,
5396
+ "rewards/prompt_consistency_reward_2": 0.0,
5397
+ "rewards/walls_orthogonality_reward_2": 0.0,
5398
+ "step": 317
5399
+ },
5400
+ {
5401
+ "completion_length": 1251.90478515625,
5402
+ "epoch": 0.2570735650767987,
5403
+ "grad_norm": 0.025573352351784706,
5404
+ "kl": 0.0004263351147528738,
5405
+ "learning_rate": 5.9434114793856104e-06,
5406
+ "loss": 0.0,
5407
+ "reward": 0.3214285671710968,
5408
+ "reward_std": 0.3095901608467102,
5409
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5410
+ "rewards/common_format_reward": 0.3095238208770752,
5411
+ "rewards/doors_consistency_reward_2": 0.0,
5412
+ "rewards/geometry_consistency_reward_2": 0.0,
5413
+ "rewards/prompt_consistency_reward_2": 0.0,
5414
+ "rewards/walls_orthogonality_reward_2": 0.0,
5415
+ "step": 318
5416
+ },
5417
+ {
5418
+ "completion_length": 1359.357177734375,
5419
+ "epoch": 0.25788197251414713,
5420
+ "grad_norm": 0.03138462454080582,
5421
+ "kl": 0.010758555494248867,
5422
+ "learning_rate": 5.936944219886823e-06,
5423
+ "loss": 0.0004,
5424
+ "reward": 0.2946428656578064,
5425
+ "reward_std": 0.2812837064266205,
5426
+ "rewards/answer_format_reward_2": 0.0,
5427
+ "rewards/common_format_reward": 0.2946428656578064,
5428
+ "rewards/doors_consistency_reward_2": 0.0,
5429
+ "rewards/geometry_consistency_reward_2": 0.0,
5430
+ "rewards/prompt_consistency_reward_2": 0.0,
5431
+ "rewards/walls_orthogonality_reward_2": 0.0,
5432
+ "step": 319
5433
+ },
5434
+ {
5435
+ "completion_length": 1445.761962890625,
5436
+ "epoch": 0.2586903799514956,
5437
+ "grad_norm": 0.017592772841453552,
5438
+ "kl": 0.009795819409191608,
5439
+ "learning_rate": 5.930476960388035e-06,
5440
+ "loss": 0.0004,
5441
+ "reward": 0.3273809552192688,
5442
+ "reward_std": 0.2394365668296814,
5443
+ "rewards/answer_format_reward_2": 0.0,
5444
+ "rewards/common_format_reward": 0.3273809552192688,
5445
+ "rewards/doors_consistency_reward_2": 0.0,
5446
+ "rewards/geometry_consistency_reward_2": 0.0,
5447
+ "rewards/prompt_consistency_reward_2": 0.0,
5448
+ "rewards/walls_orthogonality_reward_2": 0.0,
5449
+ "step": 320
5450
+ },
5451
+ {
5452
+ "completion_length": 1475.09521484375,
5453
+ "epoch": 0.25949878738884397,
5454
+ "grad_norm": 0.024020789191126823,
5455
+ "kl": 0.007067197933793068,
5456
+ "learning_rate": 5.924009700889248e-06,
5457
+ "loss": 0.0003,
5458
+ "reward": 0.3154762089252472,
5459
+ "reward_std": 0.27380135655403137,
5460
+ "rewards/answer_format_reward_2": 0.0,
5461
+ "rewards/common_format_reward": 0.3154762089252472,
5462
+ "rewards/doors_consistency_reward_2": 0.0,
5463
+ "rewards/geometry_consistency_reward_2": 0.0,
5464
+ "rewards/prompt_consistency_reward_2": 0.0,
5465
+ "rewards/walls_orthogonality_reward_2": 0.0,
5466
+ "step": 321
5467
+ },
5468
+ {
5469
+ "completion_length": 1418.3333740234375,
5470
+ "epoch": 0.2603071948261924,
5471
+ "grad_norm": 0.016318559646606445,
5472
+ "kl": 0.013767200522124767,
5473
+ "learning_rate": 5.917542441390461e-06,
5474
+ "loss": 0.0006,
5475
+ "reward": 0.25,
5476
+ "reward_std": 0.23760852217674255,
5477
+ "rewards/answer_format_reward_2": 0.0,
5478
+ "rewards/common_format_reward": 0.25,
5479
+ "rewards/doors_consistency_reward_2": 0.0,
5480
+ "rewards/geometry_consistency_reward_2": 0.0,
5481
+ "rewards/prompt_consistency_reward_2": 0.0,
5482
+ "rewards/walls_orthogonality_reward_2": 0.0,
5483
+ "step": 322
5484
+ },
5485
+ {
5486
+ "completion_length": 1212.09521484375,
5487
+ "epoch": 0.2611156022635408,
5488
+ "grad_norm": 0.02755824476480484,
5489
+ "kl": 0.000511774851474911,
5490
+ "learning_rate": 5.911075181891673e-06,
5491
+ "loss": 0.0,
5492
+ "reward": 0.3214285671710968,
5493
+ "reward_std": 0.24325017631053925,
5494
+ "rewards/answer_format_reward_2": 0.0,
5495
+ "rewards/common_format_reward": 0.3214285671710968,
5496
+ "rewards/doors_consistency_reward_2": 0.0,
5497
+ "rewards/geometry_consistency_reward_2": 0.0,
5498
+ "rewards/prompt_consistency_reward_2": 0.0,
5499
+ "rewards/walls_orthogonality_reward_2": 0.0,
5500
+ "step": 323
5501
+ },
5502
+ {
5503
+ "completion_length": 1395.952392578125,
5504
+ "epoch": 0.26192400970088925,
5505
+ "grad_norm": 0.023585868999361992,
5506
+ "kl": 0.004406272899359465,
5507
+ "learning_rate": 5.904607922392886e-06,
5508
+ "loss": 0.0002,
5509
+ "reward": 0.3035714328289032,
5510
+ "reward_std": 0.25467973947525024,
5511
+ "rewards/answer_format_reward_2": 0.0,
5512
+ "rewards/common_format_reward": 0.3035714328289032,
5513
+ "rewards/doors_consistency_reward_2": 0.0,
5514
+ "rewards/geometry_consistency_reward_2": 0.0,
5515
+ "rewards/prompt_consistency_reward_2": 0.0,
5516
+ "rewards/walls_orthogonality_reward_2": 0.0,
5517
+ "step": 324
5518
+ },
5519
+ {
5520
+ "completion_length": 1261.666748046875,
5521
+ "epoch": 0.2627324171382377,
5522
+ "grad_norm": 0.01840096525847912,
5523
+ "kl": 0.0038345877546817064,
5524
+ "learning_rate": 5.898140662894098e-06,
5525
+ "loss": 0.0002,
5526
+ "reward": 0.29023367166519165,
5527
+ "reward_std": 0.34168222546577454,
5528
+ "rewards/answer_format_reward_2": 0.02380952425301075,
5529
+ "rewards/common_format_reward": 0.2470238208770752,
5530
+ "rewards/doors_consistency_reward_2": 0.0,
5531
+ "rewards/geometry_consistency_reward_2": 0.0,
5532
+ "rewards/prompt_consistency_reward_2": 0.0,
5533
+ "rewards/walls_orthogonality_reward_2": 0.019400352612137794,
5534
+ "step": 325
5535
+ },
5536
+ {
5537
+ "completion_length": 1238.90478515625,
5538
+ "epoch": 0.2635408245755861,
5539
+ "grad_norm": 0.04081631824374199,
5540
+ "kl": 0.009143279865384102,
5541
+ "learning_rate": 5.89167340339531e-06,
5542
+ "loss": 0.0004,
5543
+ "reward": 0.2440476268529892,
5544
+ "reward_std": 0.19615037739276886,
5545
+ "rewards/answer_format_reward_2": 0.0,
5546
+ "rewards/common_format_reward": 0.2440476268529892,
5547
+ "rewards/doors_consistency_reward_2": 0.0,
5548
+ "rewards/geometry_consistency_reward_2": 0.0,
5549
+ "rewards/prompt_consistency_reward_2": 0.0,
5550
+ "rewards/walls_orthogonality_reward_2": 0.0,
5551
+ "step": 326
5552
+ },
5553
+ {
5554
+ "completion_length": 1241.452392578125,
5555
+ "epoch": 0.26434923201293453,
5556
+ "grad_norm": 0.024989062920212746,
5557
+ "kl": 0.02819332852959633,
5558
+ "learning_rate": 5.885206143896524e-06,
5559
+ "loss": 0.0011,
5560
+ "reward": 0.2976190447807312,
5561
+ "reward_std": 0.2215328961610794,
5562
+ "rewards/answer_format_reward_2": 0.0,
5563
+ "rewards/common_format_reward": 0.2976190447807312,
5564
+ "rewards/doors_consistency_reward_2": 0.0,
5565
+ "rewards/geometry_consistency_reward_2": 0.0,
5566
+ "rewards/prompt_consistency_reward_2": 0.0,
5567
+ "rewards/walls_orthogonality_reward_2": 0.0,
5568
+ "step": 327
5569
+ },
5570
+ {
5571
+ "completion_length": 1322.09521484375,
5572
+ "epoch": 0.2651576394502829,
5573
+ "grad_norm": 0.04035574197769165,
5574
+ "kl": 0.06031531095504761,
5575
+ "learning_rate": 5.878738884397736e-06,
5576
+ "loss": 0.0024,
5577
+ "reward": 0.3085317313671112,
5578
+ "reward_std": 0.3752860426902771,
5579
+ "rewards/answer_format_reward_2": 0.02380952425301075,
5580
+ "rewards/common_format_reward": 0.2678571343421936,
5581
+ "rewards/doors_consistency_reward_2": 0.0,
5582
+ "rewards/geometry_consistency_reward_2": 0.0,
5583
+ "rewards/prompt_consistency_reward_2": 0.0,
5584
+ "rewards/walls_orthogonality_reward_2": 0.0168650783598423,
5585
+ "step": 328
5586
+ },
5587
+ {
5588
+ "completion_length": 1438.0,
5589
+ "epoch": 0.26596604688763137,
5590
+ "grad_norm": 0.041567374020814896,
5591
+ "kl": 0.0012564590433612466,
5592
+ "learning_rate": 5.8722716248989485e-06,
5593
+ "loss": 0.0001,
5594
+ "reward": 0.3363095223903656,
5595
+ "reward_std": 0.23071452975273132,
5596
+ "rewards/answer_format_reward_2": 0.0,
5597
+ "rewards/common_format_reward": 0.3363095223903656,
5598
+ "rewards/doors_consistency_reward_2": 0.0,
5599
+ "rewards/geometry_consistency_reward_2": 0.0,
5600
+ "rewards/prompt_consistency_reward_2": 0.0,
5601
+ "rewards/walls_orthogonality_reward_2": 0.0,
5602
+ "step": 329
5603
+ },
5604
+ {
5605
+ "completion_length": 1211.857177734375,
5606
+ "epoch": 0.2667744543249798,
5607
+ "grad_norm": 0.023597463965415955,
5608
+ "kl": 0.0003930572129320353,
5609
+ "learning_rate": 5.8658043654001615e-06,
5610
+ "loss": 0.0,
5611
+ "reward": 0.2678571343421936,
5612
+ "reward_std": 0.2533595860004425,
5613
+ "rewards/answer_format_reward_2": 0.0,
5614
+ "rewards/common_format_reward": 0.2678571343421936,
5615
+ "rewards/doors_consistency_reward_2": 0.0,
5616
+ "rewards/geometry_consistency_reward_2": 0.0,
5617
+ "rewards/prompt_consistency_reward_2": 0.0,
5618
+ "rewards/walls_orthogonality_reward_2": 0.0,
5619
+ "step": 330
5620
+ },
5621
+ {
5622
+ "completion_length": 1379.7857666015625,
5623
+ "epoch": 0.2675828617623282,
5624
+ "grad_norm": 0.056616153568029404,
5625
+ "kl": 0.02393675595521927,
5626
+ "learning_rate": 5.859337105901374e-06,
5627
+ "loss": 0.001,
5628
+ "reward": 0.2857142984867096,
5629
+ "reward_std": 0.25488919019699097,
5630
+ "rewards/answer_format_reward_2": 0.0,
5631
+ "rewards/common_format_reward": 0.2857142984867096,
5632
+ "rewards/doors_consistency_reward_2": 0.0,
5633
+ "rewards/geometry_consistency_reward_2": 0.0,
5634
+ "rewards/prompt_consistency_reward_2": 0.0,
5635
+ "rewards/walls_orthogonality_reward_2": 0.0,
5636
+ "step": 331
5637
+ },
5638
+ {
5639
+ "completion_length": 1400.666748046875,
5640
+ "epoch": 0.26839126919967665,
5641
+ "grad_norm": 0.04419691488146782,
5642
+ "kl": 0.027386026456952095,
5643
+ "learning_rate": 5.852869846402587e-06,
5644
+ "loss": 0.0011,
5645
+ "reward": 0.3660714328289032,
5646
+ "reward_std": 0.2553485929965973,
5647
+ "rewards/answer_format_reward_2": 0.0,
5648
+ "rewards/common_format_reward": 0.3660714328289032,
5649
+ "rewards/doors_consistency_reward_2": 0.0,
5650
+ "rewards/geometry_consistency_reward_2": 0.0,
5651
+ "rewards/prompt_consistency_reward_2": 0.0,
5652
+ "rewards/walls_orthogonality_reward_2": 0.0,
5653
+ "step": 332
5654
+ },
5655
+ {
5656
+ "completion_length": 1202.4285888671875,
5657
+ "epoch": 0.26919967663702504,
5658
+ "grad_norm": 0.01960425078868866,
5659
+ "kl": 0.00044838967733085155,
5660
+ "learning_rate": 5.8464025869038e-06,
5661
+ "loss": 0.0,
5662
+ "reward": 0.324404776096344,
5663
+ "reward_std": 0.28681516647338867,
5664
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5665
+ "rewards/common_format_reward": 0.3125,
5666
+ "rewards/doors_consistency_reward_2": 0.0,
5667
+ "rewards/geometry_consistency_reward_2": 0.0,
5668
+ "rewards/prompt_consistency_reward_2": 0.0,
5669
+ "rewards/walls_orthogonality_reward_2": 0.0,
5670
+ "step": 333
5671
+ },
5672
+ {
5673
+ "completion_length": 1462.3095703125,
5674
+ "epoch": 0.2700080840743735,
5675
+ "grad_norm": 0.045769017189741135,
5676
+ "kl": 0.02239864319562912,
5677
+ "learning_rate": 5.839935327405012e-06,
5678
+ "loss": 0.0009,
5679
+ "reward": 0.3273809552192688,
5680
+ "reward_std": 0.23631471395492554,
5681
+ "rewards/answer_format_reward_2": 0.0,
5682
+ "rewards/common_format_reward": 0.3273809552192688,
5683
+ "rewards/doors_consistency_reward_2": 0.0,
5684
+ "rewards/geometry_consistency_reward_2": 0.0,
5685
+ "rewards/prompt_consistency_reward_2": 0.0,
5686
+ "rewards/walls_orthogonality_reward_2": 0.0,
5687
+ "step": 334
5688
+ },
5689
+ {
5690
+ "completion_length": 1103.357177734375,
5691
+ "epoch": 0.27081649151172194,
5692
+ "grad_norm": 0.057885535061359406,
5693
+ "kl": 0.0005538546247407794,
5694
+ "learning_rate": 5.833468067906224e-06,
5695
+ "loss": 0.0,
5696
+ "reward": 0.3005952537059784,
5697
+ "reward_std": 0.22661477327346802,
5698
+ "rewards/answer_format_reward_2": 0.0,
5699
+ "rewards/common_format_reward": 0.3005952537059784,
5700
+ "rewards/doors_consistency_reward_2": 0.0,
5701
+ "rewards/geometry_consistency_reward_2": 0.0,
5702
+ "rewards/prompt_consistency_reward_2": 0.0,
5703
+ "rewards/walls_orthogonality_reward_2": 0.0,
5704
+ "step": 335
5705
+ },
5706
+ {
5707
+ "completion_length": 1496.0,
5708
+ "epoch": 0.2716248989490703,
5709
+ "grad_norm": 0.05171234533190727,
5710
+ "kl": 0.028659667819738388,
5711
+ "learning_rate": 5.827000808407437e-06,
5712
+ "loss": 0.0011,
5713
+ "reward": 0.369047611951828,
5714
+ "reward_std": 0.2855287790298462,
5715
+ "rewards/answer_format_reward_2": 0.0,
5716
+ "rewards/common_format_reward": 0.369047611951828,
5717
+ "rewards/doors_consistency_reward_2": 0.0,
5718
+ "rewards/geometry_consistency_reward_2": 0.0,
5719
+ "rewards/prompt_consistency_reward_2": 0.0,
5720
+ "rewards/walls_orthogonality_reward_2": 0.0,
5721
+ "step": 336
5722
+ },
5723
+ {
5724
+ "completion_length": 1343.666748046875,
5725
+ "epoch": 0.27243330638641877,
5726
+ "grad_norm": 0.019201209768652916,
5727
+ "kl": 0.0077083176001906395,
5728
+ "learning_rate": 5.82053354890865e-06,
5729
+ "loss": 0.0003,
5730
+ "reward": 0.3571428656578064,
5731
+ "reward_std": 0.2777146100997925,
5732
+ "rewards/answer_format_reward_2": 0.0,
5733
+ "rewards/common_format_reward": 0.3571428656578064,
5734
+ "rewards/doors_consistency_reward_2": 0.0,
5735
+ "rewards/geometry_consistency_reward_2": 0.0,
5736
+ "rewards/prompt_consistency_reward_2": 0.0,
5737
+ "rewards/walls_orthogonality_reward_2": 0.0,
5738
+ "step": 337
5739
+ },
5740
+ {
5741
+ "completion_length": 1425.7381591796875,
5742
+ "epoch": 0.27324171382376716,
5743
+ "grad_norm": 0.024142196401953697,
5744
+ "kl": 0.00800237338989973,
5745
+ "learning_rate": 5.814066289409862e-06,
5746
+ "loss": 0.0003,
5747
+ "reward": 0.3422619104385376,
5748
+ "reward_std": 0.2588088810443878,
5749
+ "rewards/answer_format_reward_2": 0.0,
5750
+ "rewards/common_format_reward": 0.3422619104385376,
5751
+ "rewards/doors_consistency_reward_2": 0.0,
5752
+ "rewards/geometry_consistency_reward_2": 0.0,
5753
+ "rewards/prompt_consistency_reward_2": 0.0,
5754
+ "rewards/walls_orthogonality_reward_2": 0.0,
5755
+ "step": 338
5756
+ },
5757
+ {
5758
+ "completion_length": 1209.8809814453125,
5759
+ "epoch": 0.2740501212611156,
5760
+ "grad_norm": 0.020528865978121758,
5761
+ "kl": 0.0005083674332126975,
5762
+ "learning_rate": 5.807599029911075e-06,
5763
+ "loss": 0.0,
5764
+ "reward": 0.2529762089252472,
5765
+ "reward_std": 0.23409408330917358,
5766
+ "rewards/answer_format_reward_2": 0.0,
5767
+ "rewards/common_format_reward": 0.2529762089252472,
5768
+ "rewards/doors_consistency_reward_2": 0.0,
5769
+ "rewards/geometry_consistency_reward_2": 0.0,
5770
+ "rewards/prompt_consistency_reward_2": 0.0,
5771
+ "rewards/walls_orthogonality_reward_2": 0.0,
5772
+ "step": 339
5773
+ },
5774
+ {
5775
+ "completion_length": 1282.2381591796875,
5776
+ "epoch": 0.274858528698464,
5777
+ "grad_norm": 0.02704017236828804,
5778
+ "kl": 0.009719254449009895,
5779
+ "learning_rate": 5.8011317704122874e-06,
5780
+ "loss": 0.0004,
5781
+ "reward": 0.442460298538208,
5782
+ "reward_std": 0.3546527028083801,
5783
+ "rewards/answer_format_reward_2": 0.02380952425301075,
5784
+ "rewards/common_format_reward": 0.3988095223903656,
5785
+ "rewards/doors_consistency_reward_2": 0.0,
5786
+ "rewards/geometry_consistency_reward_2": 0.0,
5787
+ "rewards/prompt_consistency_reward_2": 0.0,
5788
+ "rewards/walls_orthogonality_reward_2": 0.01984127052128315,
5789
+ "step": 340
5790
+ },
5791
+ {
5792
+ "completion_length": 1232.761962890625,
5793
+ "epoch": 0.27566693613581245,
5794
+ "grad_norm": 0.01856859214603901,
5795
+ "kl": 0.0005721814231947064,
5796
+ "learning_rate": 5.7946645109135e-06,
5797
+ "loss": 0.0,
5798
+ "reward": 0.2738095223903656,
5799
+ "reward_std": 0.2506893277168274,
5800
+ "rewards/answer_format_reward_2": 0.0,
5801
+ "rewards/common_format_reward": 0.2738095223903656,
5802
+ "rewards/doors_consistency_reward_2": 0.0,
5803
+ "rewards/geometry_consistency_reward_2": 0.0,
5804
+ "rewards/prompt_consistency_reward_2": 0.0,
5805
+ "rewards/walls_orthogonality_reward_2": 0.0,
5806
+ "step": 341
5807
+ },
5808
+ {
5809
+ "completion_length": 1493.1429443359375,
5810
+ "epoch": 0.2764753435731609,
5811
+ "grad_norm": 0.026749614626169205,
5812
+ "kl": 0.005409681238234043,
5813
+ "learning_rate": 5.7881972514147135e-06,
5814
+ "loss": 0.0002,
5815
+ "reward": 0.2827380895614624,
5816
+ "reward_std": 0.22369587421417236,
5817
+ "rewards/answer_format_reward_2": 0.0,
5818
+ "rewards/common_format_reward": 0.2827380895614624,
5819
+ "rewards/doors_consistency_reward_2": 0.0,
5820
+ "rewards/geometry_consistency_reward_2": 0.0,
5821
+ "rewards/prompt_consistency_reward_2": 0.0,
5822
+ "rewards/walls_orthogonality_reward_2": 0.0,
5823
+ "step": 342
5824
+ },
5825
+ {
5826
+ "completion_length": 1273.452392578125,
5827
+ "epoch": 0.2772837510105093,
5828
+ "grad_norm": 0.02593490295112133,
5829
+ "kl": 0.016275504603981972,
5830
+ "learning_rate": 5.781729991915926e-06,
5831
+ "loss": 0.0007,
5832
+ "reward": 0.2976190447807312,
5833
+ "reward_std": 0.2699171006679535,
5834
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5835
+ "rewards/common_format_reward": 0.2857142984867096,
5836
+ "rewards/doors_consistency_reward_2": 0.0,
5837
+ "rewards/geometry_consistency_reward_2": 0.0,
5838
+ "rewards/prompt_consistency_reward_2": 0.0,
5839
+ "rewards/walls_orthogonality_reward_2": 0.0,
5840
+ "step": 343
5841
+ },
5842
+ {
5843
+ "completion_length": 1367.5238037109375,
5844
+ "epoch": 0.27809215844785773,
5845
+ "grad_norm": 0.026104379445314407,
5846
+ "kl": 0.0004413637798279524,
5847
+ "learning_rate": 5.775262732417138e-06,
5848
+ "loss": 0.0,
5849
+ "reward": 0.318452388048172,
5850
+ "reward_std": 0.29804766178131104,
5851
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5852
+ "rewards/common_format_reward": 0.306547611951828,
5853
+ "rewards/doors_consistency_reward_2": 0.0,
5854
+ "rewards/geometry_consistency_reward_2": 0.0,
5855
+ "rewards/prompt_consistency_reward_2": 0.0,
5856
+ "rewards/walls_orthogonality_reward_2": 0.0,
5857
+ "step": 344
5858
+ },
5859
+ {
5860
+ "completion_length": 1284.952392578125,
5861
+ "epoch": 0.2789005658852061,
5862
+ "grad_norm": 0.034858983010053635,
5863
+ "kl": 0.015232698991894722,
5864
+ "learning_rate": 5.76879547291835e-06,
5865
+ "loss": 0.0006,
5866
+ "reward": 0.324404776096344,
5867
+ "reward_std": 0.2649897336959839,
5868
+ "rewards/answer_format_reward_2": 0.0,
5869
+ "rewards/common_format_reward": 0.324404776096344,
5870
+ "rewards/doors_consistency_reward_2": 0.0,
5871
+ "rewards/geometry_consistency_reward_2": 0.0,
5872
+ "rewards/prompt_consistency_reward_2": 0.0,
5873
+ "rewards/walls_orthogonality_reward_2": 0.0,
5874
+ "step": 345
5875
+ },
5876
+ {
5877
+ "completion_length": 1303.7857666015625,
5878
+ "epoch": 0.27970897332255457,
5879
+ "grad_norm": 0.017450712621212006,
5880
+ "kl": 0.0005410314770415425,
5881
+ "learning_rate": 5.762328213419563e-06,
5882
+ "loss": 0.0,
5883
+ "reward": 0.313492089509964,
5884
+ "reward_std": 0.3222876489162445,
5885
+ "rewards/answer_format_reward_2": 0.02380952425301075,
5886
+ "rewards/common_format_reward": 0.2857142984867096,
5887
+ "rewards/doors_consistency_reward_2": 0.0,
5888
+ "rewards/geometry_consistency_reward_2": 0.0,
5889
+ "rewards/prompt_consistency_reward_2": 0.0,
5890
+ "rewards/walls_orthogonality_reward_2": 0.003968254197388887,
5891
+ "step": 346
5892
+ },
5893
+ {
5894
+ "completion_length": 1071.71435546875,
5895
+ "epoch": 0.280517380759903,
5896
+ "grad_norm": 0.02490563876926899,
5897
+ "kl": 0.02408970706164837,
5898
+ "learning_rate": 5.755860953920776e-06,
5899
+ "loss": 0.001,
5900
+ "reward": 0.379676878452301,
5901
+ "reward_std": 0.32025134563446045,
5902
+ "rewards/answer_format_reward_2": 0.02380952425301075,
5903
+ "rewards/common_format_reward": 0.3363095223903656,
5904
+ "rewards/doors_consistency_reward_2": 0.0,
5905
+ "rewards/geometry_consistency_reward_2": 0.0,
5906
+ "rewards/prompt_consistency_reward_2": 0.0,
5907
+ "rewards/walls_orthogonality_reward_2": 0.019557824358344078,
5908
+ "step": 347
5909
+ },
5910
+ {
5911
+ "completion_length": 1339.5,
5912
+ "epoch": 0.2813257881972514,
5913
+ "grad_norm": 0.021240364760160446,
5914
+ "kl": 0.016705665737390518,
5915
+ "learning_rate": 5.749393694421988e-06,
5916
+ "loss": 0.0007,
5917
+ "reward": 0.3541666865348816,
5918
+ "reward_std": 0.2655835449695587,
5919
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5920
+ "rewards/common_format_reward": 0.3422619104385376,
5921
+ "rewards/doors_consistency_reward_2": 0.0,
5922
+ "rewards/geometry_consistency_reward_2": 0.0,
5923
+ "rewards/prompt_consistency_reward_2": 0.0,
5924
+ "rewards/walls_orthogonality_reward_2": 0.0,
5925
+ "step": 348
5926
+ },
5927
+ {
5928
+ "completion_length": 1303.59521484375,
5929
+ "epoch": 0.28213419563459985,
5930
+ "grad_norm": 0.019825851544737816,
5931
+ "kl": 0.00042840460082516074,
5932
+ "learning_rate": 5.742926434923201e-06,
5933
+ "loss": 0.0,
5934
+ "reward": 0.3790532946586609,
5935
+ "reward_std": 0.44379788637161255,
5936
+ "rewards/answer_format_reward_2": 0.04444444552063942,
5937
+ "rewards/common_format_reward": 0.3125,
5938
+ "rewards/doors_consistency_reward_2": 0.0,
5939
+ "rewards/geometry_consistency_reward_2": 0.0,
5940
+ "rewards/prompt_consistency_reward_2": 0.0,
5941
+ "rewards/walls_orthogonality_reward_2": 0.02210884355008602,
5942
+ "step": 349
5943
+ },
5944
+ {
5945
+ "completion_length": 1261.2857666015625,
5946
+ "epoch": 0.28294260307194824,
5947
+ "grad_norm": 0.03553672879934311,
5948
+ "kl": 0.01145523227751255,
5949
+ "learning_rate": 5.736459175424413e-06,
5950
+ "loss": 0.0005,
5951
+ "reward": 0.3221372067928314,
5952
+ "reward_std": 0.2752075493335724,
5953
+ "rewards/answer_format_reward_2": 0.009637188166379929,
5954
+ "rewards/common_format_reward": 0.3125,
5955
+ "rewards/doors_consistency_reward_2": 0.0,
5956
+ "rewards/geometry_consistency_reward_2": 0.0,
5957
+ "rewards/prompt_consistency_reward_2": 0.0,
5958
+ "rewards/walls_orthogonality_reward_2": 0.0,
5959
+ "step": 350
5960
+ },
5961
+ {
5962
+ "completion_length": 1319.357177734375,
5963
+ "epoch": 0.2837510105092967,
5964
+ "grad_norm": 0.029814794659614563,
5965
+ "kl": 0.018817100673913956,
5966
+ "learning_rate": 5.729991915925626e-06,
5967
+ "loss": 0.0008,
5968
+ "reward": 0.4375,
5969
+ "reward_std": 0.5767967700958252,
5970
+ "rewards/answer_format_reward_2": 0.0357142873108387,
5971
+ "rewards/common_format_reward": 0.306547611951828,
5972
+ "rewards/doors_consistency_reward_2": 0.02380952425301075,
5973
+ "rewards/geometry_consistency_reward_2": 0.02380952425301075,
5974
+ "rewards/prompt_consistency_reward_2": 0.02380952425301075,
5975
+ "rewards/walls_orthogonality_reward_2": 0.02380952425301075,
5976
+ "step": 351
5977
+ },
5978
+ {
5979
+ "completion_length": 1127.857177734375,
5980
+ "epoch": 0.28455941794664513,
5981
+ "grad_norm": 0.03532763198018074,
5982
+ "kl": 0.01532988715916872,
5983
+ "learning_rate": 5.723524656426839e-06,
5984
+ "loss": 0.0006,
5985
+ "reward": 0.3005952537059784,
5986
+ "reward_std": 0.28457164764404297,
5987
+ "rewards/answer_format_reward_2": 0.011904762126505375,
5988
+ "rewards/common_format_reward": 0.2886904776096344,
5989
+ "rewards/doors_consistency_reward_2": 0.0,
5990
+ "rewards/geometry_consistency_reward_2": 0.0,
5991
+ "rewards/prompt_consistency_reward_2": 0.0,
5992
+ "rewards/walls_orthogonality_reward_2": 0.0,
5993
+ "step": 352
5994
+ },
5995
+ {
5996
+ "completion_length": 1293.5,
5997
+ "epoch": 0.2853678253839935,
5998
+ "grad_norm": 0.041730720549821854,
5999
+ "kl": 0.019338881596922874,
6000
+ "learning_rate": 5.7170573969280515e-06,
6001
+ "loss": 0.0008,
6002
+ "reward": 0.2738095223903656,
6003
+ "reward_std": 0.25630414485931396,
6004
+ "rewards/answer_format_reward_2": 0.0,
6005
+ "rewards/common_format_reward": 0.2738095223903656,
6006
+ "rewards/doors_consistency_reward_2": 0.0,
6007
+ "rewards/geometry_consistency_reward_2": 0.0,
6008
+ "rewards/prompt_consistency_reward_2": 0.0,
6009
+ "rewards/walls_orthogonality_reward_2": 0.0,
6010
+ "step": 353
6011
+ },
6012
+ {
6013
+ "completion_length": 1185.0238037109375,
6014
+ "epoch": 0.28617623282134197,
6015
+ "grad_norm": 0.02626994252204895,
6016
+ "kl": 0.0004271938814781606,
6017
+ "learning_rate": 5.710590137429264e-06,
6018
+ "loss": 0.0,
6019
+ "reward": 0.2797619104385376,
6020
+ "reward_std": 0.2707957327365875,
6021
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6022
+ "rewards/common_format_reward": 0.2678571343421936,
6023
+ "rewards/doors_consistency_reward_2": 0.0,
6024
+ "rewards/geometry_consistency_reward_2": 0.0,
6025
+ "rewards/prompt_consistency_reward_2": 0.0,
6026
+ "rewards/walls_orthogonality_reward_2": 0.0,
6027
+ "step": 354
6028
+ },
6029
+ {
6030
+ "completion_length": 1415.9285888671875,
6031
+ "epoch": 0.28698464025869036,
6032
+ "grad_norm": 0.02469261735677719,
6033
+ "kl": 0.004333313088864088,
6034
+ "learning_rate": 5.704122877930477e-06,
6035
+ "loss": 0.0002,
6036
+ "reward": 0.2916666865348816,
6037
+ "reward_std": 0.2779538035392761,
6038
+ "rewards/answer_format_reward_2": 0.0,
6039
+ "rewards/common_format_reward": 0.2916666865348816,
6040
+ "rewards/doors_consistency_reward_2": 0.0,
6041
+ "rewards/geometry_consistency_reward_2": 0.0,
6042
+ "rewards/prompt_consistency_reward_2": 0.0,
6043
+ "rewards/walls_orthogonality_reward_2": 0.0,
6044
+ "step": 355
6045
+ },
6046
+ {
6047
+ "completion_length": 1332.09521484375,
6048
+ "epoch": 0.2877930476960388,
6049
+ "grad_norm": 0.023570047691464424,
6050
+ "kl": 0.015893612056970596,
6051
+ "learning_rate": 5.69765561843169e-06,
6052
+ "loss": 0.0006,
6053
+ "reward": 0.3511904776096344,
6054
+ "reward_std": 0.26069581508636475,
6055
+ "rewards/answer_format_reward_2": 0.0,
6056
+ "rewards/common_format_reward": 0.3511904776096344,
6057
+ "rewards/doors_consistency_reward_2": 0.0,
6058
+ "rewards/geometry_consistency_reward_2": 0.0,
6059
+ "rewards/prompt_consistency_reward_2": 0.0,
6060
+ "rewards/walls_orthogonality_reward_2": 0.0,
6061
+ "step": 356
6062
+ },
6063
+ {
6064
+ "completion_length": 1263.666748046875,
6065
+ "epoch": 0.28860145513338725,
6066
+ "grad_norm": 0.018288280814886093,
6067
+ "kl": 0.004468055441975594,
6068
+ "learning_rate": 5.691188358932902e-06,
6069
+ "loss": 0.0002,
6070
+ "reward": 0.3779762089252472,
6071
+ "reward_std": 0.24179227650165558,
6072
+ "rewards/answer_format_reward_2": 0.0,
6073
+ "rewards/common_format_reward": 0.3779762089252472,
6074
+ "rewards/doors_consistency_reward_2": 0.0,
6075
+ "rewards/geometry_consistency_reward_2": 0.0,
6076
+ "rewards/prompt_consistency_reward_2": 0.0,
6077
+ "rewards/walls_orthogonality_reward_2": 0.0,
6078
+ "step": 357
6079
+ },
6080
+ {
6081
+ "completion_length": 1370.09521484375,
6082
+ "epoch": 0.28940986257073564,
6083
+ "grad_norm": 0.026647189632058144,
6084
+ "kl": 0.009908330626785755,
6085
+ "learning_rate": 5.684721099434115e-06,
6086
+ "loss": 0.0004,
6087
+ "reward": 0.3095238208770752,
6088
+ "reward_std": 0.23659761250019073,
6089
+ "rewards/answer_format_reward_2": 0.0,
6090
+ "rewards/common_format_reward": 0.3095238208770752,
6091
+ "rewards/doors_consistency_reward_2": 0.0,
6092
+ "rewards/geometry_consistency_reward_2": 0.0,
6093
+ "rewards/prompt_consistency_reward_2": 0.0,
6094
+ "rewards/walls_orthogonality_reward_2": 0.0,
6095
+ "step": 358
6096
+ },
6097
+ {
6098
+ "completion_length": 1217.90478515625,
6099
+ "epoch": 0.2902182700080841,
6100
+ "grad_norm": 0.016880203038454056,
6101
+ "kl": 0.00048451582551933825,
6102
+ "learning_rate": 5.678253839935327e-06,
6103
+ "loss": 0.0,
6104
+ "reward": 0.3154762089252472,
6105
+ "reward_std": 0.23783370852470398,
6106
+ "rewards/answer_format_reward_2": 0.0,
6107
+ "rewards/common_format_reward": 0.3154762089252472,
6108
+ "rewards/doors_consistency_reward_2": 0.0,
6109
+ "rewards/geometry_consistency_reward_2": 0.0,
6110
+ "rewards/prompt_consistency_reward_2": 0.0,
6111
+ "rewards/walls_orthogonality_reward_2": 0.0,
6112
+ "step": 359
6113
+ },
6114
+ {
6115
+ "completion_length": 1302.6905517578125,
6116
+ "epoch": 0.2910266774454325,
6117
+ "grad_norm": 0.06579333543777466,
6118
+ "kl": 0.016222143545746803,
6119
+ "learning_rate": 5.671786580436539e-06,
6120
+ "loss": 0.0006,
6121
+ "reward": 0.3214285671710968,
6122
+ "reward_std": 0.2569543421268463,
6123
+ "rewards/answer_format_reward_2": 0.0,
6124
+ "rewards/common_format_reward": 0.3214285671710968,
6125
+ "rewards/doors_consistency_reward_2": 0.0,
6126
+ "rewards/geometry_consistency_reward_2": 0.0,
6127
+ "rewards/prompt_consistency_reward_2": 0.0,
6128
+ "rewards/walls_orthogonality_reward_2": 0.0,
6129
+ "step": 360
6130
+ },
6131
+ {
6132
+ "completion_length": 1430.5714111328125,
6133
+ "epoch": 0.2918350848827809,
6134
+ "grad_norm": 0.02659917064011097,
6135
+ "kl": 0.014950796961784363,
6136
+ "learning_rate": 5.665319320937753e-06,
6137
+ "loss": 0.0006,
6138
+ "reward": 0.3333333432674408,
6139
+ "reward_std": 0.2614738345146179,
6140
+ "rewards/answer_format_reward_2": 0.0,
6141
+ "rewards/common_format_reward": 0.3333333432674408,
6142
+ "rewards/doors_consistency_reward_2": 0.0,
6143
+ "rewards/geometry_consistency_reward_2": 0.0,
6144
+ "rewards/prompt_consistency_reward_2": 0.0,
6145
+ "rewards/walls_orthogonality_reward_2": 0.0,
6146
+ "step": 361
6147
+ },
6148
+ {
6149
+ "completion_length": 1377.952392578125,
6150
+ "epoch": 0.2926434923201294,
6151
+ "grad_norm": 0.026068588718771935,
6152
+ "kl": 0.013599374331533909,
6153
+ "learning_rate": 5.658852061438965e-06,
6154
+ "loss": 0.0005,
6155
+ "reward": 0.306547611951828,
6156
+ "reward_std": 0.26628854870796204,
6157
+ "rewards/answer_format_reward_2": 0.0,
6158
+ "rewards/common_format_reward": 0.306547611951828,
6159
+ "rewards/doors_consistency_reward_2": 0.0,
6160
+ "rewards/geometry_consistency_reward_2": 0.0,
6161
+ "rewards/prompt_consistency_reward_2": 0.0,
6162
+ "rewards/walls_orthogonality_reward_2": 0.0,
6163
+ "step": 362
6164
+ },
6165
+ {
6166
+ "completion_length": 1482.0714111328125,
6167
+ "epoch": 0.29345189975747776,
6168
+ "grad_norm": 0.025500476360321045,
6169
+ "kl": 0.0111153619363904,
6170
+ "learning_rate": 5.6523848019401775e-06,
6171
+ "loss": 0.0004,
6172
+ "reward": 0.3720238208770752,
6173
+ "reward_std": 0.21637828648090363,
6174
+ "rewards/answer_format_reward_2": 0.0,
6175
+ "rewards/common_format_reward": 0.3720238208770752,
6176
+ "rewards/doors_consistency_reward_2": 0.0,
6177
+ "rewards/geometry_consistency_reward_2": 0.0,
6178
+ "rewards/prompt_consistency_reward_2": 0.0,
6179
+ "rewards/walls_orthogonality_reward_2": 0.0,
6180
+ "step": 363
6181
+ },
6182
+ {
6183
+ "completion_length": 1163.5714111328125,
6184
+ "epoch": 0.2942603071948262,
6185
+ "grad_norm": 0.02546641230583191,
6186
+ "kl": 0.0005430954042822123,
6187
+ "learning_rate": 5.6459175424413905e-06,
6188
+ "loss": 0.0,
6189
+ "reward": 0.2767857313156128,
6190
+ "reward_std": 0.25442835688591003,
6191
+ "rewards/answer_format_reward_2": 0.0,
6192
+ "rewards/common_format_reward": 0.2767857313156128,
6193
+ "rewards/doors_consistency_reward_2": 0.0,
6194
+ "rewards/geometry_consistency_reward_2": 0.0,
6195
+ "rewards/prompt_consistency_reward_2": 0.0,
6196
+ "rewards/walls_orthogonality_reward_2": 0.0,
6197
+ "step": 364
6198
+ },
6199
+ {
6200
+ "completion_length": 1226.7381591796875,
6201
+ "epoch": 0.2950687146321746,
6202
+ "grad_norm": 0.04306091368198395,
6203
+ "kl": 0.0006244779797270894,
6204
+ "learning_rate": 5.639450282942603e-06,
6205
+ "loss": 0.0,
6206
+ "reward": 0.3482142984867096,
6207
+ "reward_std": 0.27509331703186035,
6208
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6209
+ "rewards/common_format_reward": 0.3363095223903656,
6210
+ "rewards/doors_consistency_reward_2": 0.0,
6211
+ "rewards/geometry_consistency_reward_2": 0.0,
6212
+ "rewards/prompt_consistency_reward_2": 0.0,
6213
+ "rewards/walls_orthogonality_reward_2": 0.0,
6214
+ "step": 365
6215
+ },
6216
+ {
6217
+ "completion_length": 1086.1190185546875,
6218
+ "epoch": 0.29587712206952305,
6219
+ "grad_norm": 0.019724708050489426,
6220
+ "kl": 0.0022440270986407995,
6221
+ "learning_rate": 5.632983023443816e-06,
6222
+ "loss": 0.0001,
6223
+ "reward": 0.3392857313156128,
6224
+ "reward_std": 0.27470487356185913,
6225
+ "rewards/answer_format_reward_2": 0.0,
6226
+ "rewards/common_format_reward": 0.3392857313156128,
6227
+ "rewards/doors_consistency_reward_2": 0.0,
6228
+ "rewards/geometry_consistency_reward_2": 0.0,
6229
+ "rewards/prompt_consistency_reward_2": 0.0,
6230
+ "rewards/walls_orthogonality_reward_2": 0.0,
6231
+ "step": 366
6232
+ },
6233
+ {
6234
+ "completion_length": 1142.952392578125,
6235
+ "epoch": 0.29668552950687144,
6236
+ "grad_norm": 0.019828204065561295,
6237
+ "kl": 0.010727436281740665,
6238
+ "learning_rate": 5.626515763945028e-06,
6239
+ "loss": 0.0004,
6240
+ "reward": 0.2708333432674408,
6241
+ "reward_std": 0.2589353024959564,
6242
+ "rewards/answer_format_reward_2": 0.0,
6243
+ "rewards/common_format_reward": 0.2708333432674408,
6244
+ "rewards/doors_consistency_reward_2": 0.0,
6245
+ "rewards/geometry_consistency_reward_2": 0.0,
6246
+ "rewards/prompt_consistency_reward_2": 0.0,
6247
+ "rewards/walls_orthogonality_reward_2": 0.0,
6248
+ "step": 367
6249
+ },
6250
+ {
6251
+ "completion_length": 1525.357177734375,
6252
+ "epoch": 0.2974939369442199,
6253
+ "grad_norm": 0.023398665711283684,
6254
+ "kl": 0.017353033646941185,
6255
+ "learning_rate": 5.620048504446241e-06,
6256
+ "loss": 0.0007,
6257
+ "reward": 0.36446887254714966,
6258
+ "reward_std": 0.35044190287590027,
6259
+ "rewards/answer_format_reward_2": 0.02380952425301075,
6260
+ "rewards/common_format_reward": 0.3214285671710968,
6261
+ "rewards/doors_consistency_reward_2": 0.0,
6262
+ "rewards/geometry_consistency_reward_2": 0.0,
6263
+ "rewards/prompt_consistency_reward_2": 0.0,
6264
+ "rewards/walls_orthogonality_reward_2": 0.01923076994717121,
6265
+ "step": 368
6266
+ },
6267
+ {
6268
+ "completion_length": 1356.21435546875,
6269
+ "epoch": 0.29830234438156833,
6270
+ "grad_norm": 0.026468202471733093,
6271
+ "kl": 0.02186056226491928,
6272
+ "learning_rate": 5.613581244947453e-06,
6273
+ "loss": 0.0009,
6274
+ "reward": 0.3779762089252472,
6275
+ "reward_std": 0.27012985944747925,
6276
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6277
+ "rewards/common_format_reward": 0.3660714328289032,
6278
+ "rewards/doors_consistency_reward_2": 0.0,
6279
+ "rewards/geometry_consistency_reward_2": 0.0,
6280
+ "rewards/prompt_consistency_reward_2": 0.0,
6281
+ "rewards/walls_orthogonality_reward_2": 0.0,
6282
+ "step": 369
6283
+ },
6284
+ {
6285
+ "completion_length": 1432.7381591796875,
6286
+ "epoch": 0.2991107518189167,
6287
+ "grad_norm": 0.01999063789844513,
6288
+ "kl": 0.008162550628185272,
6289
+ "learning_rate": 5.607113985448665e-06,
6290
+ "loss": 0.0003,
6291
+ "reward": 0.369047611951828,
6292
+ "reward_std": 0.28184574842453003,
6293
+ "rewards/answer_format_reward_2": 0.0,
6294
+ "rewards/common_format_reward": 0.369047611951828,
6295
+ "rewards/doors_consistency_reward_2": 0.0,
6296
+ "rewards/geometry_consistency_reward_2": 0.0,
6297
+ "rewards/prompt_consistency_reward_2": 0.0,
6298
+ "rewards/walls_orthogonality_reward_2": 0.0,
6299
+ "step": 370
6300
+ },
6301
+ {
6302
+ "completion_length": 1271.4761962890625,
6303
+ "epoch": 0.29991915925626517,
6304
+ "grad_norm": 0.017468102276325226,
6305
+ "kl": 0.00043145468225702643,
6306
+ "learning_rate": 5.600646725949879e-06,
6307
+ "loss": 0.0,
6308
+ "reward": 0.2827380895614624,
6309
+ "reward_std": 0.2799534201622009,
6310
+ "rewards/answer_format_reward_2": 0.0,
6311
+ "rewards/common_format_reward": 0.2827380895614624,
6312
+ "rewards/doors_consistency_reward_2": 0.0,
6313
+ "rewards/geometry_consistency_reward_2": 0.0,
6314
+ "rewards/prompt_consistency_reward_2": 0.0,
6315
+ "rewards/walls_orthogonality_reward_2": 0.0,
6316
+ "step": 371
6317
+ },
6318
+ {
6319
+ "completion_length": 1280.4761962890625,
6320
+ "epoch": 0.30072756669361356,
6321
+ "grad_norm": 0.03255929425358772,
6322
+ "kl": 0.01609964482486248,
6323
+ "learning_rate": 5.594179466451091e-06,
6324
+ "loss": 0.0006,
6325
+ "reward": 0.306547611951828,
6326
+ "reward_std": 0.24303922057151794,
6327
+ "rewards/answer_format_reward_2": 0.0,
6328
+ "rewards/common_format_reward": 0.306547611951828,
6329
+ "rewards/doors_consistency_reward_2": 0.0,
6330
+ "rewards/geometry_consistency_reward_2": 0.0,
6331
+ "rewards/prompt_consistency_reward_2": 0.0,
6332
+ "rewards/walls_orthogonality_reward_2": 0.0,
6333
+ "step": 372
6334
+ },
6335
+ {
6336
+ "completion_length": 1195.857177734375,
6337
+ "epoch": 0.301535974130962,
6338
+ "grad_norm": 0.07637374103069305,
6339
+ "kl": 0.014363820664584637,
6340
+ "learning_rate": 5.587712206952303e-06,
6341
+ "loss": 0.0006,
6342
+ "reward": 0.4226190447807312,
6343
+ "reward_std": 0.2839301526546478,
6344
+ "rewards/answer_format_reward_2": 0.0,
6345
+ "rewards/common_format_reward": 0.4226190447807312,
6346
+ "rewards/doors_consistency_reward_2": 0.0,
6347
+ "rewards/geometry_consistency_reward_2": 0.0,
6348
+ "rewards/prompt_consistency_reward_2": 0.0,
6349
+ "rewards/walls_orthogonality_reward_2": 0.0,
6350
+ "step": 373
6351
+ },
6352
+ {
6353
+ "completion_length": 1310.1190185546875,
6354
+ "epoch": 0.30234438156831045,
6355
+ "grad_norm": 0.020058052614331245,
6356
+ "kl": 0.0005297682364471257,
6357
+ "learning_rate": 5.581244947453516e-06,
6358
+ "loss": 0.0,
6359
+ "reward": 0.375,
6360
+ "reward_std": 0.25024354457855225,
6361
+ "rewards/answer_format_reward_2": 0.0,
6362
+ "rewards/common_format_reward": 0.375,
6363
+ "rewards/doors_consistency_reward_2": 0.0,
6364
+ "rewards/geometry_consistency_reward_2": 0.0,
6365
+ "rewards/prompt_consistency_reward_2": 0.0,
6366
+ "rewards/walls_orthogonality_reward_2": 0.0,
6367
+ "step": 374
6368
+ },
6369
+ {
6370
+ "completion_length": 1246.8095703125,
6371
+ "epoch": 0.30315278900565884,
6372
+ "grad_norm": 0.028393249958753586,
6373
+ "kl": 0.011599484831094742,
6374
+ "learning_rate": 5.5747776879547286e-06,
6375
+ "loss": 0.0005,
6376
+ "reward": 0.3273809552192688,
6377
+ "reward_std": 0.1949739009141922,
6378
+ "rewards/answer_format_reward_2": 0.0,
6379
+ "rewards/common_format_reward": 0.3273809552192688,
6380
+ "rewards/doors_consistency_reward_2": 0.0,
6381
+ "rewards/geometry_consistency_reward_2": 0.0,
6382
+ "rewards/prompt_consistency_reward_2": 0.0,
6383
+ "rewards/walls_orthogonality_reward_2": 0.0,
6384
+ "step": 375
6385
+ },
6386
+ {
6387
+ "completion_length": 1464.452392578125,
6388
+ "epoch": 0.3039611964430073,
6389
+ "grad_norm": 0.028903203085064888,
6390
+ "kl": 0.015113821253180504,
6391
+ "learning_rate": 5.5683104284559416e-06,
6392
+ "loss": 0.0006,
6393
+ "reward": 0.41921770572662354,
6394
+ "reward_std": 0.3666028380393982,
6395
+ "rewards/answer_format_reward_2": 0.02380952425301075,
6396
+ "rewards/common_format_reward": 0.375,
6397
+ "rewards/doors_consistency_reward_2": 0.0,
6398
+ "rewards/geometry_consistency_reward_2": 0.0,
6399
+ "rewards/prompt_consistency_reward_2": 0.0,
6400
+ "rewards/walls_orthogonality_reward_2": 0.020408164709806442,
6401
+ "step": 376
6402
+ },
6403
+ {
6404
+ "completion_length": 1406.6190185546875,
6405
+ "epoch": 0.3047696038803557,
6406
+ "grad_norm": 0.03880228474736214,
6407
+ "kl": 0.01875283755362034,
6408
+ "learning_rate": 5.561843168957155e-06,
6409
+ "loss": 0.0008,
6410
+ "reward": 0.2351190447807312,
6411
+ "reward_std": 0.2147950530052185,
6412
+ "rewards/answer_format_reward_2": 0.0,
6413
+ "rewards/common_format_reward": 0.2351190447807312,
6414
+ "rewards/doors_consistency_reward_2": 0.0,
6415
+ "rewards/geometry_consistency_reward_2": 0.0,
6416
+ "rewards/prompt_consistency_reward_2": 0.0,
6417
+ "rewards/walls_orthogonality_reward_2": 0.0,
6418
+ "step": 377
6419
+ },
6420
+ {
6421
+ "completion_length": 1347.1190185546875,
6422
+ "epoch": 0.3055780113177041,
6423
+ "grad_norm": 0.0395733080804348,
6424
+ "kl": 0.013636160641908646,
6425
+ "learning_rate": 5.555375909458367e-06,
6426
+ "loss": 0.0005,
6427
+ "reward": 0.318452388048172,
6428
+ "reward_std": 0.32642748951911926,
6429
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6430
+ "rewards/common_format_reward": 0.306547611951828,
6431
+ "rewards/doors_consistency_reward_2": 0.0,
6432
+ "rewards/geometry_consistency_reward_2": 0.0,
6433
+ "rewards/prompt_consistency_reward_2": 0.0,
6434
+ "rewards/walls_orthogonality_reward_2": 0.0,
6435
+ "step": 378
6436
+ },
6437
+ {
6438
+ "completion_length": 1279.452392578125,
6439
+ "epoch": 0.30638641875505257,
6440
+ "grad_norm": 0.032380059361457825,
6441
+ "kl": 0.004963691346347332,
6442
+ "learning_rate": 5.548908649959579e-06,
6443
+ "loss": 0.0002,
6444
+ "reward": 0.3125,
6445
+ "reward_std": 0.2651495337486267,
6446
+ "rewards/answer_format_reward_2": 0.0,
6447
+ "rewards/common_format_reward": 0.3125,
6448
+ "rewards/doors_consistency_reward_2": 0.0,
6449
+ "rewards/geometry_consistency_reward_2": 0.0,
6450
+ "rewards/prompt_consistency_reward_2": 0.0,
6451
+ "rewards/walls_orthogonality_reward_2": 0.0,
6452
+ "step": 379
6453
+ },
6454
+ {
6455
+ "completion_length": 1589.0238037109375,
6456
+ "epoch": 0.30719482619240096,
6457
+ "grad_norm": 0.019695742055773735,
6458
+ "kl": 0.011754564009606838,
6459
+ "learning_rate": 5.542441390460792e-06,
6460
+ "loss": 0.0005,
6461
+ "reward": 0.2976190447807312,
6462
+ "reward_std": 0.2735596299171448,
6463
+ "rewards/answer_format_reward_2": 0.0,
6464
+ "rewards/common_format_reward": 0.2976190447807312,
6465
+ "rewards/doors_consistency_reward_2": 0.0,
6466
+ "rewards/geometry_consistency_reward_2": 0.0,
6467
+ "rewards/prompt_consistency_reward_2": 0.0,
6468
+ "rewards/walls_orthogonality_reward_2": 0.0,
6469
+ "step": 380
6470
+ },
6471
+ {
6472
+ "completion_length": 993.8095703125,
6473
+ "epoch": 0.3080032336297494,
6474
+ "grad_norm": 0.029455509036779404,
6475
+ "kl": 0.01250517275184393,
6476
+ "learning_rate": 5.535974130962005e-06,
6477
+ "loss": 0.0005,
6478
+ "reward": 0.3035714328289032,
6479
+ "reward_std": 0.281058669090271,
6480
+ "rewards/answer_format_reward_2": 0.0,
6481
+ "rewards/common_format_reward": 0.3035714328289032,
6482
+ "rewards/doors_consistency_reward_2": 0.0,
6483
+ "rewards/geometry_consistency_reward_2": 0.0,
6484
+ "rewards/prompt_consistency_reward_2": 0.0,
6485
+ "rewards/walls_orthogonality_reward_2": 0.0,
6486
+ "step": 381
6487
+ },
6488
+ {
6489
+ "completion_length": 1344.8095703125,
6490
+ "epoch": 0.3088116410670978,
6491
+ "grad_norm": 0.02273244597017765,
6492
+ "kl": 0.005033253692090511,
6493
+ "learning_rate": 5.529506871463217e-06,
6494
+ "loss": 0.0002,
6495
+ "reward": 0.2678571343421936,
6496
+ "reward_std": 0.19896648824214935,
6497
+ "rewards/answer_format_reward_2": 0.0,
6498
+ "rewards/common_format_reward": 0.2678571343421936,
6499
+ "rewards/doors_consistency_reward_2": 0.0,
6500
+ "rewards/geometry_consistency_reward_2": 0.0,
6501
+ "rewards/prompt_consistency_reward_2": 0.0,
6502
+ "rewards/walls_orthogonality_reward_2": 0.0,
6503
+ "step": 382
6504
+ },
6505
+ {
6506
+ "completion_length": 959.1190795898438,
6507
+ "epoch": 0.30962004850444624,
6508
+ "grad_norm": 0.027015283703804016,
6509
+ "kl": 0.0007688857731409371,
6510
+ "learning_rate": 5.52303961196443e-06,
6511
+ "loss": 0.0,
6512
+ "reward": 0.2738095223903656,
6513
+ "reward_std": 0.2390180379152298,
6514
+ "rewards/answer_format_reward_2": 0.0,
6515
+ "rewards/common_format_reward": 0.2738095223903656,
6516
+ "rewards/doors_consistency_reward_2": 0.0,
6517
+ "rewards/geometry_consistency_reward_2": 0.0,
6518
+ "rewards/prompt_consistency_reward_2": 0.0,
6519
+ "rewards/walls_orthogonality_reward_2": 0.0,
6520
+ "step": 383
6521
+ },
6522
+ {
6523
+ "completion_length": 1279.0238037109375,
6524
+ "epoch": 0.3104284559417947,
6525
+ "grad_norm": 0.02247336506843567,
6526
+ "kl": 0.011627847328782082,
6527
+ "learning_rate": 5.516572352465642e-06,
6528
+ "loss": 0.0005,
6529
+ "reward": 0.3095238208770752,
6530
+ "reward_std": 0.27070754766464233,
6531
+ "rewards/answer_format_reward_2": 0.0,
6532
+ "rewards/common_format_reward": 0.3095238208770752,
6533
+ "rewards/doors_consistency_reward_2": 0.0,
6534
+ "rewards/geometry_consistency_reward_2": 0.0,
6535
+ "rewards/prompt_consistency_reward_2": 0.0,
6536
+ "rewards/walls_orthogonality_reward_2": 0.0,
6537
+ "step": 384
6538
+ },
6539
+ {
6540
+ "completion_length": 1413.8095703125,
6541
+ "epoch": 0.3112368633791431,
6542
+ "grad_norm": 0.04035022109746933,
6543
+ "kl": 0.010610965080559254,
6544
+ "learning_rate": 5.5101050929668545e-06,
6545
+ "loss": 0.0004,
6546
+ "reward": 0.3273809552192688,
6547
+ "reward_std": 0.28236716985702515,
6548
+ "rewards/answer_format_reward_2": 0.0,
6549
+ "rewards/common_format_reward": 0.3273809552192688,
6550
+ "rewards/doors_consistency_reward_2": 0.0,
6551
+ "rewards/geometry_consistency_reward_2": 0.0,
6552
+ "rewards/prompt_consistency_reward_2": 0.0,
6553
+ "rewards/walls_orthogonality_reward_2": 0.0,
6554
+ "step": 385
6555
+ },
6556
+ {
6557
+ "completion_length": 1480.0238037109375,
6558
+ "epoch": 0.3120452708164915,
6559
+ "grad_norm": 0.014338729903101921,
6560
+ "kl": 0.0005015616188757122,
6561
+ "learning_rate": 5.503637833468068e-06,
6562
+ "loss": 0.0,
6563
+ "reward": 0.3303571343421936,
6564
+ "reward_std": 0.2671121656894684,
6565
+ "rewards/answer_format_reward_2": 0.0,
6566
+ "rewards/common_format_reward": 0.3303571343421936,
6567
+ "rewards/doors_consistency_reward_2": 0.0,
6568
+ "rewards/geometry_consistency_reward_2": 0.0,
6569
+ "rewards/prompt_consistency_reward_2": 0.0,
6570
+ "rewards/walls_orthogonality_reward_2": 0.0,
6571
+ "step": 386
6572
+ },
6573
+ {
6574
+ "completion_length": 1339.40478515625,
6575
+ "epoch": 0.3128536782538399,
6576
+ "grad_norm": 0.020375793799757957,
6577
+ "kl": 0.0004927106201648712,
6578
+ "learning_rate": 5.4971705739692805e-06,
6579
+ "loss": 0.0,
6580
+ "reward": 0.2678571343421936,
6581
+ "reward_std": 0.2276732325553894,
6582
+ "rewards/answer_format_reward_2": 0.0,
6583
+ "rewards/common_format_reward": 0.2678571343421936,
6584
+ "rewards/doors_consistency_reward_2": 0.0,
6585
+ "rewards/geometry_consistency_reward_2": 0.0,
6586
+ "rewards/prompt_consistency_reward_2": 0.0,
6587
+ "rewards/walls_orthogonality_reward_2": 0.0,
6588
+ "step": 387
6589
+ },
6590
+ {
6591
+ "completion_length": 1249.4285888671875,
6592
+ "epoch": 0.31366208569118836,
6593
+ "grad_norm": 0.02535524219274521,
6594
+ "kl": 0.0037072785198688507,
6595
+ "learning_rate": 5.490703314470493e-06,
6596
+ "loss": 0.0001,
6597
+ "reward": 0.3005952537059784,
6598
+ "reward_std": 0.24646306037902832,
6599
+ "rewards/answer_format_reward_2": 0.0,
6600
+ "rewards/common_format_reward": 0.3005952537059784,
6601
+ "rewards/doors_consistency_reward_2": 0.0,
6602
+ "rewards/geometry_consistency_reward_2": 0.0,
6603
+ "rewards/prompt_consistency_reward_2": 0.0,
6604
+ "rewards/walls_orthogonality_reward_2": 0.0,
6605
+ "step": 388
6606
+ },
6607
+ {
6608
+ "completion_length": 1382.452392578125,
6609
+ "epoch": 0.3144704931285368,
6610
+ "grad_norm": 0.016909273341298103,
6611
+ "kl": 0.0005055685760453343,
6612
+ "learning_rate": 5.484236054971706e-06,
6613
+ "loss": 0.0,
6614
+ "reward": 0.2827380895614624,
6615
+ "reward_std": 0.23614071309566498,
6616
+ "rewards/answer_format_reward_2": 0.0,
6617
+ "rewards/common_format_reward": 0.2827380895614624,
6618
+ "rewards/doors_consistency_reward_2": 0.0,
6619
+ "rewards/geometry_consistency_reward_2": 0.0,
6620
+ "rewards/prompt_consistency_reward_2": 0.0,
6621
+ "rewards/walls_orthogonality_reward_2": 0.0,
6622
+ "step": 389
6623
+ },
6624
+ {
6625
+ "completion_length": 1209.2857666015625,
6626
+ "epoch": 0.3152789005658852,
6627
+ "grad_norm": 0.020957674831151962,
6628
+ "kl": 0.0004966093110851943,
6629
+ "learning_rate": 5.477768795472918e-06,
6630
+ "loss": 0.0,
6631
+ "reward": 0.3125,
6632
+ "reward_std": 0.25818145275115967,
6633
+ "rewards/answer_format_reward_2": 0.0,
6634
+ "rewards/common_format_reward": 0.3125,
6635
+ "rewards/doors_consistency_reward_2": 0.0,
6636
+ "rewards/geometry_consistency_reward_2": 0.0,
6637
+ "rewards/prompt_consistency_reward_2": 0.0,
6638
+ "rewards/walls_orthogonality_reward_2": 0.0,
6639
+ "step": 390
6640
+ },
6641
+ {
6642
+ "completion_length": 1385.7857666015625,
6643
+ "epoch": 0.31608730800323365,
6644
+ "grad_norm": 0.018930140882730484,
6645
+ "kl": 0.006610245909541845,
6646
+ "learning_rate": 5.471301535974131e-06,
6647
+ "loss": 0.0003,
6648
+ "reward": 0.306547611951828,
6649
+ "reward_std": 0.21854481101036072,
6650
+ "rewards/answer_format_reward_2": 0.0,
6651
+ "rewards/common_format_reward": 0.306547611951828,
6652
+ "rewards/doors_consistency_reward_2": 0.0,
6653
+ "rewards/geometry_consistency_reward_2": 0.0,
6654
+ "rewards/prompt_consistency_reward_2": 0.0,
6655
+ "rewards/walls_orthogonality_reward_2": 0.0,
6656
+ "step": 391
6657
+ },
6658
+ {
6659
+ "completion_length": 1267.8333740234375,
6660
+ "epoch": 0.31689571544058204,
6661
+ "grad_norm": 0.04813304543495178,
6662
+ "kl": 0.018240293487906456,
6663
+ "learning_rate": 5.464834276475343e-06,
6664
+ "loss": 0.0007,
6665
+ "reward": 0.3918651044368744,
6666
+ "reward_std": 0.26154178380966187,
6667
+ "rewards/answer_format_reward_2": 0.007936508394777775,
6668
+ "rewards/common_format_reward": 0.3839285671710968,
6669
+ "rewards/doors_consistency_reward_2": 0.0,
6670
+ "rewards/geometry_consistency_reward_2": 0.0,
6671
+ "rewards/prompt_consistency_reward_2": 0.0,
6672
+ "rewards/walls_orthogonality_reward_2": 0.0,
6673
+ "step": 392
6674
+ },
6675
+ {
6676
+ "completion_length": 1191.857177734375,
6677
+ "epoch": 0.3177041228779305,
6678
+ "grad_norm": 0.03435326740145683,
6679
+ "kl": 0.009386842139065266,
6680
+ "learning_rate": 5.458367016976556e-06,
6681
+ "loss": 0.0004,
6682
+ "reward": 0.386904776096344,
6683
+ "reward_std": 0.2864503562450409,
6684
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6685
+ "rewards/common_format_reward": 0.375,
6686
+ "rewards/doors_consistency_reward_2": 0.0,
6687
+ "rewards/geometry_consistency_reward_2": 0.0,
6688
+ "rewards/prompt_consistency_reward_2": 0.0,
6689
+ "rewards/walls_orthogonality_reward_2": 0.0,
6690
+ "step": 393
6691
+ },
6692
+ {
6693
+ "completion_length": 1278.8095703125,
6694
+ "epoch": 0.3185125303152789,
6695
+ "grad_norm": 0.030485305935144424,
6696
+ "kl": 0.009701346978545189,
6697
+ "learning_rate": 5.451899757477768e-06,
6698
+ "loss": 0.0004,
6699
+ "reward": 0.4136904776096344,
6700
+ "reward_std": 0.23172339797019958,
6701
+ "rewards/answer_format_reward_2": 0.0,
6702
+ "rewards/common_format_reward": 0.4136904776096344,
6703
+ "rewards/doors_consistency_reward_2": 0.0,
6704
+ "rewards/geometry_consistency_reward_2": 0.0,
6705
+ "rewards/prompt_consistency_reward_2": 0.0,
6706
+ "rewards/walls_orthogonality_reward_2": 0.0,
6707
+ "step": 394
6708
+ },
6709
+ {
6710
+ "completion_length": 1435.0,
6711
+ "epoch": 0.3193209377526273,
6712
+ "grad_norm": 0.013523245230317116,
6713
+ "kl": 0.0025565337855368853,
6714
+ "learning_rate": 5.44543249797898e-06,
6715
+ "loss": 0.0001,
6716
+ "reward": 0.3988095223903656,
6717
+ "reward_std": 0.22906598448753357,
6718
+ "rewards/answer_format_reward_2": 0.0,
6719
+ "rewards/common_format_reward": 0.3988095223903656,
6720
+ "rewards/doors_consistency_reward_2": 0.0,
6721
+ "rewards/geometry_consistency_reward_2": 0.0,
6722
+ "rewards/prompt_consistency_reward_2": 0.0,
6723
+ "rewards/walls_orthogonality_reward_2": 0.0,
6724
+ "step": 395
6725
+ },
6726
+ {
6727
+ "completion_length": 1240.8809814453125,
6728
+ "epoch": 0.32012934518997577,
6729
+ "grad_norm": 0.019103923812508583,
6730
+ "kl": 0.0023283306509256363,
6731
+ "learning_rate": 5.438965238480194e-06,
6732
+ "loss": 0.0001,
6733
+ "reward": 0.306547611951828,
6734
+ "reward_std": 0.24480070173740387,
6735
+ "rewards/answer_format_reward_2": 0.0,
6736
+ "rewards/common_format_reward": 0.306547611951828,
6737
+ "rewards/doors_consistency_reward_2": 0.0,
6738
+ "rewards/geometry_consistency_reward_2": 0.0,
6739
+ "rewards/prompt_consistency_reward_2": 0.0,
6740
+ "rewards/walls_orthogonality_reward_2": 0.0,
6741
+ "step": 396
6742
+ },
6743
+ {
6744
+ "completion_length": 1381.5,
6745
+ "epoch": 0.32093775262732416,
6746
+ "grad_norm": 0.023194076493382454,
6747
+ "kl": 0.013442360796034336,
6748
+ "learning_rate": 5.432497978981406e-06,
6749
+ "loss": 0.0005,
6750
+ "reward": 0.3541666865348816,
6751
+ "reward_std": 0.22634661197662354,
6752
+ "rewards/answer_format_reward_2": 0.0,
6753
+ "rewards/common_format_reward": 0.3541666865348816,
6754
+ "rewards/doors_consistency_reward_2": 0.0,
6755
+ "rewards/geometry_consistency_reward_2": 0.0,
6756
+ "rewards/prompt_consistency_reward_2": 0.0,
6757
+ "rewards/walls_orthogonality_reward_2": 0.0,
6758
+ "step": 397
6759
+ },
6760
+ {
6761
+ "completion_length": 1351.761962890625,
6762
+ "epoch": 0.3217461600646726,
6763
+ "grad_norm": 0.019812535494565964,
6764
+ "kl": 0.005300784949213266,
6765
+ "learning_rate": 5.426030719482619e-06,
6766
+ "loss": 0.0002,
6767
+ "reward": 0.3214285671710968,
6768
+ "reward_std": 0.26315417885780334,
6769
+ "rewards/answer_format_reward_2": 0.011904762126505375,
6770
+ "rewards/common_format_reward": 0.3095238208770752,
6771
+ "rewards/doors_consistency_reward_2": 0.0,
6772
+ "rewards/geometry_consistency_reward_2": 0.0,
6773
+ "rewards/prompt_consistency_reward_2": 0.0,
6774
+ "rewards/walls_orthogonality_reward_2": 0.0,
6775
+ "step": 398
6776
+ },
6777
+ {
6778
+ "completion_length": 1339.047607421875,
6779
+ "epoch": 0.322554567502021,
6780
+ "grad_norm": 0.043939027935266495,
6781
+ "kl": 0.008818179368972778,
6782
+ "learning_rate": 5.419563459983832e-06,
6783
+ "loss": 0.0004,
6784
+ "reward": 0.3363095223903656,
6785
+ "reward_std": 0.23655691742897034,
6786
+ "rewards/answer_format_reward_2": 0.0,
6787
+ "rewards/common_format_reward": 0.3363095223903656,
6788
+ "rewards/doors_consistency_reward_2": 0.0,
6789
+ "rewards/geometry_consistency_reward_2": 0.0,
6790
+ "rewards/prompt_consistency_reward_2": 0.0,
6791
+ "rewards/walls_orthogonality_reward_2": 0.0,
6792
+ "step": 399
6793
+ },
6794
+ {
6795
+ "completion_length": 1398.0238037109375,
6796
+ "epoch": 0.32336297493936944,
6797
+ "grad_norm": 0.017300043255090714,
6798
+ "kl": 0.006722276099026203,
6799
+ "learning_rate": 5.413096200485044e-06,
6800
+ "loss": 0.0003,
6801
+ "reward": 0.3452380895614624,
6802
+ "reward_std": 0.25769612193107605,
6803
+ "rewards/answer_format_reward_2": 0.0,
6804
+ "rewards/common_format_reward": 0.3452380895614624,
6805
+ "rewards/doors_consistency_reward_2": 0.0,
6806
+ "rewards/geometry_consistency_reward_2": 0.0,
6807
+ "rewards/prompt_consistency_reward_2": 0.0,
6808
+ "rewards/walls_orthogonality_reward_2": 0.0,
6809
+ "step": 400
6810
  }
6811
  ],
6812
  "logging_steps": 1,