Jerry Ji commited on
Commit
37290e2
·
1 Parent(s): 824c9d3

Training in progress, epoch 0

Browse files
README.md CHANGED
@@ -15,15 +15,15 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.5263
19
- - Rewards/chosen: -0.1493
20
- - Rewards/rejected: -0.8998
21
- - Rewards/accuracies: 0.7480
22
- - Rewards/margins: 0.7505
23
- - Logps/rejected: -228.2820
24
- - Logps/chosen: -266.1538
25
- - Logits/rejected: -1.9412
26
- - Logits/chosen: -2.0663
27
 
28
  ## Model description
29
 
@@ -54,15 +54,13 @@ The following hyperparameters were used during training:
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
55
  - lr_scheduler_type: linear
56
  - lr_scheduler_warmup_ratio: 0.1
57
- - num_epochs: 3
58
 
59
  ### Training results
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
- | 0.5506 | 1.0 | 968 | 0.5556 | -0.1128 | -0.6425 | 0.7120 | 0.5297 | -225.7089 | -265.7884 | -1.9914 | -2.1123 |
64
- | 0.545 | 2.0 | 1937 | 0.5313 | -0.1468 | -0.8623 | 0.7440 | 0.7156 | -227.9077 | -266.1287 | -1.9506 | -2.0746 |
65
- | 0.5342 | 3.0 | 2904 | 0.5263 | -0.1493 | -0.8998 | 0.7480 | 0.7505 | -228.2820 | -266.1538 | -1.9412 | -2.0663 |
66
 
67
 
68
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.6642
19
+ - Rewards/chosen: 0.1042
20
+ - Rewards/rejected: 0.0401
21
+ - Rewards/accuracies: 0.6480
22
+ - Rewards/margins: 0.0641
23
+ - Logps/rejected: -230.4560
24
+ - Logps/chosen: -278.6917
25
+ - Logits/rejected: -2.3987
26
+ - Logits/chosen: -2.4597
27
 
28
  ## Model description
29
 
 
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
55
  - lr_scheduler_type: linear
56
  - lr_scheduler_warmup_ratio: 0.1
57
+ - num_epochs: 1
58
 
59
  ### Training results
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.661 | 1.0 | 968 | 0.6642 | 0.1042 | 0.0401 | 0.6480 | 0.0641 | -230.4560 | -278.6917 | -2.3987 | -2.4597 |
 
 
64
 
65
 
66
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02667e5df3404888a505254c4b1c0474808ae97727b1e58368d1138042ddc366
3
  size 109086672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5355fdbd512d44198186b663cbaf8edc1b33b367aa093b0be23eee7161c1b84
3
  size 109086672
all_results.json CHANGED
@@ -1,8 +1,21 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.6731863415930882,
4
- "train_runtime": 27311.2139,
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 61966,
6
- "train_samples_per_second": 2.269,
7
  "train_steps_per_second": 0.035
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": -2.4597132205963135,
4
+ "eval_logits/rejected": -2.398695468902588,
5
+ "eval_logps/chosen": -278.69171142578125,
6
+ "eval_logps/rejected": -230.4560089111328,
7
+ "eval_loss": 0.6642152070999146,
8
+ "eval_rewards/accuracies": 0.6480000019073486,
9
+ "eval_rewards/chosen": 0.10415761172771454,
10
+ "eval_rewards/margins": 0.06405296921730042,
11
+ "eval_rewards/rejected": 0.04010463133454323,
12
+ "eval_runtime": 444.8959,
13
+ "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.495,
15
+ "eval_steps_per_second": 0.281,
16
+ "train_loss": 0.6728762634529555,
17
+ "train_runtime": 27528.1814,
18
  "train_samples": 61966,
19
+ "train_samples_per_second": 2.251,
20
  "train_steps_per_second": 0.035
21
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_logits/chosen": -2.0662753582000732,
4
- "eval_logits/rejected": -1.9411602020263672,
5
- "eval_logps/chosen": -266.15380859375,
6
- "eval_logps/rejected": -228.28196716308594,
7
- "eval_loss": 0.5263338685035706,
8
- "eval_rewards/accuracies": 0.7480000257492065,
9
- "eval_rewards/chosen": -0.14929771423339844,
10
- "eval_rewards/margins": 0.7504671812057495,
11
- "eval_rewards/rejected": -0.899764895439148,
12
- "eval_runtime": 443.1683,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 4.513,
15
- "eval_steps_per_second": 0.282
16
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_logits/chosen": -2.4597132205963135,
4
+ "eval_logits/rejected": -2.398695468902588,
5
+ "eval_logps/chosen": -278.69171142578125,
6
+ "eval_logps/rejected": -230.4560089111328,
7
+ "eval_loss": 0.6642152070999146,
8
+ "eval_rewards/accuracies": 0.6480000019073486,
9
+ "eval_rewards/chosen": 0.10415761172771454,
10
+ "eval_rewards/margins": 0.06405296921730042,
11
+ "eval_rewards/rejected": 0.04010463133454323,
12
+ "eval_runtime": 444.8959,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 4.495,
15
+ "eval_steps_per_second": 0.281
16
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.6731863415930882,
4
- "train_runtime": 27311.2139,
5
  "train_samples": 61966,
6
- "train_samples_per_second": 2.269,
7
  "train_steps_per_second": 0.035
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.6728762634529555,
4
+ "train_runtime": 27528.1814,
5
  "train_samples": 61966,
6
+ "train_samples_per_second": 2.251,
7
  "train_steps_per_second": 0.035
8
  }
trainer_state.json CHANGED
@@ -25,1370 +25,1370 @@
25
  {
26
  "epoch": 0.01,
27
  "learning_rate": 5.154639175257731e-08,
28
- "logits/chosen": -2.2234437465667725,
29
- "logits/rejected": -2.180982828140259,
30
- "logps/chosen": -284.7386474609375,
31
- "logps/rejected": -205.97119140625,
32
- "loss": 0.6935,
33
- "rewards/accuracies": 0.4166666567325592,
34
- "rewards/chosen": -0.0011506013106554747,
35
- "rewards/margins": -0.0007981713279150426,
36
- "rewards/rejected": -0.0003524304192978889,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.02,
41
  "learning_rate": 1.0309278350515462e-07,
42
- "logits/chosen": -2.335000514984131,
43
- "logits/rejected": -2.2123830318450928,
44
- "logps/chosen": -320.8105773925781,
45
- "logps/rejected": -248.3818817138672,
46
- "loss": 0.6931,
47
  "rewards/accuracies": 0.512499988079071,
48
- "rewards/chosen": 0.0012877475237473845,
49
- "rewards/margins": -0.0011181291192770004,
50
- "rewards/rejected": 0.0024058762937784195,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.03,
55
  "learning_rate": 1.5463917525773197e-07,
56
- "logits/chosen": -2.3392958641052246,
57
- "logits/rejected": -2.3039257526397705,
58
- "logps/chosen": -268.9768371582031,
59
- "logps/rejected": -227.0941162109375,
60
- "loss": 0.6909,
61
- "rewards/accuracies": 0.550000011920929,
62
- "rewards/chosen": -0.0020196367986500263,
63
- "rewards/margins": 0.0026352328713983297,
64
- "rewards/rejected": -0.004654868971556425,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.04,
69
  "learning_rate": 2.0618556701030925e-07,
70
- "logits/chosen": -2.3389391899108887,
71
- "logits/rejected": -2.329983711242676,
72
- "logps/chosen": -308.53192138671875,
73
- "logps/rejected": -253.8923797607422,
74
- "loss": 0.6926,
75
- "rewards/accuracies": 0.4937500059604645,
76
- "rewards/chosen": 0.0003858007548842579,
77
- "rewards/margins": 0.0030613162089139223,
78
- "rewards/rejected": -0.002675515366718173,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.05,
83
  "learning_rate": 2.5773195876288655e-07,
84
- "logits/chosen": -2.251640796661377,
85
- "logits/rejected": -2.236237049102783,
86
- "logps/chosen": -297.795166015625,
87
- "logps/rejected": -227.2183380126953,
88
- "loss": 0.6932,
89
- "rewards/accuracies": 0.518750011920929,
90
- "rewards/chosen": 0.0022513591684401035,
91
- "rewards/margins": 0.0027331984601914883,
92
- "rewards/rejected": -0.00048183900071308017,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.06,
97
  "learning_rate": 3.0927835051546394e-07,
98
- "logits/chosen": -2.1671040058135986,
99
- "logits/rejected": -2.3377814292907715,
100
- "logps/chosen": -256.554443359375,
101
- "logps/rejected": -229.49887084960938,
102
- "loss": 0.6923,
103
- "rewards/accuracies": 0.543749988079071,
104
- "rewards/chosen": -0.0005498790997080505,
105
- "rewards/margins": 0.0022342861630022526,
106
- "rewards/rejected": -0.002784165320917964,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.07,
111
  "learning_rate": 3.608247422680412e-07,
112
- "logits/chosen": -2.343254804611206,
113
- "logits/rejected": -2.281353712081909,
114
- "logps/chosen": -313.9508056640625,
115
- "logps/rejected": -252.5953369140625,
116
- "loss": 0.6931,
117
- "rewards/accuracies": 0.4937500059604645,
118
- "rewards/chosen": -0.0012313572224229574,
119
- "rewards/margins": -0.0001091135636670515,
120
- "rewards/rejected": -0.0011222433531656861,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.08,
125
  "learning_rate": 4.123711340206185e-07,
126
- "logits/chosen": -2.337360382080078,
127
- "logits/rejected": -2.3014636039733887,
128
- "logps/chosen": -302.96966552734375,
129
- "logps/rejected": -243.86474609375,
130
- "loss": 0.6929,
131
- "rewards/accuracies": 0.4937500059604645,
132
- "rewards/chosen": 0.00042300819768570364,
133
- "rewards/margins": -0.005995759274810553,
134
- "rewards/rejected": 0.0064187683165073395,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.09,
139
  "learning_rate": 4.639175257731959e-07,
140
- "logits/chosen": -2.259093761444092,
141
- "logits/rejected": -2.2963151931762695,
142
- "logps/chosen": -270.1607360839844,
143
- "logps/rejected": -216.63967895507812,
144
- "loss": 0.69,
145
- "rewards/accuracies": 0.5375000238418579,
146
- "rewards/chosen": 0.010549941100180149,
147
- "rewards/margins": 0.00999419204890728,
148
- "rewards/rejected": 0.0005557489348575473,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.1,
153
  "learning_rate": 4.982778415614236e-07,
154
- "logits/chosen": -2.1673099994659424,
155
- "logits/rejected": -2.2734127044677734,
156
- "logps/chosen": -274.7733154296875,
157
- "logps/rejected": -226.4468994140625,
158
- "loss": 0.6898,
159
- "rewards/accuracies": 0.5062500238418579,
160
- "rewards/chosen": 0.004620480351150036,
161
- "rewards/margins": 0.004921785555779934,
162
- "rewards/rejected": -0.00030130503000691533,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.11,
167
  "learning_rate": 4.925373134328357e-07,
168
- "logits/chosen": -2.271514415740967,
169
- "logits/rejected": -2.197758197784424,
170
- "logps/chosen": -274.6520690917969,
171
- "logps/rejected": -232.5465850830078,
172
- "loss": 0.6884,
173
- "rewards/accuracies": 0.543749988079071,
174
- "rewards/chosen": 0.012732337228953838,
175
- "rewards/margins": 0.013689815998077393,
176
- "rewards/rejected": -0.0009574781870469451,
177
  "step": 110
178
  },
179
  {
180
  "epoch": 0.12,
181
  "learning_rate": 4.867967853042479e-07,
182
- "logits/chosen": -2.255197763442993,
183
- "logits/rejected": -2.3222763538360596,
184
- "logps/chosen": -319.30975341796875,
185
- "logps/rejected": -235.72726440429688,
186
- "loss": 0.688,
187
- "rewards/accuracies": 0.5625,
188
- "rewards/chosen": 0.017383214086294174,
189
- "rewards/margins": 0.011506976559758186,
190
- "rewards/rejected": 0.005876240320503712,
191
  "step": 120
192
  },
193
  {
194
  "epoch": 0.13,
195
  "learning_rate": 4.810562571756601e-07,
196
- "logits/chosen": -2.3218114376068115,
197
- "logits/rejected": -2.3772194385528564,
198
- "logps/chosen": -296.2359313964844,
199
- "logps/rejected": -245.52505493164062,
200
- "loss": 0.6885,
201
- "rewards/accuracies": 0.5249999761581421,
202
- "rewards/chosen": 0.014686869457364082,
203
- "rewards/margins": 0.0065305838361382484,
204
- "rewards/rejected": 0.008156285621225834,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.14,
209
  "learning_rate": 4.753157290470723e-07,
210
- "logits/chosen": -2.363065004348755,
211
- "logits/rejected": -2.310908794403076,
212
- "logps/chosen": -301.93017578125,
213
- "logps/rejected": -239.286865234375,
214
- "loss": 0.6881,
215
- "rewards/accuracies": 0.543749988079071,
216
- "rewards/chosen": 0.011356567032635212,
217
- "rewards/margins": 0.009571002796292305,
218
- "rewards/rejected": 0.0017855638870969415,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.15,
223
  "learning_rate": 4.6957520091848447e-07,
224
- "logits/chosen": -2.253392457962036,
225
- "logits/rejected": -2.3485019207000732,
226
- "logps/chosen": -284.38043212890625,
227
- "logps/rejected": -259.68096923828125,
228
- "loss": 0.6862,
229
- "rewards/accuracies": 0.606249988079071,
230
- "rewards/chosen": 0.022072453051805496,
231
- "rewards/margins": 0.016027750447392464,
232
- "rewards/rejected": 0.006044704467058182,
233
  "step": 150
234
  },
235
  {
236
  "epoch": 0.17,
237
  "learning_rate": 4.6383467278989666e-07,
238
- "logits/chosen": -2.3608505725860596,
239
- "logits/rejected": -2.442317008972168,
240
- "logps/chosen": -286.76446533203125,
241
- "logps/rejected": -221.6781005859375,
242
- "loss": 0.6852,
243
- "rewards/accuracies": 0.59375,
244
- "rewards/chosen": 0.01910669170320034,
245
- "rewards/margins": 0.014247800223529339,
246
- "rewards/rejected": 0.004858892410993576,
247
  "step": 160
248
  },
249
  {
250
  "epoch": 0.18,
251
  "learning_rate": 4.580941446613088e-07,
252
- "logits/chosen": -2.322056531906128,
253
- "logits/rejected": -2.3340067863464355,
254
- "logps/chosen": -301.5868225097656,
255
- "logps/rejected": -239.265869140625,
256
- "loss": 0.6834,
257
- "rewards/accuracies": 0.5625,
258
- "rewards/chosen": 0.019180649891495705,
259
- "rewards/margins": 0.011183517053723335,
260
- "rewards/rejected": 0.00799713283777237,
261
  "step": 170
262
  },
263
  {
264
  "epoch": 0.19,
265
  "learning_rate": 4.52353616532721e-07,
266
- "logits/chosen": -2.347207546234131,
267
- "logits/rejected": -2.324052572250366,
268
- "logps/chosen": -257.86639404296875,
269
- "logps/rejected": -214.54440307617188,
270
- "loss": 0.6857,
271
- "rewards/accuracies": 0.5562499761581421,
272
- "rewards/chosen": 0.01751137152314186,
273
- "rewards/margins": 0.011507970280945301,
274
- "rewards/rejected": 0.006003401707857847,
275
  "step": 180
276
  },
277
  {
278
  "epoch": 0.2,
279
  "learning_rate": 4.4661308840413316e-07,
280
- "logits/chosen": -2.2652053833007812,
281
- "logits/rejected": -2.2009899616241455,
282
- "logps/chosen": -253.9468536376953,
283
- "logps/rejected": -206.35702514648438,
284
- "loss": 0.6837,
285
- "rewards/accuracies": 0.637499988079071,
286
- "rewards/chosen": 0.028762493282556534,
287
- "rewards/margins": 0.02420884743332863,
288
- "rewards/rejected": 0.004553645849227905,
289
  "step": 190
290
  },
291
  {
292
  "epoch": 0.21,
293
  "learning_rate": 4.408725602755453e-07,
294
- "logits/chosen": -2.284376621246338,
295
- "logits/rejected": -2.2875559329986572,
296
- "logps/chosen": -261.45037841796875,
297
- "logps/rejected": -195.525634765625,
298
- "loss": 0.6833,
299
- "rewards/accuracies": 0.543749988079071,
300
- "rewards/chosen": 0.03126645088195801,
301
- "rewards/margins": 0.016625383868813515,
302
- "rewards/rejected": 0.014641067013144493,
303
  "step": 200
304
  },
305
  {
306
  "epoch": 0.22,
307
  "learning_rate": 4.351320321469575e-07,
308
- "logits/chosen": -2.184406280517578,
309
- "logits/rejected": -2.1959569454193115,
310
- "logps/chosen": -302.328857421875,
311
- "logps/rejected": -218.5536651611328,
312
- "loss": 0.6828,
313
- "rewards/accuracies": 0.606249988079071,
314
- "rewards/chosen": 0.03549647331237793,
315
- "rewards/margins": 0.02051473781466484,
316
- "rewards/rejected": 0.014981737360358238,
317
  "step": 210
318
  },
319
  {
320
  "epoch": 0.23,
321
  "learning_rate": 4.2939150401836967e-07,
322
- "logits/chosen": -2.2145161628723145,
323
- "logits/rejected": -2.215642213821411,
324
- "logps/chosen": -269.46368408203125,
325
- "logps/rejected": -235.6923065185547,
326
- "loss": 0.6814,
327
- "rewards/accuracies": 0.6000000238418579,
328
- "rewards/chosen": 0.03645281121134758,
329
- "rewards/margins": 0.0235856045037508,
330
- "rewards/rejected": 0.012867207638919353,
331
  "step": 220
332
  },
333
  {
334
  "epoch": 0.24,
335
  "learning_rate": 4.236509758897818e-07,
336
- "logits/chosen": -2.214348554611206,
337
- "logits/rejected": -2.185147762298584,
338
- "logps/chosen": -271.40472412109375,
339
- "logps/rejected": -242.63973999023438,
340
  "loss": 0.6826,
341
- "rewards/accuracies": 0.612500011920929,
342
- "rewards/chosen": 0.03270721808075905,
343
- "rewards/margins": 0.026937326416373253,
344
- "rewards/rejected": 0.005769887939095497,
345
  "step": 230
346
  },
347
  {
348
  "epoch": 0.25,
349
  "learning_rate": 4.17910447761194e-07,
350
- "logits/chosen": -2.3055496215820312,
351
- "logits/rejected": -2.2679781913757324,
352
- "logps/chosen": -309.5468444824219,
353
- "logps/rejected": -221.61196899414062,
354
- "loss": 0.6836,
355
- "rewards/accuracies": 0.574999988079071,
356
- "rewards/chosen": 0.03591131418943405,
357
- "rewards/margins": 0.013077683746814728,
358
- "rewards/rejected": 0.022833632305264473,
359
  "step": 240
360
  },
361
  {
362
  "epoch": 0.26,
363
  "learning_rate": 4.121699196326062e-07,
364
- "logits/chosen": -2.306344509124756,
365
- "logits/rejected": -2.2909655570983887,
366
- "logps/chosen": -272.93988037109375,
367
- "logps/rejected": -237.33169555664062,
368
- "loss": 0.6828,
369
- "rewards/accuracies": 0.543749988079071,
370
- "rewards/chosen": 0.03549710661172867,
371
- "rewards/margins": 0.014110283926129341,
372
- "rewards/rejected": 0.021386824548244476,
373
  "step": 250
374
  },
375
  {
376
  "epoch": 0.27,
377
  "learning_rate": 4.0642939150401836e-07,
378
- "logits/chosen": -2.345346212387085,
379
- "logits/rejected": -2.3187355995178223,
380
- "logps/chosen": -270.4668884277344,
381
- "logps/rejected": -221.8577880859375,
382
- "loss": 0.6809,
383
- "rewards/accuracies": 0.6312500238418579,
384
- "rewards/chosen": 0.046544916927814484,
385
- "rewards/margins": 0.03206244856119156,
386
- "rewards/rejected": 0.014482468366622925,
387
  "step": 260
388
  },
389
  {
390
  "epoch": 0.28,
391
  "learning_rate": 4.006888633754305e-07,
392
- "logits/chosen": -2.38493013381958,
393
- "logits/rejected": -2.3552451133728027,
394
- "logps/chosen": -284.3419494628906,
395
- "logps/rejected": -232.5235137939453,
396
- "loss": 0.6807,
397
- "rewards/accuracies": 0.581250011920929,
398
- "rewards/chosen": 0.05182330682873726,
399
- "rewards/margins": 0.03193196654319763,
400
- "rewards/rejected": 0.019891340285539627,
401
  "step": 270
402
  },
403
  {
404
  "epoch": 0.29,
405
  "learning_rate": 3.949483352468427e-07,
406
- "logits/chosen": -2.308295488357544,
407
- "logits/rejected": -2.2600932121276855,
408
- "logps/chosen": -293.229736328125,
409
- "logps/rejected": -236.45321655273438,
410
- "loss": 0.6789,
411
- "rewards/accuracies": 0.606249988079071,
412
- "rewards/chosen": 0.05196481943130493,
413
- "rewards/margins": 0.03805174678564072,
414
- "rewards/rejected": 0.013913074508309364,
415
  "step": 280
416
  },
417
  {
418
  "epoch": 0.3,
419
  "learning_rate": 3.8920780711825487e-07,
420
- "logits/chosen": -2.2781283855438232,
421
- "logits/rejected": -2.368569850921631,
422
- "logps/chosen": -278.46221923828125,
423
- "logps/rejected": -227.41943359375,
424
- "loss": 0.6791,
425
- "rewards/accuracies": 0.6312500238418579,
426
- "rewards/chosen": 0.05021747201681137,
427
- "rewards/margins": 0.02508280798792839,
428
- "rewards/rejected": 0.02513466402888298,
429
  "step": 290
430
  },
431
  {
432
  "epoch": 0.31,
433
  "learning_rate": 3.83467278989667e-07,
434
- "logits/chosen": -2.265779495239258,
435
- "logits/rejected": -2.2051727771759033,
436
- "logps/chosen": -254.264404296875,
437
- "logps/rejected": -221.9779510498047,
438
- "loss": 0.6783,
439
- "rewards/accuracies": 0.643750011920929,
440
- "rewards/chosen": 0.05069579556584358,
441
- "rewards/margins": 0.03272496536374092,
442
- "rewards/rejected": 0.01797083020210266,
443
  "step": 300
444
  },
445
  {
446
  "epoch": 0.32,
447
  "learning_rate": 3.777267508610792e-07,
448
- "logits/chosen": -2.3235151767730713,
449
- "logits/rejected": -2.3737473487854004,
450
- "logps/chosen": -306.2057189941406,
451
- "logps/rejected": -257.5906066894531,
452
- "loss": 0.6789,
453
- "rewards/accuracies": 0.5687500238418579,
454
- "rewards/chosen": 0.05037818104028702,
455
- "rewards/margins": 0.01740873232483864,
456
- "rewards/rejected": 0.03296944126486778,
457
  "step": 310
458
  },
459
  {
460
  "epoch": 0.33,
461
  "learning_rate": 3.7198622273249137e-07,
462
- "logits/chosen": -2.234135389328003,
463
- "logits/rejected": -2.210972309112549,
464
- "logps/chosen": -251.83740234375,
465
- "logps/rejected": -192.99771118164062,
466
- "loss": 0.6743,
467
- "rewards/accuracies": 0.6625000238418579,
468
- "rewards/chosen": 0.06514623761177063,
469
- "rewards/margins": 0.04524427652359009,
470
- "rewards/rejected": 0.01990196295082569,
471
  "step": 320
472
  },
473
  {
474
  "epoch": 0.34,
475
  "learning_rate": 3.662456946039035e-07,
476
- "logits/chosen": -2.257673740386963,
477
- "logits/rejected": -2.2867014408111572,
478
- "logps/chosen": -312.2060546875,
479
- "logps/rejected": -239.00277709960938,
480
  "loss": 0.6761,
481
- "rewards/accuracies": 0.637499988079071,
482
- "rewards/chosen": 0.06925204396247864,
483
- "rewards/margins": 0.04682812839746475,
484
- "rewards/rejected": 0.022423917427659035,
485
  "step": 330
486
  },
487
  {
488
  "epoch": 0.35,
489
  "learning_rate": 3.605051664753157e-07,
490
- "logits/chosen": -2.196643590927124,
491
- "logits/rejected": -2.129664182662964,
492
- "logps/chosen": -244.24270629882812,
493
- "logps/rejected": -238.7809295654297,
494
- "loss": 0.6776,
495
- "rewards/accuracies": 0.625,
496
- "rewards/chosen": 0.05914956331253052,
497
- "rewards/margins": 0.02900281921029091,
498
- "rewards/rejected": 0.03014674223959446,
499
  "step": 340
500
  },
501
  {
502
  "epoch": 0.36,
503
  "learning_rate": 3.547646383467279e-07,
504
- "logits/chosen": -2.365565538406372,
505
- "logits/rejected": -2.3723063468933105,
506
- "logps/chosen": -313.76263427734375,
507
- "logps/rejected": -248.08090209960938,
508
- "loss": 0.6771,
509
- "rewards/accuracies": 0.606249988079071,
510
- "rewards/chosen": 0.07412128150463104,
511
- "rewards/margins": 0.048134543001651764,
512
- "rewards/rejected": 0.025986725464463234,
513
  "step": 350
514
  },
515
  {
516
  "epoch": 0.37,
517
  "learning_rate": 3.4902411021814007e-07,
518
- "logits/chosen": -2.2278995513916016,
519
- "logits/rejected": -2.259129524230957,
520
- "logps/chosen": -303.2668151855469,
521
- "logps/rejected": -249.9300537109375,
522
- "loss": 0.6727,
523
- "rewards/accuracies": 0.606249988079071,
524
- "rewards/chosen": 0.06270809471607208,
525
- "rewards/margins": 0.04535229504108429,
526
- "rewards/rejected": 0.017355797812342644,
527
  "step": 360
528
  },
529
  {
530
  "epoch": 0.38,
531
  "learning_rate": 3.432835820895522e-07,
532
- "logits/chosen": -2.3697922229766846,
533
- "logits/rejected": -2.3227946758270264,
534
- "logps/chosen": -314.5289611816406,
535
- "logps/rejected": -270.68231201171875,
536
  "loss": 0.6759,
537
- "rewards/accuracies": 0.5562499761581421,
538
- "rewards/chosen": 0.07028704136610031,
539
- "rewards/margins": 0.0307574775069952,
540
- "rewards/rejected": 0.03952956199645996,
541
  "step": 370
542
  },
543
  {
544
  "epoch": 0.39,
545
  "learning_rate": 3.375430539609644e-07,
546
- "logits/chosen": -2.321040630340576,
547
- "logits/rejected": -2.2491185665130615,
548
- "logps/chosen": -291.9346618652344,
549
- "logps/rejected": -239.68917846679688,
550
- "loss": 0.6778,
551
- "rewards/accuracies": 0.625,
552
- "rewards/chosen": 0.0727432444691658,
553
- "rewards/margins": 0.03435206040740013,
554
- "rewards/rejected": 0.03839118406176567,
555
  "step": 380
556
  },
557
  {
558
  "epoch": 0.4,
559
  "learning_rate": 3.3180252583237657e-07,
560
- "logits/chosen": -2.2968392372131348,
561
- "logits/rejected": -2.2632501125335693,
562
- "logps/chosen": -278.10028076171875,
563
- "logps/rejected": -237.16793823242188,
564
- "loss": 0.6712,
565
- "rewards/accuracies": 0.6499999761581421,
566
- "rewards/chosen": 0.06786436587572098,
567
- "rewards/margins": 0.05375425145030022,
568
- "rewards/rejected": 0.014110115356743336,
569
  "step": 390
570
  },
571
  {
572
  "epoch": 0.41,
573
  "learning_rate": 3.260619977037887e-07,
574
- "logits/chosen": -2.236884593963623,
575
- "logits/rejected": -2.2387681007385254,
576
- "logps/chosen": -263.4674072265625,
577
- "logps/rejected": -213.8645477294922,
578
- "loss": 0.6732,
579
- "rewards/accuracies": 0.6625000238418579,
580
- "rewards/chosen": 0.06493643671274185,
581
- "rewards/margins": 0.044694311916828156,
582
- "rewards/rejected": 0.0202421136200428,
583
  "step": 400
584
  },
585
  {
586
  "epoch": 0.42,
587
  "learning_rate": 3.203214695752009e-07,
588
- "logits/chosen": -2.277587890625,
589
- "logits/rejected": -2.291820526123047,
590
- "logps/chosen": -268.8821105957031,
591
- "logps/rejected": -252.8975830078125,
592
- "loss": 0.6678,
593
- "rewards/accuracies": 0.637499988079071,
594
- "rewards/chosen": 0.08244398236274719,
595
- "rewards/margins": 0.05904413014650345,
596
- "rewards/rejected": 0.023399867117404938,
597
  "step": 410
598
  },
599
  {
600
  "epoch": 0.43,
601
  "learning_rate": 3.145809414466131e-07,
602
- "logits/chosen": -2.3052468299865723,
603
- "logits/rejected": -2.2499592304229736,
604
- "logps/chosen": -252.5208282470703,
605
- "logps/rejected": -204.4167938232422,
606
- "loss": 0.675,
607
- "rewards/accuracies": 0.637499988079071,
608
- "rewards/chosen": 0.07269565761089325,
609
- "rewards/margins": 0.04640679806470871,
610
- "rewards/rejected": 0.02628885768353939,
611
  "step": 420
612
  },
613
  {
614
  "epoch": 0.44,
615
  "learning_rate": 3.0884041331802526e-07,
616
- "logits/chosen": -2.3479931354522705,
617
- "logits/rejected": -2.3252017498016357,
618
- "logps/chosen": -263.6164245605469,
619
- "logps/rejected": -241.0965576171875,
620
- "loss": 0.6737,
621
- "rewards/accuracies": 0.6312500238418579,
622
- "rewards/chosen": 0.07568270713090897,
623
- "rewards/margins": 0.04158180207014084,
624
- "rewards/rejected": 0.03410089388489723,
625
  "step": 430
626
  },
627
  {
628
  "epoch": 0.45,
629
  "learning_rate": 3.030998851894374e-07,
630
- "logits/chosen": -2.2860944271087646,
631
- "logits/rejected": -2.3197929859161377,
632
- "logps/chosen": -286.6674499511719,
633
- "logps/rejected": -247.6437225341797,
634
- "loss": 0.6686,
635
- "rewards/accuracies": 0.699999988079071,
636
- "rewards/chosen": 0.08281053602695465,
637
- "rewards/margins": 0.06470286101102829,
638
- "rewards/rejected": 0.018107673153281212,
639
  "step": 440
640
  },
641
  {
642
  "epoch": 0.46,
643
  "learning_rate": 2.973593570608496e-07,
644
- "logits/chosen": -2.2066447734832764,
645
- "logits/rejected": -2.3158316612243652,
646
- "logps/chosen": -276.1333923339844,
647
- "logps/rejected": -230.3758544921875,
648
- "loss": 0.6778,
649
- "rewards/accuracies": 0.6312500238418579,
650
- "rewards/chosen": 0.07531363517045975,
651
- "rewards/margins": 0.04221782088279724,
652
- "rewards/rejected": 0.033095818012952805,
653
  "step": 450
654
  },
655
  {
656
  "epoch": 0.47,
657
  "learning_rate": 2.9161882893226177e-07,
658
- "logits/chosen": -2.2777111530303955,
659
- "logits/rejected": -2.341663122177124,
660
- "logps/chosen": -273.2806701660156,
661
- "logps/rejected": -222.5447235107422,
662
- "loss": 0.6693,
663
- "rewards/accuracies": 0.5687500238418579,
664
- "rewards/chosen": 0.07596820592880249,
665
- "rewards/margins": 0.0409797765314579,
666
- "rewards/rejected": 0.03498842567205429,
667
  "step": 460
668
  },
669
  {
670
  "epoch": 0.49,
671
  "learning_rate": 2.858783008036739e-07,
672
- "logits/chosen": -2.2646453380584717,
673
- "logits/rejected": -2.277526378631592,
674
- "logps/chosen": -249.056396484375,
675
- "logps/rejected": -215.5797882080078,
676
- "loss": 0.6676,
677
- "rewards/accuracies": 0.675000011920929,
678
- "rewards/chosen": 0.07929818332195282,
679
- "rewards/margins": 0.05759881064295769,
680
- "rewards/rejected": 0.021699368953704834,
681
  "step": 470
682
  },
683
  {
684
  "epoch": 0.5,
685
  "learning_rate": 2.801377726750861e-07,
686
- "logits/chosen": -2.2957892417907715,
687
- "logits/rejected": -2.2723708152770996,
688
- "logps/chosen": -289.49652099609375,
689
- "logps/rejected": -231.6498565673828,
690
- "loss": 0.6708,
691
- "rewards/accuracies": 0.6312500238418579,
692
- "rewards/chosen": 0.08457117527723312,
693
- "rewards/margins": 0.06456024944782257,
694
- "rewards/rejected": 0.020010938867926598,
695
  "step": 480
696
  },
697
  {
698
  "epoch": 0.51,
699
  "learning_rate": 2.743972445464983e-07,
700
- "logits/chosen": -2.4455032348632812,
701
- "logits/rejected": -2.266815662384033,
702
- "logps/chosen": -293.1993103027344,
703
- "logps/rejected": -243.9182891845703,
704
- "loss": 0.6673,
705
- "rewards/accuracies": 0.6937500238418579,
706
- "rewards/chosen": 0.10721077769994736,
707
- "rewards/margins": 0.08375977724790573,
708
- "rewards/rejected": 0.02345099486410618,
709
  "step": 490
710
  },
711
  {
712
  "epoch": 0.52,
713
  "learning_rate": 2.686567164179104e-07,
714
- "logits/chosen": -2.278409242630005,
715
- "logits/rejected": -2.294983386993408,
716
- "logps/chosen": -254.94808959960938,
717
- "logps/rejected": -221.7699737548828,
718
- "loss": 0.6665,
719
- "rewards/accuracies": 0.6312500238418579,
720
- "rewards/chosen": 0.08222482353448868,
721
- "rewards/margins": 0.05339335650205612,
722
- "rewards/rejected": 0.028831467032432556,
723
  "step": 500
724
  },
725
  {
726
  "epoch": 0.53,
727
  "learning_rate": 2.629161882893226e-07,
728
- "logits/chosen": -2.2024474143981934,
729
- "logits/rejected": -2.249354600906372,
730
- "logps/chosen": -310.412109375,
731
- "logps/rejected": -256.75201416015625,
732
- "loss": 0.6669,
733
- "rewards/accuracies": 0.625,
734
- "rewards/chosen": 0.07344520837068558,
735
- "rewards/margins": 0.046752505004405975,
736
- "rewards/rejected": 0.026692699640989304,
737
  "step": 510
738
  },
739
  {
740
  "epoch": 0.54,
741
  "learning_rate": 2.571756601607348e-07,
742
- "logits/chosen": -2.3371269702911377,
743
- "logits/rejected": -2.3524794578552246,
744
- "logps/chosen": -278.16058349609375,
745
- "logps/rejected": -244.1069793701172,
746
- "loss": 0.6704,
747
  "rewards/accuracies": 0.6499999761581421,
748
- "rewards/chosen": 0.08704034984111786,
749
- "rewards/margins": 0.061678241938352585,
750
- "rewards/rejected": 0.025362113490700722,
751
  "step": 520
752
  },
753
  {
754
  "epoch": 0.55,
755
  "learning_rate": 2.5143513203214697e-07,
756
- "logits/chosen": -2.2432773113250732,
757
- "logits/rejected": -2.250980854034424,
758
- "logps/chosen": -242.64340209960938,
759
- "logps/rejected": -224.147216796875,
760
- "loss": 0.6702,
761
- "rewards/accuracies": 0.6312500238418579,
762
- "rewards/chosen": 0.07376811653375626,
763
- "rewards/margins": 0.05427448824048042,
764
- "rewards/rejected": 0.019493628293275833,
765
  "step": 530
766
  },
767
  {
768
  "epoch": 0.56,
769
  "learning_rate": 2.456946039035591e-07,
770
- "logits/chosen": -2.300306797027588,
771
- "logits/rejected": -2.2716238498687744,
772
- "logps/chosen": -288.2474670410156,
773
- "logps/rejected": -240.3512725830078,
774
- "loss": 0.6671,
775
- "rewards/accuracies": 0.6499999761581421,
776
- "rewards/chosen": 0.10111168771982193,
777
- "rewards/margins": 0.056197118014097214,
778
- "rewards/rejected": 0.044914569705724716,
779
  "step": 540
780
  },
781
  {
782
  "epoch": 0.57,
783
  "learning_rate": 2.399540757749713e-07,
784
- "logits/chosen": -2.3355793952941895,
785
- "logits/rejected": -2.1937708854675293,
786
- "logps/chosen": -265.01416015625,
787
- "logps/rejected": -230.23007202148438,
788
- "loss": 0.6683,
789
- "rewards/accuracies": 0.675000011920929,
790
- "rewards/chosen": 0.08131198585033417,
791
- "rewards/margins": 0.05895137041807175,
792
- "rewards/rejected": 0.022360611706972122,
793
  "step": 550
794
  },
795
  {
796
  "epoch": 0.58,
797
  "learning_rate": 2.3421354764638345e-07,
798
- "logits/chosen": -2.319580554962158,
799
- "logits/rejected": -2.283818244934082,
800
- "logps/chosen": -302.03167724609375,
801
- "logps/rejected": -251.99624633789062,
802
- "loss": 0.6717,
803
- "rewards/accuracies": 0.6000000238418579,
804
- "rewards/chosen": 0.09797920286655426,
805
- "rewards/margins": 0.049965519458055496,
806
- "rewards/rejected": 0.048013679683208466,
807
  "step": 560
808
  },
809
  {
810
  "epoch": 0.59,
811
  "learning_rate": 2.2847301951779563e-07,
812
- "logits/chosen": -2.2482268810272217,
813
- "logits/rejected": -2.4002277851104736,
814
- "logps/chosen": -268.6507873535156,
815
- "logps/rejected": -223.710693359375,
816
- "loss": 0.6676,
817
- "rewards/accuracies": 0.612500011920929,
818
- "rewards/chosen": 0.08277994394302368,
819
- "rewards/margins": 0.05562227964401245,
820
- "rewards/rejected": 0.027157653123140335,
821
  "step": 570
822
  },
823
  {
824
  "epoch": 0.6,
825
  "learning_rate": 2.227324913892078e-07,
826
- "logits/chosen": -2.2995638847351074,
827
- "logits/rejected": -2.223440647125244,
828
- "logps/chosen": -299.4001159667969,
829
- "logps/rejected": -236.94857788085938,
830
- "loss": 0.6613,
831
- "rewards/accuracies": 0.668749988079071,
832
- "rewards/chosen": 0.10369547456502914,
833
- "rewards/margins": 0.08046281337738037,
834
- "rewards/rejected": 0.023232655599713326,
835
  "step": 580
836
  },
837
  {
838
  "epoch": 0.61,
839
  "learning_rate": 2.1699196326061998e-07,
840
- "logits/chosen": -2.2583508491516113,
841
- "logits/rejected": -2.231132984161377,
842
- "logps/chosen": -253.7607421875,
843
- "logps/rejected": -218.61239624023438,
844
- "loss": 0.6683,
845
- "rewards/accuracies": 0.65625,
846
- "rewards/chosen": 0.07318417727947235,
847
- "rewards/margins": 0.04799889028072357,
848
- "rewards/rejected": 0.025185290724039078,
849
  "step": 590
850
  },
851
  {
852
  "epoch": 0.62,
853
  "learning_rate": 2.1125143513203214e-07,
854
- "logits/chosen": -2.3192005157470703,
855
- "logits/rejected": -2.2510247230529785,
856
- "logps/chosen": -256.56060791015625,
857
- "logps/rejected": -206.32177734375,
858
- "loss": 0.6695,
859
- "rewards/accuracies": 0.6187499761581421,
860
- "rewards/chosen": 0.07588864117860794,
861
- "rewards/margins": 0.05236497521400452,
862
- "rewards/rejected": 0.023523656651377678,
863
  "step": 600
864
  },
865
  {
866
  "epoch": 0.63,
867
  "learning_rate": 2.055109070034443e-07,
868
- "logits/chosen": -2.3051934242248535,
869
- "logits/rejected": -2.3035061359405518,
870
- "logps/chosen": -266.5325622558594,
871
- "logps/rejected": -223.85031127929688,
872
- "loss": 0.6668,
873
- "rewards/accuracies": 0.6875,
874
- "rewards/chosen": 0.0917385071516037,
875
- "rewards/margins": 0.0632014125585556,
876
- "rewards/rejected": 0.028537089005112648,
877
  "step": 610
878
  },
879
  {
880
  "epoch": 0.64,
881
  "learning_rate": 1.997703788748565e-07,
882
- "logits/chosen": -2.337707042694092,
883
- "logits/rejected": -2.2823574542999268,
884
- "logps/chosen": -313.8081359863281,
885
- "logps/rejected": -249.5789337158203,
886
- "loss": 0.6586,
887
- "rewards/accuracies": 0.6812499761581421,
888
- "rewards/chosen": 0.10711432993412018,
889
- "rewards/margins": 0.07846088707447052,
890
- "rewards/rejected": 0.028653452172875404,
891
  "step": 620
892
  },
893
  {
894
  "epoch": 0.65,
895
  "learning_rate": 1.9402985074626865e-07,
896
- "logits/chosen": -2.2064993381500244,
897
- "logits/rejected": -2.2465977668762207,
898
- "logps/chosen": -259.2158508300781,
899
- "logps/rejected": -240.3513641357422,
900
- "loss": 0.6659,
901
- "rewards/accuracies": 0.65625,
902
- "rewards/chosen": 0.09927239269018173,
903
- "rewards/margins": 0.06107243150472641,
904
- "rewards/rejected": 0.038199953734874725,
905
  "step": 630
906
  },
907
  {
908
  "epoch": 0.66,
909
  "learning_rate": 1.8828932261768083e-07,
910
- "logits/chosen": -2.289741277694702,
911
- "logits/rejected": -2.238556146621704,
912
- "logps/chosen": -266.5019226074219,
913
- "logps/rejected": -217.8599853515625,
914
- "loss": 0.6612,
915
- "rewards/accuracies": 0.625,
916
- "rewards/chosen": 0.09409500658512115,
917
- "rewards/margins": 0.07514993846416473,
918
- "rewards/rejected": 0.018945056945085526,
919
  "step": 640
920
  },
921
  {
922
  "epoch": 0.67,
923
  "learning_rate": 1.82548794489093e-07,
924
- "logits/chosen": -2.3345632553100586,
925
- "logits/rejected": -2.311061382293701,
926
- "logps/chosen": -284.6984558105469,
927
- "logps/rejected": -232.80517578125,
928
- "loss": 0.6641,
929
- "rewards/accuracies": 0.612500011920929,
930
- "rewards/chosen": 0.10377562046051025,
931
- "rewards/margins": 0.07344510406255722,
932
- "rewards/rejected": 0.03033052384853363,
933
  "step": 650
934
  },
935
  {
936
  "epoch": 0.68,
937
  "learning_rate": 1.7680826636050515e-07,
938
- "logits/chosen": -2.33508563041687,
939
- "logits/rejected": -2.27583646774292,
940
- "logps/chosen": -279.7930603027344,
941
- "logps/rejected": -233.23367309570312,
942
  "loss": 0.6608,
943
- "rewards/accuracies": 0.699999988079071,
944
- "rewards/chosen": 0.11143641173839569,
945
- "rewards/margins": 0.07680504024028778,
946
- "rewards/rejected": 0.03463137149810791,
947
  "step": 660
948
  },
949
  {
950
  "epoch": 0.69,
951
  "learning_rate": 1.7106773823191734e-07,
952
- "logits/chosen": -2.285081386566162,
953
- "logits/rejected": -2.2734663486480713,
954
- "logps/chosen": -295.72637939453125,
955
- "logps/rejected": -240.38070678710938,
956
- "loss": 0.6628,
957
- "rewards/accuracies": 0.606249988079071,
958
- "rewards/chosen": 0.0983828604221344,
959
- "rewards/margins": 0.055043578147888184,
960
- "rewards/rejected": 0.04333927482366562,
961
  "step": 670
962
  },
963
  {
964
  "epoch": 0.7,
965
  "learning_rate": 1.653272101033295e-07,
966
- "logits/chosen": -2.3419528007507324,
967
- "logits/rejected": -2.2720932960510254,
968
- "logps/chosen": -289.7131652832031,
969
- "logps/rejected": -230.31863403320312,
970
- "loss": 0.673,
971
- "rewards/accuracies": 0.59375,
972
- "rewards/chosen": 0.09808371216058731,
973
- "rewards/margins": 0.03939511626958847,
974
- "rewards/rejected": 0.05868858844041824,
975
  "step": 680
976
  },
977
  {
978
  "epoch": 0.71,
979
  "learning_rate": 1.5958668197474169e-07,
980
- "logits/chosen": -2.3712170124053955,
981
- "logits/rejected": -2.3621950149536133,
982
- "logps/chosen": -268.20367431640625,
983
- "logps/rejected": -229.4346160888672,
984
- "loss": 0.6658,
985
- "rewards/accuracies": 0.606249988079071,
986
- "rewards/chosen": 0.09442739933729172,
987
- "rewards/margins": 0.0633876845240593,
988
- "rewards/rejected": 0.031039711087942123,
989
  "step": 690
990
  },
991
  {
992
  "epoch": 0.72,
993
  "learning_rate": 1.5384615384615385e-07,
994
- "logits/chosen": -2.2595605850219727,
995
- "logits/rejected": -2.2577292919158936,
996
- "logps/chosen": -282.4584655761719,
997
- "logps/rejected": -222.5489959716797,
998
- "loss": 0.6655,
999
- "rewards/accuracies": 0.637499988079071,
1000
- "rewards/chosen": 0.10156550258398056,
1001
- "rewards/margins": 0.077473945915699,
1002
- "rewards/rejected": 0.024091556668281555,
1003
  "step": 700
1004
  },
1005
  {
1006
  "epoch": 0.73,
1007
  "learning_rate": 1.4810562571756603e-07,
1008
- "logits/chosen": -2.3338966369628906,
1009
- "logits/rejected": -2.2043213844299316,
1010
- "logps/chosen": -272.3428649902344,
1011
- "logps/rejected": -208.0294952392578,
1012
- "loss": 0.6677,
1013
- "rewards/accuracies": 0.6499999761581421,
1014
- "rewards/chosen": 0.09887900203466415,
1015
- "rewards/margins": 0.07613282650709152,
1016
- "rewards/rejected": 0.02274617925286293,
1017
  "step": 710
1018
  },
1019
  {
1020
  "epoch": 0.74,
1021
  "learning_rate": 1.423650975889782e-07,
1022
- "logits/chosen": -2.323683500289917,
1023
- "logits/rejected": -2.339221954345703,
1024
- "logps/chosen": -303.25946044921875,
1025
- "logps/rejected": -259.43353271484375,
1026
- "loss": 0.6673,
1027
- "rewards/accuracies": 0.6187499761581421,
1028
- "rewards/chosen": 0.11013475805521011,
1029
- "rewards/margins": 0.04143274575471878,
1030
- "rewards/rejected": 0.06870199739933014,
1031
  "step": 720
1032
  },
1033
  {
1034
  "epoch": 0.75,
1035
  "learning_rate": 1.3662456946039035e-07,
1036
- "logits/chosen": -2.30271577835083,
1037
- "logits/rejected": -2.2857494354248047,
1038
- "logps/chosen": -270.1637878417969,
1039
- "logps/rejected": -252.5942840576172,
1040
- "loss": 0.6643,
1041
- "rewards/accuracies": 0.6625000238418579,
1042
- "rewards/chosen": 0.10494796186685562,
1043
- "rewards/margins": 0.06293109059333801,
1044
- "rewards/rejected": 0.0420168861746788,
1045
  "step": 730
1046
  },
1047
  {
1048
  "epoch": 0.76,
1049
  "learning_rate": 1.3088404133180254e-07,
1050
- "logits/chosen": -2.2147629261016846,
1051
- "logits/rejected": -2.2660574913024902,
1052
- "logps/chosen": -276.7433776855469,
1053
- "logps/rejected": -199.24557495117188,
1054
- "loss": 0.6643,
1055
- "rewards/accuracies": 0.6499999761581421,
1056
- "rewards/chosen": 0.10866482555866241,
1057
- "rewards/margins": 0.08003364503383636,
1058
- "rewards/rejected": 0.02863118425011635,
1059
  "step": 740
1060
  },
1061
  {
1062
  "epoch": 0.77,
1063
  "learning_rate": 1.251435132032147e-07,
1064
- "logits/chosen": -2.204407215118408,
1065
- "logits/rejected": -2.2218000888824463,
1066
- "logps/chosen": -269.03546142578125,
1067
- "logps/rejected": -220.9041748046875,
1068
- "loss": 0.6651,
1069
- "rewards/accuracies": 0.643750011920929,
1070
- "rewards/chosen": 0.10271243005990982,
1071
- "rewards/margins": 0.04787519946694374,
1072
- "rewards/rejected": 0.054837245494127274,
1073
  "step": 750
1074
  },
1075
  {
1076
  "epoch": 0.78,
1077
  "learning_rate": 1.1940298507462686e-07,
1078
- "logits/chosen": -2.232849597930908,
1079
- "logits/rejected": -2.2518832683563232,
1080
- "logps/chosen": -267.8799743652344,
1081
- "logps/rejected": -249.4871826171875,
1082
- "loss": 0.6674,
1083
- "rewards/accuracies": 0.6625000238418579,
1084
- "rewards/chosen": 0.0854310542345047,
1085
- "rewards/margins": 0.05483313649892807,
1086
- "rewards/rejected": 0.030597921460866928,
1087
  "step": 760
1088
  },
1089
  {
1090
  "epoch": 0.8,
1091
  "learning_rate": 1.1366245694603903e-07,
1092
- "logits/chosen": -2.292814254760742,
1093
- "logits/rejected": -2.2077105045318604,
1094
- "logps/chosen": -273.19989013671875,
1095
- "logps/rejected": -238.66531372070312,
1096
- "loss": 0.6601,
1097
- "rewards/accuracies": 0.6499999761581421,
1098
- "rewards/chosen": 0.11321671307086945,
1099
- "rewards/margins": 0.07481059432029724,
1100
- "rewards/rejected": 0.038406118750572205,
1101
  "step": 770
1102
  },
1103
  {
1104
  "epoch": 0.81,
1105
  "learning_rate": 1.079219288174512e-07,
1106
- "logits/chosen": -2.350830078125,
1107
- "logits/rejected": -2.325340747833252,
1108
- "logps/chosen": -290.97967529296875,
1109
- "logps/rejected": -236.14697265625,
1110
- "loss": 0.6632,
1111
- "rewards/accuracies": 0.625,
1112
- "rewards/chosen": 0.09706258773803711,
1113
- "rewards/margins": 0.07061664760112762,
1114
- "rewards/rejected": 0.026445943862199783,
1115
  "step": 780
1116
  },
1117
  {
1118
  "epoch": 0.82,
1119
  "learning_rate": 1.0218140068886336e-07,
1120
- "logits/chosen": -2.2678112983703613,
1121
- "logits/rejected": -2.2860140800476074,
1122
- "logps/chosen": -270.32318115234375,
1123
- "logps/rejected": -221.13662719726562,
1124
- "loss": 0.6548,
1125
- "rewards/accuracies": 0.6812499761581421,
1126
- "rewards/chosen": 0.12243938446044922,
1127
- "rewards/margins": 0.08886651694774628,
1128
- "rewards/rejected": 0.033572882413864136,
1129
  "step": 790
1130
  },
1131
  {
1132
  "epoch": 0.83,
1133
  "learning_rate": 9.644087256027554e-08,
1134
- "logits/chosen": -2.272566080093384,
1135
- "logits/rejected": -2.293915271759033,
1136
- "logps/chosen": -284.6838073730469,
1137
- "logps/rejected": -243.53854370117188,
1138
- "loss": 0.6637,
1139
- "rewards/accuracies": 0.612500011920929,
1140
- "rewards/chosen": 0.10790219157934189,
1141
- "rewards/margins": 0.0468364879488945,
1142
- "rewards/rejected": 0.06106570363044739,
1143
  "step": 800
1144
  },
1145
  {
1146
  "epoch": 0.84,
1147
  "learning_rate": 9.070034443168771e-08,
1148
- "logits/chosen": -2.283632755279541,
1149
- "logits/rejected": -2.2884037494659424,
1150
- "logps/chosen": -269.58624267578125,
1151
- "logps/rejected": -230.63101196289062,
1152
- "loss": 0.6621,
1153
- "rewards/accuracies": 0.6937500238418579,
1154
- "rewards/chosen": 0.09132520109415054,
1155
- "rewards/margins": 0.06427180022001266,
1156
- "rewards/rejected": 0.02705339714884758,
1157
  "step": 810
1158
  },
1159
  {
1160
  "epoch": 0.85,
1161
  "learning_rate": 8.495981630309988e-08,
1162
- "logits/chosen": -2.366037130355835,
1163
- "logits/rejected": -2.3434507846832275,
1164
- "logps/chosen": -302.0915832519531,
1165
- "logps/rejected": -228.17526245117188,
1166
- "loss": 0.6629,
1167
- "rewards/accuracies": 0.6937500238418579,
1168
- "rewards/chosen": 0.1286555975675583,
1169
- "rewards/margins": 0.09007199853658676,
1170
- "rewards/rejected": 0.038583606481552124,
1171
  "step": 820
1172
  },
1173
  {
1174
  "epoch": 0.86,
1175
  "learning_rate": 7.921928817451206e-08,
1176
- "logits/chosen": -2.3419971466064453,
1177
- "logits/rejected": -2.2245805263519287,
1178
- "logps/chosen": -287.5382995605469,
1179
- "logps/rejected": -222.556640625,
1180
- "loss": 0.6582,
1181
  "rewards/accuracies": 0.675000011920929,
1182
- "rewards/chosen": 0.1244380846619606,
1183
- "rewards/margins": 0.0811501294374466,
1184
- "rewards/rejected": 0.043287962675094604,
1185
  "step": 830
1186
  },
1187
  {
1188
  "epoch": 0.87,
1189
  "learning_rate": 7.347876004592423e-08,
1190
- "logits/chosen": -2.258993625640869,
1191
- "logits/rejected": -2.2267391681671143,
1192
- "logps/chosen": -258.35870361328125,
1193
- "logps/rejected": -217.0255584716797,
1194
- "loss": 0.6722,
1195
- "rewards/accuracies": 0.6625000238418579,
1196
- "rewards/chosen": 0.10194462537765503,
1197
- "rewards/margins": 0.06905193626880646,
1198
- "rewards/rejected": 0.03289269283413887,
1199
  "step": 840
1200
  },
1201
  {
1202
  "epoch": 0.88,
1203
  "learning_rate": 6.773823191733639e-08,
1204
- "logits/chosen": -2.2829337120056152,
1205
- "logits/rejected": -2.3869950771331787,
1206
- "logps/chosen": -262.0351257324219,
1207
- "logps/rejected": -231.09884643554688,
1208
- "loss": 0.6654,
1209
- "rewards/accuracies": 0.65625,
1210
- "rewards/chosen": 0.09651964902877808,
1211
- "rewards/margins": 0.05541490390896797,
1212
- "rewards/rejected": 0.041104745119810104,
1213
  "step": 850
1214
  },
1215
  {
1216
  "epoch": 0.89,
1217
  "learning_rate": 6.199770378874856e-08,
1218
- "logits/chosen": -2.4058268070220947,
1219
- "logits/rejected": -2.3328609466552734,
1220
- "logps/chosen": -295.76080322265625,
1221
- "logps/rejected": -270.1774597167969,
1222
- "loss": 0.6695,
1223
- "rewards/accuracies": 0.6625000238418579,
1224
- "rewards/chosen": 0.10888679325580597,
1225
- "rewards/margins": 0.06958366930484772,
1226
- "rewards/rejected": 0.039303116500377655,
1227
  "step": 860
1228
  },
1229
  {
1230
  "epoch": 0.9,
1231
  "learning_rate": 5.6257175660160735e-08,
1232
- "logits/chosen": -2.245914936065674,
1233
- "logits/rejected": -2.2437186241149902,
1234
- "logps/chosen": -312.9712829589844,
1235
- "logps/rejected": -237.42507934570312,
1236
- "loss": 0.6657,
1237
- "rewards/accuracies": 0.581250011920929,
1238
- "rewards/chosen": 0.1000402420759201,
1239
- "rewards/margins": 0.05334781855344772,
1240
- "rewards/rejected": 0.04669243097305298,
1241
  "step": 870
1242
  },
1243
  {
1244
  "epoch": 0.91,
1245
  "learning_rate": 5.05166475315729e-08,
1246
- "logits/chosen": -2.358290195465088,
1247
- "logits/rejected": -2.312929153442383,
1248
- "logps/chosen": -291.44171142578125,
1249
- "logps/rejected": -240.0635986328125,
1250
- "loss": 0.6646,
1251
- "rewards/accuracies": 0.6812499761581421,
1252
- "rewards/chosen": 0.10663672536611557,
1253
- "rewards/margins": 0.06855263561010361,
1254
- "rewards/rejected": 0.03808409348130226,
1255
  "step": 880
1256
  },
1257
  {
1258
  "epoch": 0.92,
1259
  "learning_rate": 4.477611940298507e-08,
1260
- "logits/chosen": -2.3123860359191895,
1261
- "logits/rejected": -2.3553194999694824,
1262
- "logps/chosen": -285.9127502441406,
1263
- "logps/rejected": -235.38778686523438,
1264
- "loss": 0.6682,
1265
- "rewards/accuracies": 0.706250011920929,
1266
- "rewards/chosen": 0.12196414172649384,
1267
- "rewards/margins": 0.09207607805728912,
1268
- "rewards/rejected": 0.029888058081269264,
1269
  "step": 890
1270
  },
1271
  {
1272
  "epoch": 0.93,
1273
  "learning_rate": 3.903559127439724e-08,
1274
- "logits/chosen": -2.3270044326782227,
1275
- "logits/rejected": -2.1938259601593018,
1276
- "logps/chosen": -272.7822265625,
1277
- "logps/rejected": -211.4164581298828,
1278
- "loss": 0.6597,
1279
- "rewards/accuracies": 0.6812499761581421,
1280
- "rewards/chosen": 0.11634738743305206,
1281
- "rewards/margins": 0.08976142108440399,
1282
- "rewards/rejected": 0.02658594585955143,
1283
  "step": 900
1284
  },
1285
  {
1286
  "epoch": 0.94,
1287
  "learning_rate": 3.3295063145809414e-08,
1288
- "logits/chosen": -2.29099702835083,
1289
- "logits/rejected": -2.344202756881714,
1290
- "logps/chosen": -238.25253295898438,
1291
- "logps/rejected": -206.76351928710938,
1292
- "loss": 0.661,
1293
- "rewards/accuracies": 0.668749988079071,
1294
- "rewards/chosen": 0.10054856538772583,
1295
- "rewards/margins": 0.07191120088100433,
1296
- "rewards/rejected": 0.02863735891878605,
1297
  "step": 910
1298
  },
1299
  {
1300
  "epoch": 0.95,
1301
  "learning_rate": 2.755453501722158e-08,
1302
- "logits/chosen": -2.375382423400879,
1303
- "logits/rejected": -2.3675310611724854,
1304
- "logps/chosen": -281.5444641113281,
1305
- "logps/rejected": -225.140625,
1306
- "loss": 0.6634,
1307
- "rewards/accuracies": 0.59375,
1308
- "rewards/chosen": 0.10897977650165558,
1309
- "rewards/margins": 0.05989484861493111,
1310
- "rewards/rejected": 0.04908492788672447,
1311
  "step": 920
1312
  },
1313
  {
1314
  "epoch": 0.96,
1315
  "learning_rate": 2.1814006888633754e-08,
1316
- "logits/chosen": -2.2819228172302246,
1317
- "logits/rejected": -2.254305839538574,
1318
- "logps/chosen": -256.4382629394531,
1319
- "logps/rejected": -203.33737182617188,
1320
- "loss": 0.6616,
1321
- "rewards/accuracies": 0.706250011920929,
1322
- "rewards/chosen": 0.10740064084529877,
1323
- "rewards/margins": 0.07745448499917984,
1324
- "rewards/rejected": 0.029946163296699524,
1325
  "step": 930
1326
  },
1327
  {
1328
  "epoch": 0.97,
1329
  "learning_rate": 1.6073478760045924e-08,
1330
- "logits/chosen": -2.315403938293457,
1331
- "logits/rejected": -2.311166286468506,
1332
- "logps/chosen": -271.6436462402344,
1333
- "logps/rejected": -231.6937713623047,
1334
- "loss": 0.6646,
1335
- "rewards/accuracies": 0.675000011920929,
1336
- "rewards/chosen": 0.10409040749073029,
1337
- "rewards/margins": 0.06160085275769234,
1338
- "rewards/rejected": 0.04248954728245735,
1339
  "step": 940
1340
  },
1341
  {
1342
  "epoch": 0.98,
1343
  "learning_rate": 1.0332950631458094e-08,
1344
- "logits/chosen": -2.3142313957214355,
1345
- "logits/rejected": -2.278623342514038,
1346
- "logps/chosen": -282.9390869140625,
1347
- "logps/rejected": -233.07308959960938,
1348
- "loss": 0.6638,
1349
  "rewards/accuracies": 0.6812499761581421,
1350
- "rewards/chosen": 0.10391966998577118,
1351
- "rewards/margins": 0.07248817384243011,
1352
- "rewards/rejected": 0.03143150731921196,
1353
  "step": 950
1354
  },
1355
  {
1356
  "epoch": 0.99,
1357
  "learning_rate": 4.592422502870264e-09,
1358
- "logits/chosen": -2.2512423992156982,
1359
- "logits/rejected": -2.2343735694885254,
1360
- "logps/chosen": -281.0455322265625,
1361
- "logps/rejected": -240.0409393310547,
1362
- "loss": 0.6611,
1363
- "rewards/accuracies": 0.6312500238418579,
1364
- "rewards/chosen": 0.10249260812997818,
1365
- "rewards/margins": 0.06933777779340744,
1366
- "rewards/rejected": 0.03315482288599014,
1367
  "step": 960
1368
  },
1369
  {
1370
  "epoch": 1.0,
1371
- "eval_logits/chosen": -2.45943021774292,
1372
- "eval_logits/rejected": -2.39821720123291,
1373
- "eval_logps/chosen": -278.6921081542969,
1374
- "eval_logps/rejected": -230.4866180419922,
1375
- "eval_loss": 0.6637352705001831,
1376
- "eval_rewards/accuracies": 0.6520000100135803,
1377
- "eval_rewards/chosen": 0.1041216179728508,
1378
- "eval_rewards/margins": 0.06707857549190521,
1379
- "eval_rewards/rejected": 0.03704305365681648,
1380
- "eval_runtime": 435.7118,
1381
- "eval_samples_per_second": 4.59,
1382
- "eval_steps_per_second": 0.287,
1383
  "step": 968
1384
  },
1385
  {
1386
  "epoch": 1.0,
1387
  "step": 968,
1388
  "total_flos": 0.0,
1389
- "train_loss": 0.6731863415930882,
1390
- "train_runtime": 27311.2139,
1391
- "train_samples_per_second": 2.269,
1392
  "train_steps_per_second": 0.035
1393
  }
1394
  ],
 
25
  {
26
  "epoch": 0.01,
27
  "learning_rate": 5.154639175257731e-08,
28
+ "logits/chosen": -2.223740339279175,
29
+ "logits/rejected": -2.180643081665039,
30
+ "logps/chosen": -284.7340087890625,
31
+ "logps/rejected": -205.98194885253906,
32
+ "loss": 0.694,
33
+ "rewards/accuracies": 0.4305555522441864,
34
+ "rewards/chosen": -0.0006893649115227163,
35
+ "rewards/margins": 0.0007374237175099552,
36
+ "rewards/rejected": -0.0014267880469560623,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.02,
41
  "learning_rate": 1.0309278350515462e-07,
42
+ "logits/chosen": -2.33476185798645,
43
+ "logits/rejected": -2.2125375270843506,
44
+ "logps/chosen": -320.8204040527344,
45
+ "logps/rejected": -248.4267120361328,
46
+ "loss": 0.692,
47
  "rewards/accuracies": 0.512499988079071,
48
+ "rewards/chosen": 0.0003039050498045981,
49
+ "rewards/margins": 0.0023796656168997288,
50
+ "rewards/rejected": -0.0020757606253027916,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.03,
55
  "learning_rate": 1.5463917525773197e-07,
56
+ "logits/chosen": -2.339370012283325,
57
+ "logits/rejected": -2.304020404815674,
58
+ "logps/chosen": -268.95074462890625,
59
+ "logps/rejected": -227.067626953125,
60
+ "loss": 0.6921,
61
+ "rewards/accuracies": 0.46875,
62
+ "rewards/chosen": 0.0005883350968360901,
63
+ "rewards/margins": 0.002594549907371402,
64
+ "rewards/rejected": -0.0020062148105353117,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.04,
69
  "learning_rate": 2.0618556701030925e-07,
70
+ "logits/chosen": -2.3392791748046875,
71
+ "logits/rejected": -2.3300938606262207,
72
+ "logps/chosen": -308.5113220214844,
73
+ "logps/rejected": -253.8385467529297,
74
+ "loss": 0.6945,
75
+ "rewards/accuracies": 0.44999998807907104,
76
+ "rewards/chosen": 0.0024464379530400038,
77
+ "rewards/margins": -0.00025889737298712134,
78
+ "rewards/rejected": 0.0027053358498960733,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.05,
83
  "learning_rate": 2.5773195876288655e-07,
84
+ "logits/chosen": -2.251412868499756,
85
+ "logits/rejected": -2.2359275817871094,
86
+ "logps/chosen": -297.78375244140625,
87
+ "logps/rejected": -227.23556518554688,
88
+ "loss": 0.6922,
89
+ "rewards/accuracies": 0.5375000238418579,
90
+ "rewards/chosen": 0.0033915191888809204,
91
+ "rewards/margins": 0.0055986023508012295,
92
+ "rewards/rejected": -0.0022070836275815964,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.06,
97
  "learning_rate": 3.0927835051546394e-07,
98
+ "logits/chosen": -2.167163848876953,
99
+ "logits/rejected": -2.3376193046569824,
100
+ "logps/chosen": -256.54510498046875,
101
+ "logps/rejected": -229.5459747314453,
102
+ "loss": 0.6917,
103
+ "rewards/accuracies": 0.4937500059604645,
104
+ "rewards/chosen": 0.000388039683457464,
105
+ "rewards/margins": 0.007883811369538307,
106
+ "rewards/rejected": -0.0074957734905183315,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.07,
111
  "learning_rate": 3.608247422680412e-07,
112
+ "logits/chosen": -2.3430614471435547,
113
+ "logits/rejected": -2.281782627105713,
114
+ "logps/chosen": -313.92608642578125,
115
+ "logps/rejected": -252.57284545898438,
116
+ "loss": 0.6924,
117
+ "rewards/accuracies": 0.46875,
118
+ "rewards/chosen": 0.0012417413527145982,
119
+ "rewards/margins": 0.0001173208438558504,
120
+ "rewards/rejected": 0.0011244199704378843,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.08,
125
  "learning_rate": 4.123711340206185e-07,
126
+ "logits/chosen": -2.337070941925049,
127
+ "logits/rejected": -2.3018112182617188,
128
+ "logps/chosen": -302.9524841308594,
129
+ "logps/rejected": -243.9047088623047,
130
+ "loss": 0.6916,
131
+ "rewards/accuracies": 0.518750011920929,
132
+ "rewards/chosen": 0.0021400884725153446,
133
+ "rewards/margins": -0.0002812549355439842,
134
+ "rewards/rejected": 0.002421343233436346,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.09,
139
  "learning_rate": 4.639175257731959e-07,
140
+ "logits/chosen": -2.259251356124878,
141
+ "logits/rejected": -2.2963995933532715,
142
+ "logps/chosen": -270.1668395996094,
143
+ "logps/rejected": -216.64822387695312,
144
+ "loss": 0.6913,
145
+ "rewards/accuracies": 0.5874999761581421,
146
+ "rewards/chosen": 0.009941437281668186,
147
+ "rewards/margins": 0.010241752490401268,
148
+ "rewards/rejected": -0.00030031436472199857,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.1,
153
  "learning_rate": 4.982778415614236e-07,
154
+ "logits/chosen": -2.1677582263946533,
155
+ "logits/rejected": -2.2741990089416504,
156
+ "logps/chosen": -274.75836181640625,
157
+ "logps/rejected": -226.3966064453125,
158
+ "loss": 0.6901,
159
+ "rewards/accuracies": 0.5375000238418579,
160
+ "rewards/chosen": 0.006115993484854698,
161
+ "rewards/margins": 0.0013887921813875437,
162
+ "rewards/rejected": 0.0047272020019590855,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.11,
167
  "learning_rate": 4.925373134328357e-07,
168
+ "logits/chosen": -2.271916389465332,
169
+ "logits/rejected": -2.197857141494751,
170
+ "logps/chosen": -274.72113037109375,
171
+ "logps/rejected": -232.5464324951172,
172
+ "loss": 0.6886,
173
+ "rewards/accuracies": 0.5375000238418579,
174
+ "rewards/chosen": 0.005831545684486628,
175
+ "rewards/margins": 0.0067709460854530334,
176
+ "rewards/rejected": -0.000939400284551084,
177
  "step": 110
178
  },
179
  {
180
  "epoch": 0.12,
181
  "learning_rate": 4.867967853042479e-07,
182
+ "logits/chosen": -2.2548232078552246,
183
+ "logits/rejected": -2.322075366973877,
184
+ "logps/chosen": -319.34521484375,
185
+ "logps/rejected": -235.76535034179688,
186
+ "loss": 0.689,
187
+ "rewards/accuracies": 0.574999988079071,
188
+ "rewards/chosen": 0.013832703232765198,
189
+ "rewards/margins": 0.01176449190825224,
190
+ "rewards/rejected": 0.002068211790174246,
191
  "step": 120
192
  },
193
  {
194
  "epoch": 0.13,
195
  "learning_rate": 4.810562571756601e-07,
196
+ "logits/chosen": -2.32174015045166,
197
+ "logits/rejected": -2.3775150775909424,
198
+ "logps/chosen": -296.20733642578125,
199
+ "logps/rejected": -245.56655883789062,
200
+ "loss": 0.6875,
201
+ "rewards/accuracies": 0.5687500238418579,
202
+ "rewards/chosen": 0.017552796751260757,
203
+ "rewards/margins": 0.013545483350753784,
204
+ "rewards/rejected": 0.004007314797490835,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.14,
209
  "learning_rate": 4.753157290470723e-07,
210
+ "logits/chosen": -2.3627283573150635,
211
+ "logits/rejected": -2.310948133468628,
212
+ "logps/chosen": -301.9321594238281,
213
+ "logps/rejected": -239.2898406982422,
214
+ "loss": 0.688,
215
+ "rewards/accuracies": 0.46875,
216
+ "rewards/chosen": 0.011156091466546059,
217
+ "rewards/margins": 0.009668431244790554,
218
+ "rewards/rejected": 0.0014876595232635736,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.15,
223
  "learning_rate": 4.6957520091848447e-07,
224
+ "logits/chosen": -2.2531113624572754,
225
+ "logits/rejected": -2.348215341567993,
226
+ "logps/chosen": -284.4292907714844,
227
+ "logps/rejected": -259.6882019042969,
228
+ "loss": 0.6858,
229
+ "rewards/accuracies": 0.5625,
230
+ "rewards/chosen": 0.017186133190989494,
231
+ "rewards/margins": 0.011862866580486298,
232
+ "rewards/rejected": 0.005323265679180622,
233
  "step": 150
234
  },
235
  {
236
  "epoch": 0.17,
237
  "learning_rate": 4.6383467278989666e-07,
238
+ "logits/chosen": -2.361238956451416,
239
+ "logits/rejected": -2.4430744647979736,
240
+ "logps/chosen": -286.7644348144531,
241
+ "logps/rejected": -221.6837158203125,
242
+ "loss": 0.6857,
243
+ "rewards/accuracies": 0.6187499761581421,
244
+ "rewards/chosen": 0.01911218836903572,
245
+ "rewards/margins": 0.014816234819591045,
246
+ "rewards/rejected": 0.00429595448076725,
247
  "step": 160
248
  },
249
  {
250
  "epoch": 0.18,
251
  "learning_rate": 4.580941446613088e-07,
252
+ "logits/chosen": -2.32244610786438,
253
+ "logits/rejected": -2.3339757919311523,
254
+ "logps/chosen": -301.54693603515625,
255
+ "logps/rejected": -239.26095581054688,
256
+ "loss": 0.6839,
257
+ "rewards/accuracies": 0.5874999761581421,
258
+ "rewards/chosen": 0.023171866312623024,
259
+ "rewards/margins": 0.014685508795082569,
260
+ "rewards/rejected": 0.00848635844886303,
261
  "step": 170
262
  },
263
  {
264
  "epoch": 0.19,
265
  "learning_rate": 4.52353616532721e-07,
266
+ "logits/chosen": -2.347285032272339,
267
+ "logits/rejected": -2.3244121074676514,
268
+ "logps/chosen": -257.841552734375,
269
+ "logps/rejected": -214.5565643310547,
270
+ "loss": 0.6864,
271
+ "rewards/accuracies": 0.5625,
272
+ "rewards/chosen": 0.019994111731648445,
273
+ "rewards/margins": 0.01520625315606594,
274
+ "rewards/rejected": 0.004787858575582504,
275
  "step": 180
276
  },
277
  {
278
  "epoch": 0.2,
279
  "learning_rate": 4.4661308840413316e-07,
280
+ "logits/chosen": -2.2657313346862793,
281
+ "logits/rejected": -2.201254367828369,
282
+ "logps/chosen": -253.98916625976562,
283
+ "logps/rejected": -206.3340301513672,
284
+ "loss": 0.6833,
285
+ "rewards/accuracies": 0.5874999761581421,
286
+ "rewards/chosen": 0.0245305635035038,
287
+ "rewards/margins": 0.017677443102002144,
288
+ "rewards/rejected": 0.006853120867162943,
289
  "step": 190
290
  },
291
  {
292
  "epoch": 0.21,
293
  "learning_rate": 4.408725602755453e-07,
294
+ "logits/chosen": -2.284461498260498,
295
+ "logits/rejected": -2.2873706817626953,
296
+ "logps/chosen": -261.44427490234375,
297
+ "logps/rejected": -195.59422302246094,
298
+ "loss": 0.6835,
299
+ "rewards/accuracies": 0.6000000238418579,
300
+ "rewards/chosen": 0.03187788277864456,
301
+ "rewards/margins": 0.024095263332128525,
302
+ "rewards/rejected": 0.007782619446516037,
303
  "step": 200
304
  },
305
  {
306
  "epoch": 0.22,
307
  "learning_rate": 4.351320321469575e-07,
308
+ "logits/chosen": -2.18426513671875,
309
+ "logits/rejected": -2.1963071823120117,
310
+ "logps/chosen": -302.31195068359375,
311
+ "logps/rejected": -218.6005401611328,
312
+ "loss": 0.6815,
313
+ "rewards/accuracies": 0.612500011920929,
314
+ "rewards/chosen": 0.03718667849898338,
315
+ "rewards/margins": 0.026892077177762985,
316
+ "rewards/rejected": 0.010294605046510696,
317
  "step": 210
318
  },
319
  {
320
  "epoch": 0.23,
321
  "learning_rate": 4.2939150401836967e-07,
322
+ "logits/chosen": -2.2150394916534424,
323
+ "logits/rejected": -2.2160990238189697,
324
+ "logps/chosen": -269.44769287109375,
325
+ "logps/rejected": -235.6748504638672,
326
+ "loss": 0.6801,
327
+ "rewards/accuracies": 0.59375,
328
+ "rewards/chosen": 0.038056183606386185,
329
+ "rewards/margins": 0.023441683501005173,
330
+ "rewards/rejected": 0.014614498242735863,
331
  "step": 220
332
  },
333
  {
334
  "epoch": 0.24,
335
  "learning_rate": 4.236509758897818e-07,
336
+ "logits/chosen": -2.2152469158172607,
337
+ "logits/rejected": -2.1862380504608154,
338
+ "logps/chosen": -271.4049377441406,
339
+ "logps/rejected": -242.6397247314453,
340
  "loss": 0.6826,
341
+ "rewards/accuracies": 0.637499988079071,
342
+ "rewards/chosen": 0.03268683701753616,
343
+ "rewards/margins": 0.026912549510598183,
344
+ "rewards/rejected": 0.0057742842473089695,
345
  "step": 230
346
  },
347
  {
348
  "epoch": 0.25,
349
  "learning_rate": 4.17910447761194e-07,
350
+ "logits/chosen": -2.3059380054473877,
351
+ "logits/rejected": -2.2681984901428223,
352
+ "logps/chosen": -309.55499267578125,
353
+ "logps/rejected": -221.61703491210938,
354
+ "loss": 0.6827,
355
+ "rewards/accuracies": 0.5562499761581421,
356
+ "rewards/chosen": 0.03509462997317314,
357
+ "rewards/margins": 0.012767216190695763,
358
+ "rewards/rejected": 0.02232741378247738,
359
  "step": 240
360
  },
361
  {
362
  "epoch": 0.26,
363
  "learning_rate": 4.121699196326062e-07,
364
+ "logits/chosen": -2.307035446166992,
365
+ "logits/rejected": -2.2920923233032227,
366
+ "logps/chosen": -272.9412841796875,
367
+ "logps/rejected": -237.314208984375,
368
+ "loss": 0.6824,
369
+ "rewards/accuracies": 0.550000011920929,
370
+ "rewards/chosen": 0.03535359352827072,
371
+ "rewards/margins": 0.012216273695230484,
372
+ "rewards/rejected": 0.023137323558330536,
373
  "step": 250
374
  },
375
  {
376
  "epoch": 0.27,
377
  "learning_rate": 4.0642939150401836e-07,
378
+ "logits/chosen": -2.3456673622131348,
379
+ "logits/rejected": -2.3194832801818848,
380
+ "logps/chosen": -270.475341796875,
381
+ "logps/rejected": -221.84536743164062,
382
+ "loss": 0.6805,
383
+ "rewards/accuracies": 0.6187499761581421,
384
+ "rewards/chosen": 0.04569912329316139,
385
+ "rewards/margins": 0.029975151643157005,
386
+ "rewards/rejected": 0.015723969787359238,
387
  "step": 260
388
  },
389
  {
390
  "epoch": 0.28,
391
  "learning_rate": 4.006888633754305e-07,
392
+ "logits/chosen": -2.385854721069336,
393
+ "logits/rejected": -2.3556528091430664,
394
+ "logps/chosen": -284.36029052734375,
395
+ "logps/rejected": -232.5426788330078,
396
+ "loss": 0.6793,
397
+ "rewards/accuracies": 0.5874999761581421,
398
+ "rewards/chosen": 0.04998317360877991,
399
+ "rewards/margins": 0.032010577619075775,
400
+ "rewards/rejected": 0.017972594127058983,
401
  "step": 270
402
  },
403
  {
404
  "epoch": 0.29,
405
  "learning_rate": 3.949483352468427e-07,
406
+ "logits/chosen": -2.308225154876709,
407
+ "logits/rejected": -2.259629726409912,
408
+ "logps/chosen": -293.1715087890625,
409
+ "logps/rejected": -236.4293975830078,
410
+ "loss": 0.6771,
411
+ "rewards/accuracies": 0.581250011920929,
412
+ "rewards/chosen": 0.057786036282777786,
413
+ "rewards/margins": 0.04149205610156059,
414
+ "rewards/rejected": 0.016293983906507492,
415
  "step": 280
416
  },
417
  {
418
  "epoch": 0.3,
419
  "learning_rate": 3.8920780711825487e-07,
420
+ "logits/chosen": -2.278501033782959,
421
+ "logits/rejected": -2.369293689727783,
422
+ "logps/chosen": -278.4786376953125,
423
+ "logps/rejected": -227.40927124023438,
424
+ "loss": 0.6792,
425
+ "rewards/accuracies": 0.5625,
426
+ "rewards/chosen": 0.0485750176012516,
427
+ "rewards/margins": 0.02242155373096466,
428
+ "rewards/rejected": 0.02615346387028694,
429
  "step": 290
430
  },
431
  {
432
  "epoch": 0.31,
433
  "learning_rate": 3.83467278989667e-07,
434
+ "logits/chosen": -2.2661235332489014,
435
+ "logits/rejected": -2.205644130706787,
436
+ "logps/chosen": -254.183837890625,
437
+ "logps/rejected": -221.9667510986328,
438
+ "loss": 0.6772,
439
+ "rewards/accuracies": 0.6499999761581421,
440
+ "rewards/chosen": 0.05874975398182869,
441
+ "rewards/margins": 0.03965791314840317,
442
+ "rewards/rejected": 0.019091838970780373,
443
  "step": 300
444
  },
445
  {
446
  "epoch": 0.32,
447
  "learning_rate": 3.777267508610792e-07,
448
+ "logits/chosen": -2.32353138923645,
449
+ "logits/rejected": -2.3743112087249756,
450
+ "logps/chosen": -306.22711181640625,
451
+ "logps/rejected": -257.60980224609375,
452
+ "loss": 0.6783,
453
+ "rewards/accuracies": 0.59375,
454
+ "rewards/chosen": 0.04823786020278931,
455
+ "rewards/margins": 0.017192820087075233,
456
+ "rewards/rejected": 0.03104504384100437,
457
  "step": 310
458
  },
459
  {
460
  "epoch": 0.33,
461
  "learning_rate": 3.7198622273249137e-07,
462
+ "logits/chosen": -2.234679698944092,
463
+ "logits/rejected": -2.211430788040161,
464
+ "logps/chosen": -251.83053588867188,
465
+ "logps/rejected": -193.01544189453125,
466
+ "loss": 0.6739,
467
+ "rewards/accuracies": 0.637499988079071,
468
+ "rewards/chosen": 0.06583289802074432,
469
+ "rewards/margins": 0.047706056386232376,
470
+ "rewards/rejected": 0.018126841634511948,
471
  "step": 320
472
  },
473
  {
474
  "epoch": 0.34,
475
  "learning_rate": 3.662456946039035e-07,
476
+ "logits/chosen": -2.259127140045166,
477
+ "logits/rejected": -2.287956714630127,
478
+ "logps/chosen": -312.1918029785156,
479
+ "logps/rejected": -239.03530883789062,
480
  "loss": 0.6761,
481
+ "rewards/accuracies": 0.65625,
482
+ "rewards/chosen": 0.07068151980638504,
483
+ "rewards/margins": 0.051512353122234344,
484
+ "rewards/rejected": 0.0191691592335701,
485
  "step": 330
486
  },
487
  {
488
  "epoch": 0.35,
489
  "learning_rate": 3.605051664753157e-07,
490
+ "logits/chosen": -2.197277784347534,
491
+ "logits/rejected": -2.13037109375,
492
+ "logps/chosen": -244.2609100341797,
493
+ "logps/rejected": -238.80953979492188,
494
+ "loss": 0.6788,
495
+ "rewards/accuracies": 0.581250011920929,
496
+ "rewards/chosen": 0.05732797831296921,
497
+ "rewards/margins": 0.030042264610528946,
498
+ "rewards/rejected": 0.027285713702440262,
499
  "step": 340
500
  },
501
  {
502
  "epoch": 0.36,
503
  "learning_rate": 3.547646383467279e-07,
504
+ "logits/chosen": -2.365830421447754,
505
+ "logits/rejected": -2.3728528022766113,
506
+ "logps/chosen": -313.7022705078125,
507
+ "logps/rejected": -248.090087890625,
508
+ "loss": 0.6746,
509
+ "rewards/accuracies": 0.643750011920929,
510
+ "rewards/chosen": 0.08016298711299896,
511
+ "rewards/margins": 0.05509548634290695,
512
+ "rewards/rejected": 0.025067497044801712,
513
  "step": 350
514
  },
515
  {
516
  "epoch": 0.37,
517
  "learning_rate": 3.4902411021814007e-07,
518
+ "logits/chosen": -2.22756290435791,
519
+ "logits/rejected": -2.259359121322632,
520
+ "logps/chosen": -303.25250244140625,
521
+ "logps/rejected": -249.8985595703125,
522
+ "loss": 0.6723,
523
+ "rewards/accuracies": 0.643750011920929,
524
+ "rewards/chosen": 0.06414168328046799,
525
+ "rewards/margins": 0.04363773763179779,
526
+ "rewards/rejected": 0.020503941923379898,
527
  "step": 360
528
  },
529
  {
530
  "epoch": 0.38,
531
  "learning_rate": 3.432835820895522e-07,
532
+ "logits/chosen": -2.3700273036956787,
533
+ "logits/rejected": -2.3231639862060547,
534
+ "logps/chosen": -314.5257263183594,
535
+ "logps/rejected": -270.7105712890625,
536
  "loss": 0.6759,
537
+ "rewards/accuracies": 0.612500011920929,
538
+ "rewards/chosen": 0.07061124593019485,
539
+ "rewards/margins": 0.03391130641102791,
540
+ "rewards/rejected": 0.03669993579387665,
541
  "step": 370
542
  },
543
  {
544
  "epoch": 0.39,
545
  "learning_rate": 3.375430539609644e-07,
546
+ "logits/chosen": -2.3212878704071045,
547
+ "logits/rejected": -2.249602794647217,
548
+ "logps/chosen": -291.92474365234375,
549
+ "logps/rejected": -239.6724395751953,
550
+ "loss": 0.677,
551
+ "rewards/accuracies": 0.6000000238418579,
552
+ "rewards/chosen": 0.07373902946710587,
553
+ "rewards/margins": 0.03367278352379799,
554
+ "rewards/rejected": 0.04006624594330788,
555
  "step": 380
556
  },
557
  {
558
  "epoch": 0.4,
559
  "learning_rate": 3.3180252583237657e-07,
560
+ "logits/chosen": -2.297023057937622,
561
+ "logits/rejected": -2.264172077178955,
562
+ "logps/chosen": -278.0927734375,
563
+ "logps/rejected": -237.13436889648438,
564
+ "loss": 0.6722,
565
+ "rewards/accuracies": 0.668749988079071,
566
+ "rewards/chosen": 0.0686158686876297,
567
+ "rewards/margins": 0.051144860684871674,
568
+ "rewards/rejected": 0.01747100241482258,
569
  "step": 390
570
  },
571
  {
572
  "epoch": 0.41,
573
  "learning_rate": 3.260619977037887e-07,
574
+ "logits/chosen": -2.237035036087036,
575
+ "logits/rejected": -2.2392399311065674,
576
+ "logps/chosen": -263.4399108886719,
577
+ "logps/rejected": -213.87451171875,
578
+ "loss": 0.6707,
579
+ "rewards/accuracies": 0.6812499761581421,
580
+ "rewards/chosen": 0.06768475472927094,
581
+ "rewards/margins": 0.048441771417856216,
582
+ "rewards/rejected": 0.019242987036705017,
583
  "step": 400
584
  },
585
  {
586
  "epoch": 0.42,
587
  "learning_rate": 3.203214695752009e-07,
588
+ "logits/chosen": -2.2776081562042236,
589
+ "logits/rejected": -2.2924447059631348,
590
+ "logps/chosen": -268.8953857421875,
591
+ "logps/rejected": -252.852294921875,
592
+ "loss": 0.6673,
593
+ "rewards/accuracies": 0.6312500238418579,
594
+ "rewards/chosen": 0.08111406862735748,
595
+ "rewards/margins": 0.05318716913461685,
596
+ "rewards/rejected": 0.027926897630095482,
597
  "step": 410
598
  },
599
  {
600
  "epoch": 0.43,
601
  "learning_rate": 3.145809414466131e-07,
602
+ "logits/chosen": -2.3054046630859375,
603
+ "logits/rejected": -2.2502362728118896,
604
+ "logps/chosen": -252.5205841064453,
605
+ "logps/rejected": -204.43344116210938,
606
+ "loss": 0.6749,
607
+ "rewards/accuracies": 0.606249988079071,
608
+ "rewards/chosen": 0.07272285223007202,
609
+ "rewards/margins": 0.04809904843568802,
610
+ "rewards/rejected": 0.024623800069093704,
611
  "step": 420
612
  },
613
  {
614
  "epoch": 0.44,
615
  "learning_rate": 3.0884041331802526e-07,
616
+ "logits/chosen": -2.3482632637023926,
617
+ "logits/rejected": -2.3258707523345947,
618
+ "logps/chosen": -263.67095947265625,
619
+ "logps/rejected": -241.14047241210938,
620
+ "loss": 0.6741,
621
+ "rewards/accuracies": 0.6625000238418579,
622
+ "rewards/chosen": 0.07022975385189056,
623
+ "rewards/margins": 0.04051927849650383,
624
+ "rewards/rejected": 0.029710477218031883,
625
  "step": 430
626
  },
627
  {
628
  "epoch": 0.45,
629
  "learning_rate": 3.030998851894374e-07,
630
+ "logits/chosen": -2.286533832550049,
631
+ "logits/rejected": -2.320568084716797,
632
+ "logps/chosen": -286.72894287109375,
633
+ "logps/rejected": -247.65542602539062,
634
+ "loss": 0.6705,
635
+ "rewards/accuracies": 0.668749988079071,
636
+ "rewards/chosen": 0.07666246592998505,
637
+ "rewards/margins": 0.05972421169281006,
638
+ "rewards/rejected": 0.01693824864923954,
639
  "step": 440
640
  },
641
  {
642
  "epoch": 0.46,
643
  "learning_rate": 2.973593570608496e-07,
644
+ "logits/chosen": -2.206477642059326,
645
+ "logits/rejected": -2.315464496612549,
646
+ "logps/chosen": -276.1682434082031,
647
+ "logps/rejected": -230.3959197998047,
648
+ "loss": 0.678,
649
+ "rewards/accuracies": 0.6187499761581421,
650
+ "rewards/chosen": 0.0718303695321083,
651
+ "rewards/margins": 0.04074189439415932,
652
+ "rewards/rejected": 0.03108847141265869,
653
  "step": 450
654
  },
655
  {
656
  "epoch": 0.47,
657
  "learning_rate": 2.9161882893226177e-07,
658
+ "logits/chosen": -2.277815103530884,
659
+ "logits/rejected": -2.342268705368042,
660
+ "logps/chosen": -273.23773193359375,
661
+ "logps/rejected": -222.5966796875,
662
+ "loss": 0.6662,
663
+ "rewards/accuracies": 0.6187499761581421,
664
+ "rewards/chosen": 0.0802597850561142,
665
+ "rewards/margins": 0.050464123487472534,
666
+ "rewards/rejected": 0.029795657843351364,
667
  "step": 460
668
  },
669
  {
670
  "epoch": 0.49,
671
  "learning_rate": 2.858783008036739e-07,
672
+ "logits/chosen": -2.2656216621398926,
673
+ "logits/rejected": -2.2778594493865967,
674
+ "logps/chosen": -248.9929656982422,
675
+ "logps/rejected": -215.5894012451172,
676
+ "loss": 0.6669,
677
+ "rewards/accuracies": 0.637499988079071,
678
+ "rewards/chosen": 0.08564073592424393,
679
+ "rewards/margins": 0.06490761041641235,
680
+ "rewards/rejected": 0.020733121782541275,
681
  "step": 470
682
  },
683
  {
684
  "epoch": 0.5,
685
  "learning_rate": 2.801377726750861e-07,
686
+ "logits/chosen": -2.2962255477905273,
687
+ "logits/rejected": -2.27239727973938,
688
+ "logps/chosen": -289.5277404785156,
689
+ "logps/rejected": -231.601318359375,
690
+ "loss": 0.6713,
691
+ "rewards/accuracies": 0.6187499761581421,
692
+ "rewards/chosen": 0.08144901692867279,
693
+ "rewards/margins": 0.05658548325300217,
694
+ "rewards/rejected": 0.024863524362444878,
695
  "step": 480
696
  },
697
  {
698
  "epoch": 0.51,
699
  "learning_rate": 2.743972445464983e-07,
700
+ "logits/chosen": -2.445746660232544,
701
+ "logits/rejected": -2.267007827758789,
702
+ "logps/chosen": -293.1885986328125,
703
+ "logps/rejected": -243.8875274658203,
704
+ "loss": 0.6676,
705
+ "rewards/accuracies": 0.6625000238418579,
706
+ "rewards/chosen": 0.10828351974487305,
707
+ "rewards/margins": 0.08175922185182571,
708
+ "rewards/rejected": 0.02652430161833763,
709
  "step": 490
710
  },
711
  {
712
  "epoch": 0.52,
713
  "learning_rate": 2.686567164179104e-07,
714
+ "logits/chosen": -2.278276205062866,
715
+ "logits/rejected": -2.295633316040039,
716
+ "logps/chosen": -254.94760131835938,
717
+ "logps/rejected": -221.79452514648438,
718
+ "loss": 0.6672,
719
+ "rewards/accuracies": 0.6499999761581421,
720
+ "rewards/chosen": 0.08227074891328812,
721
+ "rewards/margins": 0.055896710604429245,
722
+ "rewards/rejected": 0.026374032720923424,
723
  "step": 500
724
  },
725
  {
726
  "epoch": 0.53,
727
  "learning_rate": 2.629161882893226e-07,
728
+ "logits/chosen": -2.202611207962036,
729
+ "logits/rejected": -2.2495861053466797,
730
+ "logps/chosen": -310.4443664550781,
731
+ "logps/rejected": -256.72406005859375,
732
+ "loss": 0.6666,
733
+ "rewards/accuracies": 0.606249988079071,
734
+ "rewards/chosen": 0.07021793723106384,
735
+ "rewards/margins": 0.040728576481342316,
736
+ "rewards/rejected": 0.02948935702443123,
737
  "step": 510
738
  },
739
  {
740
  "epoch": 0.54,
741
  "learning_rate": 2.571756601607348e-07,
742
+ "logits/chosen": -2.3376307487487793,
743
+ "logits/rejected": -2.352074146270752,
744
+ "logps/chosen": -278.10504150390625,
745
+ "logps/rejected": -244.0722198486328,
746
+ "loss": 0.6697,
747
  "rewards/accuracies": 0.6499999761581421,
748
+ "rewards/chosen": 0.0925985723733902,
749
+ "rewards/margins": 0.0637633204460144,
750
+ "rewards/rejected": 0.028835251927375793,
751
  "step": 520
752
  },
753
  {
754
  "epoch": 0.55,
755
  "learning_rate": 2.5143513203214697e-07,
756
+ "logits/chosen": -2.243332624435425,
757
+ "logits/rejected": -2.2513413429260254,
758
+ "logps/chosen": -242.59439086914062,
759
+ "logps/rejected": -224.13259887695312,
760
+ "loss": 0.6716,
761
+ "rewards/accuracies": 0.6187499761581421,
762
+ "rewards/chosen": 0.07866770029067993,
763
+ "rewards/margins": 0.057711243629455566,
764
+ "rewards/rejected": 0.020956454798579216,
765
  "step": 530
766
  },
767
  {
768
  "epoch": 0.56,
769
  "learning_rate": 2.456946039035591e-07,
770
+ "logits/chosen": -2.300567150115967,
771
+ "logits/rejected": -2.271827220916748,
772
+ "logps/chosen": -288.2174377441406,
773
+ "logps/rejected": -240.34439086914062,
774
+ "loss": 0.6682,
775
+ "rewards/accuracies": 0.65625,
776
+ "rewards/chosen": 0.10411250591278076,
777
+ "rewards/margins": 0.05851038545370102,
778
+ "rewards/rejected": 0.04560210928320885,
779
  "step": 540
780
  },
781
  {
782
  "epoch": 0.57,
783
  "learning_rate": 2.399540757749713e-07,
784
+ "logits/chosen": -2.3359756469726562,
785
+ "logits/rejected": -2.194058895111084,
786
+ "logps/chosen": -265.052001953125,
787
+ "logps/rejected": -230.23605346679688,
788
+ "loss": 0.6686,
789
+ "rewards/accuracies": 0.65625,
790
+ "rewards/chosen": 0.0775262787938118,
791
+ "rewards/margins": 0.05575944110751152,
792
+ "rewards/rejected": 0.021766824647784233,
793
  "step": 550
794
  },
795
  {
796
  "epoch": 0.58,
797
  "learning_rate": 2.3421354764638345e-07,
798
+ "logits/chosen": -2.3195242881774902,
799
+ "logits/rejected": -2.283975124359131,
800
+ "logps/chosen": -302.0104064941406,
801
+ "logps/rejected": -252.0124053955078,
802
+ "loss": 0.6708,
803
+ "rewards/accuracies": 0.612500011920929,
804
+ "rewards/chosen": 0.10010389983654022,
805
+ "rewards/margins": 0.053703296929597855,
806
+ "rewards/rejected": 0.04640059918165207,
807
  "step": 560
808
  },
809
  {
810
  "epoch": 0.59,
811
  "learning_rate": 2.2847301951779563e-07,
812
+ "logits/chosen": -2.2481091022491455,
813
+ "logits/rejected": -2.400871515274048,
814
+ "logps/chosen": -268.6519775390625,
815
+ "logps/rejected": -223.69882202148438,
816
+ "loss": 0.6654,
817
+ "rewards/accuracies": 0.65625,
818
+ "rewards/chosen": 0.0826568529009819,
819
+ "rewards/margins": 0.05431235954165459,
820
+ "rewards/rejected": 0.028344491496682167,
821
  "step": 570
822
  },
823
  {
824
  "epoch": 0.6,
825
  "learning_rate": 2.227324913892078e-07,
826
+ "logits/chosen": -2.299408197402954,
827
+ "logits/rejected": -2.22338604927063,
828
+ "logps/chosen": -299.3912353515625,
829
+ "logps/rejected": -236.9815216064453,
830
+ "loss": 0.661,
831
+ "rewards/accuracies": 0.6812499761581421,
832
+ "rewards/chosen": 0.10458721220493317,
833
+ "rewards/margins": 0.08465038239955902,
834
+ "rewards/rejected": 0.019936833530664444,
835
  "step": 580
836
  },
837
  {
838
  "epoch": 0.61,
839
  "learning_rate": 2.1699196326061998e-07,
840
+ "logits/chosen": -2.2584633827209473,
841
+ "logits/rejected": -2.2311649322509766,
842
+ "logps/chosen": -253.76913452148438,
843
+ "logps/rejected": -218.6166534423828,
844
+ "loss": 0.6687,
845
+ "rewards/accuracies": 0.643750011920929,
846
+ "rewards/chosen": 0.07234074175357819,
847
+ "rewards/margins": 0.04758009687066078,
848
+ "rewards/rejected": 0.024760644882917404,
849
  "step": 590
850
  },
851
  {
852
  "epoch": 0.62,
853
  "learning_rate": 2.1125143513203214e-07,
854
+ "logits/chosen": -2.318943738937378,
855
+ "logits/rejected": -2.2511682510375977,
856
+ "logps/chosen": -256.5652770996094,
857
+ "logps/rejected": -206.35586547851562,
858
+ "loss": 0.669,
859
+ "rewards/accuracies": 0.643750011920929,
860
+ "rewards/chosen": 0.07542125880718231,
861
+ "rewards/margins": 0.0553053617477417,
862
+ "rewards/rejected": 0.020115893334150314,
863
  "step": 600
864
  },
865
  {
866
  "epoch": 0.63,
867
  "learning_rate": 2.055109070034443e-07,
868
+ "logits/chosen": -2.3058714866638184,
869
+ "logits/rejected": -2.304198741912842,
870
+ "logps/chosen": -266.4674987792969,
871
+ "logps/rejected": -223.82711791992188,
872
+ "loss": 0.6677,
873
+ "rewards/accuracies": 0.699999988079071,
874
+ "rewards/chosen": 0.09824246913194656,
875
+ "rewards/margins": 0.06738617271184921,
876
+ "rewards/rejected": 0.03085630014538765,
877
  "step": 610
878
  },
879
  {
880
  "epoch": 0.64,
881
  "learning_rate": 1.997703788748565e-07,
882
+ "logits/chosen": -2.337787389755249,
883
+ "logits/rejected": -2.2819180488586426,
884
+ "logps/chosen": -313.7826232910156,
885
+ "logps/rejected": -249.5704803466797,
886
+ "loss": 0.6582,
887
+ "rewards/accuracies": 0.668749988079071,
888
+ "rewards/chosen": 0.10966908931732178,
889
+ "rewards/margins": 0.08016980439424515,
890
+ "rewards/rejected": 0.029499292373657227,
891
  "step": 620
892
  },
893
  {
894
  "epoch": 0.65,
895
  "learning_rate": 1.9402985074626865e-07,
896
+ "logits/chosen": -2.2067112922668457,
897
+ "logits/rejected": -2.246953010559082,
898
+ "logps/chosen": -259.2144775390625,
899
+ "logps/rejected": -240.3810272216797,
900
+ "loss": 0.6653,
901
+ "rewards/accuracies": 0.6875,
902
+ "rewards/chosen": 0.09941162168979645,
903
+ "rewards/margins": 0.06417630612850189,
904
+ "rewards/rejected": 0.035235337913036346,
905
  "step": 630
906
  },
907
  {
908
  "epoch": 0.66,
909
  "learning_rate": 1.8828932261768083e-07,
910
+ "logits/chosen": -2.2894420623779297,
911
+ "logits/rejected": -2.2385382652282715,
912
+ "logps/chosen": -266.48992919921875,
913
+ "logps/rejected": -217.8952178955078,
914
+ "loss": 0.661,
915
+ "rewards/accuracies": 0.65625,
916
+ "rewards/chosen": 0.095299132168293,
917
+ "rewards/margins": 0.07987986505031586,
918
+ "rewards/rejected": 0.01541926246136427,
919
  "step": 640
920
  },
921
  {
922
  "epoch": 0.67,
923
  "learning_rate": 1.82548794489093e-07,
924
+ "logits/chosen": -2.33485746383667,
925
+ "logits/rejected": -2.3108019828796387,
926
+ "logps/chosen": -284.7020568847656,
927
+ "logps/rejected": -232.82080078125,
928
+ "loss": 0.664,
929
+ "rewards/accuracies": 0.6312500238418579,
930
+ "rewards/chosen": 0.10341651737689972,
931
+ "rewards/margins": 0.07464977353811264,
932
+ "rewards/rejected": 0.028766745701432228,
933
  "step": 650
934
  },
935
  {
936
  "epoch": 0.68,
937
  "learning_rate": 1.7680826636050515e-07,
938
+ "logits/chosen": -2.3347816467285156,
939
+ "logits/rejected": -2.2758853435516357,
940
+ "logps/chosen": -279.80059814453125,
941
+ "logps/rejected": -233.2425994873047,
942
  "loss": 0.6608,
943
+ "rewards/accuracies": 0.6812499761581421,
944
+ "rewards/chosen": 0.11068934202194214,
945
+ "rewards/margins": 0.07695071399211884,
946
+ "rewards/rejected": 0.0337386280298233,
947
  "step": 660
948
  },
949
  {
950
  "epoch": 0.69,
951
  "learning_rate": 1.7106773823191734e-07,
952
+ "logits/chosen": -2.2854952812194824,
953
+ "logits/rejected": -2.273536205291748,
954
+ "logps/chosen": -295.6964416503906,
955
+ "logps/rejected": -240.4071502685547,
956
+ "loss": 0.6615,
957
+ "rewards/accuracies": 0.6187499761581421,
958
+ "rewards/chosen": 0.1013779416680336,
959
+ "rewards/margins": 0.060683172196149826,
960
+ "rewards/rejected": 0.04069476202130318,
961
  "step": 670
962
  },
963
  {
964
  "epoch": 0.7,
965
  "learning_rate": 1.653272101033295e-07,
966
+ "logits/chosen": -2.34243106842041,
967
+ "logits/rejected": -2.2720611095428467,
968
+ "logps/chosen": -289.71722412109375,
969
+ "logps/rejected": -230.321533203125,
970
+ "loss": 0.6729,
971
+ "rewards/accuracies": 0.606249988079071,
972
+ "rewards/chosen": 0.09767869859933853,
973
+ "rewards/margins": 0.039280109107494354,
974
+ "rewards/rejected": 0.05839858204126358,
975
  "step": 680
976
  },
977
  {
978
  "epoch": 0.71,
979
  "learning_rate": 1.5958668197474169e-07,
980
+ "logits/chosen": -2.371598482131958,
981
+ "logits/rejected": -2.362656354904175,
982
+ "logps/chosen": -268.17828369140625,
983
+ "logps/rejected": -229.41232299804688,
984
+ "loss": 0.6659,
985
+ "rewards/accuracies": 0.6000000238418579,
986
+ "rewards/chosen": 0.0969640463590622,
987
+ "rewards/margins": 0.06369610875844955,
988
+ "rewards/rejected": 0.033267926424741745,
989
  "step": 690
990
  },
991
  {
992
  "epoch": 0.72,
993
  "learning_rate": 1.5384615384615385e-07,
994
+ "logits/chosen": -2.2588796615600586,
995
+ "logits/rejected": -2.2576823234558105,
996
+ "logps/chosen": -282.4342041015625,
997
+ "logps/rejected": -222.56381225585938,
998
+ "loss": 0.664,
999
+ "rewards/accuracies": 0.65625,
1000
+ "rewards/chosen": 0.10399500280618668,
1001
+ "rewards/margins": 0.08138440549373627,
1002
+ "rewards/rejected": 0.0226106159389019,
1003
  "step": 700
1004
  },
1005
  {
1006
  "epoch": 0.73,
1007
  "learning_rate": 1.4810562571756603e-07,
1008
+ "logits/chosen": -2.3341283798217773,
1009
+ "logits/rejected": -2.2046780586242676,
1010
+ "logps/chosen": -272.2647399902344,
1011
+ "logps/rejected": -208.01364135742188,
1012
+ "loss": 0.666,
1013
+ "rewards/accuracies": 0.699999988079071,
1014
+ "rewards/chosen": 0.10669133812189102,
1015
+ "rewards/margins": 0.08235933631658554,
1016
+ "rewards/rejected": 0.02433200553059578,
1017
  "step": 710
1018
  },
1019
  {
1020
  "epoch": 0.74,
1021
  "learning_rate": 1.423650975889782e-07,
1022
+ "logits/chosen": -2.323979139328003,
1023
+ "logits/rejected": -2.340238094329834,
1024
+ "logps/chosen": -303.2074279785156,
1025
+ "logps/rejected": -259.44268798828125,
1026
+ "loss": 0.6667,
1027
+ "rewards/accuracies": 0.625,
1028
+ "rewards/chosen": 0.11533965170383453,
1029
+ "rewards/margins": 0.047552816569805145,
1030
+ "rewards/rejected": 0.06778682768344879,
1031
  "step": 720
1032
  },
1033
  {
1034
  "epoch": 0.75,
1035
  "learning_rate": 1.3662456946039035e-07,
1036
+ "logits/chosen": -2.3031513690948486,
1037
+ "logits/rejected": -2.28584623336792,
1038
+ "logps/chosen": -270.1670837402344,
1039
+ "logps/rejected": -252.5519256591797,
1040
+ "loss": 0.6642,
1041
+ "rewards/accuracies": 0.668749988079071,
1042
+ "rewards/chosen": 0.10461707413196564,
1043
+ "rewards/margins": 0.058367032557725906,
1044
+ "rewards/rejected": 0.04625004902482033,
1045
  "step": 730
1046
  },
1047
  {
1048
  "epoch": 0.76,
1049
  "learning_rate": 1.3088404133180254e-07,
1050
+ "logits/chosen": -2.2157022953033447,
1051
+ "logits/rejected": -2.2670745849609375,
1052
+ "logps/chosen": -276.71240234375,
1053
+ "logps/rejected": -199.2496795654297,
1054
+ "loss": 0.6635,
1055
+ "rewards/accuracies": 0.668749988079071,
1056
+ "rewards/chosen": 0.11176248639822006,
1057
+ "rewards/margins": 0.08353973925113678,
1058
+ "rewards/rejected": 0.02822275087237358,
1059
  "step": 740
1060
  },
1061
  {
1062
  "epoch": 0.77,
1063
  "learning_rate": 1.251435132032147e-07,
1064
+ "logits/chosen": -2.2043914794921875,
1065
+ "logits/rejected": -2.221619129180908,
1066
+ "logps/chosen": -269.0702819824219,
1067
+ "logps/rejected": -220.8921356201172,
1068
+ "loss": 0.665,
1069
+ "rewards/accuracies": 0.6312500238418579,
1070
+ "rewards/chosen": 0.09922349452972412,
1071
+ "rewards/margins": 0.04318443313241005,
1072
+ "rewards/rejected": 0.05603905767202377,
1073
  "step": 750
1074
  },
1075
  {
1076
  "epoch": 0.78,
1077
  "learning_rate": 1.1940298507462686e-07,
1078
+ "logits/chosen": -2.232959270477295,
1079
+ "logits/rejected": -2.2529525756835938,
1080
+ "logps/chosen": -267.9338684082031,
1081
+ "logps/rejected": -249.4876251220703,
1082
+ "loss": 0.6684,
1083
+ "rewards/accuracies": 0.625,
1084
+ "rewards/chosen": 0.08004304021596909,
1085
+ "rewards/margins": 0.04949140548706055,
1086
+ "rewards/rejected": 0.030551627278327942,
1087
  "step": 760
1088
  },
1089
  {
1090
  "epoch": 0.8,
1091
  "learning_rate": 1.1366245694603903e-07,
1092
+ "logits/chosen": -2.293257236480713,
1093
+ "logits/rejected": -2.2078585624694824,
1094
+ "logps/chosen": -273.19671630859375,
1095
+ "logps/rejected": -238.57858276367188,
1096
+ "loss": 0.661,
1097
+ "rewards/accuracies": 0.625,
1098
+ "rewards/chosen": 0.11353409290313721,
1099
+ "rewards/margins": 0.06645722687244415,
1100
+ "rewards/rejected": 0.04707685858011246,
1101
  "step": 770
1102
  },
1103
  {
1104
  "epoch": 0.81,
1105
  "learning_rate": 1.079219288174512e-07,
1106
+ "logits/chosen": -2.3507869243621826,
1107
+ "logits/rejected": -2.325718879699707,
1108
+ "logps/chosen": -290.9693298339844,
1109
+ "logps/rejected": -236.1486358642578,
1110
+ "loss": 0.6633,
1111
+ "rewards/accuracies": 0.675000011920929,
1112
+ "rewards/chosen": 0.0980958342552185,
1113
+ "rewards/margins": 0.07181811332702637,
1114
+ "rewards/rejected": 0.026277724653482437,
1115
  "step": 780
1116
  },
1117
  {
1118
  "epoch": 0.82,
1119
  "learning_rate": 1.0218140068886336e-07,
1120
+ "logits/chosen": -2.268038272857666,
1121
+ "logits/rejected": -2.286581516265869,
1122
+ "logps/chosen": -270.3387451171875,
1123
+ "logps/rejected": -221.06356811523438,
1124
+ "loss": 0.6564,
1125
+ "rewards/accuracies": 0.6625000238418579,
1126
+ "rewards/chosen": 0.12088136374950409,
1127
+ "rewards/margins": 0.080001600086689,
1128
+ "rewards/rejected": 0.040879763662815094,
1129
  "step": 790
1130
  },
1131
  {
1132
  "epoch": 0.83,
1133
  "learning_rate": 9.644087256027554e-08,
1134
+ "logits/chosen": -2.272735118865967,
1135
+ "logits/rejected": -2.2941083908081055,
1136
+ "logps/chosen": -284.6488952636719,
1137
+ "logps/rejected": -243.56796264648438,
1138
+ "loss": 0.6639,
1139
+ "rewards/accuracies": 0.606249988079071,
1140
+ "rewards/chosen": 0.1113913282752037,
1141
+ "rewards/margins": 0.05327050760388374,
1142
+ "rewards/rejected": 0.05812082439661026,
1143
  "step": 800
1144
  },
1145
  {
1146
  "epoch": 0.84,
1147
  "learning_rate": 9.070034443168771e-08,
1148
+ "logits/chosen": -2.2838375568389893,
1149
+ "logits/rejected": -2.289247751235962,
1150
+ "logps/chosen": -269.5845642089844,
1151
+ "logps/rejected": -230.6207275390625,
1152
+ "loss": 0.6617,
1153
+ "rewards/accuracies": 0.6312500238418579,
1154
+ "rewards/chosen": 0.09149408340454102,
1155
+ "rewards/margins": 0.06341233849525452,
1156
+ "rewards/rejected": 0.02808173932135105,
1157
  "step": 810
1158
  },
1159
  {
1160
  "epoch": 0.85,
1161
  "learning_rate": 8.495981630309988e-08,
1162
+ "logits/chosen": -2.365980863571167,
1163
+ "logits/rejected": -2.3436598777770996,
1164
+ "logps/chosen": -302.0718688964844,
1165
+ "logps/rejected": -228.1407470703125,
1166
+ "loss": 0.6623,
1167
+ "rewards/accuracies": 0.7124999761581421,
1168
+ "rewards/chosen": 0.13062262535095215,
1169
+ "rewards/margins": 0.08858474344015121,
1170
+ "rewards/rejected": 0.04203786700963974,
1171
  "step": 820
1172
  },
1173
  {
1174
  "epoch": 0.86,
1175
  "learning_rate": 7.921928817451206e-08,
1176
+ "logits/chosen": -2.342413902282715,
1177
+ "logits/rejected": -2.2254080772399902,
1178
+ "logps/chosen": -287.4922180175781,
1179
+ "logps/rejected": -222.5606231689453,
1180
+ "loss": 0.6565,
1181
  "rewards/accuracies": 0.675000011920929,
1182
+ "rewards/chosen": 0.12904855608940125,
1183
+ "rewards/margins": 0.08615640550851822,
1184
+ "rewards/rejected": 0.04289213940501213,
1185
  "step": 830
1186
  },
1187
  {
1188
  "epoch": 0.87,
1189
  "learning_rate": 7.347876004592423e-08,
1190
+ "logits/chosen": -2.259397029876709,
1191
+ "logits/rejected": -2.227036476135254,
1192
+ "logps/chosen": -258.3423767089844,
1193
+ "logps/rejected": -216.99606323242188,
1194
+ "loss": 0.6714,
1195
+ "rewards/accuracies": 0.6499999761581421,
1196
+ "rewards/chosen": 0.10358164459466934,
1197
+ "rewards/margins": 0.06773830950260162,
1198
+ "rewards/rejected": 0.03584333881735802,
1199
  "step": 840
1200
  },
1201
  {
1202
  "epoch": 0.88,
1203
  "learning_rate": 6.773823191733639e-08,
1204
+ "logits/chosen": -2.2834537029266357,
1205
+ "logits/rejected": -2.3872971534729004,
1206
+ "logps/chosen": -262.05084228515625,
1207
+ "logps/rejected": -231.11306762695312,
1208
+ "loss": 0.6647,
1209
+ "rewards/accuracies": 0.625,
1210
+ "rewards/chosen": 0.09495140612125397,
1211
+ "rewards/margins": 0.055265575647354126,
1212
+ "rewards/rejected": 0.03968583419919014,
1213
  "step": 850
1214
  },
1215
  {
1216
  "epoch": 0.89,
1217
  "learning_rate": 6.199770378874856e-08,
1218
+ "logits/chosen": -2.4065003395080566,
1219
+ "logits/rejected": -2.3337345123291016,
1220
+ "logps/chosen": -295.71478271484375,
1221
+ "logps/rejected": -270.1822814941406,
1222
+ "loss": 0.6693,
1223
+ "rewards/accuracies": 0.65625,
1224
+ "rewards/chosen": 0.11348612606525421,
1225
+ "rewards/margins": 0.07466179132461548,
1226
+ "rewards/rejected": 0.03882431983947754,
1227
  "step": 860
1228
  },
1229
  {
1230
  "epoch": 0.9,
1231
  "learning_rate": 5.6257175660160735e-08,
1232
+ "logits/chosen": -2.2463555335998535,
1233
+ "logits/rejected": -2.2443947792053223,
1234
+ "logps/chosen": -312.9588317871094,
1235
+ "logps/rejected": -237.4109344482422,
1236
+ "loss": 0.6644,
1237
+ "rewards/accuracies": 0.59375,
1238
+ "rewards/chosen": 0.10128283500671387,
1239
+ "rewards/margins": 0.053178369998931885,
1240
+ "rewards/rejected": 0.04810447618365288,
1241
  "step": 870
1242
  },
1243
  {
1244
  "epoch": 0.91,
1245
  "learning_rate": 5.05166475315729e-08,
1246
+ "logits/chosen": -2.358501434326172,
1247
+ "logits/rejected": -2.313483715057373,
1248
+ "logps/chosen": -291.43377685546875,
1249
+ "logps/rejected": -240.09054565429688,
1250
+ "loss": 0.6632,
1251
+ "rewards/accuracies": 0.675000011920929,
1252
+ "rewards/chosen": 0.10742716491222382,
1253
+ "rewards/margins": 0.07204015552997589,
1254
+ "rewards/rejected": 0.03538701683282852,
1255
  "step": 880
1256
  },
1257
  {
1258
  "epoch": 0.92,
1259
  "learning_rate": 4.477611940298507e-08,
1260
+ "logits/chosen": -2.313149929046631,
1261
+ "logits/rejected": -2.3558261394500732,
1262
+ "logps/chosen": -285.90643310546875,
1263
+ "logps/rejected": -235.43051147460938,
1264
+ "loss": 0.6666,
1265
+ "rewards/accuracies": 0.6625000238418579,
1266
+ "rewards/chosen": 0.12259715795516968,
1267
+ "rewards/margins": 0.09698096662759781,
1268
+ "rewards/rejected": 0.02561618760228157,
1269
  "step": 890
1270
  },
1271
  {
1272
  "epoch": 0.93,
1273
  "learning_rate": 3.903559127439724e-08,
1274
+ "logits/chosen": -2.3278651237487793,
1275
+ "logits/rejected": -2.195068836212158,
1276
+ "logps/chosen": -272.7381896972656,
1277
+ "logps/rejected": -211.40640258789062,
1278
+ "loss": 0.658,
1279
+ "rewards/accuracies": 0.6937500238418579,
1280
+ "rewards/chosen": 0.1207551583647728,
1281
+ "rewards/margins": 0.09316142648458481,
1282
+ "rewards/rejected": 0.027593741193413734,
1283
  "step": 900
1284
  },
1285
  {
1286
  "epoch": 0.94,
1287
  "learning_rate": 3.3295063145809414e-08,
1288
+ "logits/chosen": -2.290696859359741,
1289
+ "logits/rejected": -2.3440823554992676,
1290
+ "logps/chosen": -238.2651824951172,
1291
+ "logps/rejected": -206.77969360351562,
1292
+ "loss": 0.6616,
1293
+ "rewards/accuracies": 0.6499999761581421,
1294
+ "rewards/chosen": 0.09928463399410248,
1295
+ "rewards/margins": 0.07226204872131348,
1296
+ "rewards/rejected": 0.027022594586014748,
1297
  "step": 910
1298
  },
1299
  {
1300
  "epoch": 0.95,
1301
  "learning_rate": 2.755453501722158e-08,
1302
+ "logits/chosen": -2.375807762145996,
1303
+ "logits/rejected": -2.367743730545044,
1304
+ "logps/chosen": -281.56195068359375,
1305
+ "logps/rejected": -225.125244140625,
1306
+ "loss": 0.662,
1307
+ "rewards/accuracies": 0.581250011920929,
1308
+ "rewards/chosen": 0.1072310209274292,
1309
+ "rewards/margins": 0.056608647108078,
1310
+ "rewards/rejected": 0.050622373819351196,
1311
  "step": 920
1312
  },
1313
  {
1314
  "epoch": 0.96,
1315
  "learning_rate": 2.1814006888633754e-08,
1316
+ "logits/chosen": -2.281919002532959,
1317
+ "logits/rejected": -2.254122734069824,
1318
+ "logps/chosen": -256.39105224609375,
1319
+ "logps/rejected": -203.3081817626953,
1320
+ "loss": 0.6617,
1321
+ "rewards/accuracies": 0.643750011920929,
1322
+ "rewards/chosen": 0.11211923509836197,
1323
+ "rewards/margins": 0.07925260812044144,
1324
+ "rewards/rejected": 0.03286661207675934,
1325
  "step": 930
1326
  },
1327
  {
1328
  "epoch": 0.97,
1329
  "learning_rate": 1.6073478760045924e-08,
1330
+ "logits/chosen": -2.316282272338867,
1331
+ "logits/rejected": -2.3123340606689453,
1332
+ "logps/chosen": -271.6207580566406,
1333
+ "logps/rejected": -231.7317352294922,
1334
+ "loss": 0.6626,
1335
+ "rewards/accuracies": 0.706250011920929,
1336
+ "rewards/chosen": 0.10637687146663666,
1337
+ "rewards/margins": 0.06768520176410675,
1338
+ "rewards/rejected": 0.0386916846036911,
1339
  "step": 940
1340
  },
1341
  {
1342
  "epoch": 0.98,
1343
  "learning_rate": 1.0332950631458094e-08,
1344
+ "logits/chosen": -2.3146958351135254,
1345
+ "logits/rejected": -2.2793381214141846,
1346
+ "logps/chosen": -282.83270263671875,
1347
+ "logps/rejected": -233.0804443359375,
1348
+ "loss": 0.6612,
1349
  "rewards/accuracies": 0.6812499761581421,
1350
+ "rewards/chosen": 0.11455857753753662,
1351
+ "rewards/margins": 0.0838586837053299,
1352
+ "rewards/rejected": 0.030699897557497025,
1353
  "step": 950
1354
  },
1355
  {
1356
  "epoch": 0.99,
1357
  "learning_rate": 4.592422502870264e-09,
1358
+ "logits/chosen": -2.251638889312744,
1359
+ "logits/rejected": -2.234907627105713,
1360
+ "logps/chosen": -281.0075378417969,
1361
+ "logps/rejected": -239.98049926757812,
1362
+ "loss": 0.661,
1363
+ "rewards/accuracies": 0.65625,
1364
+ "rewards/chosen": 0.1062885969877243,
1365
+ "rewards/margins": 0.06708581745624542,
1366
+ "rewards/rejected": 0.03920278698205948,
1367
  "step": 960
1368
  },
1369
  {
1370
  "epoch": 1.0,
1371
+ "eval_logits/chosen": -2.4597132205963135,
1372
+ "eval_logits/rejected": -2.398695468902588,
1373
+ "eval_logps/chosen": -278.69171142578125,
1374
+ "eval_logps/rejected": -230.4560089111328,
1375
+ "eval_loss": 0.6642152070999146,
1376
+ "eval_rewards/accuracies": 0.6480000019073486,
1377
+ "eval_rewards/chosen": 0.10415761172771454,
1378
+ "eval_rewards/margins": 0.06405296921730042,
1379
+ "eval_rewards/rejected": 0.04010463133454323,
1380
+ "eval_runtime": 443.9432,
1381
+ "eval_samples_per_second": 4.505,
1382
+ "eval_steps_per_second": 0.282,
1383
  "step": 968
1384
  },
1385
  {
1386
  "epoch": 1.0,
1387
  "step": 968,
1388
  "total_flos": 0.0,
1389
+ "train_loss": 0.6728762634529555,
1390
+ "train_runtime": 27528.1814,
1391
+ "train_samples_per_second": 2.251,
1392
  "train_steps_per_second": 0.035
1393
  }
1394
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e17c7a7d2c11078dfab2a74b3be402684b85c57187f1c9e190575380bb18b7e5
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc6fa65238373edb8c038b73d0de99649ac0d248e697a0222bd24510217b308
3
  size 4792