selectorseb commited on
Commit
2354d04
·
verified ·
1 Parent(s): 3f77388

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -23,11 +23,11 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "up_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "down_proj",
30
  "k_proj",
 
 
 
31
  "v_proj",
32
  "q_proj"
33
  ],
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "o_proj",
 
27
  "k_proj",
28
+ "up_proj",
29
+ "down_proj",
30
+ "gate_proj",
31
  "v_proj",
32
  "q_proj"
33
  ],
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d04441d0efe62beb5505e17968d1f070584e9b09d918e82b35df3e76aa9e9bef
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f803ca1ea6e7a15322303eb997266e3c083ce8bea91ba5d1fc507f3521a61248
3
  size 83946192
checkpoint-10/adapter_config.json CHANGED
@@ -23,11 +23,11 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "up_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "down_proj",
30
  "k_proj",
 
 
 
31
  "v_proj",
32
  "q_proj"
33
  ],
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "o_proj",
 
27
  "k_proj",
28
+ "up_proj",
29
+ "down_proj",
30
+ "gate_proj",
31
  "v_proj",
32
  "q_proj"
33
  ],
checkpoint-10/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4303b93069b3d1dcda75f3d73a6bbe59b0c1ed1f66c46f47e2af45c622182675
3
  size 6388184376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4543cfdedc0fcc5febfb1f61b6cd94a9a1232990c59af06db637d601072a09ff
3
  size 6388184376
checkpoint-10/global_step10/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8d20edc76bb73b85283833642c85ce0b890198fa3a9ba4e2bdcc3d6a4789df1
3
  size 13111840636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4512f1d11620f0d39a231b5f06d9c7b99c84b1b64ded7b452c7b149534a495a
3
  size 13111840636
checkpoint-10/global_step10/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c241a7950ad3e86057e0debe8807dacf6eb43108a369072aff9c6379da5c302
3
  size 2185646124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4bb49133a4b11d3d18aca6bc3a05aa800bc2a934b7227bd2cb77f3c0cc92c3e
3
  size 2185646124
checkpoint-10/trainer_state.json CHANGED
@@ -10,37 +10,37 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 22.02356719970703,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
- "grad_norm": 21.681835174560547,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
- "grad_norm": 12.661576271057129,
28
  "learning_rate": 1.9458172417006347e-05,
29
- "loss": 1.1806,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
- "grad_norm": 9.024881362915039,
35
  "learning_rate": 1.879473751206489e-05,
36
- "loss": 1.0682,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
- "grad_norm": 8.294164657592773,
42
  "learning_rate": 1.789140509396394e-05,
43
- "loss": 0.9835,
44
  "step": 5
45
  },
46
  {
@@ -59,47 +59,47 @@
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
- "eval_loss": 0.8951861262321472,
63
- "eval_perplexity": 1.010767880290677,
64
- "eval_runtime": 0.8904,
65
- "eval_samples_per_second": 1.123,
66
- "eval_steps_per_second": 1.123,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
- "grad_norm": 8.226706504821777,
73
  "learning_rate": 1.6772815716257414e-05,
74
- "loss": 0.8936,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
- "grad_norm": 8.382328987121582,
80
  "learning_rate": 1.5469481581224274e-05,
81
- "loss": 0.8304,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
- "grad_norm": 7.861330986022949,
87
  "learning_rate": 1.4016954246529697e-05,
88
- "loss": 0.7703,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
- "grad_norm": 7.340066909790039,
94
  "learning_rate": 1.2454854871407993e-05,
95
- "loss": 0.7171,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
- "grad_norm": 6.915948390960693,
101
  "learning_rate": 1.0825793454723325e-05,
102
- "loss": 0.6641,
103
  "step": 10
104
  },
105
  {
@@ -118,11 +118,11 @@
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
- "eval_loss": 0.6229805946350098,
122
- "eval_perplexity": 1.0074814049420708,
123
- "eval_runtime": 0.8828,
124
- "eval_samples_per_second": 1.133,
125
- "eval_steps_per_second": 1.133,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 21.60628890991211,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
+ "grad_norm": 21.794620513916016,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
+ "grad_norm": 13.444245338439941,
28
  "learning_rate": 1.9458172417006347e-05,
29
+ "loss": 1.1815,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
+ "grad_norm": 9.568578720092773,
35
  "learning_rate": 1.879473751206489e-05,
36
+ "loss": 1.0718,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
+ "grad_norm": 8.503974914550781,
42
  "learning_rate": 1.789140509396394e-05,
43
+ "loss": 0.9886,
44
  "step": 5
45
  },
46
  {
 
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
+ "eval_loss": 0.897951602935791,
63
+ "eval_perplexity": 1.0108013242189644,
64
+ "eval_runtime": 0.885,
65
+ "eval_samples_per_second": 1.13,
66
+ "eval_steps_per_second": 1.13,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
+ "grad_norm": 8.138020515441895,
73
  "learning_rate": 1.6772815716257414e-05,
74
+ "loss": 0.8967,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
+ "grad_norm": 8.324158668518066,
80
  "learning_rate": 1.5469481581224274e-05,
81
+ "loss": 0.8374,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
+ "grad_norm": 8.041045188903809,
87
  "learning_rate": 1.4016954246529697e-05,
88
+ "loss": 0.7753,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
+ "grad_norm": 7.898632526397705,
94
  "learning_rate": 1.2454854871407993e-05,
95
+ "loss": 0.7203,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
+ "grad_norm": 6.87092924118042,
101
  "learning_rate": 1.0825793454723325e-05,
102
+ "loss": 0.6677,
103
  "step": 10
104
  },
105
  {
 
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
+ "eval_loss": 0.627693235874176,
122
+ "eval_perplexity": 1.0075382125074093,
123
+ "eval_runtime": 0.878,
124
+ "eval_samples_per_second": 1.139,
125
+ "eval_steps_per_second": 1.139,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  }
checkpoint-10/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67586136f4609abc9357461142a2bba22693796be0e1eef7d79dfa07d236f385
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11e825a5b1e7bd17e78d2e5d730d3c993fdb3d39a70eb784fa37a52c0535935
3
  size 6648
checkpoint-15/adapter_config.json CHANGED
@@ -23,11 +23,11 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "up_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "down_proj",
30
  "k_proj",
 
 
 
31
  "v_proj",
32
  "q_proj"
33
  ],
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "o_proj",
 
27
  "k_proj",
28
+ "up_proj",
29
+ "down_proj",
30
+ "gate_proj",
31
  "v_proj",
32
  "q_proj"
33
  ],
checkpoint-15/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f680bd627283c65ac4facc923e2f2ba913b34cb597f5e753ad30487d47c3abbe
3
  size 6388184376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26d67a87994a8d25f0c7632ccc9143f98c14dce3496c77ce95a3129ebd5167f0
3
  size 6388184376
checkpoint-15/global_step15/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ec791a4120d29d1533afb70f0552bbf6c33ad37acc48954401b097fa6797121
3
  size 13111840636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d0e07725d5bfd8c325d61d3b0ba6650b88010dbd1ad034c55f93edbb07cfb9d
3
  size 13111840636
checkpoint-15/global_step15/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28cd286dc47facbb1a1289b23b9470509461941fff3768ef5a9089b21764e4d2
3
  size 2185646124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aa9c5e4eda56c648fe24e82060d791f7edd40024d239744fa3dc89fb7292b16
3
  size 2185646124
checkpoint-15/trainer_state.json CHANGED
@@ -10,37 +10,37 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 22.02356719970703,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
- "grad_norm": 21.681835174560547,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
- "grad_norm": 12.661576271057129,
28
  "learning_rate": 1.9458172417006347e-05,
29
- "loss": 1.1806,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
- "grad_norm": 9.024881362915039,
35
  "learning_rate": 1.879473751206489e-05,
36
- "loss": 1.0682,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
- "grad_norm": 8.294164657592773,
42
  "learning_rate": 1.789140509396394e-05,
43
- "loss": 0.9835,
44
  "step": 5
45
  },
46
  {
@@ -59,47 +59,47 @@
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
- "eval_loss": 0.8951861262321472,
63
- "eval_perplexity": 1.010767880290677,
64
- "eval_runtime": 0.8904,
65
- "eval_samples_per_second": 1.123,
66
- "eval_steps_per_second": 1.123,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
- "grad_norm": 8.226706504821777,
73
  "learning_rate": 1.6772815716257414e-05,
74
- "loss": 0.8936,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
- "grad_norm": 8.382328987121582,
80
  "learning_rate": 1.5469481581224274e-05,
81
- "loss": 0.8304,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
- "grad_norm": 7.861330986022949,
87
  "learning_rate": 1.4016954246529697e-05,
88
- "loss": 0.7703,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
- "grad_norm": 7.340066909790039,
94
  "learning_rate": 1.2454854871407993e-05,
95
- "loss": 0.7171,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
- "grad_norm": 6.915948390960693,
101
  "learning_rate": 1.0825793454723325e-05,
102
- "loss": 0.6641,
103
  "step": 10
104
  },
105
  {
@@ -118,47 +118,47 @@
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
- "eval_loss": 0.6229805946350098,
122
- "eval_perplexity": 1.0074814049420708,
123
- "eval_runtime": 0.8828,
124
- "eval_samples_per_second": 1.133,
125
- "eval_steps_per_second": 1.133,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
- "grad_norm": 6.680131912231445,
132
  "learning_rate": 9.174206545276678e-06,
133
- "loss": 0.6224,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
- "grad_norm": 5.9928297996521,
139
  "learning_rate": 7.545145128592009e-06,
140
- "loss": 0.5933,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
- "grad_norm": 5.214223861694336,
146
  "learning_rate": 5.983045753470308e-06,
147
- "loss": 0.5704,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
- "grad_norm": 4.801523685455322,
153
  "learning_rate": 4.530518418775734e-06,
154
- "loss": 0.5552,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
- "grad_norm": 4.650667667388916,
160
  "learning_rate": 3.2271842837425917e-06,
161
- "loss": 0.545,
162
  "step": 15
163
  },
164
  {
@@ -177,11 +177,11 @@
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
- "eval_loss": 0.5376743674278259,
181
- "eval_perplexity": 1.006453660117816,
182
- "eval_runtime": 0.8811,
183
- "eval_samples_per_second": 1.135,
184
- "eval_steps_per_second": 1.135,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 21.60628890991211,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
+ "grad_norm": 21.794620513916016,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
+ "grad_norm": 13.444245338439941,
28
  "learning_rate": 1.9458172417006347e-05,
29
+ "loss": 1.1815,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
+ "grad_norm": 9.568578720092773,
35
  "learning_rate": 1.879473751206489e-05,
36
+ "loss": 1.0718,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
+ "grad_norm": 8.503974914550781,
42
  "learning_rate": 1.789140509396394e-05,
43
+ "loss": 0.9886,
44
  "step": 5
45
  },
46
  {
 
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
+ "eval_loss": 0.897951602935791,
63
+ "eval_perplexity": 1.0108013242189644,
64
+ "eval_runtime": 0.885,
65
+ "eval_samples_per_second": 1.13,
66
+ "eval_steps_per_second": 1.13,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
+ "grad_norm": 8.138020515441895,
73
  "learning_rate": 1.6772815716257414e-05,
74
+ "loss": 0.8967,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
+ "grad_norm": 8.324158668518066,
80
  "learning_rate": 1.5469481581224274e-05,
81
+ "loss": 0.8374,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
+ "grad_norm": 8.041045188903809,
87
  "learning_rate": 1.4016954246529697e-05,
88
+ "loss": 0.7753,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
+ "grad_norm": 7.898632526397705,
94
  "learning_rate": 1.2454854871407993e-05,
95
+ "loss": 0.7203,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
+ "grad_norm": 6.87092924118042,
101
  "learning_rate": 1.0825793454723325e-05,
102
+ "loss": 0.6677,
103
  "step": 10
104
  },
105
  {
 
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
+ "eval_loss": 0.627693235874176,
122
+ "eval_perplexity": 1.0075382125074093,
123
+ "eval_runtime": 0.878,
124
+ "eval_samples_per_second": 1.139,
125
+ "eval_steps_per_second": 1.139,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
+ "grad_norm": 6.593441009521484,
132
  "learning_rate": 9.174206545276678e-06,
133
+ "loss": 0.6275,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
+ "grad_norm": 6.028723239898682,
139
  "learning_rate": 7.545145128592009e-06,
140
+ "loss": 0.5971,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
+ "grad_norm": 5.46033239364624,
146
  "learning_rate": 5.983045753470308e-06,
147
+ "loss": 0.5745,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
+ "grad_norm": 4.930461883544922,
153
  "learning_rate": 4.530518418775734e-06,
154
+ "loss": 0.5558,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
+ "grad_norm": 4.64890193939209,
160
  "learning_rate": 3.2271842837425917e-06,
161
+ "loss": 0.5455,
162
  "step": 15
163
  },
164
  {
 
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
+ "eval_loss": 0.5380075573921204,
181
+ "eval_perplexity": 1.0064576742544533,
182
+ "eval_runtime": 0.8779,
183
+ "eval_samples_per_second": 1.139,
184
+ "eval_steps_per_second": 1.139,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  }
checkpoint-15/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67586136f4609abc9357461142a2bba22693796be0e1eef7d79dfa07d236f385
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11e825a5b1e7bd17e78d2e5d730d3c993fdb3d39a70eb784fa37a52c0535935
3
  size 6648
checkpoint-20/adapter_config.json CHANGED
@@ -23,11 +23,11 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "up_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "down_proj",
30
  "k_proj",
 
 
 
31
  "v_proj",
32
  "q_proj"
33
  ],
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "o_proj",
 
27
  "k_proj",
28
+ "up_proj",
29
+ "down_proj",
30
+ "gate_proj",
31
  "v_proj",
32
  "q_proj"
33
  ],
checkpoint-20/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ee2c9bf1fd62002efcd446cc6861c0eac27db5f8271388127187ab30b6b9e7a
3
  size 6388184376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da1f9ccdca20706dfbff0f0b107be0aff7333231031881696d0ccbc9343a107
3
  size 6388184376
checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e69da5ee43cefb69af895104ab25bf7300c33a5865e64407b330350490dbbbe4
3
  size 13111840636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb5a5b854f34b1f12423ae9102619fb0608bdd4853641f40fd40c06a1ad51dd
3
  size 13111840636
checkpoint-20/global_step20/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31b69268e5911c4d0180e1b34274f3728e65f6379fe8de91e5821d4d120ffb44
3
  size 2185646124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb70443d8223302e5d4013f1a070cb79efc19fab47ded52e758969f2ef9d331
3
  size 2185646124
checkpoint-20/trainer_state.json CHANGED
@@ -10,37 +10,37 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 22.02356719970703,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
- "grad_norm": 21.681835174560547,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
- "grad_norm": 12.661576271057129,
28
  "learning_rate": 1.9458172417006347e-05,
29
- "loss": 1.1806,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
- "grad_norm": 9.024881362915039,
35
  "learning_rate": 1.879473751206489e-05,
36
- "loss": 1.0682,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
- "grad_norm": 8.294164657592773,
42
  "learning_rate": 1.789140509396394e-05,
43
- "loss": 0.9835,
44
  "step": 5
45
  },
46
  {
@@ -59,47 +59,47 @@
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
- "eval_loss": 0.8951861262321472,
63
- "eval_perplexity": 1.010767880290677,
64
- "eval_runtime": 0.8904,
65
- "eval_samples_per_second": 1.123,
66
- "eval_steps_per_second": 1.123,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
- "grad_norm": 8.226706504821777,
73
  "learning_rate": 1.6772815716257414e-05,
74
- "loss": 0.8936,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
- "grad_norm": 8.382328987121582,
80
  "learning_rate": 1.5469481581224274e-05,
81
- "loss": 0.8304,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
- "grad_norm": 7.861330986022949,
87
  "learning_rate": 1.4016954246529697e-05,
88
- "loss": 0.7703,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
- "grad_norm": 7.340066909790039,
94
  "learning_rate": 1.2454854871407993e-05,
95
- "loss": 0.7171,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
- "grad_norm": 6.915948390960693,
101
  "learning_rate": 1.0825793454723325e-05,
102
- "loss": 0.6641,
103
  "step": 10
104
  },
105
  {
@@ -118,47 +118,47 @@
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
- "eval_loss": 0.6229805946350098,
122
- "eval_perplexity": 1.0074814049420708,
123
- "eval_runtime": 0.8828,
124
- "eval_samples_per_second": 1.133,
125
- "eval_steps_per_second": 1.133,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
- "grad_norm": 6.680131912231445,
132
  "learning_rate": 9.174206545276678e-06,
133
- "loss": 0.6224,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
- "grad_norm": 5.9928297996521,
139
  "learning_rate": 7.545145128592009e-06,
140
- "loss": 0.5933,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
- "grad_norm": 5.214223861694336,
146
  "learning_rate": 5.983045753470308e-06,
147
- "loss": 0.5704,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
- "grad_norm": 4.801523685455322,
153
  "learning_rate": 4.530518418775734e-06,
154
- "loss": 0.5552,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
- "grad_norm": 4.650667667388916,
160
  "learning_rate": 3.2271842837425917e-06,
161
- "loss": 0.545,
162
  "step": 15
163
  },
164
  {
@@ -177,47 +177,47 @@
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
- "eval_loss": 0.5376743674278259,
181
- "eval_perplexity": 1.006453660117816,
182
- "eval_runtime": 0.8811,
183
- "eval_samples_per_second": 1.135,
184
- "eval_steps_per_second": 1.135,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  },
188
  {
189
  "epoch": 16.0,
190
- "grad_norm": 4.648330211639404,
191
  "learning_rate": 2.1085949060360654e-06,
192
- "loss": 0.5376,
193
  "step": 16
194
  },
195
  {
196
  "epoch": 17.0,
197
- "grad_norm": 4.580282211303711,
198
  "learning_rate": 1.2052624879351105e-06,
199
- "loss": 0.5339,
200
  "step": 17
201
  },
202
  {
203
  "epoch": 18.0,
204
- "grad_norm": 4.567606449127197,
205
  "learning_rate": 5.418275829936537e-07,
206
- "loss": 0.5301,
207
  "step": 18
208
  },
209
  {
210
  "epoch": 19.0,
211
- "grad_norm": 4.5482001304626465,
212
  "learning_rate": 1.3638696597277678e-07,
213
- "loss": 0.528,
214
  "step": 19
215
  },
216
  {
217
  "epoch": 20.0,
218
- "grad_norm": 4.52694034576416,
219
  "learning_rate": 0.0,
220
- "loss": 0.5258,
221
  "step": 20
222
  },
223
  {
@@ -236,11 +236,11 @@
236
  "eval_accuracy_total_num_<|stop|>": 2,
237
  "eval_first_token_param_values": 0.9,
238
  "eval_first_token_param_values_total": 10,
239
- "eval_loss": 0.5257245302200317,
240
- "eval_perplexity": 1.0063097771319383,
241
- "eval_runtime": 0.8846,
242
- "eval_samples_per_second": 1.131,
243
- "eval_steps_per_second": 1.131,
244
  "eval_total_number_first_token": 2,
245
  "step": 20
246
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 21.60628890991211,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
+ "grad_norm": 21.794620513916016,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
+ "grad_norm": 13.444245338439941,
28
  "learning_rate": 1.9458172417006347e-05,
29
+ "loss": 1.1815,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
+ "grad_norm": 9.568578720092773,
35
  "learning_rate": 1.879473751206489e-05,
36
+ "loss": 1.0718,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
+ "grad_norm": 8.503974914550781,
42
  "learning_rate": 1.789140509396394e-05,
43
+ "loss": 0.9886,
44
  "step": 5
45
  },
46
  {
 
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
+ "eval_loss": 0.897951602935791,
63
+ "eval_perplexity": 1.0108013242189644,
64
+ "eval_runtime": 0.885,
65
+ "eval_samples_per_second": 1.13,
66
+ "eval_steps_per_second": 1.13,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
+ "grad_norm": 8.138020515441895,
73
  "learning_rate": 1.6772815716257414e-05,
74
+ "loss": 0.8967,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
+ "grad_norm": 8.324158668518066,
80
  "learning_rate": 1.5469481581224274e-05,
81
+ "loss": 0.8374,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
+ "grad_norm": 8.041045188903809,
87
  "learning_rate": 1.4016954246529697e-05,
88
+ "loss": 0.7753,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
+ "grad_norm": 7.898632526397705,
94
  "learning_rate": 1.2454854871407993e-05,
95
+ "loss": 0.7203,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
+ "grad_norm": 6.87092924118042,
101
  "learning_rate": 1.0825793454723325e-05,
102
+ "loss": 0.6677,
103
  "step": 10
104
  },
105
  {
 
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
+ "eval_loss": 0.627693235874176,
122
+ "eval_perplexity": 1.0075382125074093,
123
+ "eval_runtime": 0.878,
124
+ "eval_samples_per_second": 1.139,
125
+ "eval_steps_per_second": 1.139,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
+ "grad_norm": 6.593441009521484,
132
  "learning_rate": 9.174206545276678e-06,
133
+ "loss": 0.6275,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
+ "grad_norm": 6.028723239898682,
139
  "learning_rate": 7.545145128592009e-06,
140
+ "loss": 0.5971,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
+ "grad_norm": 5.46033239364624,
146
  "learning_rate": 5.983045753470308e-06,
147
+ "loss": 0.5745,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
+ "grad_norm": 4.930461883544922,
153
  "learning_rate": 4.530518418775734e-06,
154
+ "loss": 0.5558,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
+ "grad_norm": 4.64890193939209,
160
  "learning_rate": 3.2271842837425917e-06,
161
+ "loss": 0.5455,
162
  "step": 15
163
  },
164
  {
 
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
+ "eval_loss": 0.5380075573921204,
181
+ "eval_perplexity": 1.0064576742544533,
182
+ "eval_runtime": 0.8779,
183
+ "eval_samples_per_second": 1.139,
184
+ "eval_steps_per_second": 1.139,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  },
188
  {
189
  "epoch": 16.0,
190
+ "grad_norm": 4.5582594871521,
191
  "learning_rate": 2.1085949060360654e-06,
192
+ "loss": 0.5374,
193
  "step": 16
194
  },
195
  {
196
  "epoch": 17.0,
197
+ "grad_norm": 4.519166946411133,
198
  "learning_rate": 1.2052624879351105e-06,
199
+ "loss": 0.5316,
200
  "step": 17
201
  },
202
  {
203
  "epoch": 18.0,
204
+ "grad_norm": 4.582608699798584,
205
  "learning_rate": 5.418275829936537e-07,
206
+ "loss": 0.5291,
207
  "step": 18
208
  },
209
  {
210
  "epoch": 19.0,
211
+ "grad_norm": 4.5061774253845215,
212
  "learning_rate": 1.3638696597277678e-07,
213
+ "loss": 0.5265,
214
  "step": 19
215
  },
216
  {
217
  "epoch": 20.0,
218
+ "grad_norm": 4.478773593902588,
219
  "learning_rate": 0.0,
220
+ "loss": 0.5249,
221
  "step": 20
222
  },
223
  {
 
236
  "eval_accuracy_total_num_<|stop|>": 2,
237
  "eval_first_token_param_values": 0.9,
238
  "eval_first_token_param_values_total": 10,
239
+ "eval_loss": 0.5260699987411499,
240
+ "eval_perplexity": 1.0063139350239472,
241
+ "eval_runtime": 0.8774,
242
+ "eval_samples_per_second": 1.14,
243
+ "eval_steps_per_second": 1.14,
244
  "eval_total_number_first_token": 2,
245
  "step": 20
246
  }
checkpoint-20/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67586136f4609abc9357461142a2bba22693796be0e1eef7d79dfa07d236f385
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11e825a5b1e7bd17e78d2e5d730d3c993fdb3d39a70eb784fa37a52c0535935
3
  size 6648
trainer_state.json CHANGED
@@ -10,37 +10,37 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 22.02356719970703,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
- "grad_norm": 21.681835174560547,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
- "grad_norm": 12.661576271057129,
28
  "learning_rate": 1.9458172417006347e-05,
29
- "loss": 1.1806,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
- "grad_norm": 9.024881362915039,
35
  "learning_rate": 1.879473751206489e-05,
36
- "loss": 1.0682,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
- "grad_norm": 8.294164657592773,
42
  "learning_rate": 1.789140509396394e-05,
43
- "loss": 0.9835,
44
  "step": 5
45
  },
46
  {
@@ -59,47 +59,47 @@
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
- "eval_loss": 0.8951861262321472,
63
- "eval_perplexity": 1.010767880290677,
64
- "eval_runtime": 0.8904,
65
- "eval_samples_per_second": 1.123,
66
- "eval_steps_per_second": 1.123,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
- "grad_norm": 8.226706504821777,
73
  "learning_rate": 1.6772815716257414e-05,
74
- "loss": 0.8936,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
- "grad_norm": 8.382328987121582,
80
  "learning_rate": 1.5469481581224274e-05,
81
- "loss": 0.8304,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
- "grad_norm": 7.861330986022949,
87
  "learning_rate": 1.4016954246529697e-05,
88
- "loss": 0.7703,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
- "grad_norm": 7.340066909790039,
94
  "learning_rate": 1.2454854871407993e-05,
95
- "loss": 0.7171,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
- "grad_norm": 6.915948390960693,
101
  "learning_rate": 1.0825793454723325e-05,
102
- "loss": 0.6641,
103
  "step": 10
104
  },
105
  {
@@ -118,47 +118,47 @@
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
- "eval_loss": 0.6229805946350098,
122
- "eval_perplexity": 1.0074814049420708,
123
- "eval_runtime": 0.8828,
124
- "eval_samples_per_second": 1.133,
125
- "eval_steps_per_second": 1.133,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
- "grad_norm": 6.680131912231445,
132
  "learning_rate": 9.174206545276678e-06,
133
- "loss": 0.6224,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
- "grad_norm": 5.9928297996521,
139
  "learning_rate": 7.545145128592009e-06,
140
- "loss": 0.5933,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
- "grad_norm": 5.214223861694336,
146
  "learning_rate": 5.983045753470308e-06,
147
- "loss": 0.5704,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
- "grad_norm": 4.801523685455322,
153
  "learning_rate": 4.530518418775734e-06,
154
- "loss": 0.5552,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
- "grad_norm": 4.650667667388916,
160
  "learning_rate": 3.2271842837425917e-06,
161
- "loss": 0.545,
162
  "step": 15
163
  },
164
  {
@@ -177,47 +177,47 @@
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
- "eval_loss": 0.5376743674278259,
181
- "eval_perplexity": 1.006453660117816,
182
- "eval_runtime": 0.8811,
183
- "eval_samples_per_second": 1.135,
184
- "eval_steps_per_second": 1.135,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  },
188
  {
189
  "epoch": 16.0,
190
- "grad_norm": 4.648330211639404,
191
  "learning_rate": 2.1085949060360654e-06,
192
- "loss": 0.5376,
193
  "step": 16
194
  },
195
  {
196
  "epoch": 17.0,
197
- "grad_norm": 4.580282211303711,
198
  "learning_rate": 1.2052624879351105e-06,
199
- "loss": 0.5339,
200
  "step": 17
201
  },
202
  {
203
  "epoch": 18.0,
204
- "grad_norm": 4.567606449127197,
205
  "learning_rate": 5.418275829936537e-07,
206
- "loss": 0.5301,
207
  "step": 18
208
  },
209
  {
210
  "epoch": 19.0,
211
- "grad_norm": 4.5482001304626465,
212
  "learning_rate": 1.3638696597277678e-07,
213
- "loss": 0.528,
214
  "step": 19
215
  },
216
  {
217
  "epoch": 20.0,
218
- "grad_norm": 4.52694034576416,
219
  "learning_rate": 0.0,
220
- "loss": 0.5258,
221
  "step": 20
222
  },
223
  {
@@ -236,11 +236,11 @@
236
  "eval_accuracy_total_num_<|stop|>": 2,
237
  "eval_first_token_param_values": 0.9,
238
  "eval_first_token_param_values_total": 10,
239
- "eval_loss": 0.5257245302200317,
240
- "eval_perplexity": 1.0063097771319383,
241
- "eval_runtime": 0.8846,
242
- "eval_samples_per_second": 1.131,
243
- "eval_steps_per_second": 1.131,
244
  "eval_total_number_first_token": 2,
245
  "step": 20
246
  },
@@ -248,10 +248,10 @@
248
  "epoch": 20.0,
249
  "step": 20,
250
  "total_flos": 7935331915530240.0,
251
- "train_loss": 0.7579435586929322,
252
- "train_runtime": 380.9512,
253
- "train_samples_per_second": 0.053,
254
- "train_steps_per_second": 0.053
255
  }
256
  ],
257
  "logging_steps": 1.0,
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 21.60628890991211,
14
  "learning_rate": 2e-05,
15
  "loss": 1.2546,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 2.0,
20
+ "grad_norm": 21.794620513916016,
21
  "learning_rate": 1.9863613034027224e-05,
22
  "loss": 1.2546,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 3.0,
27
+ "grad_norm": 13.444245338439941,
28
  "learning_rate": 1.9458172417006347e-05,
29
+ "loss": 1.1815,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 4.0,
34
+ "grad_norm": 9.568578720092773,
35
  "learning_rate": 1.879473751206489e-05,
36
+ "loss": 1.0718,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 5.0,
41
+ "grad_norm": 8.503974914550781,
42
  "learning_rate": 1.789140509396394e-05,
43
+ "loss": 0.9886,
44
  "step": 5
45
  },
46
  {
 
59
  "eval_accuracy_total_num_<|stop|>": 2,
60
  "eval_first_token_param_values": 0.9,
61
  "eval_first_token_param_values_total": 10,
62
+ "eval_loss": 0.897951602935791,
63
+ "eval_perplexity": 1.0108013242189644,
64
+ "eval_runtime": 0.885,
65
+ "eval_samples_per_second": 1.13,
66
+ "eval_steps_per_second": 1.13,
67
  "eval_total_number_first_token": 2,
68
  "step": 5
69
  },
70
  {
71
  "epoch": 6.0,
72
+ "grad_norm": 8.138020515441895,
73
  "learning_rate": 1.6772815716257414e-05,
74
+ "loss": 0.8967,
75
  "step": 6
76
  },
77
  {
78
  "epoch": 7.0,
79
+ "grad_norm": 8.324158668518066,
80
  "learning_rate": 1.5469481581224274e-05,
81
+ "loss": 0.8374,
82
  "step": 7
83
  },
84
  {
85
  "epoch": 8.0,
86
+ "grad_norm": 8.041045188903809,
87
  "learning_rate": 1.4016954246529697e-05,
88
+ "loss": 0.7753,
89
  "step": 8
90
  },
91
  {
92
  "epoch": 9.0,
93
+ "grad_norm": 7.898632526397705,
94
  "learning_rate": 1.2454854871407993e-05,
95
+ "loss": 0.7203,
96
  "step": 9
97
  },
98
  {
99
  "epoch": 10.0,
100
+ "grad_norm": 6.87092924118042,
101
  "learning_rate": 1.0825793454723325e-05,
102
+ "loss": 0.6677,
103
  "step": 10
104
  },
105
  {
 
118
  "eval_accuracy_total_num_<|stop|>": 2,
119
  "eval_first_token_param_values": 0.9,
120
  "eval_first_token_param_values_total": 10,
121
+ "eval_loss": 0.627693235874176,
122
+ "eval_perplexity": 1.0075382125074093,
123
+ "eval_runtime": 0.878,
124
+ "eval_samples_per_second": 1.139,
125
+ "eval_steps_per_second": 1.139,
126
  "eval_total_number_first_token": 2,
127
  "step": 10
128
  },
129
  {
130
  "epoch": 11.0,
131
+ "grad_norm": 6.593441009521484,
132
  "learning_rate": 9.174206545276678e-06,
133
+ "loss": 0.6275,
134
  "step": 11
135
  },
136
  {
137
  "epoch": 12.0,
138
+ "grad_norm": 6.028723239898682,
139
  "learning_rate": 7.545145128592009e-06,
140
+ "loss": 0.5971,
141
  "step": 12
142
  },
143
  {
144
  "epoch": 13.0,
145
+ "grad_norm": 5.46033239364624,
146
  "learning_rate": 5.983045753470308e-06,
147
+ "loss": 0.5745,
148
  "step": 13
149
  },
150
  {
151
  "epoch": 14.0,
152
+ "grad_norm": 4.930461883544922,
153
  "learning_rate": 4.530518418775734e-06,
154
+ "loss": 0.5558,
155
  "step": 14
156
  },
157
  {
158
  "epoch": 15.0,
159
+ "grad_norm": 4.64890193939209,
160
  "learning_rate": 3.2271842837425917e-06,
161
+ "loss": 0.5455,
162
  "step": 15
163
  },
164
  {
 
177
  "eval_accuracy_total_num_<|stop|>": 2,
178
  "eval_first_token_param_values": 0.9,
179
  "eval_first_token_param_values_total": 10,
180
+ "eval_loss": 0.5380075573921204,
181
+ "eval_perplexity": 1.0064576742544533,
182
+ "eval_runtime": 0.8779,
183
+ "eval_samples_per_second": 1.139,
184
+ "eval_steps_per_second": 1.139,
185
  "eval_total_number_first_token": 2,
186
  "step": 15
187
  },
188
  {
189
  "epoch": 16.0,
190
+ "grad_norm": 4.5582594871521,
191
  "learning_rate": 2.1085949060360654e-06,
192
+ "loss": 0.5374,
193
  "step": 16
194
  },
195
  {
196
  "epoch": 17.0,
197
+ "grad_norm": 4.519166946411133,
198
  "learning_rate": 1.2052624879351105e-06,
199
+ "loss": 0.5316,
200
  "step": 17
201
  },
202
  {
203
  "epoch": 18.0,
204
+ "grad_norm": 4.582608699798584,
205
  "learning_rate": 5.418275829936537e-07,
206
+ "loss": 0.5291,
207
  "step": 18
208
  },
209
  {
210
  "epoch": 19.0,
211
+ "grad_norm": 4.5061774253845215,
212
  "learning_rate": 1.3638696597277678e-07,
213
+ "loss": 0.5265,
214
  "step": 19
215
  },
216
  {
217
  "epoch": 20.0,
218
+ "grad_norm": 4.478773593902588,
219
  "learning_rate": 0.0,
220
+ "loss": 0.5249,
221
  "step": 20
222
  },
223
  {
 
236
  "eval_accuracy_total_num_<|stop|>": 2,
237
  "eval_first_token_param_values": 0.9,
238
  "eval_first_token_param_values_total": 10,
239
+ "eval_loss": 0.5260699987411499,
240
+ "eval_perplexity": 1.0063139350239472,
241
+ "eval_runtime": 0.8774,
242
+ "eval_samples_per_second": 1.14,
243
+ "eval_steps_per_second": 1.14,
244
  "eval_total_number_first_token": 2,
245
  "step": 20
246
  },
 
248
  "epoch": 20.0,
249
  "step": 20,
250
  "total_flos": 7935331915530240.0,
251
+ "train_loss": 0.7599329292774201,
252
+ "train_runtime": 382.8027,
253
+ "train_samples_per_second": 0.052,
254
+ "train_steps_per_second": 0.052
255
  }
256
  ],
257
  "logging_steps": 1.0,