CodeIsAbstract commited on
Commit
8af0cfd
·
verified ·
1 Parent(s): c3f386f

Upload fine-tuned model

Browse files
Files changed (7) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scaler.pt +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +85 -131
  7. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25c1ab1d72c24bfe28dcd99393d47713be6eac6f12ab34a9cfaffff70d0f94c7
3
  size 4943274328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ae1252eb009de5f4ffae322b888fd760512993d43e9d623a032eaf86042404
3
  size 4943274328
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72797521203759b3fa484f53336a7892a7039282c777c88bfe269ff51ac8884f
3
  size 2510808826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:153d74c362568635276812279c9b2262cfde40fddaa6794f820202bd3a4a2b37
3
  size 2510808826
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9899ccda7f0d8d9511991180b93aab508ce6e8489de708c88ad1188e7e1d90d6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
3
  size 14244
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2499e0399fbf93134f32089f43a54b542db105fd8163905b5ca10492c93f08c
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc401fda741476ac85311d689253b63b3cab7dd7e757753c00d7432b308b4a77
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:502fc0646817f7b28f50f0797fbf78aca9985ff1902e46adf8c295619f5e8837
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973c2ea88782fa3ec32fbffa7f9f3938091abd7d9b495e680068b46093ff8ffb
3
  size 1064
trainer_state.json CHANGED
@@ -2,172 +2,126 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9835082458770614,
6
- "eval_steps": 25,
7
- "global_step": 166,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.13193403298350825,
14
- "grad_norm": 119.62747955322266,
15
- "learning_rate": 1.6000000000000003e-05,
16
- "loss": 60.0685,
17
- "step": 11
18
  },
19
  {
20
- "epoch": 0.2638680659670165,
21
- "grad_norm": 76.251220703125,
22
- "learning_rate": 1.983619906947144e-05,
23
- "loss": 53.8545,
24
- "step": 22
25
- },
26
- {
27
- "epoch": 0.29985007496251875,
28
- "eval_loss": 3.3249454498291016,
29
- "eval_runtime": 89.9708,
30
- "eval_samples_per_second": 5.557,
31
- "eval_steps_per_second": 1.856,
32
- "step": 25
33
- },
34
- {
35
- "epoch": 0.39580209895052476,
36
- "grad_norm": 67.91344451904297,
37
- "learning_rate": 1.9199794436588244e-05,
38
- "loss": 52.8293,
39
- "step": 33
40
- },
41
- {
42
- "epoch": 0.527736131934033,
43
- "grad_norm": 75.1031265258789,
44
- "learning_rate": 1.811377838556573e-05,
45
- "loss": 52.3377,
46
- "step": 44
47
  },
48
  {
49
- "epoch": 0.5997001499250375,
50
- "eval_loss": 3.2520751953125,
51
- "eval_runtime": 89.7398,
52
- "eval_samples_per_second": 5.572,
53
- "eval_steps_per_second": 1.861,
54
- "step": 50
55
  },
56
  {
57
- "epoch": 0.6596701649175413,
58
- "grad_norm": 70.66634368896484,
59
- "learning_rate": 1.6631226582407954e-05,
60
- "loss": 52.2969,
61
- "step": 55
62
  },
63
  {
64
- "epoch": 0.7916041979010495,
65
- "grad_norm": 74.07559204101562,
66
- "learning_rate": 1.4824594148071936e-05,
67
- "loss": 51.8169,
68
- "step": 66
69
  },
70
  {
71
- "epoch": 0.8995502248875562,
72
- "eval_loss": 3.2324743270874023,
73
- "eval_runtime": 90.1521,
74
- "eval_samples_per_second": 5.546,
75
- "eval_steps_per_second": 1.852,
76
- "step": 75
77
  },
78
  {
79
- "epoch": 0.9235382308845578,
80
- "grad_norm": 72.83753967285156,
81
- "learning_rate": 1.2782174639164528e-05,
82
- "loss": 51.5118,
83
- "step": 77
84
  },
85
  {
86
- "epoch": 1.047976011994003,
87
- "grad_norm": 79.93111419677734,
88
- "learning_rate": 1.0603784974222862e-05,
89
- "loss": 48.0834,
90
- "step": 88
91
  },
92
  {
93
- "epoch": 1.1799100449775113,
94
- "grad_norm": 89.3459243774414,
95
- "learning_rate": 8.395887191422397e-06,
96
- "loss": 50.378,
97
- "step": 99
98
  },
99
  {
100
- "epoch": 1.191904047976012,
101
- "eval_loss": 3.2021567821502686,
102
- "eval_runtime": 90.5661,
103
- "eval_samples_per_second": 5.521,
104
- "eval_steps_per_second": 1.844,
105
- "step": 100
106
  },
107
  {
108
- "epoch": 1.3118440779610194,
109
- "grad_norm": 69.37840270996094,
110
- "learning_rate": 6.266385446673791e-06,
111
- "loss": 50.3322,
112
- "step": 110
113
  },
114
  {
115
- "epoch": 1.4437781109445278,
116
- "grad_norm": 65.35021209716797,
117
- "learning_rate": 4.319352532688444e-06,
118
- "loss": 50.1113,
119
- "step": 121
 
120
  },
121
  {
122
- "epoch": 1.4917541229385307,
123
- "eval_loss": 3.19496488571167,
124
- "eval_runtime": 90.6364,
125
- "eval_samples_per_second": 5.517,
126
- "eval_steps_per_second": 1.843,
127
- "step": 125
128
- },
129
- {
130
- "epoch": 1.575712143928036,
131
- "grad_norm": 70.10005187988281,
132
- "learning_rate": 2.6499436440367165e-06,
133
- "loss": 50.1493,
134
- "step": 132
135
- },
136
- {
137
- "epoch": 1.707646176911544,
138
- "grad_norm": 68.84884643554688,
139
- "learning_rate": 1.339745962155613e-06,
140
- "loss": 50.1588,
141
- "step": 143
142
- },
143
- {
144
- "epoch": 1.7916041979010495,
145
- "eval_loss": 3.176970958709717,
146
- "eval_runtime": 89.1443,
147
- "eval_samples_per_second": 5.609,
148
- "eval_steps_per_second": 1.873,
149
- "step": 150
150
  },
151
  {
152
- "epoch": 1.8395802098950524,
153
- "grad_norm": 84.58167266845703,
154
- "learning_rate": 4.5279133491454406e-07,
155
- "loss": 49.9598,
156
- "step": 154
157
  },
158
  {
159
- "epoch": 1.9715142428785608,
160
- "grad_norm": 62.91596984863281,
161
- "learning_rate": 3.242691865790071e-08,
162
- "loss": 49.826,
163
- "step": 165
164
  }
165
  ],
166
- "logging_steps": 11,
167
- "max_steps": 166,
168
  "num_input_tokens_seen": 0,
169
- "num_train_epochs": 2,
170
- "save_steps": 50,
171
  "stateful_callbacks": {
172
  "TrainerControl": {
173
  "args": {
@@ -180,7 +134,7 @@
180
  "attributes": {}
181
  }
182
  },
183
- "total_flos": 2.3724710537723904e+17,
184
  "train_batch_size": 3,
185
  "trial_name": null,
186
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.96,
6
+ "eval_steps": 10,
7
+ "global_step": 27,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.21333333333333335,
14
+ "grad_norm": NaN,
15
+ "learning_rate": 0.0,
16
+ "loss": 63.3011,
17
+ "step": 2
18
  },
19
  {
20
+ "epoch": 0.4266666666666667,
21
+ "grad_norm": Infinity,
22
+ "learning_rate": 0.0,
23
+ "loss": 61.3878,
24
+ "step": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  },
26
  {
27
+ "epoch": 0.64,
28
+ "grad_norm": 611.62060546875,
29
+ "learning_rate": 2.0000000000000003e-06,
30
+ "loss": 64.6879,
31
+ "step": 6
 
32
  },
33
  {
34
+ "epoch": 0.8533333333333334,
35
+ "grad_norm": 343.85888671875,
36
+ "learning_rate": 6e-06,
37
+ "loss": 59.1893,
38
+ "step": 8
39
  },
40
  {
41
+ "epoch": 1.1066666666666667,
42
+ "grad_norm": 286.243408203125,
43
+ "learning_rate": 1e-05,
44
+ "loss": 61.2499,
45
+ "step": 10
46
  },
47
  {
48
+ "epoch": 1.1066666666666667,
49
+ "eval_loss": 3.3731160163879395,
50
+ "eval_runtime": 3.4128,
51
+ "eval_samples_per_second": 5.86,
52
+ "eval_steps_per_second": 2.051,
53
+ "step": 10
54
  },
55
  {
56
+ "epoch": 1.32,
57
+ "grad_norm": 289.30523681640625,
58
+ "learning_rate": 1.4e-05,
59
+ "loss": 55.2484,
60
+ "step": 12
61
  },
62
  {
63
+ "epoch": 1.5333333333333332,
64
+ "grad_norm": 334.54632568359375,
65
+ "learning_rate": 1.8e-05,
66
+ "loss": 56.5117,
67
+ "step": 14
68
  },
69
  {
70
+ "epoch": 1.7466666666666666,
71
+ "grad_norm": 328.0875549316406,
72
+ "learning_rate": 1.982973099683902e-05,
73
+ "loss": 54.6102,
74
+ "step": 16
75
  },
76
  {
77
+ "epoch": 1.96,
78
+ "grad_norm": 249.81207275390625,
79
+ "learning_rate": 1.8502171357296144e-05,
80
+ "loss": 55.2078,
81
+ "step": 18
 
82
  },
83
  {
84
+ "epoch": 2.2133333333333334,
85
+ "grad_norm": 240.95912170410156,
86
+ "learning_rate": 1.6026346363792565e-05,
87
+ "loss": 56.2904,
88
+ "step": 20
89
  },
90
  {
91
+ "epoch": 2.2133333333333334,
92
+ "eval_loss": 3.224900007247925,
93
+ "eval_runtime": 3.438,
94
+ "eval_samples_per_second": 5.817,
95
+ "eval_steps_per_second": 2.036,
96
+ "step": 20
97
  },
98
  {
99
+ "epoch": 2.4266666666666667,
100
+ "grad_norm": 274.64892578125,
101
+ "learning_rate": 1.2736629900720832e-05,
102
+ "loss": 52.4623,
103
+ "step": 22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  },
105
  {
106
+ "epoch": 2.64,
107
+ "grad_norm": 275.81597900390625,
108
+ "learning_rate": 9.07731640536698e-06,
109
+ "loss": 53.445,
110
+ "step": 24
111
  },
112
  {
113
+ "epoch": 2.8533333333333335,
114
+ "grad_norm": 252.26492309570312,
115
+ "learning_rate": 5.542616442234618e-06,
116
+ "loss": 53.8146,
117
+ "step": 26
118
  }
119
  ],
120
+ "logging_steps": 2,
121
+ "max_steps": 27,
122
  "num_input_tokens_seen": 0,
123
+ "num_train_epochs": 3,
124
+ "save_steps": 10,
125
  "stateful_callbacks": {
126
  "TrainerControl": {
127
  "args": {
 
134
  "attributes": {}
135
  }
136
  },
137
+ "total_flos": 3.910272351731712e+16,
138
  "train_batch_size": 3,
139
  "trial_name": null,
140
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16d296347945cb67bfa6c7229aa45967a046d5ff07369d05446b90a909c9721c
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8fc01af08c4c1686a979c57f0a67b791f460dc8eb9b636ec98728fcd1562cee
3
  size 5368