CodeIsAbstract commited on
Commit
681cae0
·
verified ·
1 Parent(s): 352031f

Upload fine-tuned model

Browse files
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80090c0ec5dfe7066377a3cc545ac216fd6741d8d88895f37ecff26a25a72237
3
  size 4961852416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb37d1d98e407541ac5be48089f1dbd65004ec4e7ee30371a1fa3beb3edf5978
3
  size 4961852416
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0656a1cc2e04850b6ef2963e0250cd620faecdbf88b58071d4c3757934aa8a43
3
  size 4983111176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:635ab5605f4deaba09610a2821eaf9f0c43f07e8b28e26f02bfb355b32f4aa67
3
  size 4983111176
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16511ee7337aa20595cd6e4025396bb209c1e34dee938838ec739fb48a7dcf0a
3
  size 4945374704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8879fd1f45aff078cc7e8485e6220488fa7cbce98eb7f5bc2dd78a6a37bff06
3
  size 4945374704
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b900f38794888826263302802614491ebb47c32f22ddc42bbb09e072e8680f3c
3
  size 394002560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c9b6f3523afb0a206fc5faba27884c901665eabe8582d441f7edb814c161048
3
  size 394002560
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:285abb08886d14c329c3145cd21092c92a575d4647731c4c6bacdc52838df314
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbac4f1ced44f35e7f915b7cb8080e25aa99a3afe0491b4e7972091465d885e3
3
  size 14244
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34d01644342e74e4c6b26bcf6157d30b6c7b4ae37f6998df0a4f5e710352b706
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:431d359aac953011e8b3e0aa259323346617a3406f926e81cc0a0de1f9b0c6eb
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42ac1193d9afd5e9e6a069954ba46bddc93f719e87099011fc640726439b771c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7de6d366951b9cf0a0e3642f86d07125dd936b019bbfd2f2b8a0cccee56b9442
3
  size 1064
special_tokens_map.json CHANGED
@@ -13,13 +13,7 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": {
17
- "content": "<|endoftext|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "<|endoftext|>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
tokenizer_config.json CHANGED
@@ -120,6 +120,7 @@
120
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
 
123
  "legacy": false,
124
  "model_max_length": 131072,
125
  "pad_token": "<|endoftext|>",
 
120
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
+ "extra_special_tokens": {},
124
  "legacy": false,
125
  "model_max_length": 131072,
126
  "pad_token": "<|endoftext|>",
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 10,
7
- "global_step": 625,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -998,387 +998,6 @@
998
  "eval_samples_per_second": 4.539,
999
  "eval_steps_per_second": 4.539,
1000
  "step": 450
1001
- },
1002
- {
1003
- "epoch": 0.728,
1004
- "grad_norm": 4.757038593292236,
1005
- "learning_rate": 3.657301224942098e-06,
1006
- "loss": 3.7008,
1007
- "step": 455
1008
- },
1009
- {
1010
- "epoch": 0.736,
1011
- "grad_norm": 4.461075305938721,
1012
- "learning_rate": 3.4619278695411495e-06,
1013
- "loss": 3.651,
1014
- "step": 460
1015
- },
1016
- {
1017
- "epoch": 0.736,
1018
- "eval_loss": 3.6577467918395996,
1019
- "eval_runtime": 110.5987,
1020
- "eval_samples_per_second": 4.521,
1021
- "eval_steps_per_second": 4.521,
1022
- "step": 460
1023
- },
1024
- {
1025
- "epoch": 0.744,
1026
- "grad_norm": 3.8970530033111572,
1027
- "learning_rate": 3.2708194805996252e-06,
1028
- "loss": 3.6794,
1029
- "step": 465
1030
- },
1031
- {
1032
- "epoch": 0.752,
1033
- "grad_norm": 4.632080554962158,
1034
- "learning_rate": 3.0841007234397655e-06,
1035
- "loss": 3.6475,
1036
- "step": 470
1037
- },
1038
- {
1039
- "epoch": 0.752,
1040
- "eval_loss": 3.661729574203491,
1041
- "eval_runtime": 112.6803,
1042
- "eval_samples_per_second": 4.437,
1043
- "eval_steps_per_second": 4.437,
1044
- "step": 470
1045
- },
1046
- {
1047
- "epoch": 0.76,
1048
- "grad_norm": 5.098201751708984,
1049
- "learning_rate": 2.901893399904797e-06,
1050
- "loss": 3.6647,
1051
- "step": 475
1052
- },
1053
- {
1054
- "epoch": 0.768,
1055
- "grad_norm": 4.249239921569824,
1056
- "learning_rate": 2.724316368904201e-06,
1057
- "loss": 3.694,
1058
- "step": 480
1059
- },
1060
- {
1061
- "epoch": 0.768,
1062
- "eval_loss": 3.652348041534424,
1063
- "eval_runtime": 111.7392,
1064
- "eval_samples_per_second": 4.475,
1065
- "eval_steps_per_second": 4.475,
1066
- "step": 480
1067
- },
1068
- {
1069
- "epoch": 0.776,
1070
- "grad_norm": 4.952099800109863,
1071
- "learning_rate": 2.5514854688787406e-06,
1072
- "loss": 3.6771,
1073
- "step": 485
1074
- },
1075
- {
1076
- "epoch": 0.784,
1077
- "grad_norm": 4.995110034942627,
1078
- "learning_rate": 2.383513442235812e-06,
1079
- "loss": 3.6666,
1080
- "step": 490
1081
- },
1082
- {
1083
- "epoch": 0.784,
1084
- "eval_loss": 3.653099298477173,
1085
- "eval_runtime": 111.5607,
1086
- "eval_samples_per_second": 4.482,
1087
- "eval_steps_per_second": 4.482,
1088
- "step": 490
1089
- },
1090
- {
1091
- "epoch": 0.792,
1092
- "grad_norm": 5.493462085723877,
1093
- "learning_rate": 2.2205098618044584e-06,
1094
- "loss": 3.6716,
1095
- "step": 495
1096
- },
1097
- {
1098
- "epoch": 0.8,
1099
- "grad_norm": 4.578746795654297,
1100
- "learning_rate": 2.0625810593579286e-06,
1101
- "loss": 3.64,
1102
- "step": 500
1103
- },
1104
- {
1105
- "epoch": 0.8,
1106
- "eval_loss": 3.6576831340789795,
1107
- "eval_runtime": 109.6662,
1108
- "eval_samples_per_second": 4.559,
1109
- "eval_steps_per_second": 4.559,
1110
- "step": 500
1111
- },
1112
- {
1113
- "epoch": 0.808,
1114
- "grad_norm": 4.769631385803223,
1115
- "learning_rate": 1.9098300562505266e-06,
1116
- "loss": 3.6298,
1117
- "step": 505
1118
- },
1119
- {
1120
- "epoch": 0.816,
1121
- "grad_norm": 5.270910263061523,
1122
- "learning_rate": 1.7623564962139061e-06,
1123
- "loss": 3.6375,
1124
- "step": 510
1125
- },
1126
- {
1127
- "epoch": 0.816,
1128
- "eval_loss": 3.658618211746216,
1129
- "eval_runtime": 112.7334,
1130
- "eval_samples_per_second": 4.435,
1131
- "eval_steps_per_second": 4.435,
1132
- "step": 510
1133
- },
1134
- {
1135
- "epoch": 0.824,
1136
- "grad_norm": 4.440089225769043,
1137
- "learning_rate": 1.6202565803566917e-06,
1138
- "loss": 3.6672,
1139
- "step": 515
1140
- },
1141
- {
1142
- "epoch": 0.832,
1143
- "grad_norm": 4.2689337730407715,
1144
- "learning_rate": 1.4836230044098164e-06,
1145
- "loss": 3.6544,
1146
- "step": 520
1147
- },
1148
- {
1149
- "epoch": 0.832,
1150
- "eval_loss": 3.6577441692352295,
1151
- "eval_runtime": 111.1537,
1152
- "eval_samples_per_second": 4.498,
1153
- "eval_steps_per_second": 4.498,
1154
- "step": 520
1155
- },
1156
- {
1157
- "epoch": 0.84,
1158
- "grad_norm": 7.282012939453125,
1159
- "learning_rate": 1.352544898258511e-06,
1160
- "loss": 3.6416,
1161
- "step": 525
1162
- },
1163
- {
1164
- "epoch": 0.848,
1165
- "grad_norm": 5.834847927093506,
1166
- "learning_rate": 1.2271077678004084e-06,
1167
- "loss": 3.653,
1168
- "step": 530
1169
- },
1170
- {
1171
- "epoch": 0.848,
1172
- "eval_loss": 3.6557505130767822,
1173
- "eval_runtime": 111.4698,
1174
- "eval_samples_per_second": 4.486,
1175
- "eval_steps_per_second": 4.486,
1176
- "step": 530
1177
- },
1178
- {
1179
- "epoch": 0.856,
1180
- "grad_norm": 4.881261825561523,
1181
- "learning_rate": 1.1073934391676666e-06,
1182
- "loss": 3.6859,
1183
- "step": 535
1184
- },
1185
- {
1186
- "epoch": 0.864,
1187
- "grad_norm": 4.997443675994873,
1188
- "learning_rate": 9.934800053494886e-07,
1189
- "loss": 3.6678,
1190
- "step": 540
1191
- },
1192
- {
1193
- "epoch": 0.864,
1194
- "eval_loss": 3.6547605991363525,
1195
- "eval_runtime": 112.6768,
1196
- "eval_samples_per_second": 4.437,
1197
- "eval_steps_per_second": 4.437,
1198
- "step": 540
1199
- },
1200
- {
1201
- "epoch": 0.872,
1202
- "grad_norm": 5.601071834564209,
1203
- "learning_rate": 8.854417752499112e-07,
1204
- "loss": 3.6535,
1205
- "step": 545
1206
- },
1207
- {
1208
- "epoch": 0.88,
1209
- "grad_norm": 5.277810573577881,
1210
- "learning_rate": 7.833492252140284e-07,
1211
- "loss": 3.6455,
1212
- "step": 550
1213
- },
1214
- {
1215
- "epoch": 0.88,
1216
- "eval_loss": 3.6565444469451904,
1217
- "eval_runtime": 111.5954,
1218
- "eval_samples_per_second": 4.48,
1219
- "eval_steps_per_second": 4.48,
1220
- "step": 550
1221
- },
1222
- {
1223
- "epoch": 0.888,
1224
- "grad_norm": 4.55835485458374,
1225
- "learning_rate": 6.872689530543087e-07,
1226
- "loss": 3.6681,
1227
- "step": 555
1228
- },
1229
- {
1230
- "epoch": 0.896,
1231
- "grad_norm": 4.985684871673584,
1232
- "learning_rate": 5.972636346069949e-07,
1233
- "loss": 3.6437,
1234
- "step": 560
1235
- },
1236
- {
1237
- "epoch": 0.896,
1238
- "eval_loss": 3.656482696533203,
1239
- "eval_runtime": 111.9622,
1240
- "eval_samples_per_second": 4.466,
1241
- "eval_steps_per_second": 4.466,
1242
- "step": 560
1243
- },
1244
- {
1245
- "epoch": 0.904,
1246
- "grad_norm": 5.561159133911133,
1247
- "learning_rate": 5.133919828468992e-07,
1248
- "loss": 3.6635,
1249
- "step": 565
1250
- },
1251
- {
1252
- "epoch": 0.912,
1253
- "grad_norm": 5.3784003257751465,
1254
- "learning_rate": 4.357087095873136e-07,
1255
- "loss": 3.6519,
1256
- "step": 570
1257
- },
1258
- {
1259
- "epoch": 0.912,
1260
- "eval_loss": 3.656426429748535,
1261
- "eval_runtime": 110.8291,
1262
- "eval_samples_per_second": 4.511,
1263
- "eval_steps_per_second": 4.511,
1264
- "step": 570
1265
- },
1266
- {
1267
- "epoch": 0.92,
1268
- "grad_norm": 5.003520965576172,
1269
- "learning_rate": 3.6426448978995054e-07,
1270
- "loss": 3.6271,
1271
- "step": 575
1272
- },
1273
- {
1274
- "epoch": 0.928,
1275
- "grad_norm": 4.7706403732299805,
1276
- "learning_rate": 2.9910592850826983e-07,
1277
- "loss": 3.7089,
1278
- "step": 580
1279
- },
1280
- {
1281
- "epoch": 0.928,
1282
- "eval_loss": 3.6552963256835938,
1283
- "eval_runtime": 111.738,
1284
- "eval_samples_per_second": 4.475,
1285
- "eval_steps_per_second": 4.475,
1286
- "step": 580
1287
- },
1288
- {
1289
- "epoch": 0.936,
1290
- "grad_norm": 5.243898391723633,
1291
- "learning_rate": 2.402755304856974e-07,
1292
- "loss": 3.6502,
1293
- "step": 585
1294
- },
1295
- {
1296
- "epoch": 0.944,
1297
- "grad_norm": 5.9423298835754395,
1298
- "learning_rate": 1.8781167242860276e-07,
1299
- "loss": 3.6603,
1300
- "step": 590
1301
- },
1302
- {
1303
- "epoch": 0.944,
1304
- "eval_loss": 3.656373977661133,
1305
- "eval_runtime": 110.7007,
1306
- "eval_samples_per_second": 4.517,
1307
- "eval_steps_per_second": 4.517,
1308
- "step": 590
1309
- },
1310
- {
1311
- "epoch": 0.952,
1312
- "grad_norm": 4.393427848815918,
1313
- "learning_rate": 1.4174857797209951e-07,
1314
- "loss": 3.6765,
1315
- "step": 595
1316
- },
1317
- {
1318
- "epoch": 0.96,
1319
- "grad_norm": 4.435468673706055,
1320
- "learning_rate": 1.0211629535501811e-07,
1321
- "loss": 3.6715,
1322
- "step": 600
1323
- },
1324
- {
1325
- "epoch": 0.96,
1326
- "eval_loss": 3.6557369232177734,
1327
- "eval_runtime": 111.0884,
1328
- "eval_samples_per_second": 4.501,
1329
- "eval_steps_per_second": 4.501,
1330
- "step": 600
1331
- },
1332
- {
1333
- "epoch": 0.968,
1334
- "grad_norm": 4.849489212036133,
1335
- "learning_rate": 6.894067781860636e-08,
1336
- "loss": 3.6554,
1337
- "step": 605
1338
- },
1339
- {
1340
- "epoch": 0.976,
1341
- "grad_norm": 6.290756702423096,
1342
- "learning_rate": 4.2243366741735457e-08,
1343
- "loss": 3.6473,
1344
- "step": 610
1345
- },
1346
- {
1347
- "epoch": 0.976,
1348
- "eval_loss": 3.6551318168640137,
1349
- "eval_runtime": 111.5978,
1350
- "eval_samples_per_second": 4.48,
1351
- "eval_steps_per_second": 4.48,
1352
- "step": 610
1353
- },
1354
- {
1355
- "epoch": 0.984,
1356
- "grad_norm": 5.660240650177002,
1357
- "learning_rate": 2.2041777523627018e-08,
1358
- "loss": 3.6221,
1359
- "step": 615
1360
- },
1361
- {
1362
- "epoch": 0.992,
1363
- "grad_norm": 5.215294361114502,
1364
- "learning_rate": 8.349088223306157e-09,
1365
- "loss": 3.6463,
1366
- "step": 620
1367
- },
1368
- {
1369
- "epoch": 0.992,
1370
- "eval_loss": 3.655787944793701,
1371
- "eval_runtime": 111.9272,
1372
- "eval_samples_per_second": 4.467,
1373
- "eval_steps_per_second": 4.467,
1374
- "step": 620
1375
- },
1376
- {
1377
- "epoch": 1.0,
1378
- "grad_norm": 5.055933475494385,
1379
- "learning_rate": 1.1742309631845861e-09,
1380
- "loss": 3.6125,
1381
- "step": 625
1382
  }
1383
  ],
1384
  "logging_steps": 5,
@@ -1393,12 +1012,12 @@
1393
  "should_evaluate": false,
1394
  "should_log": false,
1395
  "should_save": true,
1396
- "should_training_stop": true
1397
  },
1398
  "attributes": {}
1399
  }
1400
  },
1401
- "total_flos": 6.8614575095808e+17,
1402
  "train_batch_size": 1,
1403
  "trial_name": null,
1404
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.72,
6
  "eval_steps": 10,
7
+ "global_step": 450,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
998
  "eval_samples_per_second": 4.539,
999
  "eval_steps_per_second": 4.539,
1000
  "step": 450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001
  }
1002
  ],
1003
  "logging_steps": 5,
 
1012
  "should_evaluate": false,
1013
  "should_log": false,
1014
  "should_save": true,
1015
+ "should_training_stop": false
1016
  },
1017
  "attributes": {}
1018
  }
1019
  },
1020
+ "total_flos": 4.940249406898176e+17,
1021
  "train_batch_size": 1,
1022
  "trial_name": null,
1023
  "trial_params": null