SystemAdmin123 commited on
Commit
77e6e64
·
verified ·
1 Parent(s): 68d0d0e

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bcd41d10593de3cdb9d40fb311ae2ff3910a2b47dbbe8786beb51948f14e6ff
3
  size 2066752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b65f1f5bdc8e68c678fe87b09e42b54b9eba1640a82fd33222297da0ee47b3
3
  size 2066752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d350466da0c8d34450032ea9f33f3752c1be3a599d9885109e18618f6872f48a
3
  size 2162798
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef4e35f5dcad96cb1089fe9a38ce911a58e849647e472ddf5a739d5e6986aa33
3
  size 2162798
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ced0ac0d077b41bd2987add3782b7ce1140142ac3cddaf433babda96674c50fb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ab3d6cfcb43de67f16e412d0cb4f86309db602f8242d16f2b203a0212d6cbb
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ed6aad8025a80b776f2d50234fd05b8c1e2e758d3d427458fe15ed9bc7f733a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c88b3aeb8ec2bf995149291b90b69667d3f268ff2f13afbeab1a220b8cc27590
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.47351287363125183,
5
  "eval_steps": 200,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1199,6 +1199,302 @@
1199
  "eval_samples_per_second": 75.613,
1200
  "eval_steps_per_second": 18.928,
1201
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
  }
1203
  ],
1204
  "logging_steps": 10,
@@ -1218,7 +1514,7 @@
1218
  "attributes": {}
1219
  }
1220
  },
1221
- "total_flos": 41018144587776.0,
1222
  "train_batch_size": 4,
1223
  "trial_name": null,
1224
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5918910920390648,
5
  "eval_steps": 200,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1199
  "eval_samples_per_second": 75.613,
1200
  "eval_steps_per_second": 18.928,
1201
  "step": 1600
1202
+ },
1203
+ {
1204
+ "epoch": 0.47647232909144716,
1205
+ "grad_norm": 0.52734375,
1206
+ "learning_rate": 6.165528950410884e-05,
1207
+ "loss": 8.6937,
1208
+ "step": 1610
1209
+ },
1210
+ {
1211
+ "epoch": 0.4794317845516425,
1212
+ "grad_norm": 0.486328125,
1213
+ "learning_rate": 6.0437013114095195e-05,
1214
+ "loss": 8.6631,
1215
+ "step": 1620
1216
+ },
1217
+ {
1218
+ "epoch": 0.4823912400118378,
1219
+ "grad_norm": 0.51953125,
1220
+ "learning_rate": 5.922565910122967e-05,
1221
+ "loss": 8.696,
1222
+ "step": 1630
1223
+ },
1224
+ {
1225
+ "epoch": 0.48535069547203313,
1226
+ "grad_norm": 0.75,
1227
+ "learning_rate": 5.8021439417389444e-05,
1228
+ "loss": 8.6176,
1229
+ "step": 1640
1230
+ },
1231
+ {
1232
+ "epoch": 0.48831015093222846,
1233
+ "grad_norm": 0.984375,
1234
+ "learning_rate": 5.6824564766150726e-05,
1235
+ "loss": 8.7082,
1236
+ "step": 1650
1237
+ },
1238
+ {
1239
+ "epoch": 0.4912696063924238,
1240
+ "grad_norm": 0.470703125,
1241
+ "learning_rate": 5.563524456592163e-05,
1242
+ "loss": 8.6952,
1243
+ "step": 1660
1244
+ },
1245
+ {
1246
+ "epoch": 0.4942290618526191,
1247
+ "grad_norm": 0.5,
1248
+ "learning_rate": 5.4453686913300074e-05,
1249
+ "loss": 8.678,
1250
+ "step": 1670
1251
+ },
1252
+ {
1253
+ "epoch": 0.49718851731281444,
1254
+ "grad_norm": 0.5625,
1255
+ "learning_rate": 5.328009854666303e-05,
1256
+ "loss": 8.6815,
1257
+ "step": 1680
1258
+ },
1259
+ {
1260
+ "epoch": 0.5001479727730098,
1261
+ "grad_norm": 0.703125,
1262
+ "learning_rate": 5.2114684809993044e-05,
1263
+ "loss": 8.6626,
1264
+ "step": 1690
1265
+ },
1266
+ {
1267
+ "epoch": 0.5031074282332051,
1268
+ "grad_norm": 1.0390625,
1269
+ "learning_rate": 5.095764961694922e-05,
1270
+ "loss": 8.7641,
1271
+ "step": 1700
1272
+ },
1273
+ {
1274
+ "epoch": 0.5060668836934004,
1275
+ "grad_norm": 0.515625,
1276
+ "learning_rate": 4.980919541518796e-05,
1277
+ "loss": 8.6364,
1278
+ "step": 1710
1279
+ },
1280
+ {
1281
+ "epoch": 0.5090263391535957,
1282
+ "grad_norm": 0.482421875,
1283
+ "learning_rate": 4.866952315094088e-05,
1284
+ "loss": 8.689,
1285
+ "step": 1720
1286
+ },
1287
+ {
1288
+ "epoch": 0.511985794613791,
1289
+ "grad_norm": 0.51953125,
1290
+ "learning_rate": 4.753883223385467e-05,
1291
+ "loss": 8.7382,
1292
+ "step": 1730
1293
+ },
1294
+ {
1295
+ "epoch": 0.5149452500739864,
1296
+ "grad_norm": 0.62109375,
1297
+ "learning_rate": 4.6417320502100316e-05,
1298
+ "loss": 8.6902,
1299
+ "step": 1740
1300
+ },
1301
+ {
1302
+ "epoch": 0.5179047055341817,
1303
+ "grad_norm": 1.0859375,
1304
+ "learning_rate": 4.530518418775733e-05,
1305
+ "loss": 8.6841,
1306
+ "step": 1750
1307
+ },
1308
+ {
1309
+ "epoch": 0.520864160994377,
1310
+ "grad_norm": 0.478515625,
1311
+ "learning_rate": 4.4202617882478405e-05,
1312
+ "loss": 8.708,
1313
+ "step": 1760
1314
+ },
1315
+ {
1316
+ "epoch": 0.5238236164545723,
1317
+ "grad_norm": 0.486328125,
1318
+ "learning_rate": 4.310981450344189e-05,
1319
+ "loss": 8.6534,
1320
+ "step": 1770
1321
+ },
1322
+ {
1323
+ "epoch": 0.5267830719147677,
1324
+ "grad_norm": 0.5234375,
1325
+ "learning_rate": 4.2026965259596666e-05,
1326
+ "loss": 8.6607,
1327
+ "step": 1780
1328
+ },
1329
+ {
1330
+ "epoch": 0.529742527374963,
1331
+ "grad_norm": 0.8515625,
1332
+ "learning_rate": 4.0954259618206295e-05,
1333
+ "loss": 8.6611,
1334
+ "step": 1790
1335
+ },
1336
+ {
1337
+ "epoch": 0.5327019828351583,
1338
+ "grad_norm": 1.8984375,
1339
+ "learning_rate": 3.9891885271697496e-05,
1340
+ "loss": 8.6325,
1341
+ "step": 1800
1342
+ },
1343
+ {
1344
+ "epoch": 0.5327019828351583,
1345
+ "eval_loss": 8.694791793823242,
1346
+ "eval_runtime": 14.7074,
1347
+ "eval_samples_per_second": 102.126,
1348
+ "eval_steps_per_second": 25.565,
1349
+ "step": 1800
1350
+ },
1351
+ {
1352
+ "epoch": 0.5356614382953536,
1353
+ "grad_norm": 0.50390625,
1354
+ "learning_rate": 3.884002810481958e-05,
1355
+ "loss": 8.6837,
1356
+ "step": 1810
1357
+ },
1358
+ {
1359
+ "epoch": 0.538620893755549,
1360
+ "grad_norm": 0.490234375,
1361
+ "learning_rate": 3.779887216211995e-05,
1362
+ "loss": 8.6631,
1363
+ "step": 1820
1364
+ },
1365
+ {
1366
+ "epoch": 0.5415803492157443,
1367
+ "grad_norm": 0.5546875,
1368
+ "learning_rate": 3.676859961574162e-05,
1369
+ "loss": 8.6576,
1370
+ "step": 1830
1371
+ },
1372
+ {
1373
+ "epoch": 0.5445398046759397,
1374
+ "grad_norm": 0.69921875,
1375
+ "learning_rate": 3.574939073354838e-05,
1376
+ "loss": 8.7047,
1377
+ "step": 1840
1378
+ },
1379
+ {
1380
+ "epoch": 0.5474992601361349,
1381
+ "grad_norm": 1.0546875,
1382
+ "learning_rate": 3.4741423847583134e-05,
1383
+ "loss": 8.7234,
1384
+ "step": 1850
1385
+ },
1386
+ {
1387
+ "epoch": 0.5504587155963303,
1388
+ "grad_norm": 0.48046875,
1389
+ "learning_rate": 3.3744875322865034e-05,
1390
+ "loss": 8.7565,
1391
+ "step": 1860
1392
+ },
1393
+ {
1394
+ "epoch": 0.5534181710565256,
1395
+ "grad_norm": 0.4609375,
1396
+ "learning_rate": 3.275991952653054e-05,
1397
+ "loss": 8.6812,
1398
+ "step": 1870
1399
+ },
1400
+ {
1401
+ "epoch": 0.556377626516721,
1402
+ "grad_norm": 0.54296875,
1403
+ "learning_rate": 3.178672879732435e-05,
1404
+ "loss": 8.7074,
1405
+ "step": 1880
1406
+ },
1407
+ {
1408
+ "epoch": 0.5593370819769162,
1409
+ "grad_norm": 0.6875,
1410
+ "learning_rate": 3.0825473415445074e-05,
1411
+ "loss": 8.6826,
1412
+ "step": 1890
1413
+ },
1414
+ {
1415
+ "epoch": 0.5622965374371116,
1416
+ "grad_norm": 1.3359375,
1417
+ "learning_rate": 2.9876321572751144e-05,
1418
+ "loss": 8.7359,
1419
+ "step": 1900
1420
+ },
1421
+ {
1422
+ "epoch": 0.5652559928973069,
1423
+ "grad_norm": 0.49609375,
1424
+ "learning_rate": 2.8939439343332086e-05,
1425
+ "loss": 8.6599,
1426
+ "step": 1910
1427
+ },
1428
+ {
1429
+ "epoch": 0.5682154483575023,
1430
+ "grad_norm": 0.53515625,
1431
+ "learning_rate": 2.8014990654450325e-05,
1432
+ "loss": 8.631,
1433
+ "step": 1920
1434
+ },
1435
+ {
1436
+ "epoch": 0.5711749038176975,
1437
+ "grad_norm": 0.55859375,
1438
+ "learning_rate": 2.7103137257858868e-05,
1439
+ "loss": 8.6579,
1440
+ "step": 1930
1441
+ },
1442
+ {
1443
+ "epoch": 0.5741343592778929,
1444
+ "grad_norm": 0.59765625,
1445
+ "learning_rate": 2.6204038701499056e-05,
1446
+ "loss": 8.7039,
1447
+ "step": 1940
1448
+ },
1449
+ {
1450
+ "epoch": 0.5770938147380882,
1451
+ "grad_norm": 1.2109375,
1452
+ "learning_rate": 2.5317852301584643e-05,
1453
+ "loss": 8.6511,
1454
+ "step": 1950
1455
+ },
1456
+ {
1457
+ "epoch": 0.5800532701982836,
1458
+ "grad_norm": 0.4921875,
1459
+ "learning_rate": 2.4444733115075823e-05,
1460
+ "loss": 8.6733,
1461
+ "step": 1960
1462
+ },
1463
+ {
1464
+ "epoch": 0.5830127256584788,
1465
+ "grad_norm": 0.48828125,
1466
+ "learning_rate": 2.3584833912548888e-05,
1467
+ "loss": 8.6404,
1468
+ "step": 1970
1469
+ },
1470
+ {
1471
+ "epoch": 0.5859721811186742,
1472
+ "grad_norm": 0.53515625,
1473
+ "learning_rate": 2.2738305151465645e-05,
1474
+ "loss": 8.6784,
1475
+ "step": 1980
1476
+ },
1477
+ {
1478
+ "epoch": 0.5889316365788695,
1479
+ "grad_norm": 0.67578125,
1480
+ "learning_rate": 2.190529494984782e-05,
1481
+ "loss": 8.6897,
1482
+ "step": 1990
1483
+ },
1484
+ {
1485
+ "epoch": 0.5918910920390648,
1486
+ "grad_norm": 1.3046875,
1487
+ "learning_rate": 2.1085949060360654e-05,
1488
+ "loss": 8.6338,
1489
+ "step": 2000
1490
+ },
1491
+ {
1492
+ "epoch": 0.5918910920390648,
1493
+ "eval_loss": 8.694610595703125,
1494
+ "eval_runtime": 15.9292,
1495
+ "eval_samples_per_second": 94.292,
1496
+ "eval_steps_per_second": 23.604,
1497
+ "step": 2000
1498
  }
1499
  ],
1500
  "logging_steps": 10,
 
1514
  "attributes": {}
1515
  }
1516
  },
1517
+ "total_flos": 51272680734720.0,
1518
  "train_batch_size": 4,
1519
  "trial_name": null,
1520
  "trial_params": null