|
[ |
|
{ |
|
"loss": 7.1456, |
|
"grad_norm": 4.7170538902282715, |
|
"learning_rate": 3.824091778202677e-05, |
|
"epoch": 0.03824091778202677, |
|
"step": 20 |
|
}, |
|
{ |
|
"loss": 7.1158, |
|
"grad_norm": 4.5134358406066895, |
|
"learning_rate": 7.648183556405354e-05, |
|
"epoch": 0.07648183556405354, |
|
"step": 40 |
|
}, |
|
{ |
|
"loss": 7.0575, |
|
"grad_norm": 3.9098806381225586, |
|
"learning_rate": 0.0001147227533460803, |
|
"epoch": 0.1147227533460803, |
|
"step": 60 |
|
}, |
|
{ |
|
"loss": 6.9743, |
|
"grad_norm": 3.2200050354003906, |
|
"learning_rate": 0.00015296367112810707, |
|
"epoch": 0.15296367112810708, |
|
"step": 80 |
|
}, |
|
{ |
|
"loss": 6.874, |
|
"grad_norm": 2.5479934215545654, |
|
"learning_rate": 0.00019120458891013384, |
|
"epoch": 0.19120458891013384, |
|
"step": 100 |
|
}, |
|
{ |
|
"loss": 6.7278, |
|
"grad_norm": 2.168301820755005, |
|
"learning_rate": 0.0002294455066921606, |
|
"epoch": 0.2294455066921606, |
|
"step": 120 |
|
}, |
|
{ |
|
"loss": 6.5778, |
|
"grad_norm": 2.0083394050598145, |
|
"learning_rate": 0.0002676864244741874, |
|
"epoch": 0.2676864244741874, |
|
"step": 140 |
|
}, |
|
{ |
|
"loss": 6.4192, |
|
"grad_norm": 1.8298897743225098, |
|
"learning_rate": 0.00030592734225621415, |
|
"epoch": 0.30592734225621415, |
|
"step": 160 |
|
}, |
|
{ |
|
"loss": 6.2416, |
|
"grad_norm": 1.782423734664917, |
|
"learning_rate": 0.00034416826003824094, |
|
"epoch": 0.3441682600382409, |
|
"step": 180 |
|
}, |
|
{ |
|
"loss": 6.0624, |
|
"grad_norm": 1.8139146566390991, |
|
"learning_rate": 0.0003824091778202677, |
|
"epoch": 0.3824091778202677, |
|
"step": 200 |
|
}, |
|
{ |
|
"loss": 5.9686, |
|
"grad_norm": 1.7659958600997925, |
|
"learning_rate": 0.0004206500956022944, |
|
"epoch": 0.42065009560229444, |
|
"step": 220 |
|
}, |
|
{ |
|
"loss": 5.8142, |
|
"grad_norm": 1.8660094738006592, |
|
"learning_rate": 0.0004588910133843212, |
|
"epoch": 0.4588910133843212, |
|
"step": 240 |
|
}, |
|
{ |
|
"loss": 5.6944, |
|
"grad_norm": 1.831566333770752, |
|
"learning_rate": 0.0004971319311663481, |
|
"epoch": 0.497131931166348, |
|
"step": 260 |
|
}, |
|
{ |
|
"loss": 5.6101, |
|
"grad_norm": 1.7546241283416748, |
|
"learning_rate": 0.0005353728489483748, |
|
"epoch": 0.5353728489483748, |
|
"step": 280 |
|
}, |
|
{ |
|
"loss": 5.5192, |
|
"grad_norm": 1.8890600204467773, |
|
"learning_rate": 0.0005736137667304016, |
|
"epoch": 0.5736137667304015, |
|
"step": 300 |
|
}, |
|
{ |
|
"loss": 5.3822, |
|
"grad_norm": 1.7542874813079834, |
|
"learning_rate": 0.0006118546845124283, |
|
"epoch": 0.6118546845124283, |
|
"step": 320 |
|
}, |
|
{ |
|
"loss": 5.3236, |
|
"grad_norm": 1.8762731552124023, |
|
"learning_rate": 0.000650095602294455, |
|
"epoch": 0.6500956022944551, |
|
"step": 340 |
|
}, |
|
{ |
|
"loss": 5.2483, |
|
"grad_norm": 1.886903166770935, |
|
"learning_rate": 0.0006883365200764819, |
|
"epoch": 0.6883365200764818, |
|
"step": 360 |
|
}, |
|
{ |
|
"loss": 5.1245, |
|
"grad_norm": 1.9873583316802979, |
|
"learning_rate": 0.0007265774378585086, |
|
"epoch": 0.7265774378585086, |
|
"step": 380 |
|
}, |
|
{ |
|
"loss": 5.0771, |
|
"grad_norm": 1.953506350517273, |
|
"learning_rate": 0.0007648183556405354, |
|
"epoch": 0.7648183556405354, |
|
"step": 400 |
|
}, |
|
{ |
|
"loss": 5.0354, |
|
"grad_norm": 1.851192831993103, |
|
"learning_rate": 0.0008030592734225621, |
|
"epoch": 0.8030592734225621, |
|
"step": 420 |
|
}, |
|
{ |
|
"loss": 4.9532, |
|
"grad_norm": 1.861971139907837, |
|
"learning_rate": 0.0008413001912045888, |
|
"epoch": 0.8413001912045889, |
|
"step": 440 |
|
}, |
|
{ |
|
"loss": 4.8698, |
|
"grad_norm": 1.9388970136642456, |
|
"learning_rate": 0.0008795411089866157, |
|
"epoch": 0.8795411089866156, |
|
"step": 460 |
|
}, |
|
{ |
|
"loss": 4.847, |
|
"grad_norm": 1.919184684753418, |
|
"learning_rate": 0.0009177820267686424, |
|
"epoch": 0.9177820267686424, |
|
"step": 480 |
|
}, |
|
{ |
|
"loss": 4.7754, |
|
"grad_norm": 1.8886795043945312, |
|
"learning_rate": 0.0009560229445506692, |
|
"epoch": 0.9560229445506692, |
|
"step": 500 |
|
}, |
|
{ |
|
"loss": 4.6728, |
|
"grad_norm": 1.9832813739776611, |
|
"learning_rate": 0.0009942638623326961, |
|
"epoch": 0.994263862332696, |
|
"step": 520 |
|
}, |
|
{ |
|
"eval_loss": 4.345639228820801, |
|
"eval_accuracy": 0.15042045072317525, |
|
"eval_runtime": 211.2136, |
|
"eval_samples_per_second": 70.379, |
|
"eval_steps_per_second": 70.379, |
|
"epoch": 1.0, |
|
"step": 523 |
|
}, |
|
{ |
|
"loss": 4.5657, |
|
"grad_norm": 1.831260323524475, |
|
"learning_rate": 0.0009963883577650309, |
|
"epoch": 1.0325047801147227, |
|
"step": 540 |
|
}, |
|
{ |
|
"loss": 4.5047, |
|
"grad_norm": 1.934414029121399, |
|
"learning_rate": 0.0009921393669003612, |
|
"epoch": 1.0707456978967496, |
|
"step": 560 |
|
}, |
|
{ |
|
"loss": 4.4165, |
|
"grad_norm": 1.7718721628189087, |
|
"learning_rate": 0.0009878903760356915, |
|
"epoch": 1.1089866156787762, |
|
"step": 580 |
|
}, |
|
{ |
|
"loss": 4.3933, |
|
"grad_norm": 1.741455078125, |
|
"learning_rate": 0.0009836413851710218, |
|
"epoch": 1.147227533460803, |
|
"step": 600 |
|
}, |
|
{ |
|
"loss": 4.3249, |
|
"grad_norm": 1.8857481479644775, |
|
"learning_rate": 0.0009793923943063523, |
|
"epoch": 1.1854684512428297, |
|
"step": 620 |
|
}, |
|
{ |
|
"loss": 4.2147, |
|
"grad_norm": 1.8325748443603516, |
|
"learning_rate": 0.0009751434034416827, |
|
"epoch": 1.2237093690248566, |
|
"step": 640 |
|
}, |
|
{ |
|
"loss": 4.1569, |
|
"grad_norm": 1.8758591413497925, |
|
"learning_rate": 0.000970894412577013, |
|
"epoch": 1.2619502868068833, |
|
"step": 660 |
|
}, |
|
{ |
|
"loss": 4.131, |
|
"grad_norm": 1.899542212486267, |
|
"learning_rate": 0.0009666454217123433, |
|
"epoch": 1.3001912045889101, |
|
"step": 680 |
|
}, |
|
{ |
|
"loss": 4.0467, |
|
"grad_norm": 1.8188538551330566, |
|
"learning_rate": 0.0009623964308476737, |
|
"epoch": 1.338432122370937, |
|
"step": 700 |
|
}, |
|
{ |
|
"loss": 3.9904, |
|
"grad_norm": 1.7679705619812012, |
|
"learning_rate": 0.000958147439983004, |
|
"epoch": 1.3766730401529637, |
|
"step": 720 |
|
}, |
|
{ |
|
"loss": 3.9464, |
|
"grad_norm": 1.849482774734497, |
|
"learning_rate": 0.0009538984491183344, |
|
"epoch": 1.4149139579349903, |
|
"step": 740 |
|
}, |
|
{ |
|
"loss": 3.9104, |
|
"grad_norm": 1.8237632513046265, |
|
"learning_rate": 0.0009496494582536647, |
|
"epoch": 1.4531548757170172, |
|
"step": 760 |
|
}, |
|
{ |
|
"loss": 3.8441, |
|
"grad_norm": 1.8175936937332153, |
|
"learning_rate": 0.0009454004673889951, |
|
"epoch": 1.491395793499044, |
|
"step": 780 |
|
}, |
|
{ |
|
"loss": 3.7898, |
|
"grad_norm": 1.7967997789382935, |
|
"learning_rate": 0.0009411514765243255, |
|
"epoch": 1.5296367112810707, |
|
"step": 800 |
|
}, |
|
{ |
|
"loss": 3.6894, |
|
"grad_norm": 1.7681634426116943, |
|
"learning_rate": 0.0009369024856596558, |
|
"epoch": 1.5678776290630974, |
|
"step": 820 |
|
}, |
|
{ |
|
"loss": 3.6798, |
|
"grad_norm": 1.8655925989151, |
|
"learning_rate": 0.0009326534947949862, |
|
"epoch": 1.6061185468451242, |
|
"step": 840 |
|
}, |
|
{ |
|
"loss": 3.6297, |
|
"grad_norm": 1.853769302368164, |
|
"learning_rate": 0.0009284045039303166, |
|
"epoch": 1.644359464627151, |
|
"step": 860 |
|
}, |
|
{ |
|
"loss": 3.5592, |
|
"grad_norm": 1.8198288679122925, |
|
"learning_rate": 0.0009241555130656469, |
|
"epoch": 1.682600382409178, |
|
"step": 880 |
|
}, |
|
{ |
|
"loss": 3.5056, |
|
"grad_norm": 1.7744460105895996, |
|
"learning_rate": 0.0009199065222009773, |
|
"epoch": 1.7208413001912046, |
|
"step": 900 |
|
}, |
|
{ |
|
"loss": 3.4635, |
|
"grad_norm": 1.797914981842041, |
|
"learning_rate": 0.0009156575313363077, |
|
"epoch": 1.7590822179732313, |
|
"step": 920 |
|
}, |
|
{ |
|
"loss": 3.4434, |
|
"grad_norm": 1.8479169607162476, |
|
"learning_rate": 0.000911408540471638, |
|
"epoch": 1.7973231357552581, |
|
"step": 940 |
|
}, |
|
{ |
|
"loss": 3.441, |
|
"grad_norm": 1.818405032157898, |
|
"learning_rate": 0.0009071595496069684, |
|
"epoch": 1.835564053537285, |
|
"step": 960 |
|
}, |
|
{ |
|
"loss": 3.3934, |
|
"grad_norm": 1.7609572410583496, |
|
"learning_rate": 0.0009029105587422988, |
|
"epoch": 1.8738049713193117, |
|
"step": 980 |
|
}, |
|
{ |
|
"loss": 3.2961, |
|
"grad_norm": 1.7228211164474487, |
|
"learning_rate": 0.0008986615678776291, |
|
"epoch": 1.9120458891013383, |
|
"step": 1000 |
|
}, |
|
{ |
|
"loss": 3.2611, |
|
"grad_norm": 1.8148291110992432, |
|
"learning_rate": 0.0008944125770129595, |
|
"epoch": 1.9502868068833652, |
|
"step": 1020 |
|
}, |
|
{ |
|
"loss": 3.224, |
|
"grad_norm": 1.933300495147705, |
|
"learning_rate": 0.0008901635861482899, |
|
"epoch": 1.988527724665392, |
|
"step": 1040 |
|
}, |
|
{ |
|
"eval_loss": 2.258894205093384, |
|
"eval_accuracy": 0.5140935082408342, |
|
"eval_runtime": 203.9361, |
|
"eval_samples_per_second": 72.89, |
|
"eval_steps_per_second": 72.89, |
|
"epoch": 2.0, |
|
"step": 1046 |
|
}, |
|
{ |
|
"loss": 3.1667, |
|
"grad_norm": 1.819346308708191, |
|
"learning_rate": 0.0008859145952836202, |
|
"epoch": 2.026768642447419, |
|
"step": 1060 |
|
}, |
|
{ |
|
"loss": 3.0232, |
|
"grad_norm": 1.7024896144866943, |
|
"learning_rate": 0.0008816656044189504, |
|
"epoch": 2.0650095602294454, |
|
"step": 1080 |
|
}, |
|
{ |
|
"loss": 3.0489, |
|
"grad_norm": 1.7023948431015015, |
|
"learning_rate": 0.000877416613554281, |
|
"epoch": 2.1032504780114722, |
|
"step": 1100 |
|
}, |
|
{ |
|
"loss": 2.9732, |
|
"grad_norm": 1.804140329360962, |
|
"learning_rate": 0.0008731676226896112, |
|
"epoch": 2.141491395793499, |
|
"step": 1120 |
|
}, |
|
{ |
|
"loss": 2.9562, |
|
"grad_norm": 1.7260992527008057, |
|
"learning_rate": 0.0008689186318249415, |
|
"epoch": 2.179732313575526, |
|
"step": 1140 |
|
}, |
|
{ |
|
"loss": 2.8875, |
|
"grad_norm": 1.7970356941223145, |
|
"learning_rate": 0.000864669640960272, |
|
"epoch": 2.2179732313575524, |
|
"step": 1160 |
|
}, |
|
{ |
|
"loss": 2.916, |
|
"grad_norm": 1.8579261302947998, |
|
"learning_rate": 0.0008604206500956023, |
|
"epoch": 2.2562141491395793, |
|
"step": 1180 |
|
}, |
|
{ |
|
"loss": 2.8963, |
|
"grad_norm": 1.852342128753662, |
|
"learning_rate": 0.0008561716592309326, |
|
"epoch": 2.294455066921606, |
|
"step": 1200 |
|
}, |
|
{ |
|
"loss": 2.8029, |
|
"grad_norm": 1.8845752477645874, |
|
"learning_rate": 0.000851922668366263, |
|
"epoch": 2.332695984703633, |
|
"step": 1220 |
|
}, |
|
{ |
|
"loss": 2.8237, |
|
"grad_norm": 1.883952260017395, |
|
"learning_rate": 0.0008476736775015934, |
|
"epoch": 2.3709369024856595, |
|
"step": 1240 |
|
}, |
|
{ |
|
"loss": 2.8473, |
|
"grad_norm": 1.8383756875991821, |
|
"learning_rate": 0.0008434246866369237, |
|
"epoch": 2.4091778202676863, |
|
"step": 1260 |
|
}, |
|
{ |
|
"loss": 2.7722, |
|
"grad_norm": 1.8900470733642578, |
|
"learning_rate": 0.0008391756957722541, |
|
"epoch": 2.447418738049713, |
|
"step": 1280 |
|
}, |
|
{ |
|
"loss": 2.7584, |
|
"grad_norm": 1.8097845315933228, |
|
"learning_rate": 0.0008349267049075845, |
|
"epoch": 2.48565965583174, |
|
"step": 1300 |
|
}, |
|
{ |
|
"loss": 2.7134, |
|
"grad_norm": 1.7215895652770996, |
|
"learning_rate": 0.0008306777140429148, |
|
"epoch": 2.5239005736137665, |
|
"step": 1320 |
|
}, |
|
{ |
|
"loss": 2.6531, |
|
"grad_norm": 1.8249051570892334, |
|
"learning_rate": 0.0008264287231782451, |
|
"epoch": 2.5621414913957934, |
|
"step": 1340 |
|
}, |
|
{ |
|
"loss": 2.6675, |
|
"grad_norm": 1.8082237243652344, |
|
"learning_rate": 0.0008221797323135756, |
|
"epoch": 2.6003824091778203, |
|
"step": 1360 |
|
}, |
|
{ |
|
"loss": 2.5702, |
|
"grad_norm": 1.7981261014938354, |
|
"learning_rate": 0.0008179307414489059, |
|
"epoch": 2.638623326959847, |
|
"step": 1380 |
|
}, |
|
{ |
|
"loss": 2.6339, |
|
"grad_norm": 1.6964036226272583, |
|
"learning_rate": 0.0008136817505842362, |
|
"epoch": 2.676864244741874, |
|
"step": 1400 |
|
}, |
|
{ |
|
"loss": 2.5489, |
|
"grad_norm": 1.755050778388977, |
|
"learning_rate": 0.0008094327597195667, |
|
"epoch": 2.7151051625239004, |
|
"step": 1420 |
|
}, |
|
{ |
|
"loss": 2.5908, |
|
"grad_norm": 1.7242581844329834, |
|
"learning_rate": 0.000805183768854897, |
|
"epoch": 2.7533460803059273, |
|
"step": 1440 |
|
}, |
|
{ |
|
"loss": 2.5143, |
|
"grad_norm": 1.819612741470337, |
|
"learning_rate": 0.0008009347779902273, |
|
"epoch": 2.791586998087954, |
|
"step": 1460 |
|
}, |
|
{ |
|
"loss": 2.4662, |
|
"grad_norm": 1.7033363580703735, |
|
"learning_rate": 0.0007966857871255578, |
|
"epoch": 2.8298279158699806, |
|
"step": 1480 |
|
}, |
|
{ |
|
"loss": 2.4044, |
|
"grad_norm": 1.7662159204483032, |
|
"learning_rate": 0.000792436796260888, |
|
"epoch": 2.8680688336520075, |
|
"step": 1500 |
|
}, |
|
{ |
|
"loss": 2.4636, |
|
"grad_norm": 1.7460269927978516, |
|
"learning_rate": 0.0007881878053962183, |
|
"epoch": 2.9063097514340344, |
|
"step": 1520 |
|
}, |
|
{ |
|
"loss": 2.3955, |
|
"grad_norm": 1.8268380165100098, |
|
"learning_rate": 0.0007839388145315488, |
|
"epoch": 2.9445506692160612, |
|
"step": 1540 |
|
}, |
|
{ |
|
"loss": 2.3964, |
|
"grad_norm": 1.796981930732727, |
|
"learning_rate": 0.0007796898236668791, |
|
"epoch": 2.982791586998088, |
|
"step": 1560 |
|
}, |
|
{ |
|
"eval_loss": 1.4662528038024902, |
|
"eval_accuracy": 0.6835519677093844, |
|
"eval_runtime": 419.2044, |
|
"eval_samples_per_second": 35.46, |
|
"eval_steps_per_second": 35.46, |
|
"epoch": 3.0, |
|
"step": 1569 |
|
}, |
|
{ |
|
"loss": 2.3174, |
|
"grad_norm": 1.7852272987365723, |
|
"learning_rate": 0.0007754408328022094, |
|
"epoch": 3.0210325047801145, |
|
"step": 1580 |
|
}, |
|
{ |
|
"loss": 2.2913, |
|
"grad_norm": 1.8464534282684326, |
|
"learning_rate": 0.0007711918419375399, |
|
"epoch": 3.0592734225621414, |
|
"step": 1600 |
|
}, |
|
{ |
|
"loss": 2.2856, |
|
"grad_norm": 1.7783145904541016, |
|
"learning_rate": 0.0007669428510728702, |
|
"epoch": 3.0975143403441683, |
|
"step": 1620 |
|
}, |
|
{ |
|
"loss": 2.2099, |
|
"grad_norm": 1.744454264640808, |
|
"learning_rate": 0.0007626938602082005, |
|
"epoch": 3.135755258126195, |
|
"step": 1640 |
|
}, |
|
{ |
|
"loss": 2.23, |
|
"grad_norm": 1.8276797533035278, |
|
"learning_rate": 0.0007584448693435309, |
|
"epoch": 3.173996175908222, |
|
"step": 1660 |
|
}, |
|
{ |
|
"loss": 2.1912, |
|
"grad_norm": 1.8144315481185913, |
|
"learning_rate": 0.0007541958784788613, |
|
"epoch": 3.2122370936902485, |
|
"step": 1680 |
|
}, |
|
{ |
|
"loss": 2.1818, |
|
"grad_norm": 1.8499830961227417, |
|
"learning_rate": 0.0007499468876141916, |
|
"epoch": 3.2504780114722753, |
|
"step": 1700 |
|
}, |
|
{ |
|
"loss": 2.1349, |
|
"grad_norm": 1.7623099088668823, |
|
"learning_rate": 0.000745697896749522, |
|
"epoch": 3.288718929254302, |
|
"step": 1720 |
|
}, |
|
{ |
|
"loss": 2.1055, |
|
"grad_norm": 1.8180640935897827, |
|
"learning_rate": 0.0007414489058848524, |
|
"epoch": 3.3269598470363286, |
|
"step": 1740 |
|
}, |
|
{ |
|
"loss": 2.1077, |
|
"grad_norm": 1.8159964084625244, |
|
"learning_rate": 0.0007371999150201827, |
|
"epoch": 3.3652007648183555, |
|
"step": 1760 |
|
}, |
|
{ |
|
"loss": 2.0999, |
|
"grad_norm": 1.7902129888534546, |
|
"learning_rate": 0.0007329509241555131, |
|
"epoch": 3.4034416826003824, |
|
"step": 1780 |
|
}, |
|
{ |
|
"loss": 2.1188, |
|
"grad_norm": 1.7685898542404175, |
|
"learning_rate": 0.0007287019332908435, |
|
"epoch": 3.4416826003824093, |
|
"step": 1800 |
|
}, |
|
{ |
|
"loss": 2.0956, |
|
"grad_norm": 1.758325219154358, |
|
"learning_rate": 0.0007244529424261738, |
|
"epoch": 3.479923518164436, |
|
"step": 1820 |
|
}, |
|
{ |
|
"loss": 2.0488, |
|
"grad_norm": 1.7802537679672241, |
|
"learning_rate": 0.0007202039515615042, |
|
"epoch": 3.5181644359464626, |
|
"step": 1840 |
|
}, |
|
{ |
|
"loss": 2.0776, |
|
"grad_norm": 1.8220280408859253, |
|
"learning_rate": 0.0007159549606968346, |
|
"epoch": 3.5564053537284894, |
|
"step": 1860 |
|
}, |
|
{ |
|
"loss": 2.0257, |
|
"grad_norm": 1.8494378328323364, |
|
"learning_rate": 0.0007117059698321649, |
|
"epoch": 3.5946462715105163, |
|
"step": 1880 |
|
}, |
|
{ |
|
"loss": 2.0332, |
|
"grad_norm": 1.719109296798706, |
|
"learning_rate": 0.0007074569789674953, |
|
"epoch": 3.632887189292543, |
|
"step": 1900 |
|
}, |
|
{ |
|
"loss": 2.0052, |
|
"grad_norm": 1.9517509937286377, |
|
"learning_rate": 0.0007032079881028257, |
|
"epoch": 3.67112810707457, |
|
"step": 1920 |
|
}, |
|
{ |
|
"loss": 1.9901, |
|
"grad_norm": 1.7318940162658691, |
|
"learning_rate": 0.0006989589972381559, |
|
"epoch": 3.7093690248565965, |
|
"step": 1940 |
|
}, |
|
{ |
|
"loss": 1.9411, |
|
"grad_norm": 1.767015814781189, |
|
"learning_rate": 0.0006947100063734863, |
|
"epoch": 3.7476099426386233, |
|
"step": 1960 |
|
}, |
|
{ |
|
"loss": 2.0048, |
|
"grad_norm": 1.761806607246399, |
|
"learning_rate": 0.0006904610155088166, |
|
"epoch": 3.78585086042065, |
|
"step": 1980 |
|
}, |
|
{ |
|
"loss": 1.9312, |
|
"grad_norm": 1.7126002311706543, |
|
"learning_rate": 0.000686212024644147, |
|
"epoch": 3.8240917782026767, |
|
"step": 2000 |
|
}, |
|
{ |
|
"loss": 1.9443, |
|
"grad_norm": 1.7167437076568604, |
|
"learning_rate": 0.0006819630337794774, |
|
"epoch": 3.8623326959847035, |
|
"step": 2020 |
|
}, |
|
{ |
|
"loss": 1.893, |
|
"grad_norm": 1.749881386756897, |
|
"learning_rate": 0.0006777140429148077, |
|
"epoch": 3.9005736137667304, |
|
"step": 2040 |
|
}, |
|
{ |
|
"loss": 1.8876, |
|
"grad_norm": 1.6846592426300049, |
|
"learning_rate": 0.0006734650520501381, |
|
"epoch": 3.9388145315487573, |
|
"step": 2060 |
|
}, |
|
{ |
|
"loss": 1.8474, |
|
"grad_norm": 1.8149057626724243, |
|
"learning_rate": 0.0006692160611854685, |
|
"epoch": 3.977055449330784, |
|
"step": 2080 |
|
}, |
|
{ |
|
"eval_loss": 0.9547563195228577, |
|
"eval_accuracy": 0.7926673393878237, |
|
"eval_runtime": 183.3001, |
|
"eval_samples_per_second": 81.097, |
|
"eval_steps_per_second": 81.097, |
|
"epoch": 4.0, |
|
"step": 2092 |
|
}, |
|
{ |
|
"loss": 1.813, |
|
"grad_norm": 1.6455098390579224, |
|
"learning_rate": 0.0006649670703207988, |
|
"epoch": 4.015296367112811, |
|
"step": 2100 |
|
}, |
|
{ |
|
"loss": 1.7354, |
|
"grad_norm": 1.6958200931549072, |
|
"learning_rate": 0.0006607180794561292, |
|
"epoch": 4.053537284894838, |
|
"step": 2120 |
|
}, |
|
{ |
|
"loss": 1.7479, |
|
"grad_norm": 1.7456037998199463, |
|
"learning_rate": 0.0006564690885914596, |
|
"epoch": 4.091778202676864, |
|
"step": 2140 |
|
}, |
|
{ |
|
"loss": 1.7138, |
|
"grad_norm": 1.7887734174728394, |
|
"learning_rate": 0.0006522200977267899, |
|
"epoch": 4.130019120458891, |
|
"step": 2160 |
|
}, |
|
{ |
|
"loss": 1.7023, |
|
"grad_norm": 1.7080284357070923, |
|
"learning_rate": 0.0006479711068621203, |
|
"epoch": 4.168260038240918, |
|
"step": 2180 |
|
}, |
|
{ |
|
"loss": 1.7526, |
|
"grad_norm": 1.8061983585357666, |
|
"learning_rate": 0.0006437221159974506, |
|
"epoch": 4.2065009560229445, |
|
"step": 2200 |
|
}, |
|
{ |
|
"loss": 1.7474, |
|
"grad_norm": 1.7831811904907227, |
|
"learning_rate": 0.000639473125132781, |
|
"epoch": 4.244741873804971, |
|
"step": 2220 |
|
}, |
|
{ |
|
"loss": 1.6688, |
|
"grad_norm": 1.752357840538025, |
|
"learning_rate": 0.0006352241342681113, |
|
"epoch": 4.282982791586998, |
|
"step": 2240 |
|
}, |
|
{ |
|
"loss": 1.7009, |
|
"grad_norm": 1.7843034267425537, |
|
"learning_rate": 0.0006309751434034417, |
|
"epoch": 4.321223709369025, |
|
"step": 2260 |
|
}, |
|
{ |
|
"loss": 1.6727, |
|
"grad_norm": 1.7608367204666138, |
|
"learning_rate": 0.0006267261525387721, |
|
"epoch": 4.359464627151052, |
|
"step": 2280 |
|
}, |
|
{ |
|
"loss": 1.6801, |
|
"grad_norm": 1.6877254247665405, |
|
"learning_rate": 0.0006224771616741024, |
|
"epoch": 4.397705544933078, |
|
"step": 2300 |
|
}, |
|
{ |
|
"loss": 1.7108, |
|
"grad_norm": 1.7891350984573364, |
|
"learning_rate": 0.0006182281708094328, |
|
"epoch": 4.435946462715105, |
|
"step": 2320 |
|
}, |
|
{ |
|
"loss": 1.6442, |
|
"grad_norm": 1.7104123830795288, |
|
"learning_rate": 0.0006139791799447631, |
|
"epoch": 4.474187380497132, |
|
"step": 2340 |
|
}, |
|
{ |
|
"loss": 1.6531, |
|
"grad_norm": 1.7026969194412231, |
|
"learning_rate": 0.0006097301890800934, |
|
"epoch": 4.512428298279159, |
|
"step": 2360 |
|
}, |
|
{ |
|
"loss": 1.6539, |
|
"grad_norm": 1.7890552282333374, |
|
"learning_rate": 0.0006054811982154238, |
|
"epoch": 4.550669216061186, |
|
"step": 2380 |
|
}, |
|
{ |
|
"loss": 1.6681, |
|
"grad_norm": 1.8423861265182495, |
|
"learning_rate": 0.0006012322073507542, |
|
"epoch": 4.588910133843212, |
|
"step": 2400 |
|
}, |
|
{ |
|
"loss": 1.5935, |
|
"grad_norm": 1.6434499025344849, |
|
"learning_rate": 0.0005969832164860845, |
|
"epoch": 4.627151051625239, |
|
"step": 2420 |
|
}, |
|
{ |
|
"loss": 1.6273, |
|
"grad_norm": 1.7261130809783936, |
|
"learning_rate": 0.0005927342256214149, |
|
"epoch": 4.665391969407266, |
|
"step": 2440 |
|
}, |
|
{ |
|
"loss": 1.6181, |
|
"grad_norm": 1.7288273572921753, |
|
"learning_rate": 0.0005884852347567453, |
|
"epoch": 4.7036328871892925, |
|
"step": 2460 |
|
}, |
|
{ |
|
"loss": 1.5719, |
|
"grad_norm": 1.773258924484253, |
|
"learning_rate": 0.0005842362438920756, |
|
"epoch": 4.741873804971319, |
|
"step": 2480 |
|
}, |
|
{ |
|
"loss": 1.578, |
|
"grad_norm": 1.7676658630371094, |
|
"learning_rate": 0.000579987253027406, |
|
"epoch": 4.780114722753346, |
|
"step": 2500 |
|
}, |
|
{ |
|
"loss": 1.535, |
|
"grad_norm": 1.8115794658660889, |
|
"learning_rate": 0.0005757382621627364, |
|
"epoch": 4.818355640535373, |
|
"step": 2520 |
|
}, |
|
{ |
|
"loss": 1.5493, |
|
"grad_norm": 1.7989414930343628, |
|
"learning_rate": 0.0005714892712980667, |
|
"epoch": 4.8565965583174, |
|
"step": 2540 |
|
}, |
|
{ |
|
"loss": 1.5489, |
|
"grad_norm": 1.6607849597930908, |
|
"learning_rate": 0.000567240280433397, |
|
"epoch": 4.894837476099426, |
|
"step": 2560 |
|
}, |
|
{ |
|
"loss": 1.5091, |
|
"grad_norm": 1.630257487297058, |
|
"learning_rate": 0.0005629912895687275, |
|
"epoch": 4.933078393881453, |
|
"step": 2580 |
|
}, |
|
{ |
|
"loss": 1.5275, |
|
"grad_norm": 1.7995944023132324, |
|
"learning_rate": 0.0005587422987040578, |
|
"epoch": 4.97131931166348, |
|
"step": 2600 |
|
}, |
|
{ |
|
"eval_loss": 0.6697778105735779, |
|
"eval_accuracy": 0.8571140262361251, |
|
"eval_runtime": 181.3784, |
|
"eval_samples_per_second": 81.956, |
|
"eval_steps_per_second": 81.956, |
|
"epoch": 5.0, |
|
"step": 2615 |
|
}, |
|
{ |
|
"loss": 1.4774, |
|
"grad_norm": 1.7868553400039673, |
|
"learning_rate": 0.0005544933078393881, |
|
"epoch": 5.009560229445507, |
|
"step": 2620 |
|
}, |
|
{ |
|
"loss": 1.3955, |
|
"grad_norm": 1.6380654573440552, |
|
"learning_rate": 0.0005502443169747186, |
|
"epoch": 5.047801147227533, |
|
"step": 2640 |
|
}, |
|
{ |
|
"loss": 1.4414, |
|
"grad_norm": 1.7844533920288086, |
|
"learning_rate": 0.0005459953261100489, |
|
"epoch": 5.08604206500956, |
|
"step": 2660 |
|
}, |
|
{ |
|
"loss": 1.3782, |
|
"grad_norm": 1.779080867767334, |
|
"learning_rate": 0.0005417463352453792, |
|
"epoch": 5.124282982791587, |
|
"step": 2680 |
|
}, |
|
{ |
|
"loss": 1.4152, |
|
"grad_norm": 1.741326928138733, |
|
"learning_rate": 0.0005374973443807097, |
|
"epoch": 5.162523900573614, |
|
"step": 2700 |
|
}, |
|
{ |
|
"loss": 1.3996, |
|
"grad_norm": 1.7447401285171509, |
|
"learning_rate": 0.00053324835351604, |
|
"epoch": 5.2007648183556405, |
|
"step": 2720 |
|
}, |
|
{ |
|
"loss": 1.4137, |
|
"grad_norm": 1.8067736625671387, |
|
"learning_rate": 0.0005289993626513702, |
|
"epoch": 5.239005736137667, |
|
"step": 2740 |
|
}, |
|
{ |
|
"loss": 1.3937, |
|
"grad_norm": 1.7393046617507935, |
|
"learning_rate": 0.0005247503717867008, |
|
"epoch": 5.277246653919694, |
|
"step": 2760 |
|
}, |
|
{ |
|
"loss": 1.3912, |
|
"grad_norm": 1.756184458732605, |
|
"learning_rate": 0.000520501380922031, |
|
"epoch": 5.315487571701721, |
|
"step": 2780 |
|
}, |
|
{ |
|
"loss": 1.387, |
|
"grad_norm": 1.7133733034133911, |
|
"learning_rate": 0.0005162523900573613, |
|
"epoch": 5.353728489483748, |
|
"step": 2800 |
|
}, |
|
{ |
|
"loss": 1.3551, |
|
"grad_norm": 1.6597713232040405, |
|
"learning_rate": 0.0005120033991926918, |
|
"epoch": 5.3919694072657744, |
|
"step": 2820 |
|
}, |
|
{ |
|
"loss": 1.3557, |
|
"grad_norm": 1.8462845087051392, |
|
"learning_rate": 0.0005077544083280221, |
|
"epoch": 5.430210325047801, |
|
"step": 2840 |
|
}, |
|
{ |
|
"loss": 1.3495, |
|
"grad_norm": 1.6737143993377686, |
|
"learning_rate": 0.0005035054174633524, |
|
"epoch": 5.468451242829828, |
|
"step": 2860 |
|
}, |
|
{ |
|
"loss": 1.394, |
|
"grad_norm": 1.7071157693862915, |
|
"learning_rate": 0.0004992564265986828, |
|
"epoch": 5.506692160611855, |
|
"step": 2880 |
|
}, |
|
{ |
|
"loss": 1.3263, |
|
"grad_norm": 1.663072943687439, |
|
"learning_rate": 0.0004950074357340132, |
|
"epoch": 5.544933078393882, |
|
"step": 2900 |
|
}, |
|
{ |
|
"loss": 1.3474, |
|
"grad_norm": 1.640093207359314, |
|
"learning_rate": 0.0004907584448693436, |
|
"epoch": 5.583173996175908, |
|
"step": 2920 |
|
}, |
|
{ |
|
"loss": 1.3375, |
|
"grad_norm": 1.762568712234497, |
|
"learning_rate": 0.0004865094540046739, |
|
"epoch": 5.621414913957935, |
|
"step": 2940 |
|
}, |
|
{ |
|
"loss": 1.3218, |
|
"grad_norm": 1.6714434623718262, |
|
"learning_rate": 0.00048226046314000425, |
|
"epoch": 5.659655831739962, |
|
"step": 2960 |
|
}, |
|
{ |
|
"loss": 1.3008, |
|
"grad_norm": 1.7594107389450073, |
|
"learning_rate": 0.0004780114722753346, |
|
"epoch": 5.6978967495219885, |
|
"step": 2980 |
|
}, |
|
{ |
|
"loss": 1.3331, |
|
"grad_norm": 1.6483973264694214, |
|
"learning_rate": 0.000473762481410665, |
|
"epoch": 5.736137667304015, |
|
"step": 3000 |
|
}, |
|
{ |
|
"loss": 1.2775, |
|
"grad_norm": 1.7252651453018188, |
|
"learning_rate": 0.00046951349054599533, |
|
"epoch": 5.774378585086042, |
|
"step": 3020 |
|
}, |
|
{ |
|
"loss": 1.2747, |
|
"grad_norm": 1.7860745191574097, |
|
"learning_rate": 0.0004652644996813257, |
|
"epoch": 5.812619502868069, |
|
"step": 3040 |
|
}, |
|
{ |
|
"loss": 1.2946, |
|
"grad_norm": 1.749874234199524, |
|
"learning_rate": 0.0004610155088166561, |
|
"epoch": 5.850860420650095, |
|
"step": 3060 |
|
}, |
|
{ |
|
"loss": 1.2849, |
|
"grad_norm": 1.7197644710540771, |
|
"learning_rate": 0.0004567665179519864, |
|
"epoch": 5.8891013384321225, |
|
"step": 3080 |
|
}, |
|
{ |
|
"loss": 1.2544, |
|
"grad_norm": 1.6396132707595825, |
|
"learning_rate": 0.00045251752708731676, |
|
"epoch": 5.927342256214149, |
|
"step": 3100 |
|
}, |
|
{ |
|
"loss": 1.248, |
|
"grad_norm": 1.720376968383789, |
|
"learning_rate": 0.0004482685362226471, |
|
"epoch": 5.965583173996176, |
|
"step": 3120 |
|
}, |
|
{ |
|
"eval_loss": 0.5270123481750488, |
|
"eval_accuracy": 0.8899428187016482, |
|
"eval_runtime": 183.7621, |
|
"eval_samples_per_second": 80.893, |
|
"eval_steps_per_second": 80.893, |
|
"epoch": 6.0, |
|
"step": 3138 |
|
}, |
|
{ |
|
"loss": 1.2398, |
|
"grad_norm": 1.5206599235534668, |
|
"learning_rate": 0.0004440195453579775, |
|
"epoch": 6.003824091778203, |
|
"step": 3140 |
|
}, |
|
{ |
|
"loss": 1.2097, |
|
"grad_norm": 1.7172082662582397, |
|
"learning_rate": 0.00043977055449330785, |
|
"epoch": 6.042065009560229, |
|
"step": 3160 |
|
}, |
|
{ |
|
"loss": 1.1669, |
|
"grad_norm": 1.5570909976959229, |
|
"learning_rate": 0.0004355215636286382, |
|
"epoch": 6.080305927342256, |
|
"step": 3180 |
|
}, |
|
{ |
|
"loss": 1.1647, |
|
"grad_norm": 1.7044614553451538, |
|
"learning_rate": 0.0004312725727639686, |
|
"epoch": 6.118546845124283, |
|
"step": 3200 |
|
}, |
|
{ |
|
"loss": 1.1627, |
|
"grad_norm": 1.5819571018218994, |
|
"learning_rate": 0.0004270235818992989, |
|
"epoch": 6.15678776290631, |
|
"step": 3220 |
|
}, |
|
{ |
|
"loss": 1.1728, |
|
"grad_norm": 1.7076871395111084, |
|
"learning_rate": 0.0004227745910346293, |
|
"epoch": 6.195028680688337, |
|
"step": 3240 |
|
}, |
|
{ |
|
"loss": 1.1459, |
|
"grad_norm": 1.7301490306854248, |
|
"learning_rate": 0.0004185256001699597, |
|
"epoch": 6.233269598470363, |
|
"step": 3260 |
|
}, |
|
{ |
|
"loss": 1.1676, |
|
"grad_norm": 1.7135626077651978, |
|
"learning_rate": 0.00041427660930528997, |
|
"epoch": 6.27151051625239, |
|
"step": 3280 |
|
}, |
|
{ |
|
"loss": 1.1488, |
|
"grad_norm": 1.602142572402954, |
|
"learning_rate": 0.00041002761844062037, |
|
"epoch": 6.309751434034417, |
|
"step": 3300 |
|
}, |
|
{ |
|
"loss": 1.1395, |
|
"grad_norm": 1.755293846130371, |
|
"learning_rate": 0.00040577862757595076, |
|
"epoch": 6.347992351816444, |
|
"step": 3320 |
|
}, |
|
{ |
|
"loss": 1.1321, |
|
"grad_norm": 1.663662314414978, |
|
"learning_rate": 0.00040152963671128105, |
|
"epoch": 6.3862332695984705, |
|
"step": 3340 |
|
}, |
|
{ |
|
"loss": 1.1317, |
|
"grad_norm": 1.7366993427276611, |
|
"learning_rate": 0.00039728064584661145, |
|
"epoch": 6.424474187380497, |
|
"step": 3360 |
|
}, |
|
{ |
|
"loss": 1.1449, |
|
"grad_norm": 1.7560149431228638, |
|
"learning_rate": 0.0003930316549819418, |
|
"epoch": 6.462715105162524, |
|
"step": 3380 |
|
}, |
|
{ |
|
"loss": 1.13, |
|
"grad_norm": 1.7576582431793213, |
|
"learning_rate": 0.00038878266411727214, |
|
"epoch": 6.500956022944551, |
|
"step": 3400 |
|
}, |
|
{ |
|
"loss": 1.1419, |
|
"grad_norm": 1.7916873693466187, |
|
"learning_rate": 0.00038453367325260254, |
|
"epoch": 6.539196940726577, |
|
"step": 3420 |
|
}, |
|
{ |
|
"loss": 1.1107, |
|
"grad_norm": 1.5987508296966553, |
|
"learning_rate": 0.0003802846823879329, |
|
"epoch": 6.577437858508604, |
|
"step": 3440 |
|
}, |
|
{ |
|
"loss": 1.1162, |
|
"grad_norm": 1.8192518949508667, |
|
"learning_rate": 0.0003760356915232632, |
|
"epoch": 6.615678776290631, |
|
"step": 3460 |
|
}, |
|
{ |
|
"loss": 1.1255, |
|
"grad_norm": 1.7236486673355103, |
|
"learning_rate": 0.0003717867006585936, |
|
"epoch": 6.653919694072657, |
|
"step": 3480 |
|
}, |
|
{ |
|
"loss": 1.0629, |
|
"grad_norm": 1.8209389448165894, |
|
"learning_rate": 0.0003675377097939239, |
|
"epoch": 6.692160611854685, |
|
"step": 3500 |
|
}, |
|
{ |
|
"loss": 1.0809, |
|
"grad_norm": 1.652782678604126, |
|
"learning_rate": 0.0003632887189292543, |
|
"epoch": 6.730401529636711, |
|
"step": 3520 |
|
}, |
|
{ |
|
"loss": 1.1286, |
|
"grad_norm": 1.6148645877838135, |
|
"learning_rate": 0.00035903972806458466, |
|
"epoch": 6.768642447418738, |
|
"step": 3540 |
|
}, |
|
{ |
|
"loss": 1.1069, |
|
"grad_norm": 1.6869423389434814, |
|
"learning_rate": 0.000354790737199915, |
|
"epoch": 6.806883365200765, |
|
"step": 3560 |
|
}, |
|
{ |
|
"loss": 1.0911, |
|
"grad_norm": 1.6373172998428345, |
|
"learning_rate": 0.0003505417463352454, |
|
"epoch": 6.845124282982791, |
|
"step": 3580 |
|
}, |
|
{ |
|
"loss": 1.0808, |
|
"grad_norm": 1.6761549711227417, |
|
"learning_rate": 0.00034629275547057574, |
|
"epoch": 6.8833652007648185, |
|
"step": 3600 |
|
}, |
|
{ |
|
"loss": 1.0809, |
|
"grad_norm": 1.6510460376739502, |
|
"learning_rate": 0.0003420437646059061, |
|
"epoch": 6.921606118546845, |
|
"step": 3620 |
|
}, |
|
{ |
|
"loss": 1.0912, |
|
"grad_norm": 1.7351855039596558, |
|
"learning_rate": 0.0003377947737412365, |
|
"epoch": 6.959847036328872, |
|
"step": 3640 |
|
}, |
|
{ |
|
"loss": 1.0991, |
|
"grad_norm": 1.7165274620056152, |
|
"learning_rate": 0.00033354578287656683, |
|
"epoch": 6.998087954110899, |
|
"step": 3660 |
|
}, |
|
{ |
|
"eval_loss": 0.44995447993278503, |
|
"eval_accuracy": 0.9037336024217961, |
|
"eval_runtime": 189.5634, |
|
"eval_samples_per_second": 78.417, |
|
"eval_steps_per_second": 78.417, |
|
"epoch": 7.0, |
|
"step": 3661 |
|
}, |
|
{ |
|
"loss": 1.0154, |
|
"grad_norm": 1.6468501091003418, |
|
"learning_rate": 0.0003292967920118972, |
|
"epoch": 7.036328871892925, |
|
"step": 3680 |
|
}, |
|
{ |
|
"loss": 1.0378, |
|
"grad_norm": 1.79421067237854, |
|
"learning_rate": 0.0003250478011472275, |
|
"epoch": 7.074569789674952, |
|
"step": 3700 |
|
}, |
|
{ |
|
"loss": 1.0145, |
|
"grad_norm": 1.7234885692596436, |
|
"learning_rate": 0.0003207988102825579, |
|
"epoch": 7.112810707456979, |
|
"step": 3720 |
|
}, |
|
{ |
|
"loss": 1.0012, |
|
"grad_norm": 1.6947157382965088, |
|
"learning_rate": 0.00031654981941788826, |
|
"epoch": 7.151051625239006, |
|
"step": 3740 |
|
}, |
|
{ |
|
"loss": 1.0191, |
|
"grad_norm": 1.6818758249282837, |
|
"learning_rate": 0.0003123008285532186, |
|
"epoch": 7.189292543021033, |
|
"step": 3760 |
|
}, |
|
{ |
|
"loss": 1.0437, |
|
"grad_norm": 1.557080864906311, |
|
"learning_rate": 0.000308051837688549, |
|
"epoch": 7.227533460803059, |
|
"step": 3780 |
|
}, |
|
{ |
|
"loss": 1.0079, |
|
"grad_norm": 1.6532793045043945, |
|
"learning_rate": 0.00030380284682387935, |
|
"epoch": 7.265774378585086, |
|
"step": 3800 |
|
}, |
|
{ |
|
"loss": 0.9904, |
|
"grad_norm": 1.646686315536499, |
|
"learning_rate": 0.0002995538559592097, |
|
"epoch": 7.304015296367113, |
|
"step": 3820 |
|
}, |
|
{ |
|
"loss": 1.0002, |
|
"grad_norm": 1.6772829294204712, |
|
"learning_rate": 0.0002953048650945401, |
|
"epoch": 7.342256214149139, |
|
"step": 3840 |
|
}, |
|
{ |
|
"loss": 0.9674, |
|
"grad_norm": 1.6452054977416992, |
|
"learning_rate": 0.0002910558742298704, |
|
"epoch": 7.3804971319311665, |
|
"step": 3860 |
|
}, |
|
{ |
|
"loss": 0.9642, |
|
"grad_norm": 1.592207908630371, |
|
"learning_rate": 0.0002868068833652008, |
|
"epoch": 7.418738049713193, |
|
"step": 3880 |
|
}, |
|
{ |
|
"loss": 0.973, |
|
"grad_norm": 1.7015941143035889, |
|
"learning_rate": 0.0002825578925005312, |
|
"epoch": 7.45697896749522, |
|
"step": 3900 |
|
}, |
|
{ |
|
"loss": 0.9803, |
|
"grad_norm": 1.6589232683181763, |
|
"learning_rate": 0.00027830890163586146, |
|
"epoch": 7.495219885277247, |
|
"step": 3920 |
|
}, |
|
{ |
|
"loss": 0.9702, |
|
"grad_norm": 1.660190463066101, |
|
"learning_rate": 0.00027405991077119186, |
|
"epoch": 7.533460803059273, |
|
"step": 3940 |
|
}, |
|
{ |
|
"loss": 0.9919, |
|
"grad_norm": 1.7052509784698486, |
|
"learning_rate": 0.00026981091990652226, |
|
"epoch": 7.5717017208413, |
|
"step": 3960 |
|
}, |
|
{ |
|
"loss": 0.938, |
|
"grad_norm": 1.6874445676803589, |
|
"learning_rate": 0.00026556192904185255, |
|
"epoch": 7.609942638623327, |
|
"step": 3980 |
|
}, |
|
{ |
|
"loss": 0.9564, |
|
"grad_norm": 1.811640739440918, |
|
"learning_rate": 0.00026131293817718295, |
|
"epoch": 7.648183556405353, |
|
"step": 4000 |
|
}, |
|
{ |
|
"loss": 0.9432, |
|
"grad_norm": 1.741968035697937, |
|
"learning_rate": 0.00025706394731251324, |
|
"epoch": 7.686424474187381, |
|
"step": 4020 |
|
}, |
|
{ |
|
"loss": 0.9082, |
|
"grad_norm": 1.6731518507003784, |
|
"learning_rate": 0.00025281495644784364, |
|
"epoch": 7.724665391969407, |
|
"step": 4040 |
|
}, |
|
{ |
|
"loss": 0.92, |
|
"grad_norm": 1.7399870157241821, |
|
"learning_rate": 0.00024856596558317403, |
|
"epoch": 7.762906309751434, |
|
"step": 4060 |
|
}, |
|
{ |
|
"loss": 0.9433, |
|
"grad_norm": 1.580674171447754, |
|
"learning_rate": 0.0002443169747185044, |
|
"epoch": 7.801147227533461, |
|
"step": 4080 |
|
}, |
|
{ |
|
"loss": 0.9584, |
|
"grad_norm": 1.5683550834655762, |
|
"learning_rate": 0.00024006798385383472, |
|
"epoch": 7.839388145315487, |
|
"step": 4100 |
|
}, |
|
{ |
|
"loss": 0.9318, |
|
"grad_norm": 1.7664682865142822, |
|
"learning_rate": 0.00023581899298916507, |
|
"epoch": 7.8776290630975145, |
|
"step": 4120 |
|
}, |
|
{ |
|
"loss": 0.8852, |
|
"grad_norm": 1.5522887706756592, |
|
"learning_rate": 0.00023157000212449544, |
|
"epoch": 7.915869980879541, |
|
"step": 4140 |
|
}, |
|
{ |
|
"loss": 0.9121, |
|
"grad_norm": 1.626836895942688, |
|
"learning_rate": 0.00022732101125982578, |
|
"epoch": 7.954110898661568, |
|
"step": 4160 |
|
}, |
|
{ |
|
"loss": 0.9221, |
|
"grad_norm": 1.8042898178100586, |
|
"learning_rate": 0.00022307202039515615, |
|
"epoch": 7.992351816443595, |
|
"step": 4180 |
|
}, |
|
{ |
|
"eval_loss": 0.3572401702404022, |
|
"eval_accuracy": 0.9266733938782374, |
|
"eval_runtime": 168.0336, |
|
"eval_samples_per_second": 88.464, |
|
"eval_steps_per_second": 88.464, |
|
"epoch": 8.0, |
|
"step": 4184 |
|
}, |
|
{ |
|
"loss": 0.8979, |
|
"grad_norm": 1.545024037361145, |
|
"learning_rate": 0.00021882302953048652, |
|
"epoch": 8.030592734225621, |
|
"step": 4200 |
|
}, |
|
{ |
|
"loss": 0.8464, |
|
"grad_norm": 1.592607855796814, |
|
"learning_rate": 0.00021457403866581687, |
|
"epoch": 8.068833652007648, |
|
"step": 4220 |
|
}, |
|
{ |
|
"loss": 0.8634, |
|
"grad_norm": 1.5347646474838257, |
|
"learning_rate": 0.0002103250478011472, |
|
"epoch": 8.107074569789676, |
|
"step": 4240 |
|
}, |
|
{ |
|
"loss": 0.8425, |
|
"grad_norm": 1.621201515197754, |
|
"learning_rate": 0.0002060760569364776, |
|
"epoch": 8.145315487571702, |
|
"step": 4260 |
|
}, |
|
{ |
|
"loss": 0.8776, |
|
"grad_norm": 1.7381062507629395, |
|
"learning_rate": 0.00020182706607180795, |
|
"epoch": 8.183556405353729, |
|
"step": 4280 |
|
}, |
|
{ |
|
"loss": 0.854, |
|
"grad_norm": 1.5798373222351074, |
|
"learning_rate": 0.0001975780752071383, |
|
"epoch": 8.221797323135755, |
|
"step": 4300 |
|
}, |
|
{ |
|
"loss": 0.8646, |
|
"grad_norm": 1.5751338005065918, |
|
"learning_rate": 0.00019332908434246867, |
|
"epoch": 8.260038240917781, |
|
"step": 4320 |
|
}, |
|
{ |
|
"loss": 0.8521, |
|
"grad_norm": 1.570742130279541, |
|
"learning_rate": 0.00018908009347779904, |
|
"epoch": 8.29827915869981, |
|
"step": 4340 |
|
}, |
|
{ |
|
"loss": 0.8858, |
|
"grad_norm": 1.7959846258163452, |
|
"learning_rate": 0.00018483110261312938, |
|
"epoch": 8.336520076481836, |
|
"step": 4360 |
|
}, |
|
{ |
|
"loss": 0.8707, |
|
"grad_norm": 1.7537908554077148, |
|
"learning_rate": 0.00018058211174845973, |
|
"epoch": 8.374760994263863, |
|
"step": 4380 |
|
}, |
|
{ |
|
"loss": 0.8375, |
|
"grad_norm": 1.635578989982605, |
|
"learning_rate": 0.0001763331208837901, |
|
"epoch": 8.413001912045889, |
|
"step": 4400 |
|
}, |
|
{ |
|
"loss": 0.8631, |
|
"grad_norm": 1.5729222297668457, |
|
"learning_rate": 0.00017208413001912047, |
|
"epoch": 8.451242829827915, |
|
"step": 4420 |
|
}, |
|
{ |
|
"loss": 0.8551, |
|
"grad_norm": 1.6586476564407349, |
|
"learning_rate": 0.00016783513915445082, |
|
"epoch": 8.489483747609942, |
|
"step": 4440 |
|
}, |
|
{ |
|
"loss": 0.8637, |
|
"grad_norm": 1.6118619441986084, |
|
"learning_rate": 0.00016358614828978119, |
|
"epoch": 8.52772466539197, |
|
"step": 4460 |
|
}, |
|
{ |
|
"loss": 0.8484, |
|
"grad_norm": 1.5538595914840698, |
|
"learning_rate": 0.00015933715742511153, |
|
"epoch": 8.565965583173996, |
|
"step": 4480 |
|
}, |
|
{ |
|
"loss": 0.8433, |
|
"grad_norm": 1.5646642446517944, |
|
"learning_rate": 0.0001550881665604419, |
|
"epoch": 8.604206500956023, |
|
"step": 4500 |
|
}, |
|
{ |
|
"loss": 0.8592, |
|
"grad_norm": 1.7190415859222412, |
|
"learning_rate": 0.00015083917569577227, |
|
"epoch": 8.64244741873805, |
|
"step": 4520 |
|
}, |
|
{ |
|
"loss": 0.8236, |
|
"grad_norm": 1.4950307607650757, |
|
"learning_rate": 0.00014659018483110262, |
|
"epoch": 8.680688336520076, |
|
"step": 4540 |
|
}, |
|
{ |
|
"loss": 0.8421, |
|
"grad_norm": 1.5117732286453247, |
|
"learning_rate": 0.00014234119396643296, |
|
"epoch": 8.718929254302104, |
|
"step": 4560 |
|
}, |
|
{ |
|
"loss": 0.8287, |
|
"grad_norm": 1.5558750629425049, |
|
"learning_rate": 0.00013809220310176336, |
|
"epoch": 8.75717017208413, |
|
"step": 4580 |
|
}, |
|
{ |
|
"loss": 0.8492, |
|
"grad_norm": 1.7955564260482788, |
|
"learning_rate": 0.0001338432122370937, |
|
"epoch": 8.795411089866157, |
|
"step": 4600 |
|
}, |
|
{ |
|
"loss": 0.8419, |
|
"grad_norm": 1.6532599925994873, |
|
"learning_rate": 0.00012959422137242405, |
|
"epoch": 8.833652007648183, |
|
"step": 4620 |
|
}, |
|
{ |
|
"loss": 0.8125, |
|
"grad_norm": 1.7040739059448242, |
|
"learning_rate": 0.0001253452305077544, |
|
"epoch": 8.87189292543021, |
|
"step": 4640 |
|
}, |
|
{ |
|
"loss": 0.8187, |
|
"grad_norm": 1.7040703296661377, |
|
"learning_rate": 0.00012109623964308478, |
|
"epoch": 8.910133843212238, |
|
"step": 4660 |
|
}, |
|
{ |
|
"loss": 0.8155, |
|
"grad_norm": 1.7090845108032227, |
|
"learning_rate": 0.00011684724877841513, |
|
"epoch": 8.948374760994264, |
|
"step": 4680 |
|
}, |
|
{ |
|
"loss": 0.7997, |
|
"grad_norm": 1.6070616245269775, |
|
"learning_rate": 0.00011259825791374549, |
|
"epoch": 8.98661567877629, |
|
"step": 4700 |
|
}, |
|
{ |
|
"eval_loss": 0.3138497769832611, |
|
"eval_accuracy": 0.9352842246888665, |
|
"eval_runtime": 177.561, |
|
"eval_samples_per_second": 83.718, |
|
"eval_steps_per_second": 83.718, |
|
"epoch": 9.0, |
|
"step": 4707 |
|
}, |
|
{ |
|
"loss": 0.7906, |
|
"grad_norm": 1.5590825080871582, |
|
"learning_rate": 0.00010834926704907585, |
|
"epoch": 9.024856596558317, |
|
"step": 4720 |
|
}, |
|
{ |
|
"loss": 0.7494, |
|
"grad_norm": 1.4745252132415771, |
|
"learning_rate": 0.0001041002761844062, |
|
"epoch": 9.063097514340344, |
|
"step": 4740 |
|
}, |
|
{ |
|
"loss": 0.7854, |
|
"grad_norm": 1.61099112033844, |
|
"learning_rate": 9.985128531973658e-05, |
|
"epoch": 9.101338432122372, |
|
"step": 4760 |
|
}, |
|
{ |
|
"loss": 0.7636, |
|
"grad_norm": 1.5839650630950928, |
|
"learning_rate": 9.560229445506692e-05, |
|
"epoch": 9.139579349904398, |
|
"step": 4780 |
|
}, |
|
{ |
|
"loss": 0.776, |
|
"grad_norm": 1.7259138822555542, |
|
"learning_rate": 9.135330359039729e-05, |
|
"epoch": 9.177820267686425, |
|
"step": 4800 |
|
}, |
|
{ |
|
"loss": 0.7444, |
|
"grad_norm": 1.5495970249176025, |
|
"learning_rate": 8.710431272572764e-05, |
|
"epoch": 9.216061185468451, |
|
"step": 4820 |
|
}, |
|
{ |
|
"loss": 0.7603, |
|
"grad_norm": 1.5250838994979858, |
|
"learning_rate": 8.2855321861058e-05, |
|
"epoch": 9.254302103250478, |
|
"step": 4840 |
|
}, |
|
{ |
|
"loss": 0.7561, |
|
"grad_norm": 1.6244220733642578, |
|
"learning_rate": 7.860633099638836e-05, |
|
"epoch": 9.292543021032504, |
|
"step": 4860 |
|
}, |
|
{ |
|
"loss": 0.7908, |
|
"grad_norm": 1.6825993061065674, |
|
"learning_rate": 7.435734013171871e-05, |
|
"epoch": 9.330783938814532, |
|
"step": 4880 |
|
}, |
|
{ |
|
"loss": 0.7517, |
|
"grad_norm": 1.563707947731018, |
|
"learning_rate": 7.010834926704908e-05, |
|
"epoch": 9.369024856596559, |
|
"step": 4900 |
|
}, |
|
{ |
|
"loss": 0.7679, |
|
"grad_norm": 1.7463629245758057, |
|
"learning_rate": 6.585935840237942e-05, |
|
"epoch": 9.407265774378585, |
|
"step": 4920 |
|
}, |
|
{ |
|
"loss": 0.7426, |
|
"grad_norm": 1.5689053535461426, |
|
"learning_rate": 6.16103675377098e-05, |
|
"epoch": 9.445506692160611, |
|
"step": 4940 |
|
}, |
|
{ |
|
"loss": 0.7695, |
|
"grad_norm": 1.6512914896011353, |
|
"learning_rate": 5.736137667304015e-05, |
|
"epoch": 9.483747609942638, |
|
"step": 4960 |
|
}, |
|
{ |
|
"loss": 0.7603, |
|
"grad_norm": 1.6542084217071533, |
|
"learning_rate": 5.311238580837052e-05, |
|
"epoch": 9.521988527724666, |
|
"step": 4980 |
|
}, |
|
{ |
|
"loss": 0.754, |
|
"grad_norm": 1.6929945945739746, |
|
"learning_rate": 4.8863394943700874e-05, |
|
"epoch": 9.560229445506693, |
|
"step": 5000 |
|
}, |
|
{ |
|
"loss": 0.7597, |
|
"grad_norm": 1.4880517721176147, |
|
"learning_rate": 4.461440407903123e-05, |
|
"epoch": 9.598470363288719, |
|
"step": 5020 |
|
}, |
|
{ |
|
"loss": 0.7624, |
|
"grad_norm": 1.578971266746521, |
|
"learning_rate": 4.036541321436159e-05, |
|
"epoch": 9.636711281070745, |
|
"step": 5040 |
|
}, |
|
{ |
|
"loss": 0.7653, |
|
"grad_norm": 1.616727352142334, |
|
"learning_rate": 3.6116422349691954e-05, |
|
"epoch": 9.674952198852772, |
|
"step": 5060 |
|
}, |
|
{ |
|
"loss": 0.7352, |
|
"grad_norm": 1.6762784719467163, |
|
"learning_rate": 3.186743148502231e-05, |
|
"epoch": 9.7131931166348, |
|
"step": 5080 |
|
}, |
|
{ |
|
"loss": 0.732, |
|
"grad_norm": 1.5666388273239136, |
|
"learning_rate": 2.7618440620352666e-05, |
|
"epoch": 9.751434034416826, |
|
"step": 5100 |
|
}, |
|
{ |
|
"loss": 0.7631, |
|
"grad_norm": 1.641012191772461, |
|
"learning_rate": 2.3369449755683023e-05, |
|
"epoch": 9.789674952198853, |
|
"step": 5120 |
|
}, |
|
{ |
|
"loss": 0.7153, |
|
"grad_norm": 1.7024327516555786, |
|
"learning_rate": 1.9120458891013384e-05, |
|
"epoch": 9.82791586998088, |
|
"step": 5140 |
|
}, |
|
{ |
|
"loss": 0.7247, |
|
"grad_norm": 1.4840829372406006, |
|
"learning_rate": 1.4871468026343743e-05, |
|
"epoch": 9.866156787762906, |
|
"step": 5160 |
|
}, |
|
{ |
|
"loss": 0.7289, |
|
"grad_norm": 1.627562165260315, |
|
"learning_rate": 1.0622477161674103e-05, |
|
"epoch": 9.904397705544934, |
|
"step": 5180 |
|
}, |
|
{ |
|
"loss": 0.7563, |
|
"grad_norm": 1.6473079919815063, |
|
"learning_rate": 6.373486297004461e-06, |
|
"epoch": 9.94263862332696, |
|
"step": 5200 |
|
}, |
|
{ |
|
"loss": 0.7603, |
|
"grad_norm": 1.577776312828064, |
|
"learning_rate": 2.1244954323348204e-06, |
|
"epoch": 9.980879541108987, |
|
"step": 5220 |
|
}, |
|
{ |
|
"eval_loss": 0.29460111260414124, |
|
"eval_accuracy": 0.9410023545240498, |
|
"eval_runtime": 187.2975, |
|
"eval_samples_per_second": 79.366, |
|
"eval_steps_per_second": 79.366, |
|
"epoch": 10.0, |
|
"step": 5230 |
|
}, |
|
{ |
|
"train_runtime": 19361.582, |
|
"train_samples_per_second": 69.094, |
|
"train_steps_per_second": 0.27, |
|
"total_flos": 1.96318398191328e+18, |
|
"train_loss": 2.1099621225725396, |
|
"epoch": 10.0, |
|
"step": 5230 |
|
} |
|
] |