{ "best_metric": 0.9405314497140935, "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/xvector/voxceleb1/finetune/ce-len3-bs256-lr1e-3/checkpoint-5230", "epoch": 10.0, "eval_steps": 500, "global_step": 5230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03824091778202677, "grad_norm": 4.702692031860352, "learning_rate": 3.824091778202677e-05, "loss": 7.1455, "step": 20 }, { "epoch": 0.07648183556405354, "grad_norm": 4.504467487335205, "learning_rate": 7.648183556405354e-05, "loss": 7.116, "step": 40 }, { "epoch": 0.1147227533460803, "grad_norm": 3.964728832244873, "learning_rate": 0.0001147227533460803, "loss": 7.0566, "step": 60 }, { "epoch": 0.15296367112810708, "grad_norm": 3.199570417404175, "learning_rate": 0.00015296367112810707, "loss": 6.972, "step": 80 }, { "epoch": 0.19120458891013384, "grad_norm": 2.6367344856262207, "learning_rate": 0.00019120458891013384, "loss": 6.8778, "step": 100 }, { "epoch": 0.2294455066921606, "grad_norm": 2.1930582523345947, "learning_rate": 0.0002294455066921606, "loss": 6.733, "step": 120 }, { "epoch": 0.2676864244741874, "grad_norm": 1.9982482194900513, "learning_rate": 0.0002676864244741874, "loss": 6.5814, "step": 140 }, { "epoch": 0.30592734225621415, "grad_norm": 1.8051823377609253, "learning_rate": 0.00030592734225621415, "loss": 6.4206, "step": 160 }, { "epoch": 0.3441682600382409, "grad_norm": 1.757859706878662, "learning_rate": 0.00034416826003824094, "loss": 6.2424, "step": 180 }, { "epoch": 0.3824091778202677, "grad_norm": 1.7747690677642822, "learning_rate": 0.0003824091778202677, "loss": 6.0657, "step": 200 }, { "epoch": 0.42065009560229444, "grad_norm": 1.7719988822937012, "learning_rate": 0.0004206500956022944, "loss": 5.9722, "step": 220 }, { "epoch": 0.4588910133843212, "grad_norm": 1.934911847114563, "learning_rate": 0.0004588910133843212, "loss": 5.814, "step": 240 }, { "epoch": 0.497131931166348, "grad_norm": 1.793244481086731, "learning_rate": 0.0004971319311663481, "loss": 5.6998, "step": 260 }, { "epoch": 0.5353728489483748, "grad_norm": 1.7790417671203613, "learning_rate": 0.0005353728489483748, "loss": 5.6115, "step": 280 }, { "epoch": 0.5736137667304015, "grad_norm": 1.8355106115341187, "learning_rate": 0.0005736137667304016, "loss": 5.5236, "step": 300 }, { "epoch": 0.6118546845124283, "grad_norm": 1.7114174365997314, "learning_rate": 0.0006118546845124283, "loss": 5.3894, "step": 320 }, { "epoch": 0.6500956022944551, "grad_norm": 1.877690315246582, "learning_rate": 0.000650095602294455, "loss": 5.3336, "step": 340 }, { "epoch": 0.6883365200764818, "grad_norm": 1.8771674633026123, "learning_rate": 0.0006883365200764819, "loss": 5.2577, "step": 360 }, { "epoch": 0.7265774378585086, "grad_norm": 1.9654275178909302, "learning_rate": 0.0007265774378585086, "loss": 5.1333, "step": 380 }, { "epoch": 0.7648183556405354, "grad_norm": 1.9327517747879028, "learning_rate": 0.0007648183556405354, "loss": 5.0882, "step": 400 }, { "epoch": 0.8030592734225621, "grad_norm": 1.8918468952178955, "learning_rate": 0.0008030592734225621, "loss": 5.0404, "step": 420 }, { "epoch": 0.8413001912045889, "grad_norm": 1.8536239862442017, "learning_rate": 0.0008413001912045888, "loss": 4.9524, "step": 440 }, { "epoch": 0.8795411089866156, "grad_norm": 1.8849778175354004, "learning_rate": 0.0008795411089866157, "loss": 4.8673, "step": 460 }, { "epoch": 0.9177820267686424, "grad_norm": 1.8270999193191528, "learning_rate": 0.0009177820267686424, "loss": 4.8456, "step": 480 }, { "epoch": 0.9560229445506692, "grad_norm": 1.8521556854248047, "learning_rate": 0.0009560229445506692, "loss": 4.7748, "step": 500 }, { "epoch": 0.994263862332696, "grad_norm": 1.9331077337265015, "learning_rate": 0.0009942638623326961, "loss": 4.6869, "step": 520 }, { "epoch": 1.0, "eval_accuracy": 0.19596367305751766, "eval_loss": 4.119868278503418, "eval_runtime": 539.7217, "eval_samples_per_second": 27.542, "eval_steps_per_second": 27.542, "step": 523 }, { "epoch": 1.0325047801147227, "grad_norm": 1.7894033193588257, "learning_rate": 0.0009963883577650309, "loss": 4.5805, "step": 540 }, { "epoch": 1.0707456978967496, "grad_norm": 1.9136412143707275, "learning_rate": 0.0009921393669003612, "loss": 4.5223, "step": 560 }, { "epoch": 1.1089866156787762, "grad_norm": 1.8342599868774414, "learning_rate": 0.0009878903760356915, "loss": 4.4213, "step": 580 }, { "epoch": 1.147227533460803, "grad_norm": 1.7360094785690308, "learning_rate": 0.0009836413851710218, "loss": 4.409, "step": 600 }, { "epoch": 1.1854684512428297, "grad_norm": 1.8302013874053955, "learning_rate": 0.0009793923943063523, "loss": 4.3382, "step": 620 }, { "epoch": 1.2237093690248566, "grad_norm": 1.847433090209961, "learning_rate": 0.0009751434034416827, "loss": 4.2381, "step": 640 }, { "epoch": 1.2619502868068833, "grad_norm": 1.866734504699707, "learning_rate": 0.000970894412577013, "loss": 4.1802, "step": 660 }, { "epoch": 1.3001912045889101, "grad_norm": 1.9123674631118774, "learning_rate": 0.0009666454217123433, "loss": 4.1404, "step": 680 }, { "epoch": 1.338432122370937, "grad_norm": 1.8355252742767334, "learning_rate": 0.0009623964308476737, "loss": 4.059, "step": 700 }, { "epoch": 1.3766730401529637, "grad_norm": 1.7890186309814453, "learning_rate": 0.000958147439983004, "loss": 4.0086, "step": 720 }, { "epoch": 1.4149139579349903, "grad_norm": 1.847299337387085, "learning_rate": 0.0009538984491183344, "loss": 3.9641, "step": 740 }, { "epoch": 1.4531548757170172, "grad_norm": 1.8219211101531982, "learning_rate": 0.0009496494582536647, "loss": 3.9291, "step": 760 }, { "epoch": 1.491395793499044, "grad_norm": 1.8026444911956787, "learning_rate": 0.0009454004673889951, "loss": 3.8577, "step": 780 }, { "epoch": 1.5296367112810707, "grad_norm": 1.7959771156311035, "learning_rate": 0.0009411514765243255, "loss": 3.8234, "step": 800 }, { "epoch": 1.5678776290630974, "grad_norm": 1.7638428211212158, "learning_rate": 0.0009369024856596558, "loss": 3.7132, "step": 820 }, { "epoch": 1.6061185468451242, "grad_norm": 1.8295230865478516, "learning_rate": 0.0009326534947949862, "loss": 3.7057, "step": 840 }, { "epoch": 1.644359464627151, "grad_norm": 1.8669532537460327, "learning_rate": 0.0009284045039303166, "loss": 3.6656, "step": 860 }, { "epoch": 1.682600382409178, "grad_norm": 1.8251906633377075, "learning_rate": 0.0009241555130656469, "loss": 3.591, "step": 880 }, { "epoch": 1.7208413001912046, "grad_norm": 1.825548529624939, "learning_rate": 0.0009199065222009773, "loss": 3.5281, "step": 900 }, { "epoch": 1.7590822179732313, "grad_norm": 1.7811344861984253, "learning_rate": 0.0009156575313363077, "loss": 3.4841, "step": 920 }, { "epoch": 1.7973231357552581, "grad_norm": 1.823803186416626, "learning_rate": 0.000911408540471638, "loss": 3.4707, "step": 940 }, { "epoch": 1.835564053537285, "grad_norm": 1.872054100036621, "learning_rate": 0.0009071595496069684, "loss": 3.4532, "step": 960 }, { "epoch": 1.8738049713193117, "grad_norm": 1.7651457786560059, "learning_rate": 0.0009029105587422988, "loss": 3.4224, "step": 980 }, { "epoch": 1.9120458891013383, "grad_norm": 1.7336605787277222, "learning_rate": 0.0008986615678776291, "loss": 3.3199, "step": 1000 }, { "epoch": 1.9502868068833652, "grad_norm": 1.8058604001998901, "learning_rate": 0.0008944125770129595, "loss": 3.2814, "step": 1020 }, { "epoch": 1.988527724665392, "grad_norm": 1.9400501251220703, "learning_rate": 0.0008901635861482899, "loss": 3.2423, "step": 1040 }, { "epoch": 2.0, "eval_accuracy": 0.5047426841574167, "eval_loss": 2.282437801361084, "eval_runtime": 710.9931, "eval_samples_per_second": 20.907, "eval_steps_per_second": 20.907, "step": 1046 }, { "epoch": 2.026768642447419, "grad_norm": 1.7496325969696045, "learning_rate": 0.0008859145952836202, "loss": 3.1858, "step": 1060 }, { "epoch": 2.0650095602294454, "grad_norm": 1.7203223705291748, "learning_rate": 0.0008816656044189504, "loss": 3.0482, "step": 1080 }, { "epoch": 2.1032504780114722, "grad_norm": 1.6859164237976074, "learning_rate": 0.000877416613554281, "loss": 3.0764, "step": 1100 }, { "epoch": 2.141491395793499, "grad_norm": 1.887332558631897, "learning_rate": 0.0008731676226896112, "loss": 2.9918, "step": 1120 }, { "epoch": 2.179732313575526, "grad_norm": 1.7712619304656982, "learning_rate": 0.0008689186318249415, "loss": 2.9791, "step": 1140 }, { "epoch": 2.2179732313575524, "grad_norm": 1.8518322706222534, "learning_rate": 0.000864669640960272, "loss": 2.9064, "step": 1160 }, { "epoch": 2.2562141491395793, "grad_norm": 1.8636976480484009, "learning_rate": 0.0008604206500956023, "loss": 2.9346, "step": 1180 }, { "epoch": 2.294455066921606, "grad_norm": 1.8007034063339233, "learning_rate": 0.0008561716592309326, "loss": 2.9154, "step": 1200 }, { "epoch": 2.332695984703633, "grad_norm": 1.8480207920074463, "learning_rate": 0.000851922668366263, "loss": 2.8311, "step": 1220 }, { "epoch": 2.3709369024856595, "grad_norm": 1.8463302850723267, "learning_rate": 0.0008476736775015934, "loss": 2.843, "step": 1240 }, { "epoch": 2.4091778202676863, "grad_norm": 1.8563566207885742, "learning_rate": 0.0008434246866369237, "loss": 2.8704, "step": 1260 }, { "epoch": 2.447418738049713, "grad_norm": 1.8388174772262573, "learning_rate": 0.0008391756957722541, "loss": 2.7794, "step": 1280 }, { "epoch": 2.48565965583174, "grad_norm": 1.787711262702942, "learning_rate": 0.0008349267049075845, "loss": 2.7776, "step": 1300 }, { "epoch": 2.5239005736137665, "grad_norm": 1.6573237180709839, "learning_rate": 0.0008306777140429148, "loss": 2.7288, "step": 1320 }, { "epoch": 2.5621414913957934, "grad_norm": 1.8304985761642456, "learning_rate": 0.0008264287231782451, "loss": 2.6598, "step": 1340 }, { "epoch": 2.6003824091778203, "grad_norm": 1.769439458847046, "learning_rate": 0.0008221797323135756, "loss": 2.6842, "step": 1360 }, { "epoch": 2.638623326959847, "grad_norm": 1.7404167652130127, "learning_rate": 0.0008179307414489059, "loss": 2.5974, "step": 1380 }, { "epoch": 2.676864244741874, "grad_norm": 1.7064534425735474, "learning_rate": 0.0008136817505842362, "loss": 2.6557, "step": 1400 }, { "epoch": 2.7151051625239004, "grad_norm": 1.784652590751648, "learning_rate": 0.0008094327597195667, "loss": 2.5701, "step": 1420 }, { "epoch": 2.7533460803059273, "grad_norm": 1.730402946472168, "learning_rate": 0.000805183768854897, "loss": 2.5879, "step": 1440 }, { "epoch": 2.791586998087954, "grad_norm": 1.803881049156189, "learning_rate": 0.0008009347779902273, "loss": 2.5413, "step": 1460 }, { "epoch": 2.8298279158699806, "grad_norm": 1.7114533185958862, "learning_rate": 0.0007966857871255578, "loss": 2.4823, "step": 1480 }, { "epoch": 2.8680688336520075, "grad_norm": 1.7487016916275024, "learning_rate": 0.000792436796260888, "loss": 2.4236, "step": 1500 }, { "epoch": 2.9063097514340344, "grad_norm": 1.7806780338287354, "learning_rate": 0.0007881878053962183, "loss": 2.4762, "step": 1520 }, { "epoch": 2.9445506692160612, "grad_norm": 1.851486086845398, "learning_rate": 0.0007839388145315488, "loss": 2.4221, "step": 1540 }, { "epoch": 2.982791586998088, "grad_norm": 1.779451608657837, "learning_rate": 0.0007796898236668791, "loss": 2.4164, "step": 1560 }, { "epoch": 3.0, "eval_accuracy": 0.6816010763538514, "eval_loss": 1.4862462282180786, "eval_runtime": 715.5989, "eval_samples_per_second": 20.773, "eval_steps_per_second": 20.773, "step": 1569 }, { "epoch": 3.0210325047801145, "grad_norm": 1.7617199420928955, "learning_rate": 0.0007754408328022094, "loss": 2.3348, "step": 1580 }, { "epoch": 3.0592734225621414, "grad_norm": 1.7850340604782104, "learning_rate": 0.0007711918419375399, "loss": 2.3196, "step": 1600 }, { "epoch": 3.0975143403441683, "grad_norm": 1.828715205192566, "learning_rate": 0.0007669428510728702, "loss": 2.2991, "step": 1620 }, { "epoch": 3.135755258126195, "grad_norm": 1.7825413942337036, "learning_rate": 0.0007626938602082005, "loss": 2.2354, "step": 1640 }, { "epoch": 3.173996175908222, "grad_norm": 1.8411946296691895, "learning_rate": 0.0007584448693435309, "loss": 2.221, "step": 1660 }, { "epoch": 3.2122370936902485, "grad_norm": 1.8236651420593262, "learning_rate": 0.0007541958784788613, "loss": 2.1939, "step": 1680 }, { "epoch": 3.2504780114722753, "grad_norm": 1.8275988101959229, "learning_rate": 0.0007499468876141916, "loss": 2.211, "step": 1700 }, { "epoch": 3.288718929254302, "grad_norm": 1.7743233442306519, "learning_rate": 0.000745697896749522, "loss": 2.1454, "step": 1720 }, { "epoch": 3.3269598470363286, "grad_norm": 1.7873393297195435, "learning_rate": 0.0007414489058848524, "loss": 2.1258, "step": 1740 }, { "epoch": 3.3652007648183555, "grad_norm": 1.8012022972106934, "learning_rate": 0.0007371999150201827, "loss": 2.1101, "step": 1760 }, { "epoch": 3.4034416826003824, "grad_norm": 1.8000600337982178, "learning_rate": 0.0007329509241555131, "loss": 2.1246, "step": 1780 }, { "epoch": 3.4416826003824093, "grad_norm": 1.7723950147628784, "learning_rate": 0.0007287019332908435, "loss": 2.1244, "step": 1800 }, { "epoch": 3.479923518164436, "grad_norm": 1.8095979690551758, "learning_rate": 0.0007244529424261738, "loss": 2.1099, "step": 1820 }, { "epoch": 3.5181644359464626, "grad_norm": 1.8022161722183228, "learning_rate": 0.0007202039515615042, "loss": 2.0703, "step": 1840 }, { "epoch": 3.5564053537284894, "grad_norm": 1.7775332927703857, "learning_rate": 0.0007159549606968346, "loss": 2.0937, "step": 1860 }, { "epoch": 3.5946462715105163, "grad_norm": 1.829291820526123, "learning_rate": 0.0007117059698321649, "loss": 2.0305, "step": 1880 }, { "epoch": 3.632887189292543, "grad_norm": 1.731218934059143, "learning_rate": 0.0007074569789674953, "loss": 2.0528, "step": 1900 }, { "epoch": 3.67112810707457, "grad_norm": 1.9170475006103516, "learning_rate": 0.0007032079881028257, "loss": 2.0311, "step": 1920 }, { "epoch": 3.7093690248565965, "grad_norm": 1.6934610605239868, "learning_rate": 0.0006989589972381559, "loss": 2.006, "step": 1940 }, { "epoch": 3.7476099426386233, "grad_norm": 1.792523741722107, "learning_rate": 0.0006947100063734863, "loss": 1.9627, "step": 1960 }, { "epoch": 3.78585086042065, "grad_norm": 1.7618036270141602, "learning_rate": 0.0006904610155088166, "loss": 2.0141, "step": 1980 }, { "epoch": 3.8240917782026767, "grad_norm": 1.7026081085205078, "learning_rate": 0.000686212024644147, "loss": 1.9489, "step": 2000 }, { "epoch": 3.8623326959847035, "grad_norm": 1.7117011547088623, "learning_rate": 0.0006819630337794774, "loss": 1.965, "step": 2020 }, { "epoch": 3.9005736137667304, "grad_norm": 1.7798806428909302, "learning_rate": 0.0006777140429148077, "loss": 1.9188, "step": 2040 }, { "epoch": 3.9388145315487573, "grad_norm": 1.7349345684051514, "learning_rate": 0.0006734650520501381, "loss": 1.9044, "step": 2060 }, { "epoch": 3.977055449330784, "grad_norm": 1.8268795013427734, "learning_rate": 0.0006692160611854685, "loss": 1.8625, "step": 2080 }, { "epoch": 4.0, "eval_accuracy": 0.7917255297679112, "eval_loss": 0.9794349670410156, "eval_runtime": 689.4289, "eval_samples_per_second": 21.561, "eval_steps_per_second": 21.561, "step": 2092 }, { "epoch": 4.015296367112811, "grad_norm": 1.6992357969284058, "learning_rate": 0.0006649670703207988, "loss": 1.8294, "step": 2100 }, { "epoch": 4.053537284894838, "grad_norm": 1.6715435981750488, "learning_rate": 0.0006607180794561292, "loss": 1.7572, "step": 2120 }, { "epoch": 4.091778202676864, "grad_norm": 1.7837462425231934, "learning_rate": 0.0006564690885914596, "loss": 1.7502, "step": 2140 }, { "epoch": 4.130019120458891, "grad_norm": 1.7979024648666382, "learning_rate": 0.0006522200977267899, "loss": 1.733, "step": 2160 }, { "epoch": 4.168260038240918, "grad_norm": 1.6781351566314697, "learning_rate": 0.0006479711068621203, "loss": 1.7437, "step": 2180 }, { "epoch": 4.2065009560229445, "grad_norm": 1.7719937562942505, "learning_rate": 0.0006437221159974506, "loss": 1.7702, "step": 2200 }, { "epoch": 4.244741873804971, "grad_norm": 1.7680734395980835, "learning_rate": 0.000639473125132781, "loss": 1.7546, "step": 2220 }, { "epoch": 4.282982791586998, "grad_norm": 1.7028470039367676, "learning_rate": 0.0006352241342681113, "loss": 1.6716, "step": 2240 }, { "epoch": 4.321223709369025, "grad_norm": 1.764496922492981, "learning_rate": 0.0006309751434034417, "loss": 1.7316, "step": 2260 }, { "epoch": 4.359464627151052, "grad_norm": 1.7339156866073608, "learning_rate": 0.0006267261525387721, "loss": 1.6865, "step": 2280 }, { "epoch": 4.397705544933078, "grad_norm": 1.6657025814056396, "learning_rate": 0.0006224771616741024, "loss": 1.6711, "step": 2300 }, { "epoch": 4.435946462715105, "grad_norm": 1.8127187490463257, "learning_rate": 0.0006182281708094328, "loss": 1.7128, "step": 2320 }, { "epoch": 4.474187380497132, "grad_norm": 1.720580816268921, "learning_rate": 0.0006139791799447631, "loss": 1.6676, "step": 2340 }, { "epoch": 4.512428298279159, "grad_norm": 1.7044484615325928, "learning_rate": 0.0006097301890800934, "loss": 1.6651, "step": 2360 }, { "epoch": 4.550669216061186, "grad_norm": 1.824859857559204, "learning_rate": 0.0006054811982154238, "loss": 1.6743, "step": 2380 }, { "epoch": 4.588910133843212, "grad_norm": 1.847652554512024, "learning_rate": 0.0006012322073507542, "loss": 1.6765, "step": 2400 }, { "epoch": 4.627151051625239, "grad_norm": 1.6747491359710693, "learning_rate": 0.0005969832164860845, "loss": 1.6257, "step": 2420 }, { "epoch": 4.665391969407266, "grad_norm": 1.7143758535385132, "learning_rate": 0.0005927342256214149, "loss": 1.6515, "step": 2440 }, { "epoch": 4.7036328871892925, "grad_norm": 1.7966911792755127, "learning_rate": 0.0005884852347567453, "loss": 1.6384, "step": 2460 }, { "epoch": 4.741873804971319, "grad_norm": 1.6942733526229858, "learning_rate": 0.0005842362438920756, "loss": 1.5939, "step": 2480 }, { "epoch": 4.780114722753346, "grad_norm": 1.7530827522277832, "learning_rate": 0.000579987253027406, "loss": 1.5908, "step": 2500 }, { "epoch": 4.818355640535373, "grad_norm": 1.845311164855957, "learning_rate": 0.0005757382621627364, "loss": 1.5586, "step": 2520 }, { "epoch": 4.8565965583174, "grad_norm": 1.7648016214370728, "learning_rate": 0.0005714892712980667, "loss": 1.5561, "step": 2540 }, { "epoch": 4.894837476099426, "grad_norm": 1.6392900943756104, "learning_rate": 0.000567240280433397, "loss": 1.5641, "step": 2560 }, { "epoch": 4.933078393881453, "grad_norm": 1.6668881177902222, "learning_rate": 0.0005629912895687275, "loss": 1.5443, "step": 2580 }, { "epoch": 4.97131931166348, "grad_norm": 1.8424748182296753, "learning_rate": 0.0005587422987040578, "loss": 1.5637, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.8490413723511604, "eval_loss": 0.7047534584999084, "eval_runtime": 7835.8179, "eval_samples_per_second": 1.897, "eval_steps_per_second": 1.897, "step": 2615 }, { "epoch": 5.009560229445507, "grad_norm": 1.8417068719863892, "learning_rate": 0.0005544933078393881, "loss": 1.4799, "step": 2620 }, { "epoch": 5.047801147227533, "grad_norm": 1.6466395854949951, "learning_rate": 0.0005502443169747186, "loss": 1.4184, "step": 2640 }, { "epoch": 5.08604206500956, "grad_norm": 1.7499247789382935, "learning_rate": 0.0005459953261100489, "loss": 1.4507, "step": 2660 }, { "epoch": 5.124282982791587, "grad_norm": 1.7968547344207764, "learning_rate": 0.0005417463352453792, "loss": 1.405, "step": 2680 }, { "epoch": 5.162523900573614, "grad_norm": 1.7950819730758667, "learning_rate": 0.0005374973443807097, "loss": 1.4362, "step": 2700 }, { "epoch": 5.2007648183556405, "grad_norm": 1.745133399963379, "learning_rate": 0.00053324835351604, "loss": 1.4238, "step": 2720 }, { "epoch": 5.239005736137667, "grad_norm": 1.7767413854599, "learning_rate": 0.0005289993626513702, "loss": 1.4025, "step": 2740 }, { "epoch": 5.277246653919694, "grad_norm": 1.7297043800354004, "learning_rate": 0.0005247503717867008, "loss": 1.4144, "step": 2760 }, { "epoch": 5.315487571701721, "grad_norm": 1.8174902200698853, "learning_rate": 0.000520501380922031, "loss": 1.4071, "step": 2780 }, { "epoch": 5.353728489483748, "grad_norm": 1.6889333724975586, "learning_rate": 0.0005162523900573613, "loss": 1.4022, "step": 2800 }, { "epoch": 5.3919694072657744, "grad_norm": 1.6331517696380615, "learning_rate": 0.0005120033991926918, "loss": 1.3479, "step": 2820 }, { "epoch": 5.430210325047801, "grad_norm": 1.8916860818862915, "learning_rate": 0.0005077544083280221, "loss": 1.362, "step": 2840 }, { "epoch": 5.468451242829828, "grad_norm": 1.706222653388977, "learning_rate": 0.0005035054174633524, "loss": 1.3613, "step": 2860 }, { "epoch": 5.506692160611855, "grad_norm": 1.6761025190353394, "learning_rate": 0.0004992564265986828, "loss": 1.3883, "step": 2880 }, { "epoch": 5.544933078393882, "grad_norm": 1.632095217704773, "learning_rate": 0.0004950074357340132, "loss": 1.3432, "step": 2900 }, { "epoch": 5.583173996175908, "grad_norm": 1.6419159173965454, "learning_rate": 0.0004907584448693436, "loss": 1.3417, "step": 2920 }, { "epoch": 5.621414913957935, "grad_norm": 1.8355722427368164, "learning_rate": 0.0004865094540046739, "loss": 1.341, "step": 2940 }, { "epoch": 5.659655831739962, "grad_norm": 1.6611793041229248, "learning_rate": 0.00048226046314000425, "loss": 1.339, "step": 2960 }, { "epoch": 5.6978967495219885, "grad_norm": 1.7696843147277832, "learning_rate": 0.0004780114722753346, "loss": 1.3183, "step": 2980 }, { "epoch": 5.736137667304015, "grad_norm": 1.6689785718917847, "learning_rate": 0.000473762481410665, "loss": 1.3365, "step": 3000 }, { "epoch": 5.774378585086042, "grad_norm": 1.6962292194366455, "learning_rate": 0.00046951349054599533, "loss": 1.2966, "step": 3020 }, { "epoch": 5.812619502868069, "grad_norm": 1.7446441650390625, "learning_rate": 0.0004652644996813257, "loss": 1.274, "step": 3040 }, { "epoch": 5.850860420650095, "grad_norm": 1.7083852291107178, "learning_rate": 0.0004610155088166561, "loss": 1.3145, "step": 3060 }, { "epoch": 5.8891013384321225, "grad_norm": 1.6674286127090454, "learning_rate": 0.0004567665179519864, "loss": 1.2944, "step": 3080 }, { "epoch": 5.927342256214149, "grad_norm": 1.5787798166275024, "learning_rate": 0.00045251752708731676, "loss": 1.2772, "step": 3100 }, { "epoch": 5.965583173996176, "grad_norm": 1.6089515686035156, "learning_rate": 0.0004482685362226471, "loss": 1.265, "step": 3120 }, { "epoch": 6.0, "eval_accuracy": 0.886242852337706, "eval_loss": 0.5389042496681213, "eval_runtime": 383.2829, "eval_samples_per_second": 38.783, "eval_steps_per_second": 38.783, "step": 3138 }, { "epoch": 6.003824091778203, "grad_norm": 1.5683554410934448, "learning_rate": 0.0004440195453579775, "loss": 1.2601, "step": 3140 }, { "epoch": 6.042065009560229, "grad_norm": 1.6791157722473145, "learning_rate": 0.00043977055449330785, "loss": 1.2067, "step": 3160 }, { "epoch": 6.080305927342256, "grad_norm": 1.5930650234222412, "learning_rate": 0.0004355215636286382, "loss": 1.1818, "step": 3180 }, { "epoch": 6.118546845124283, "grad_norm": 1.7136551141738892, "learning_rate": 0.0004312725727639686, "loss": 1.1871, "step": 3200 }, { "epoch": 6.15678776290631, "grad_norm": 1.6400994062423706, "learning_rate": 0.0004270235818992989, "loss": 1.1851, "step": 3220 }, { "epoch": 6.195028680688337, "grad_norm": 1.722548246383667, "learning_rate": 0.0004227745910346293, "loss": 1.1798, "step": 3240 }, { "epoch": 6.233269598470363, "grad_norm": 1.600697636604309, "learning_rate": 0.0004185256001699597, "loss": 1.1516, "step": 3260 }, { "epoch": 6.27151051625239, "grad_norm": 1.6722103357315063, "learning_rate": 0.00041427660930528997, "loss": 1.176, "step": 3280 }, { "epoch": 6.309751434034417, "grad_norm": 1.5297291278839111, "learning_rate": 0.00041002761844062037, "loss": 1.1625, "step": 3300 }, { "epoch": 6.347992351816444, "grad_norm": 1.6687546968460083, "learning_rate": 0.00040577862757595076, "loss": 1.1744, "step": 3320 }, { "epoch": 6.3862332695984705, "grad_norm": 1.6758590936660767, "learning_rate": 0.00040152963671128105, "loss": 1.1408, "step": 3340 }, { "epoch": 6.424474187380497, "grad_norm": 1.7506797313690186, "learning_rate": 0.00039728064584661145, "loss": 1.1458, "step": 3360 }, { "epoch": 6.462715105162524, "grad_norm": 1.7690140008926392, "learning_rate": 0.0003930316549819418, "loss": 1.1579, "step": 3380 }, { "epoch": 6.500956022944551, "grad_norm": 1.7732901573181152, "learning_rate": 0.00038878266411727214, "loss": 1.1444, "step": 3400 }, { "epoch": 6.539196940726577, "grad_norm": 1.7551547288894653, "learning_rate": 0.00038453367325260254, "loss": 1.1431, "step": 3420 }, { "epoch": 6.577437858508604, "grad_norm": 1.6275290250778198, "learning_rate": 0.0003802846823879329, "loss": 1.1293, "step": 3440 }, { "epoch": 6.615678776290631, "grad_norm": 1.769103765487671, "learning_rate": 0.0003760356915232632, "loss": 1.1418, "step": 3460 }, { "epoch": 6.653919694072657, "grad_norm": 1.7487330436706543, "learning_rate": 0.0003717867006585936, "loss": 1.1305, "step": 3480 }, { "epoch": 6.692160611854685, "grad_norm": 1.698512315750122, "learning_rate": 0.0003675377097939239, "loss": 1.0805, "step": 3500 }, { "epoch": 6.730401529636711, "grad_norm": 1.6636496782302856, "learning_rate": 0.0003632887189292543, "loss": 1.0907, "step": 3520 }, { "epoch": 6.768642447418738, "grad_norm": 1.577497959136963, "learning_rate": 0.00035903972806458466, "loss": 1.1399, "step": 3540 }, { "epoch": 6.806883365200765, "grad_norm": 1.7101361751556396, "learning_rate": 0.000354790737199915, "loss": 1.1206, "step": 3560 }, { "epoch": 6.845124282982791, "grad_norm": 1.6473299264907837, "learning_rate": 0.0003505417463352454, "loss": 1.103, "step": 3580 }, { "epoch": 6.8833652007648185, "grad_norm": 1.6744282245635986, "learning_rate": 0.00034629275547057574, "loss": 1.1088, "step": 3600 }, { "epoch": 6.921606118546845, "grad_norm": 1.67130708694458, "learning_rate": 0.0003420437646059061, "loss": 1.0857, "step": 3620 }, { "epoch": 6.959847036328872, "grad_norm": 1.6932523250579834, "learning_rate": 0.0003377947737412365, "loss": 1.0912, "step": 3640 }, { "epoch": 6.998087954110899, "grad_norm": 1.6580239534378052, "learning_rate": 0.00033354578287656683, "loss": 1.0888, "step": 3660 }, { "epoch": 7.0, "eval_accuracy": 0.9101244534140599, "eval_loss": 0.4364229142665863, "eval_runtime": 605.3795, "eval_samples_per_second": 24.555, "eval_steps_per_second": 24.555, "step": 3661 }, { "epoch": 7.036328871892925, "grad_norm": 1.5949829816818237, "learning_rate": 0.0003292967920118972, "loss": 1.0123, "step": 3680 }, { "epoch": 7.074569789674952, "grad_norm": 1.8134639263153076, "learning_rate": 0.0003250478011472275, "loss": 1.0552, "step": 3700 }, { "epoch": 7.112810707456979, "grad_norm": 1.6394524574279785, "learning_rate": 0.0003207988102825579, "loss": 1.0142, "step": 3720 }, { "epoch": 7.151051625239006, "grad_norm": 1.6918762922286987, "learning_rate": 0.00031654981941788826, "loss": 1.0096, "step": 3740 }, { "epoch": 7.189292543021033, "grad_norm": 1.673691987991333, "learning_rate": 0.0003123008285532186, "loss": 1.0203, "step": 3760 }, { "epoch": 7.227533460803059, "grad_norm": 1.5526095628738403, "learning_rate": 0.000308051837688549, "loss": 1.049, "step": 3780 }, { "epoch": 7.265774378585086, "grad_norm": 1.638197660446167, "learning_rate": 0.00030380284682387935, "loss": 1.0247, "step": 3800 }, { "epoch": 7.304015296367113, "grad_norm": 1.6690630912780762, "learning_rate": 0.0002995538559592097, "loss": 0.9841, "step": 3820 }, { "epoch": 7.342256214149139, "grad_norm": 1.645591139793396, "learning_rate": 0.0002953048650945401, "loss": 1.018, "step": 3840 }, { "epoch": 7.3804971319311665, "grad_norm": 1.676079273223877, "learning_rate": 0.0002910558742298704, "loss": 0.9818, "step": 3860 }, { "epoch": 7.418738049713193, "grad_norm": 1.6065680980682373, "learning_rate": 0.0002868068833652008, "loss": 0.9795, "step": 3880 }, { "epoch": 7.45697896749522, "grad_norm": 1.683929443359375, "learning_rate": 0.0002825578925005312, "loss": 0.9588, "step": 3900 }, { "epoch": 7.495219885277247, "grad_norm": 1.6200690269470215, "learning_rate": 0.00027830890163586146, "loss": 1.0081, "step": 3920 }, { "epoch": 7.533460803059273, "grad_norm": 1.7147966623306274, "learning_rate": 0.00027405991077119186, "loss": 0.9822, "step": 3940 }, { "epoch": 7.5717017208413, "grad_norm": 1.7224268913269043, "learning_rate": 0.00026981091990652226, "loss": 0.988, "step": 3960 }, { "epoch": 7.609942638623327, "grad_norm": 1.7145981788635254, "learning_rate": 0.00026556192904185255, "loss": 0.9562, "step": 3980 }, { "epoch": 7.648183556405353, "grad_norm": 1.8020603656768799, "learning_rate": 0.00026131293817718295, "loss": 0.9644, "step": 4000 }, { "epoch": 7.686424474187381, "grad_norm": 1.7413355112075806, "learning_rate": 0.00025706394731251324, "loss": 0.9648, "step": 4020 }, { "epoch": 7.724665391969407, "grad_norm": 1.6813682317733765, "learning_rate": 0.00025281495644784364, "loss": 0.9244, "step": 4040 }, { "epoch": 7.762906309751434, "grad_norm": 1.747910737991333, "learning_rate": 0.00024856596558317403, "loss": 0.9217, "step": 4060 }, { "epoch": 7.801147227533461, "grad_norm": 1.6242161989212036, "learning_rate": 0.0002443169747185044, "loss": 0.9628, "step": 4080 }, { "epoch": 7.839388145315487, "grad_norm": 1.5416340827941895, "learning_rate": 0.00024006798385383472, "loss": 0.979, "step": 4100 }, { "epoch": 7.8776290630975145, "grad_norm": 1.7438323497772217, "learning_rate": 0.00023581899298916507, "loss": 0.9528, "step": 4120 }, { "epoch": 7.915869980879541, "grad_norm": 1.6303768157958984, "learning_rate": 0.00023157000212449544, "loss": 0.91, "step": 4140 }, { "epoch": 7.954110898661568, "grad_norm": 1.586729884147644, "learning_rate": 0.00022732101125982578, "loss": 0.9346, "step": 4160 }, { "epoch": 7.992351816443595, "grad_norm": 1.771173357963562, "learning_rate": 0.00022307202039515615, "loss": 0.9296, "step": 4180 }, { "epoch": 8.0, "eval_accuracy": 0.9264715775311133, "eval_loss": 0.36169418692588806, "eval_runtime": 57.1398, "eval_samples_per_second": 260.152, "eval_steps_per_second": 260.152, "step": 4184 }, { "epoch": 8.030592734225621, "grad_norm": 1.5647226572036743, "learning_rate": 0.00021882302953048652, "loss": 0.9005, "step": 4200 }, { "epoch": 8.068833652007648, "grad_norm": 1.6461886167526245, "learning_rate": 0.00021457403866581687, "loss": 0.8696, "step": 4220 }, { "epoch": 8.107074569789676, "grad_norm": 1.5358296632766724, "learning_rate": 0.0002103250478011472, "loss": 0.8782, "step": 4240 }, { "epoch": 8.145315487571702, "grad_norm": 1.6051462888717651, "learning_rate": 0.0002060760569364776, "loss": 0.8483, "step": 4260 }, { "epoch": 8.183556405353729, "grad_norm": 1.7184685468673706, "learning_rate": 0.00020182706607180795, "loss": 0.8817, "step": 4280 }, { "epoch": 8.221797323135755, "grad_norm": 1.6134257316589355, "learning_rate": 0.0001975780752071383, "loss": 0.8603, "step": 4300 }, { "epoch": 8.260038240917781, "grad_norm": 1.5783709287643433, "learning_rate": 0.00019332908434246867, "loss": 0.8654, "step": 4320 }, { "epoch": 8.29827915869981, "grad_norm": 1.4778318405151367, "learning_rate": 0.00018908009347779904, "loss": 0.8554, "step": 4340 }, { "epoch": 8.336520076481836, "grad_norm": 1.8124628067016602, "learning_rate": 0.00018483110261312938, "loss": 0.8974, "step": 4360 }, { "epoch": 8.374760994263863, "grad_norm": 1.7594116926193237, "learning_rate": 0.00018058211174845973, "loss": 0.8716, "step": 4380 }, { "epoch": 8.413001912045889, "grad_norm": 1.7039234638214111, "learning_rate": 0.0001763331208837901, "loss": 0.8525, "step": 4400 }, { "epoch": 8.451242829827915, "grad_norm": 1.6325209140777588, "learning_rate": 0.00017208413001912047, "loss": 0.8663, "step": 4420 }, { "epoch": 8.489483747609942, "grad_norm": 1.6818372011184692, "learning_rate": 0.00016783513915445082, "loss": 0.8629, "step": 4440 }, { "epoch": 8.52772466539197, "grad_norm": 1.5809085369110107, "learning_rate": 0.00016358614828978119, "loss": 0.8718, "step": 4460 }, { "epoch": 8.565965583173996, "grad_norm": 1.5711621046066284, "learning_rate": 0.00015933715742511153, "loss": 0.8527, "step": 4480 }, { "epoch": 8.604206500956023, "grad_norm": 1.5462543964385986, "learning_rate": 0.0001550881665604419, "loss": 0.8589, "step": 4500 }, { "epoch": 8.64244741873805, "grad_norm": 1.6341811418533325, "learning_rate": 0.00015083917569577227, "loss": 0.8649, "step": 4520 }, { "epoch": 8.680688336520076, "grad_norm": 1.5172038078308105, "learning_rate": 0.00014659018483110262, "loss": 0.8192, "step": 4540 }, { "epoch": 8.718929254302104, "grad_norm": 1.4799879789352417, "learning_rate": 0.00014234119396643296, "loss": 0.8436, "step": 4560 }, { "epoch": 8.75717017208413, "grad_norm": 1.547850251197815, "learning_rate": 0.00013809220310176336, "loss": 0.8292, "step": 4580 }, { "epoch": 8.795411089866157, "grad_norm": 1.8095018863677979, "learning_rate": 0.0001338432122370937, "loss": 0.8527, "step": 4600 }, { "epoch": 8.833652007648183, "grad_norm": 1.5578687191009521, "learning_rate": 0.00012959422137242405, "loss": 0.8358, "step": 4620 }, { "epoch": 8.87189292543021, "grad_norm": 1.7008335590362549, "learning_rate": 0.0001253452305077544, "loss": 0.822, "step": 4640 }, { "epoch": 8.910133843212238, "grad_norm": 1.6717548370361328, "learning_rate": 0.00012109623964308478, "loss": 0.8352, "step": 4660 }, { "epoch": 8.948374760994264, "grad_norm": 1.729179859161377, "learning_rate": 0.00011684724877841513, "loss": 0.826, "step": 4680 }, { "epoch": 8.98661567877629, "grad_norm": 1.6358789205551147, "learning_rate": 0.00011259825791374549, "loss": 0.8066, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.9352842246888665, "eval_loss": 0.3206591010093689, "eval_runtime": 381.3245, "eval_samples_per_second": 38.983, "eval_steps_per_second": 38.983, "step": 4707 }, { "epoch": 9.024856596558317, "grad_norm": 1.5783698558807373, "learning_rate": 0.00010834926704907585, "loss": 0.8019, "step": 4720 }, { "epoch": 9.063097514340344, "grad_norm": 1.5668120384216309, "learning_rate": 0.0001041002761844062, "loss": 0.7702, "step": 4740 }, { "epoch": 9.101338432122372, "grad_norm": 1.624084234237671, "learning_rate": 9.985128531973658e-05, "loss": 0.7934, "step": 4760 }, { "epoch": 9.139579349904398, "grad_norm": 1.5410951375961304, "learning_rate": 9.560229445506692e-05, "loss": 0.7863, "step": 4780 }, { "epoch": 9.177820267686425, "grad_norm": 1.663845419883728, "learning_rate": 9.135330359039729e-05, "loss": 0.7894, "step": 4800 }, { "epoch": 9.216061185468451, "grad_norm": 1.5939579010009766, "learning_rate": 8.710431272572764e-05, "loss": 0.739, "step": 4820 }, { "epoch": 9.254302103250478, "grad_norm": 1.5545909404754639, "learning_rate": 8.2855321861058e-05, "loss": 0.7808, "step": 4840 }, { "epoch": 9.292543021032504, "grad_norm": 1.665999412536621, "learning_rate": 7.860633099638836e-05, "loss": 0.7761, "step": 4860 }, { "epoch": 9.330783938814532, "grad_norm": 1.6480567455291748, "learning_rate": 7.435734013171871e-05, "loss": 0.7876, "step": 4880 }, { "epoch": 9.369024856596559, "grad_norm": 1.5779589414596558, "learning_rate": 7.010834926704908e-05, "loss": 0.767, "step": 4900 }, { "epoch": 9.407265774378585, "grad_norm": 1.6985348463058472, "learning_rate": 6.585935840237942e-05, "loss": 0.783, "step": 4920 }, { "epoch": 9.445506692160611, "grad_norm": 1.5563093423843384, "learning_rate": 6.16103675377098e-05, "loss": 0.756, "step": 4940 }, { "epoch": 9.483747609942638, "grad_norm": 1.6173079013824463, "learning_rate": 5.736137667304015e-05, "loss": 0.7682, "step": 4960 }, { "epoch": 9.521988527724666, "grad_norm": 1.5880271196365356, "learning_rate": 5.311238580837052e-05, "loss": 0.7632, "step": 4980 }, { "epoch": 9.560229445506693, "grad_norm": 1.6329987049102783, "learning_rate": 4.8863394943700874e-05, "loss": 0.7602, "step": 5000 }, { "epoch": 9.598470363288719, "grad_norm": 1.442744255065918, "learning_rate": 4.461440407903123e-05, "loss": 0.7595, "step": 5020 }, { "epoch": 9.636711281070745, "grad_norm": 1.5359572172164917, "learning_rate": 4.036541321436159e-05, "loss": 0.7621, "step": 5040 }, { "epoch": 9.674952198852772, "grad_norm": 1.6465296745300293, "learning_rate": 3.6116422349691954e-05, "loss": 0.7717, "step": 5060 }, { "epoch": 9.7131931166348, "grad_norm": 1.6590745449066162, "learning_rate": 3.186743148502231e-05, "loss": 0.73, "step": 5080 }, { "epoch": 9.751434034416826, "grad_norm": 1.5176348686218262, "learning_rate": 2.7618440620352666e-05, "loss": 0.7486, "step": 5100 }, { "epoch": 9.789674952198853, "grad_norm": 1.7158029079437256, "learning_rate": 2.3369449755683023e-05, "loss": 0.7766, "step": 5120 }, { "epoch": 9.82791586998088, "grad_norm": 1.6565515995025635, "learning_rate": 1.9120458891013384e-05, "loss": 0.7159, "step": 5140 }, { "epoch": 9.866156787762906, "grad_norm": 1.4815343618392944, "learning_rate": 1.4871468026343743e-05, "loss": 0.7304, "step": 5160 }, { "epoch": 9.904397705544934, "grad_norm": 1.6041672229766846, "learning_rate": 1.0622477161674103e-05, "loss": 0.744, "step": 5180 }, { "epoch": 9.94263862332696, "grad_norm": 1.6224092245101929, "learning_rate": 6.373486297004461e-06, "loss": 0.7562, "step": 5200 }, { "epoch": 9.980879541108987, "grad_norm": 1.5392311811447144, "learning_rate": 2.1244954323348204e-06, "loss": 0.7675, "step": 5220 }, { "epoch": 10.0, "eval_accuracy": 0.9405314497140935, "eval_loss": 0.29811325669288635, "eval_runtime": 464.2204, "eval_samples_per_second": 32.021, "eval_steps_per_second": 32.021, "step": 5230 }, { "epoch": 10.0, "step": 5230, "total_flos": 1.96318398191328e+18, "train_loss": 2.122584131525306, "train_runtime": 50832.5393, "train_samples_per_second": 26.317, "train_steps_per_second": 0.103 } ], "logging_steps": 20, "max_steps": 5230, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.96318398191328e+18, "train_batch_size": 256, "trial_name": null, "trial_params": null }