diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4245 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.783378908572108, + "eval_steps": 500, + "global_step": 60000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029722981809535134, + "grad_norm": 0.6250418424606323, + "learning_rate": 1.485884101040119e-06, + "loss": 0.1876, + "step": 100 + }, + { + "epoch": 0.005944596361907027, + "grad_norm": 0.11807194352149963, + "learning_rate": 2.971768202080238e-06, + "loss": 0.1453, + "step": 200 + }, + { + "epoch": 0.00891689454286054, + "grad_norm": 0.10204608738422394, + "learning_rate": 4.457652303120357e-06, + "loss": 0.116, + "step": 300 + }, + { + "epoch": 0.011889192723814054, + "grad_norm": 0.08203478157520294, + "learning_rate": 5.943536404160476e-06, + "loss": 0.0996, + "step": 400 + }, + { + "epoch": 0.014861490904767566, + "grad_norm": 0.09882761538028717, + "learning_rate": 7.429420505200595e-06, + "loss": 0.0913, + "step": 500 + }, + { + "epoch": 0.01783378908572108, + "grad_norm": 0.09138158708810806, + "learning_rate": 8.915304606240713e-06, + "loss": 0.0905, + "step": 600 + }, + { + "epoch": 0.020806087266674593, + "grad_norm": 0.14378643035888672, + "learning_rate": 1.0401188707280832e-05, + "loss": 0.0861, + "step": 700 + }, + { + "epoch": 0.023778385447628107, + "grad_norm": 0.0, + "learning_rate": 1.1887072808320951e-05, + "loss": 0.0831, + "step": 800 + }, + { + "epoch": 0.026750683628581618, + "grad_norm": 0.10987524688243866, + "learning_rate": 1.337295690936107e-05, + "loss": 0.081, + "step": 900 + }, + { + "epoch": 0.029722981809535132, + "grad_norm": 0.13225562870502472, + "learning_rate": 1.485884101040119e-05, + "loss": 0.0827, + "step": 1000 + }, + { + "epoch": 0.03269527999048864, + "grad_norm": 0.16068494319915771, + "learning_rate": 1.6344725111441307e-05, + "loss": 0.0827, + "step": 1100 + }, + { + "epoch": 0.03566757817144216, + "grad_norm": 0.15021204948425293, + "learning_rate": 1.7830609212481426e-05, + "loss": 0.0811, + "step": 1200 + }, + { + "epoch": 0.03863987635239567, + "grad_norm": 0.14025737345218658, + "learning_rate": 1.9316493313521546e-05, + "loss": 0.0783, + "step": 1300 + }, + { + "epoch": 0.041612174533349186, + "grad_norm": 0.16900166869163513, + "learning_rate": 2.0802377414561665e-05, + "loss": 0.0802, + "step": 1400 + }, + { + "epoch": 0.0445844727143027, + "grad_norm": 0.179672509431839, + "learning_rate": 2.2288261515601784e-05, + "loss": 0.0776, + "step": 1500 + }, + { + "epoch": 0.047556770895256215, + "grad_norm": 0.10613241791725159, + "learning_rate": 2.3774145616641903e-05, + "loss": 0.0771, + "step": 1600 + }, + { + "epoch": 0.05052906907620972, + "grad_norm": 0.21280072629451752, + "learning_rate": 2.5260029717682022e-05, + "loss": 0.079, + "step": 1700 + }, + { + "epoch": 0.053501367257163236, + "grad_norm": 0.15712925791740417, + "learning_rate": 2.674591381872214e-05, + "loss": 0.0776, + "step": 1800 + }, + { + "epoch": 0.05647366543811675, + "grad_norm": 0.18619948625564575, + "learning_rate": 2.823179791976226e-05, + "loss": 0.0769, + "step": 1900 + }, + { + "epoch": 0.059445963619070265, + "grad_norm": 0.1450018435716629, + "learning_rate": 2.971768202080238e-05, + "loss": 0.0777, + "step": 2000 + }, + { + "epoch": 0.06241826180002378, + "grad_norm": 0.15822643041610718, + "learning_rate": 3.12035661218425e-05, + "loss": 0.0744, + "step": 2100 + }, + { + "epoch": 0.06539055998097729, + "grad_norm": 0.15202847123146057, + "learning_rate": 3.2689450222882614e-05, + "loss": 0.0801, + "step": 2200 + }, + { + "epoch": 0.0683628581619308, + "grad_norm": 0.09045654535293579, + "learning_rate": 3.417533432392274e-05, + "loss": 0.0766, + "step": 2300 + }, + { + "epoch": 0.07133515634288431, + "grad_norm": 0.1728498637676239, + "learning_rate": 3.566121842496285e-05, + "loss": 0.0776, + "step": 2400 + }, + { + "epoch": 0.07430745452383783, + "grad_norm": 0.16322079300880432, + "learning_rate": 3.7147102526002975e-05, + "loss": 0.0757, + "step": 2500 + }, + { + "epoch": 0.07727975270479134, + "grad_norm": 0.138300821185112, + "learning_rate": 3.863298662704309e-05, + "loss": 0.0729, + "step": 2600 + }, + { + "epoch": 0.08025205088574486, + "grad_norm": 0.13257281482219696, + "learning_rate": 4.0118870728083214e-05, + "loss": 0.0748, + "step": 2700 + }, + { + "epoch": 0.08322434906669837, + "grad_norm": 0.13874976336956024, + "learning_rate": 4.160475482912333e-05, + "loss": 0.0763, + "step": 2800 + }, + { + "epoch": 0.08619664724765189, + "grad_norm": 0.12492681294679642, + "learning_rate": 4.309063893016345e-05, + "loss": 0.0755, + "step": 2900 + }, + { + "epoch": 0.0891689454286054, + "grad_norm": 0.18948061764240265, + "learning_rate": 4.457652303120357e-05, + "loss": 0.076, + "step": 3000 + }, + { + "epoch": 0.09214124360955891, + "grad_norm": 0.21893437206745148, + "learning_rate": 4.6062407132243683e-05, + "loss": 0.0758, + "step": 3100 + }, + { + "epoch": 0.09511354179051243, + "grad_norm": 0.10585571080446243, + "learning_rate": 4.7548291233283806e-05, + "loss": 0.076, + "step": 3200 + }, + { + "epoch": 0.09808583997146594, + "grad_norm": 0.11917442083358765, + "learning_rate": 4.903417533432392e-05, + "loss": 0.0741, + "step": 3300 + }, + { + "epoch": 0.10105813815241944, + "grad_norm": 0.19210471212863922, + "learning_rate": 4.997262331242276e-05, + "loss": 0.0739, + "step": 3400 + }, + { + "epoch": 0.10403043633337296, + "grad_norm": 0.12663370370864868, + "learning_rate": 4.9894404205059215e-05, + "loss": 0.075, + "step": 3500 + }, + { + "epoch": 0.10700273451432647, + "grad_norm": 0.15841467678546906, + "learning_rate": 4.981618509769567e-05, + "loss": 0.0702, + "step": 3600 + }, + { + "epoch": 0.10997503269527999, + "grad_norm": 0.1830332726240158, + "learning_rate": 4.973796599033212e-05, + "loss": 0.0756, + "step": 3700 + }, + { + "epoch": 0.1129473308762335, + "grad_norm": 0.0, + "learning_rate": 4.965974688296857e-05, + "loss": 0.0722, + "step": 3800 + }, + { + "epoch": 0.11591962905718702, + "grad_norm": 0.15343832969665527, + "learning_rate": 4.9581527775605024e-05, + "loss": 0.0751, + "step": 3900 + }, + { + "epoch": 0.11889192723814053, + "grad_norm": 0.1250162124633789, + "learning_rate": 4.950330866824148e-05, + "loss": 0.0708, + "step": 4000 + }, + { + "epoch": 0.12186422541909404, + "grad_norm": 0.16833215951919556, + "learning_rate": 4.9425089560877935e-05, + "loss": 0.0758, + "step": 4100 + }, + { + "epoch": 0.12483652360004756, + "grad_norm": 0.1292218714952469, + "learning_rate": 4.934687045351439e-05, + "loss": 0.0749, + "step": 4200 + }, + { + "epoch": 0.12780882178100106, + "grad_norm": 0.17948633432388306, + "learning_rate": 4.926865134615084e-05, + "loss": 0.0709, + "step": 4300 + }, + { + "epoch": 0.13078111996195457, + "grad_norm": 0.10589201748371124, + "learning_rate": 4.919043223878729e-05, + "loss": 0.0741, + "step": 4400 + }, + { + "epoch": 0.1337534181429081, + "grad_norm": 0.10382993519306183, + "learning_rate": 4.9112213131423744e-05, + "loss": 0.0747, + "step": 4500 + }, + { + "epoch": 0.1367257163238616, + "grad_norm": 0.08471754938364029, + "learning_rate": 4.9033994024060197e-05, + "loss": 0.0728, + "step": 4600 + }, + { + "epoch": 0.13969801450481512, + "grad_norm": 0.1706983894109726, + "learning_rate": 4.895577491669665e-05, + "loss": 0.0708, + "step": 4700 + }, + { + "epoch": 0.14267031268576863, + "grad_norm": 0.08754973113536835, + "learning_rate": 4.887755580933311e-05, + "loss": 0.0711, + "step": 4800 + }, + { + "epoch": 0.14564261086672214, + "grad_norm": 0.1107291653752327, + "learning_rate": 4.879933670196956e-05, + "loss": 0.0726, + "step": 4900 + }, + { + "epoch": 0.14861490904767566, + "grad_norm": 0.14203386008739471, + "learning_rate": 4.872111759460601e-05, + "loss": 0.0729, + "step": 5000 + }, + { + "epoch": 0.15158720722862917, + "grad_norm": 0.1973818838596344, + "learning_rate": 4.8642898487242465e-05, + "loss": 0.0714, + "step": 5100 + }, + { + "epoch": 0.1545595054095827, + "grad_norm": 0.11510657519102097, + "learning_rate": 4.856467937987892e-05, + "loss": 0.073, + "step": 5200 + }, + { + "epoch": 0.1575318035905362, + "grad_norm": 0.15956437587738037, + "learning_rate": 4.848646027251537e-05, + "loss": 0.0709, + "step": 5300 + }, + { + "epoch": 0.16050410177148972, + "grad_norm": 0.12511183321475983, + "learning_rate": 4.840824116515182e-05, + "loss": 0.0715, + "step": 5400 + }, + { + "epoch": 0.16347639995244323, + "grad_norm": 0.13191531598567963, + "learning_rate": 4.8330022057788274e-05, + "loss": 0.0678, + "step": 5500 + }, + { + "epoch": 0.16644869813339674, + "grad_norm": 0.14790107309818268, + "learning_rate": 4.8251802950424726e-05, + "loss": 0.0709, + "step": 5600 + }, + { + "epoch": 0.16942099631435026, + "grad_norm": 0.096534363925457, + "learning_rate": 4.8173583843061185e-05, + "loss": 0.0702, + "step": 5700 + }, + { + "epoch": 0.17239329449530377, + "grad_norm": 0.17341692745685577, + "learning_rate": 4.809536473569764e-05, + "loss": 0.0704, + "step": 5800 + }, + { + "epoch": 0.1753655926762573, + "grad_norm": 0.1814459264278412, + "learning_rate": 4.801714562833409e-05, + "loss": 0.0714, + "step": 5900 + }, + { + "epoch": 0.1783378908572108, + "grad_norm": 0.12359002977609634, + "learning_rate": 4.793892652097054e-05, + "loss": 0.0701, + "step": 6000 + }, + { + "epoch": 0.18131018903816432, + "grad_norm": 0.11429854482412338, + "learning_rate": 4.7860707413606994e-05, + "loss": 0.0705, + "step": 6100 + }, + { + "epoch": 0.18428248721911783, + "grad_norm": 0.1720765233039856, + "learning_rate": 4.7782488306243453e-05, + "loss": 0.0696, + "step": 6200 + }, + { + "epoch": 0.18725478540007134, + "grad_norm": 0.23681002855300903, + "learning_rate": 4.7704269198879906e-05, + "loss": 0.0713, + "step": 6300 + }, + { + "epoch": 0.19022708358102486, + "grad_norm": 0.12058725953102112, + "learning_rate": 4.762605009151636e-05, + "loss": 0.0725, + "step": 6400 + }, + { + "epoch": 0.19319938176197837, + "grad_norm": 0.1621580868959427, + "learning_rate": 4.754783098415281e-05, + "loss": 0.0702, + "step": 6500 + }, + { + "epoch": 0.1961716799429319, + "grad_norm": 0.15290570259094238, + "learning_rate": 4.746961187678927e-05, + "loss": 0.071, + "step": 6600 + }, + { + "epoch": 0.1991439781238854, + "grad_norm": 0.1742640882730484, + "learning_rate": 4.739139276942572e-05, + "loss": 0.069, + "step": 6700 + }, + { + "epoch": 0.2021162763048389, + "grad_norm": 0.13613374531269073, + "learning_rate": 4.7313173662062174e-05, + "loss": 0.0665, + "step": 6800 + }, + { + "epoch": 0.2050885744857924, + "grad_norm": 0.16826081275939941, + "learning_rate": 4.7234954554698626e-05, + "loss": 0.0706, + "step": 6900 + }, + { + "epoch": 0.20806087266674592, + "grad_norm": 0.1396206170320511, + "learning_rate": 4.715673544733508e-05, + "loss": 0.069, + "step": 7000 + }, + { + "epoch": 0.21103317084769943, + "grad_norm": 0.13312986493110657, + "learning_rate": 4.707851633997153e-05, + "loss": 0.0646, + "step": 7100 + }, + { + "epoch": 0.21400546902865294, + "grad_norm": 0.16151180863380432, + "learning_rate": 4.700029723260798e-05, + "loss": 0.0698, + "step": 7200 + }, + { + "epoch": 0.21697776720960646, + "grad_norm": 0.13654786348342896, + "learning_rate": 4.692207812524444e-05, + "loss": 0.0709, + "step": 7300 + }, + { + "epoch": 0.21995006539055997, + "grad_norm": 0.1378316432237625, + "learning_rate": 4.6843859017880894e-05, + "loss": 0.0715, + "step": 7400 + }, + { + "epoch": 0.2229223635715135, + "grad_norm": 0.16807210445404053, + "learning_rate": 4.676563991051735e-05, + "loss": 0.0681, + "step": 7500 + }, + { + "epoch": 0.225894661752467, + "grad_norm": 0.16466864943504333, + "learning_rate": 4.66874208031538e-05, + "loss": 0.0701, + "step": 7600 + }, + { + "epoch": 0.22886695993342052, + "grad_norm": 0.18744218349456787, + "learning_rate": 4.660920169579025e-05, + "loss": 0.0699, + "step": 7700 + }, + { + "epoch": 0.23183925811437403, + "grad_norm": 0.10295972228050232, + "learning_rate": 4.6530982588426703e-05, + "loss": 0.0729, + "step": 7800 + }, + { + "epoch": 0.23481155629532754, + "grad_norm": 0.1175568625330925, + "learning_rate": 4.6452763481063156e-05, + "loss": 0.071, + "step": 7900 + }, + { + "epoch": 0.23778385447628106, + "grad_norm": 0.08961982280015945, + "learning_rate": 4.637454437369961e-05, + "loss": 0.072, + "step": 8000 + }, + { + "epoch": 0.24075615265723457, + "grad_norm": 0.14698870480060577, + "learning_rate": 4.629632526633606e-05, + "loss": 0.0717, + "step": 8100 + }, + { + "epoch": 0.2437284508381881, + "grad_norm": 0.1095692440867424, + "learning_rate": 4.621810615897252e-05, + "loss": 0.0687, + "step": 8200 + }, + { + "epoch": 0.2467007490191416, + "grad_norm": 0.10859747976064682, + "learning_rate": 4.613988705160897e-05, + "loss": 0.0694, + "step": 8300 + }, + { + "epoch": 0.24967304720009512, + "grad_norm": 0.14616329967975616, + "learning_rate": 4.6061667944245424e-05, + "loss": 0.0681, + "step": 8400 + }, + { + "epoch": 0.2526453453810486, + "grad_norm": 0.19373726844787598, + "learning_rate": 4.5983448836881876e-05, + "loss": 0.0674, + "step": 8500 + }, + { + "epoch": 0.2556176435620021, + "grad_norm": 0.2358144372701645, + "learning_rate": 4.590522972951833e-05, + "loss": 0.0672, + "step": 8600 + }, + { + "epoch": 0.25858994174295563, + "grad_norm": 0.15689809620380402, + "learning_rate": 4.582701062215478e-05, + "loss": 0.0701, + "step": 8700 + }, + { + "epoch": 0.26156223992390915, + "grad_norm": 0.0, + "learning_rate": 4.574879151479123e-05, + "loss": 0.0684, + "step": 8800 + }, + { + "epoch": 0.26453453810486266, + "grad_norm": 0.1861608475446701, + "learning_rate": 4.5670572407427685e-05, + "loss": 0.0691, + "step": 8900 + }, + { + "epoch": 0.2675068362858162, + "grad_norm": 0.12946495413780212, + "learning_rate": 4.5592353300064144e-05, + "loss": 0.0702, + "step": 9000 + }, + { + "epoch": 0.2704791344667697, + "grad_norm": 0.2267402708530426, + "learning_rate": 4.55141341927006e-05, + "loss": 0.0693, + "step": 9100 + }, + { + "epoch": 0.2734514326477232, + "grad_norm": 0.16589206457138062, + "learning_rate": 4.543591508533705e-05, + "loss": 0.0691, + "step": 9200 + }, + { + "epoch": 0.2764237308286767, + "grad_norm": 0.09351163357496262, + "learning_rate": 4.53576959779735e-05, + "loss": 0.0717, + "step": 9300 + }, + { + "epoch": 0.27939602900963023, + "grad_norm": 0.1546529084444046, + "learning_rate": 4.5279476870609954e-05, + "loss": 0.0676, + "step": 9400 + }, + { + "epoch": 0.28236832719058375, + "grad_norm": 0.19429802894592285, + "learning_rate": 4.5201257763246406e-05, + "loss": 0.0683, + "step": 9500 + }, + { + "epoch": 0.28534062537153726, + "grad_norm": 0.17836213111877441, + "learning_rate": 4.512303865588286e-05, + "loss": 0.0692, + "step": 9600 + }, + { + "epoch": 0.2883129235524908, + "grad_norm": 0.19443030655384064, + "learning_rate": 4.504481954851931e-05, + "loss": 0.0679, + "step": 9700 + }, + { + "epoch": 0.2912852217334443, + "grad_norm": 0.1298290640115738, + "learning_rate": 4.496660044115576e-05, + "loss": 0.0696, + "step": 9800 + }, + { + "epoch": 0.2942575199143978, + "grad_norm": 0.0, + "learning_rate": 4.488838133379222e-05, + "loss": 0.0686, + "step": 9900 + }, + { + "epoch": 0.2972298180953513, + "grad_norm": 0.15551084280014038, + "learning_rate": 4.4810162226428674e-05, + "loss": 0.0659, + "step": 10000 + }, + { + "epoch": 0.30020211627630483, + "grad_norm": 0.24490253627300262, + "learning_rate": 4.4731943119065126e-05, + "loss": 0.0673, + "step": 10100 + }, + { + "epoch": 0.30317441445725835, + "grad_norm": 0.4358350336551666, + "learning_rate": 4.465372401170158e-05, + "loss": 0.0676, + "step": 10200 + }, + { + "epoch": 0.30614671263821186, + "grad_norm": 0.11326293647289276, + "learning_rate": 4.457550490433803e-05, + "loss": 0.0655, + "step": 10300 + }, + { + "epoch": 0.3091190108191654, + "grad_norm": 0.1475234180688858, + "learning_rate": 4.449728579697448e-05, + "loss": 0.0688, + "step": 10400 + }, + { + "epoch": 0.3120913090001189, + "grad_norm": 0.12599079310894012, + "learning_rate": 4.4419066689610935e-05, + "loss": 0.0686, + "step": 10500 + }, + { + "epoch": 0.3150636071810724, + "grad_norm": 0.14842267334461212, + "learning_rate": 4.4340847582247395e-05, + "loss": 0.0673, + "step": 10600 + }, + { + "epoch": 0.3180359053620259, + "grad_norm": 0.12599268555641174, + "learning_rate": 4.426262847488385e-05, + "loss": 0.0692, + "step": 10700 + }, + { + "epoch": 0.32100820354297943, + "grad_norm": 0.19562271237373352, + "learning_rate": 4.41844093675203e-05, + "loss": 0.0694, + "step": 10800 + }, + { + "epoch": 0.32398050172393295, + "grad_norm": 0.17160725593566895, + "learning_rate": 4.410619026015675e-05, + "loss": 0.0664, + "step": 10900 + }, + { + "epoch": 0.32695279990488646, + "grad_norm": 0.16906797885894775, + "learning_rate": 4.402797115279321e-05, + "loss": 0.0674, + "step": 11000 + }, + { + "epoch": 0.32992509808584, + "grad_norm": 0.1143309697508812, + "learning_rate": 4.394975204542966e-05, + "loss": 0.0684, + "step": 11100 + }, + { + "epoch": 0.3328973962667935, + "grad_norm": 0.08855737745761871, + "learning_rate": 4.3871532938066115e-05, + "loss": 0.0691, + "step": 11200 + }, + { + "epoch": 0.335869694447747, + "grad_norm": 0.22277314960956573, + "learning_rate": 4.379331383070257e-05, + "loss": 0.0668, + "step": 11300 + }, + { + "epoch": 0.3388419926287005, + "grad_norm": 0.19662527740001678, + "learning_rate": 4.371509472333902e-05, + "loss": 0.0678, + "step": 11400 + }, + { + "epoch": 0.34181429080965403, + "grad_norm": 0.09259670227766037, + "learning_rate": 4.363687561597547e-05, + "loss": 0.0666, + "step": 11500 + }, + { + "epoch": 0.34478658899060755, + "grad_norm": 0.12981513142585754, + "learning_rate": 4.355865650861193e-05, + "loss": 0.0681, + "step": 11600 + }, + { + "epoch": 0.34775888717156106, + "grad_norm": 0.12641040980815887, + "learning_rate": 4.348043740124838e-05, + "loss": 0.069, + "step": 11700 + }, + { + "epoch": 0.3507311853525146, + "grad_norm": 0.14636339247226715, + "learning_rate": 4.3402218293884835e-05, + "loss": 0.0683, + "step": 11800 + }, + { + "epoch": 0.3537034835334681, + "grad_norm": 0.22742003202438354, + "learning_rate": 4.332399918652129e-05, + "loss": 0.0673, + "step": 11900 + }, + { + "epoch": 0.3566757817144216, + "grad_norm": 0.14071860909461975, + "learning_rate": 4.324578007915774e-05, + "loss": 0.0672, + "step": 12000 + }, + { + "epoch": 0.3596480798953751, + "grad_norm": 0.08983983844518661, + "learning_rate": 4.316756097179419e-05, + "loss": 0.0685, + "step": 12100 + }, + { + "epoch": 0.36262037807632863, + "grad_norm": 0.13283419609069824, + "learning_rate": 4.3089341864430645e-05, + "loss": 0.0705, + "step": 12200 + }, + { + "epoch": 0.36559267625728215, + "grad_norm": 0.17909641563892365, + "learning_rate": 4.30111227570671e-05, + "loss": 0.0665, + "step": 12300 + }, + { + "epoch": 0.36856497443823566, + "grad_norm": 0.15871115028858185, + "learning_rate": 4.2932903649703556e-05, + "loss": 0.0665, + "step": 12400 + }, + { + "epoch": 0.3715372726191892, + "grad_norm": 0.15093806385993958, + "learning_rate": 4.285468454234001e-05, + "loss": 0.0683, + "step": 12500 + }, + { + "epoch": 0.3745095708001427, + "grad_norm": 0.11400102823972702, + "learning_rate": 4.277646543497646e-05, + "loss": 0.0689, + "step": 12600 + }, + { + "epoch": 0.3774818689810962, + "grad_norm": 0.15755300223827362, + "learning_rate": 4.269824632761291e-05, + "loss": 0.0685, + "step": 12700 + }, + { + "epoch": 0.3804541671620497, + "grad_norm": 0.14009451866149902, + "learning_rate": 4.2620027220249365e-05, + "loss": 0.0693, + "step": 12800 + }, + { + "epoch": 0.38342646534300323, + "grad_norm": 0.10680609941482544, + "learning_rate": 4.254180811288582e-05, + "loss": 0.0667, + "step": 12900 + }, + { + "epoch": 0.38639876352395675, + "grad_norm": 0.14737030863761902, + "learning_rate": 4.246358900552227e-05, + "loss": 0.0687, + "step": 13000 + }, + { + "epoch": 0.38937106170491026, + "grad_norm": 0.10508720576763153, + "learning_rate": 4.238536989815872e-05, + "loss": 0.0685, + "step": 13100 + }, + { + "epoch": 0.3923433598858638, + "grad_norm": 0.18317556381225586, + "learning_rate": 4.2307150790795174e-05, + "loss": 0.0672, + "step": 13200 + }, + { + "epoch": 0.3953156580668173, + "grad_norm": 0.25876110792160034, + "learning_rate": 4.222893168343163e-05, + "loss": 0.0678, + "step": 13300 + }, + { + "epoch": 0.3982879562477708, + "grad_norm": 0.17435985803604126, + "learning_rate": 4.2150712576068086e-05, + "loss": 0.0683, + "step": 13400 + }, + { + "epoch": 0.4012602544287243, + "grad_norm": 0.16925185918807983, + "learning_rate": 4.207249346870454e-05, + "loss": 0.0675, + "step": 13500 + }, + { + "epoch": 0.4042325526096778, + "grad_norm": 0.11990626901388168, + "learning_rate": 4.199427436134099e-05, + "loss": 0.0655, + "step": 13600 + }, + { + "epoch": 0.4072048507906313, + "grad_norm": 0.1442670226097107, + "learning_rate": 4.191605525397744e-05, + "loss": 0.0675, + "step": 13700 + }, + { + "epoch": 0.4101771489715848, + "grad_norm": 0.1616191864013672, + "learning_rate": 4.1837836146613895e-05, + "loss": 0.067, + "step": 13800 + }, + { + "epoch": 0.4131494471525383, + "grad_norm": 0.10042081773281097, + "learning_rate": 4.175961703925035e-05, + "loss": 0.0684, + "step": 13900 + }, + { + "epoch": 0.41612174533349183, + "grad_norm": 0.15398038923740387, + "learning_rate": 4.16813979318868e-05, + "loss": 0.0689, + "step": 14000 + }, + { + "epoch": 0.41909404351444535, + "grad_norm": 0.14517879486083984, + "learning_rate": 4.160317882452326e-05, + "loss": 0.0669, + "step": 14100 + }, + { + "epoch": 0.42206634169539886, + "grad_norm": 0.1727454513311386, + "learning_rate": 4.152495971715971e-05, + "loss": 0.0651, + "step": 14200 + }, + { + "epoch": 0.4250386398763524, + "grad_norm": 0.15185511112213135, + "learning_rate": 4.144674060979616e-05, + "loss": 0.0651, + "step": 14300 + }, + { + "epoch": 0.4280109380573059, + "grad_norm": 0.13296420872211456, + "learning_rate": 4.1368521502432615e-05, + "loss": 0.0656, + "step": 14400 + }, + { + "epoch": 0.4309832362382594, + "grad_norm": 0.11834818124771118, + "learning_rate": 4.129030239506907e-05, + "loss": 0.0665, + "step": 14500 + }, + { + "epoch": 0.4339555344192129, + "grad_norm": 0.13726277649402618, + "learning_rate": 4.121208328770552e-05, + "loss": 0.0665, + "step": 14600 + }, + { + "epoch": 0.43692783260016643, + "grad_norm": 0.12360060960054398, + "learning_rate": 4.113386418034197e-05, + "loss": 0.0667, + "step": 14700 + }, + { + "epoch": 0.43990013078111995, + "grad_norm": 0.13088300824165344, + "learning_rate": 4.1055645072978424e-05, + "loss": 0.0653, + "step": 14800 + }, + { + "epoch": 0.44287242896207346, + "grad_norm": 0.20376217365264893, + "learning_rate": 4.097742596561488e-05, + "loss": 0.065, + "step": 14900 + }, + { + "epoch": 0.445844727143027, + "grad_norm": 0.1642831265926361, + "learning_rate": 4.0899206858251336e-05, + "loss": 0.0682, + "step": 15000 + }, + { + "epoch": 0.4488170253239805, + "grad_norm": 0.1356818526983261, + "learning_rate": 4.082098775088779e-05, + "loss": 0.0689, + "step": 15100 + }, + { + "epoch": 0.451789323504934, + "grad_norm": 0.1623271256685257, + "learning_rate": 4.074276864352424e-05, + "loss": 0.0688, + "step": 15200 + }, + { + "epoch": 0.4547616216858875, + "grad_norm": 0.1605483591556549, + "learning_rate": 4.066454953616069e-05, + "loss": 0.0646, + "step": 15300 + }, + { + "epoch": 0.45773391986684103, + "grad_norm": 0.18635429441928864, + "learning_rate": 4.0586330428797145e-05, + "loss": 0.0671, + "step": 15400 + }, + { + "epoch": 0.46070621804779455, + "grad_norm": 0.11615981161594391, + "learning_rate": 4.0508111321433604e-05, + "loss": 0.065, + "step": 15500 + }, + { + "epoch": 0.46367851622874806, + "grad_norm": 0.11628147959709167, + "learning_rate": 4.0429892214070056e-05, + "loss": 0.0648, + "step": 15600 + }, + { + "epoch": 0.4666508144097016, + "grad_norm": 0.17421554028987885, + "learning_rate": 4.035167310670651e-05, + "loss": 0.0664, + "step": 15700 + }, + { + "epoch": 0.4696231125906551, + "grad_norm": 0.13611331582069397, + "learning_rate": 4.027345399934296e-05, + "loss": 0.065, + "step": 15800 + }, + { + "epoch": 0.4725954107716086, + "grad_norm": 0.1467682123184204, + "learning_rate": 4.019523489197942e-05, + "loss": 0.0656, + "step": 15900 + }, + { + "epoch": 0.4755677089525621, + "grad_norm": 0.0, + "learning_rate": 4.011701578461587e-05, + "loss": 0.0664, + "step": 16000 + }, + { + "epoch": 0.47854000713351563, + "grad_norm": 0.2576521039009094, + "learning_rate": 4.0038796677252324e-05, + "loss": 0.0671, + "step": 16100 + }, + { + "epoch": 0.48151230531446915, + "grad_norm": 0.09677518159151077, + "learning_rate": 3.9960577569888777e-05, + "loss": 0.0676, + "step": 16200 + }, + { + "epoch": 0.48448460349542266, + "grad_norm": 0.12476232647895813, + "learning_rate": 3.988235846252523e-05, + "loss": 0.0684, + "step": 16300 + }, + { + "epoch": 0.4874569016763762, + "grad_norm": 0.1580890417098999, + "learning_rate": 3.980413935516168e-05, + "loss": 0.0658, + "step": 16400 + }, + { + "epoch": 0.4904291998573297, + "grad_norm": 0.1716105192899704, + "learning_rate": 3.9725920247798133e-05, + "loss": 0.0671, + "step": 16500 + }, + { + "epoch": 0.4934014980382832, + "grad_norm": 0.13039934635162354, + "learning_rate": 3.964770114043459e-05, + "loss": 0.0684, + "step": 16600 + }, + { + "epoch": 0.4963737962192367, + "grad_norm": 0.11183754354715347, + "learning_rate": 3.9569482033071045e-05, + "loss": 0.0662, + "step": 16700 + }, + { + "epoch": 0.49934609440019023, + "grad_norm": 0.16217298805713654, + "learning_rate": 3.94912629257075e-05, + "loss": 0.0646, + "step": 16800 + }, + { + "epoch": 0.5023183925811437, + "grad_norm": 0.14661744236946106, + "learning_rate": 3.941304381834395e-05, + "loss": 0.0683, + "step": 16900 + }, + { + "epoch": 0.5052906907620972, + "grad_norm": 0.11920378357172012, + "learning_rate": 3.93348247109804e-05, + "loss": 0.0651, + "step": 17000 + }, + { + "epoch": 0.5082629889430508, + "grad_norm": 0.1136004626750946, + "learning_rate": 3.9256605603616854e-05, + "loss": 0.0698, + "step": 17100 + }, + { + "epoch": 0.5112352871240042, + "grad_norm": 0.2666053771972656, + "learning_rate": 3.9178386496253306e-05, + "loss": 0.065, + "step": 17200 + }, + { + "epoch": 0.5142075853049578, + "grad_norm": 0.20627038180828094, + "learning_rate": 3.910016738888976e-05, + "loss": 0.0661, + "step": 17300 + }, + { + "epoch": 0.5171798834859113, + "grad_norm": 0.1550694704055786, + "learning_rate": 3.902194828152621e-05, + "loss": 0.0643, + "step": 17400 + }, + { + "epoch": 0.5201521816668648, + "grad_norm": 0.1619909256696701, + "learning_rate": 3.894372917416267e-05, + "loss": 0.0665, + "step": 17500 + }, + { + "epoch": 0.5231244798478183, + "grad_norm": 0.0981653481721878, + "learning_rate": 3.886551006679912e-05, + "loss": 0.0643, + "step": 17600 + }, + { + "epoch": 0.5260967780287719, + "grad_norm": 0.1934923678636551, + "learning_rate": 3.8787290959435574e-05, + "loss": 0.0676, + "step": 17700 + }, + { + "epoch": 0.5290690762097253, + "grad_norm": 0.178536057472229, + "learning_rate": 3.870907185207203e-05, + "loss": 0.068, + "step": 17800 + }, + { + "epoch": 0.5320413743906789, + "grad_norm": 0.16652072966098785, + "learning_rate": 3.863085274470848e-05, + "loss": 0.067, + "step": 17900 + }, + { + "epoch": 0.5350136725716323, + "grad_norm": 0.2468726485967636, + "learning_rate": 3.855263363734493e-05, + "loss": 0.067, + "step": 18000 + }, + { + "epoch": 0.5379859707525859, + "grad_norm": 0.1681283414363861, + "learning_rate": 3.8474414529981384e-05, + "loss": 0.0678, + "step": 18100 + }, + { + "epoch": 0.5409582689335394, + "grad_norm": 0.1613057255744934, + "learning_rate": 3.8396195422617836e-05, + "loss": 0.0668, + "step": 18200 + }, + { + "epoch": 0.543930567114493, + "grad_norm": 0.1295543909072876, + "learning_rate": 3.8317976315254295e-05, + "loss": 0.0658, + "step": 18300 + }, + { + "epoch": 0.5469028652954464, + "grad_norm": 0.12280385196208954, + "learning_rate": 3.823975720789075e-05, + "loss": 0.065, + "step": 18400 + }, + { + "epoch": 0.5498751634764, + "grad_norm": 0.17326301336288452, + "learning_rate": 3.81615381005272e-05, + "loss": 0.0648, + "step": 18500 + }, + { + "epoch": 0.5528474616573534, + "grad_norm": 0.22219550609588623, + "learning_rate": 3.808331899316365e-05, + "loss": 0.0669, + "step": 18600 + }, + { + "epoch": 0.555819759838307, + "grad_norm": 0.14581084251403809, + "learning_rate": 3.8005099885800104e-05, + "loss": 0.0657, + "step": 18700 + }, + { + "epoch": 0.5587920580192605, + "grad_norm": 0.12378513067960739, + "learning_rate": 3.7926880778436556e-05, + "loss": 0.0665, + "step": 18800 + }, + { + "epoch": 0.561764356200214, + "grad_norm": 0.1954706907272339, + "learning_rate": 3.784866167107301e-05, + "loss": 0.0675, + "step": 18900 + }, + { + "epoch": 0.5647366543811675, + "grad_norm": 0.1474733054637909, + "learning_rate": 3.777044256370946e-05, + "loss": 0.0663, + "step": 19000 + }, + { + "epoch": 0.5677089525621211, + "grad_norm": 0.146515354514122, + "learning_rate": 3.769222345634591e-05, + "loss": 0.0663, + "step": 19100 + }, + { + "epoch": 0.5706812507430745, + "grad_norm": 0.16457676887512207, + "learning_rate": 3.761400434898237e-05, + "loss": 0.0683, + "step": 19200 + }, + { + "epoch": 0.5736535489240281, + "grad_norm": 0.13298442959785461, + "learning_rate": 3.7535785241618824e-05, + "loss": 0.0674, + "step": 19300 + }, + { + "epoch": 0.5766258471049815, + "grad_norm": 0.13941340148448944, + "learning_rate": 3.745756613425528e-05, + "loss": 0.0638, + "step": 19400 + }, + { + "epoch": 0.5795981452859351, + "grad_norm": 0.1381063610315323, + "learning_rate": 3.737934702689173e-05, + "loss": 0.0661, + "step": 19500 + }, + { + "epoch": 0.5825704434668886, + "grad_norm": 0.3499239385128021, + "learning_rate": 3.730112791952818e-05, + "loss": 0.0631, + "step": 19600 + }, + { + "epoch": 0.5855427416478421, + "grad_norm": 0.1044827550649643, + "learning_rate": 3.7222908812164634e-05, + "loss": 0.0643, + "step": 19700 + }, + { + "epoch": 0.5885150398287956, + "grad_norm": 0.2002628892660141, + "learning_rate": 3.7144689704801086e-05, + "loss": 0.0655, + "step": 19800 + }, + { + "epoch": 0.5914873380097492, + "grad_norm": 0.13389742374420166, + "learning_rate": 3.706647059743754e-05, + "loss": 0.0676, + "step": 19900 + }, + { + "epoch": 0.5944596361907026, + "grad_norm": 0.11794281750917435, + "learning_rate": 3.6988251490074e-05, + "loss": 0.0666, + "step": 20000 + }, + { + "epoch": 0.5974319343716562, + "grad_norm": 0.21176274120807648, + "learning_rate": 3.691003238271045e-05, + "loss": 0.0664, + "step": 20100 + }, + { + "epoch": 0.6004042325526097, + "grad_norm": 0.1921447515487671, + "learning_rate": 3.68318132753469e-05, + "loss": 0.0651, + "step": 20200 + }, + { + "epoch": 0.6033765307335632, + "grad_norm": 0.18152420222759247, + "learning_rate": 3.6753594167983354e-05, + "loss": 0.0643, + "step": 20300 + }, + { + "epoch": 0.6063488289145167, + "grad_norm": 0.14203208684921265, + "learning_rate": 3.667537506061981e-05, + "loss": 0.067, + "step": 20400 + }, + { + "epoch": 0.6093211270954703, + "grad_norm": 0.15306586027145386, + "learning_rate": 3.6597155953256265e-05, + "loss": 0.0669, + "step": 20500 + }, + { + "epoch": 0.6122934252764237, + "grad_norm": 0.18764156103134155, + "learning_rate": 3.651893684589272e-05, + "loss": 0.0623, + "step": 20600 + }, + { + "epoch": 0.6152657234573773, + "grad_norm": 0.16318605840206146, + "learning_rate": 3.644071773852917e-05, + "loss": 0.0652, + "step": 20700 + }, + { + "epoch": 0.6182380216383307, + "grad_norm": 0.3064686357975006, + "learning_rate": 3.636249863116562e-05, + "loss": 0.0646, + "step": 20800 + }, + { + "epoch": 0.6212103198192843, + "grad_norm": 0.09382881224155426, + "learning_rate": 3.628427952380208e-05, + "loss": 0.0633, + "step": 20900 + }, + { + "epoch": 0.6241826180002378, + "grad_norm": 0.15609365701675415, + "learning_rate": 3.6206060416438534e-05, + "loss": 0.0646, + "step": 21000 + }, + { + "epoch": 0.6271549161811913, + "grad_norm": 0.13193155825138092, + "learning_rate": 3.6127841309074986e-05, + "loss": 0.0644, + "step": 21100 + }, + { + "epoch": 0.6301272143621448, + "grad_norm": 0.10934310406446457, + "learning_rate": 3.604962220171144e-05, + "loss": 0.0671, + "step": 21200 + }, + { + "epoch": 0.6330995125430984, + "grad_norm": 0.16309602558612823, + "learning_rate": 3.597140309434789e-05, + "loss": 0.0643, + "step": 21300 + }, + { + "epoch": 0.6360718107240518, + "grad_norm": 0.15275511145591736, + "learning_rate": 3.589318398698434e-05, + "loss": 0.0647, + "step": 21400 + }, + { + "epoch": 0.6390441089050054, + "grad_norm": 0.14445960521697998, + "learning_rate": 3.5814964879620795e-05, + "loss": 0.0621, + "step": 21500 + }, + { + "epoch": 0.6420164070859589, + "grad_norm": 0.15277177095413208, + "learning_rate": 3.573674577225725e-05, + "loss": 0.064, + "step": 21600 + }, + { + "epoch": 0.6449887052669124, + "grad_norm": 0.161845400929451, + "learning_rate": 3.5658526664893706e-05, + "loss": 0.0655, + "step": 21700 + }, + { + "epoch": 0.6479610034478659, + "grad_norm": 0.10563397407531738, + "learning_rate": 3.558030755753016e-05, + "loss": 0.064, + "step": 21800 + }, + { + "epoch": 0.6509333016288193, + "grad_norm": 0.21491585671901703, + "learning_rate": 3.550208845016661e-05, + "loss": 0.0636, + "step": 21900 + }, + { + "epoch": 0.6539055998097729, + "grad_norm": 0.18121227622032166, + "learning_rate": 3.542386934280306e-05, + "loss": 0.0652, + "step": 22000 + }, + { + "epoch": 0.6568778979907264, + "grad_norm": 0.16236074268817902, + "learning_rate": 3.5345650235439515e-05, + "loss": 0.0635, + "step": 22100 + }, + { + "epoch": 0.65985019617168, + "grad_norm": 0.19213314354419708, + "learning_rate": 3.526743112807597e-05, + "loss": 0.0646, + "step": 22200 + }, + { + "epoch": 0.6628224943526334, + "grad_norm": 0.15315917134284973, + "learning_rate": 3.518921202071242e-05, + "loss": 0.0657, + "step": 22300 + }, + { + "epoch": 0.665794792533587, + "grad_norm": 0.1928938776254654, + "learning_rate": 3.511099291334887e-05, + "loss": 0.0638, + "step": 22400 + }, + { + "epoch": 0.6687670907145404, + "grad_norm": 0.13677607476711273, + "learning_rate": 3.503277380598533e-05, + "loss": 0.0641, + "step": 22500 + }, + { + "epoch": 0.671739388895494, + "grad_norm": 0.11495836824178696, + "learning_rate": 3.4954554698621784e-05, + "loss": 0.0654, + "step": 22600 + }, + { + "epoch": 0.6747116870764475, + "grad_norm": 0.16505193710327148, + "learning_rate": 3.4876335591258236e-05, + "loss": 0.0655, + "step": 22700 + }, + { + "epoch": 0.677683985257401, + "grad_norm": 0.1515558362007141, + "learning_rate": 3.479811648389469e-05, + "loss": 0.0649, + "step": 22800 + }, + { + "epoch": 0.6806562834383545, + "grad_norm": 0.07586062699556351, + "learning_rate": 3.471989737653114e-05, + "loss": 0.0645, + "step": 22900 + }, + { + "epoch": 0.6836285816193081, + "grad_norm": 0.12402459233999252, + "learning_rate": 3.464167826916759e-05, + "loss": 0.0615, + "step": 23000 + }, + { + "epoch": 0.6866008798002615, + "grad_norm": 0.1206587627530098, + "learning_rate": 3.4563459161804045e-05, + "loss": 0.0658, + "step": 23100 + }, + { + "epoch": 0.6895731779812151, + "grad_norm": 0.17542561888694763, + "learning_rate": 3.44852400544405e-05, + "loss": 0.0657, + "step": 23200 + }, + { + "epoch": 0.6925454761621685, + "grad_norm": 0.13489127159118652, + "learning_rate": 3.440702094707695e-05, + "loss": 0.0654, + "step": 23300 + }, + { + "epoch": 0.6955177743431221, + "grad_norm": 0.15735846757888794, + "learning_rate": 3.432880183971341e-05, + "loss": 0.0614, + "step": 23400 + }, + { + "epoch": 0.6984900725240756, + "grad_norm": 0.16880860924720764, + "learning_rate": 3.425058273234986e-05, + "loss": 0.0613, + "step": 23500 + }, + { + "epoch": 0.7014623707050291, + "grad_norm": 0.14936357736587524, + "learning_rate": 3.417236362498631e-05, + "loss": 0.065, + "step": 23600 + }, + { + "epoch": 0.7044346688859826, + "grad_norm": 0.18042545020580292, + "learning_rate": 3.4094144517622766e-05, + "loss": 0.0664, + "step": 23700 + }, + { + "epoch": 0.7074069670669362, + "grad_norm": 0.13980703055858612, + "learning_rate": 3.401592541025922e-05, + "loss": 0.0634, + "step": 23800 + }, + { + "epoch": 0.7103792652478896, + "grad_norm": 0.13482260704040527, + "learning_rate": 3.393770630289567e-05, + "loss": 0.0638, + "step": 23900 + }, + { + "epoch": 0.7133515634288432, + "grad_norm": 0.16407857835292816, + "learning_rate": 3.385948719553212e-05, + "loss": 0.0647, + "step": 24000 + }, + { + "epoch": 0.7163238616097967, + "grad_norm": 0.15565545856952667, + "learning_rate": 3.3781268088168575e-05, + "loss": 0.0636, + "step": 24100 + }, + { + "epoch": 0.7192961597907502, + "grad_norm": 0.1404072791337967, + "learning_rate": 3.3703048980805034e-05, + "loss": 0.0603, + "step": 24200 + }, + { + "epoch": 0.7222684579717037, + "grad_norm": 0.11534532904624939, + "learning_rate": 3.3624829873441486e-05, + "loss": 0.0661, + "step": 24300 + }, + { + "epoch": 0.7252407561526573, + "grad_norm": 0.17310760915279388, + "learning_rate": 3.354661076607794e-05, + "loss": 0.0628, + "step": 24400 + }, + { + "epoch": 0.7282130543336107, + "grad_norm": 0.26752859354019165, + "learning_rate": 3.346839165871439e-05, + "loss": 0.064, + "step": 24500 + }, + { + "epoch": 0.7311853525145643, + "grad_norm": 0.13114948570728302, + "learning_rate": 3.339017255135084e-05, + "loss": 0.0634, + "step": 24600 + }, + { + "epoch": 0.7341576506955177, + "grad_norm": 0.13670499622821808, + "learning_rate": 3.3311953443987295e-05, + "loss": 0.066, + "step": 24700 + }, + { + "epoch": 0.7371299488764713, + "grad_norm": 0.06958445906639099, + "learning_rate": 3.323373433662375e-05, + "loss": 0.0651, + "step": 24800 + }, + { + "epoch": 0.7401022470574248, + "grad_norm": 0.16883139312267303, + "learning_rate": 3.3155515229260207e-05, + "loss": 0.0616, + "step": 24900 + }, + { + "epoch": 0.7430745452383783, + "grad_norm": 0.20578400790691376, + "learning_rate": 3.307729612189666e-05, + "loss": 0.0628, + "step": 25000 + }, + { + "epoch": 0.7460468434193318, + "grad_norm": 0.261901319026947, + "learning_rate": 3.299907701453311e-05, + "loss": 0.0651, + "step": 25100 + }, + { + "epoch": 0.7490191416002854, + "grad_norm": 0.1408625692129135, + "learning_rate": 3.292085790716956e-05, + "loss": 0.0642, + "step": 25200 + }, + { + "epoch": 0.7519914397812388, + "grad_norm": 0.14399020373821259, + "learning_rate": 3.284263879980602e-05, + "loss": 0.0638, + "step": 25300 + }, + { + "epoch": 0.7549637379621924, + "grad_norm": 0.21750278770923615, + "learning_rate": 3.2764419692442475e-05, + "loss": 0.0662, + "step": 25400 + }, + { + "epoch": 0.7579360361431459, + "grad_norm": 0.12629608809947968, + "learning_rate": 3.268620058507893e-05, + "loss": 0.0637, + "step": 25500 + }, + { + "epoch": 0.7609083343240994, + "grad_norm": 0.20059539377689362, + "learning_rate": 3.260798147771538e-05, + "loss": 0.0648, + "step": 25600 + }, + { + "epoch": 0.7638806325050529, + "grad_norm": 0.1864122599363327, + "learning_rate": 3.252976237035183e-05, + "loss": 0.0662, + "step": 25700 + }, + { + "epoch": 0.7668529306860065, + "grad_norm": 0.16060927510261536, + "learning_rate": 3.2451543262988284e-05, + "loss": 0.0615, + "step": 25800 + }, + { + "epoch": 0.7698252288669599, + "grad_norm": 0.1293332874774933, + "learning_rate": 3.237332415562474e-05, + "loss": 0.0642, + "step": 25900 + }, + { + "epoch": 0.7727975270479135, + "grad_norm": 0.0, + "learning_rate": 3.2295105048261195e-05, + "loss": 0.0625, + "step": 26000 + }, + { + "epoch": 0.775769825228867, + "grad_norm": 0.1476771980524063, + "learning_rate": 3.221688594089765e-05, + "loss": 0.0621, + "step": 26100 + }, + { + "epoch": 0.7787421234098205, + "grad_norm": 0.1248912438750267, + "learning_rate": 3.21386668335341e-05, + "loss": 0.062, + "step": 26200 + }, + { + "epoch": 0.781714421590774, + "grad_norm": 0.16431346535682678, + "learning_rate": 3.206044772617055e-05, + "loss": 0.0643, + "step": 26300 + }, + { + "epoch": 0.7846867197717275, + "grad_norm": 0.1116422787308693, + "learning_rate": 3.1982228618807004e-05, + "loss": 0.064, + "step": 26400 + }, + { + "epoch": 0.787659017952681, + "grad_norm": 0.19972670078277588, + "learning_rate": 3.1904009511443457e-05, + "loss": 0.0641, + "step": 26500 + }, + { + "epoch": 0.7906313161336346, + "grad_norm": 0.15693478286266327, + "learning_rate": 3.182579040407991e-05, + "loss": 0.0629, + "step": 26600 + }, + { + "epoch": 0.793603614314588, + "grad_norm": 0.19390398263931274, + "learning_rate": 3.174757129671636e-05, + "loss": 0.0629, + "step": 26700 + }, + { + "epoch": 0.7965759124955416, + "grad_norm": 0.1264067441225052, + "learning_rate": 3.166935218935282e-05, + "loss": 0.0593, + "step": 26800 + }, + { + "epoch": 0.7995482106764951, + "grad_norm": 0.1398812085390091, + "learning_rate": 3.159113308198927e-05, + "loss": 0.0652, + "step": 26900 + }, + { + "epoch": 0.8025205088574486, + "grad_norm": 0.10826534777879715, + "learning_rate": 3.1512913974625725e-05, + "loss": 0.0629, + "step": 27000 + }, + { + "epoch": 0.8054928070384021, + "grad_norm": 0.16641683876514435, + "learning_rate": 3.143469486726218e-05, + "loss": 0.0647, + "step": 27100 + }, + { + "epoch": 0.8084651052193556, + "grad_norm": 0.21238163113594055, + "learning_rate": 3.135647575989863e-05, + "loss": 0.063, + "step": 27200 + }, + { + "epoch": 0.8114374034003091, + "grad_norm": 0.1603163182735443, + "learning_rate": 3.127825665253508e-05, + "loss": 0.0646, + "step": 27300 + }, + { + "epoch": 0.8144097015812626, + "grad_norm": 0.11647824943065643, + "learning_rate": 3.1200037545171534e-05, + "loss": 0.0644, + "step": 27400 + }, + { + "epoch": 0.8173819997622161, + "grad_norm": 0.1281505674123764, + "learning_rate": 3.1121818437807986e-05, + "loss": 0.0644, + "step": 27500 + }, + { + "epoch": 0.8203542979431696, + "grad_norm": 0.1536133885383606, + "learning_rate": 3.1043599330444445e-05, + "loss": 0.0643, + "step": 27600 + }, + { + "epoch": 0.8233265961241232, + "grad_norm": 0.14825439453125, + "learning_rate": 3.09653802230809e-05, + "loss": 0.0637, + "step": 27700 + }, + { + "epoch": 0.8262988943050766, + "grad_norm": 0.0, + "learning_rate": 3.088716111571735e-05, + "loss": 0.063, + "step": 27800 + }, + { + "epoch": 0.8292711924860302, + "grad_norm": 0.18695931136608124, + "learning_rate": 3.08089420083538e-05, + "loss": 0.0634, + "step": 27900 + }, + { + "epoch": 0.8322434906669837, + "grad_norm": 0.16763179004192352, + "learning_rate": 3.0730722900990254e-05, + "loss": 0.0631, + "step": 28000 + }, + { + "epoch": 0.8352157888479372, + "grad_norm": 0.33131468296051025, + "learning_rate": 3.065250379362671e-05, + "loss": 0.0621, + "step": 28100 + }, + { + "epoch": 0.8381880870288907, + "grad_norm": 0.1829434037208557, + "learning_rate": 3.057428468626316e-05, + "loss": 0.0623, + "step": 28200 + }, + { + "epoch": 0.8411603852098443, + "grad_norm": 0.1627769023180008, + "learning_rate": 3.0496065578899615e-05, + "loss": 0.0627, + "step": 28300 + }, + { + "epoch": 0.8441326833907977, + "grad_norm": 0.13899990916252136, + "learning_rate": 3.0417846471536067e-05, + "loss": 0.065, + "step": 28400 + }, + { + "epoch": 0.8471049815717513, + "grad_norm": 0.13856682181358337, + "learning_rate": 3.0339627364172523e-05, + "loss": 0.0627, + "step": 28500 + }, + { + "epoch": 0.8500772797527048, + "grad_norm": 0.12660333514213562, + "learning_rate": 3.0261408256808975e-05, + "loss": 0.0644, + "step": 28600 + }, + { + "epoch": 0.8530495779336583, + "grad_norm": 0.1996048241853714, + "learning_rate": 3.018318914944543e-05, + "loss": 0.0628, + "step": 28700 + }, + { + "epoch": 0.8560218761146118, + "grad_norm": 0.13596709072589874, + "learning_rate": 3.0104970042081883e-05, + "loss": 0.0629, + "step": 28800 + }, + { + "epoch": 0.8589941742955653, + "grad_norm": 0.19533370435237885, + "learning_rate": 3.0026750934718335e-05, + "loss": 0.0603, + "step": 28900 + }, + { + "epoch": 0.8619664724765188, + "grad_norm": 0.17897726595401764, + "learning_rate": 2.9948531827354787e-05, + "loss": 0.0629, + "step": 29000 + }, + { + "epoch": 0.8649387706574724, + "grad_norm": 0.1311454325914383, + "learning_rate": 2.987031271999124e-05, + "loss": 0.0646, + "step": 29100 + }, + { + "epoch": 0.8679110688384258, + "grad_norm": 0.21379490196704865, + "learning_rate": 2.9792093612627692e-05, + "loss": 0.0641, + "step": 29200 + }, + { + "epoch": 0.8708833670193794, + "grad_norm": 0.11453631520271301, + "learning_rate": 2.971387450526415e-05, + "loss": 0.0629, + "step": 29300 + }, + { + "epoch": 0.8738556652003329, + "grad_norm": 0.1669095754623413, + "learning_rate": 2.9635655397900603e-05, + "loss": 0.0638, + "step": 29400 + }, + { + "epoch": 0.8768279633812864, + "grad_norm": 0.18611925840377808, + "learning_rate": 2.9557436290537056e-05, + "loss": 0.0622, + "step": 29500 + }, + { + "epoch": 0.8798002615622399, + "grad_norm": 0.2764523923397064, + "learning_rate": 2.9479217183173508e-05, + "loss": 0.0633, + "step": 29600 + }, + { + "epoch": 0.8827725597431935, + "grad_norm": 0.15690016746520996, + "learning_rate": 2.940099807580996e-05, + "loss": 0.0641, + "step": 29700 + }, + { + "epoch": 0.8857448579241469, + "grad_norm": 0.14751718938350677, + "learning_rate": 2.9322778968446412e-05, + "loss": 0.0635, + "step": 29800 + }, + { + "epoch": 0.8887171561051005, + "grad_norm": 0.1193002238869667, + "learning_rate": 2.9244559861082865e-05, + "loss": 0.0651, + "step": 29900 + }, + { + "epoch": 0.891689454286054, + "grad_norm": 0.17135749757289886, + "learning_rate": 2.9166340753719317e-05, + "loss": 0.0628, + "step": 30000 + }, + { + "epoch": 0.8946617524670075, + "grad_norm": 0.17345505952835083, + "learning_rate": 2.908812164635577e-05, + "loss": 0.0614, + "step": 30100 + }, + { + "epoch": 0.897634050647961, + "grad_norm": 0.23677757382392883, + "learning_rate": 2.900990253899223e-05, + "loss": 0.0616, + "step": 30200 + }, + { + "epoch": 0.9006063488289145, + "grad_norm": 0.1623276323080063, + "learning_rate": 2.893168343162868e-05, + "loss": 0.0632, + "step": 30300 + }, + { + "epoch": 0.903578647009868, + "grad_norm": 0.10355116426944733, + "learning_rate": 2.8853464324265133e-05, + "loss": 0.0633, + "step": 30400 + }, + { + "epoch": 0.9065509451908216, + "grad_norm": 0.10426866263151169, + "learning_rate": 2.8775245216901585e-05, + "loss": 0.0638, + "step": 30500 + }, + { + "epoch": 0.909523243371775, + "grad_norm": 0.17414651811122894, + "learning_rate": 2.8697026109538037e-05, + "loss": 0.0611, + "step": 30600 + }, + { + "epoch": 0.9124955415527286, + "grad_norm": 0.17124363780021667, + "learning_rate": 2.861880700217449e-05, + "loss": 0.0635, + "step": 30700 + }, + { + "epoch": 0.9154678397336821, + "grad_norm": 0.20361237227916718, + "learning_rate": 2.8540587894810945e-05, + "loss": 0.0607, + "step": 30800 + }, + { + "epoch": 0.9184401379146356, + "grad_norm": 0.15385113656520844, + "learning_rate": 2.8462368787447398e-05, + "loss": 0.0645, + "step": 30900 + }, + { + "epoch": 0.9214124360955891, + "grad_norm": 0.18667034804821014, + "learning_rate": 2.8384149680083853e-05, + "loss": 0.0624, + "step": 31000 + }, + { + "epoch": 0.9243847342765427, + "grad_norm": 0.13712774217128754, + "learning_rate": 2.8305930572720306e-05, + "loss": 0.0605, + "step": 31100 + }, + { + "epoch": 0.9273570324574961, + "grad_norm": 0.15793125331401825, + "learning_rate": 2.822771146535676e-05, + "loss": 0.0613, + "step": 31200 + }, + { + "epoch": 0.9303293306384497, + "grad_norm": 0.17206710577011108, + "learning_rate": 2.8149492357993214e-05, + "loss": 0.0611, + "step": 31300 + }, + { + "epoch": 0.9333016288194032, + "grad_norm": 0.1699816882610321, + "learning_rate": 2.8071273250629666e-05, + "loss": 0.0631, + "step": 31400 + }, + { + "epoch": 0.9362739270003567, + "grad_norm": 0.14138641953468323, + "learning_rate": 2.7993054143266118e-05, + "loss": 0.0617, + "step": 31500 + }, + { + "epoch": 0.9392462251813102, + "grad_norm": 0.1501186192035675, + "learning_rate": 2.791483503590257e-05, + "loss": 0.0621, + "step": 31600 + }, + { + "epoch": 0.9422185233622637, + "grad_norm": 0.16840197145938873, + "learning_rate": 2.7836615928539023e-05, + "loss": 0.0614, + "step": 31700 + }, + { + "epoch": 0.9451908215432172, + "grad_norm": 0.16184936463832855, + "learning_rate": 2.7758396821175482e-05, + "loss": 0.0633, + "step": 31800 + }, + { + "epoch": 0.9481631197241708, + "grad_norm": 0.1523510068655014, + "learning_rate": 2.7680177713811934e-05, + "loss": 0.0607, + "step": 31900 + }, + { + "epoch": 0.9511354179051242, + "grad_norm": 0.1361226588487625, + "learning_rate": 2.7601958606448386e-05, + "loss": 0.0624, + "step": 32000 + }, + { + "epoch": 0.9541077160860778, + "grad_norm": 0.17981532216072083, + "learning_rate": 2.752373949908484e-05, + "loss": 0.0612, + "step": 32100 + }, + { + "epoch": 0.9570800142670313, + "grad_norm": 0.12752678990364075, + "learning_rate": 2.744552039172129e-05, + "loss": 0.0614, + "step": 32200 + }, + { + "epoch": 0.9600523124479848, + "grad_norm": 0.09807202219963074, + "learning_rate": 2.7367301284357743e-05, + "loss": 0.0603, + "step": 32300 + }, + { + "epoch": 0.9630246106289383, + "grad_norm": 0.1416402906179428, + "learning_rate": 2.7289082176994195e-05, + "loss": 0.0606, + "step": 32400 + }, + { + "epoch": 0.9659969088098919, + "grad_norm": 0.15595002472400665, + "learning_rate": 2.7210863069630648e-05, + "loss": 0.0615, + "step": 32500 + }, + { + "epoch": 0.9689692069908453, + "grad_norm": 0.15902253985404968, + "learning_rate": 2.71326439622671e-05, + "loss": 0.0632, + "step": 32600 + }, + { + "epoch": 0.9719415051717988, + "grad_norm": 0.14228953421115875, + "learning_rate": 2.705442485490356e-05, + "loss": 0.0627, + "step": 32700 + }, + { + "epoch": 0.9749138033527524, + "grad_norm": 0.20859934389591217, + "learning_rate": 2.697620574754001e-05, + "loss": 0.0607, + "step": 32800 + }, + { + "epoch": 0.9778861015337058, + "grad_norm": 0.12655174732208252, + "learning_rate": 2.6897986640176464e-05, + "loss": 0.0622, + "step": 32900 + }, + { + "epoch": 0.9808583997146594, + "grad_norm": 0.11206143349409103, + "learning_rate": 2.6819767532812916e-05, + "loss": 0.0602, + "step": 33000 + }, + { + "epoch": 0.9838306978956128, + "grad_norm": 0.11327452957630157, + "learning_rate": 2.6741548425449368e-05, + "loss": 0.0606, + "step": 33100 + }, + { + "epoch": 0.9868029960765664, + "grad_norm": 0.14507678151130676, + "learning_rate": 2.6663329318085824e-05, + "loss": 0.0629, + "step": 33200 + }, + { + "epoch": 0.9897752942575199, + "grad_norm": 0.13163773715496063, + "learning_rate": 2.6585110210722276e-05, + "loss": 0.0611, + "step": 33300 + }, + { + "epoch": 0.9927475924384734, + "grad_norm": 0.14853987097740173, + "learning_rate": 2.650689110335873e-05, + "loss": 0.0631, + "step": 33400 + }, + { + "epoch": 0.9957198906194269, + "grad_norm": 0.13821183145046234, + "learning_rate": 2.6428671995995184e-05, + "loss": 0.0621, + "step": 33500 + }, + { + "epoch": 0.9986921888003805, + "grad_norm": 0.22169095277786255, + "learning_rate": 2.635045288863164e-05, + "loss": 0.062, + "step": 33600 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9765524221811527, + "eval_f1": 0.9740709203873195, + "eval_loss": 0.061117351055145264, + "eval_precision": 0.9738283242003958, + "eval_recall": 0.9765524221811527, + "eval_runtime": 860.8276, + "eval_samples_per_second": 138.962, + "eval_steps_per_second": 4.343, + "step": 33644 + }, + { + "epoch": 1.001664486981334, + "grad_norm": 0.12748785316944122, + "learning_rate": 2.6272233781268092e-05, + "loss": 0.0602, + "step": 33700 + }, + { + "epoch": 1.0046367851622875, + "grad_norm": 0.1471647471189499, + "learning_rate": 2.6194014673904544e-05, + "loss": 0.0606, + "step": 33800 + }, + { + "epoch": 1.007609083343241, + "grad_norm": 0.12204054743051529, + "learning_rate": 2.6115795566540997e-05, + "loss": 0.0567, + "step": 33900 + }, + { + "epoch": 1.0105813815241944, + "grad_norm": 0.18438702821731567, + "learning_rate": 2.603757645917745e-05, + "loss": 0.0595, + "step": 34000 + }, + { + "epoch": 1.013553679705148, + "grad_norm": 0.15276484191417694, + "learning_rate": 2.59593573518139e-05, + "loss": 0.0603, + "step": 34100 + }, + { + "epoch": 1.0165259778861016, + "grad_norm": 0.12649641931056976, + "learning_rate": 2.5881138244450354e-05, + "loss": 0.0598, + "step": 34200 + }, + { + "epoch": 1.019498276067055, + "grad_norm": 0.19768394529819489, + "learning_rate": 2.5802919137086806e-05, + "loss": 0.0625, + "step": 34300 + }, + { + "epoch": 1.0224705742480085, + "grad_norm": 0.1956031620502472, + "learning_rate": 2.5724700029723265e-05, + "loss": 0.0583, + "step": 34400 + }, + { + "epoch": 1.0254428724289621, + "grad_norm": 0.16348020732402802, + "learning_rate": 2.5646480922359717e-05, + "loss": 0.0588, + "step": 34500 + }, + { + "epoch": 1.0284151706099156, + "grad_norm": 0.17258135974407196, + "learning_rate": 2.556826181499617e-05, + "loss": 0.0583, + "step": 34600 + }, + { + "epoch": 1.031387468790869, + "grad_norm": 0.14946769177913666, + "learning_rate": 2.5490042707632622e-05, + "loss": 0.0592, + "step": 34700 + }, + { + "epoch": 1.0343597669718225, + "grad_norm": 0.19938164949417114, + "learning_rate": 2.5411823600269074e-05, + "loss": 0.0594, + "step": 34800 + }, + { + "epoch": 1.0373320651527762, + "grad_norm": 0.22682930529117584, + "learning_rate": 2.5333604492905526e-05, + "loss": 0.0599, + "step": 34900 + }, + { + "epoch": 1.0403043633337297, + "grad_norm": 0.2040901482105255, + "learning_rate": 2.525538538554198e-05, + "loss": 0.0592, + "step": 35000 + }, + { + "epoch": 1.0432766615146831, + "grad_norm": 0.1988217681646347, + "learning_rate": 2.517716627817843e-05, + "loss": 0.0578, + "step": 35100 + }, + { + "epoch": 1.0462489596956366, + "grad_norm": 0.15943174064159393, + "learning_rate": 2.509894717081489e-05, + "loss": 0.0595, + "step": 35200 + }, + { + "epoch": 1.0492212578765903, + "grad_norm": 0.10466223210096359, + "learning_rate": 2.5020728063451342e-05, + "loss": 0.0621, + "step": 35300 + }, + { + "epoch": 1.0521935560575437, + "grad_norm": 0.18628579378128052, + "learning_rate": 2.4942508956087794e-05, + "loss": 0.0595, + "step": 35400 + }, + { + "epoch": 1.0551658542384972, + "grad_norm": 0.14460884034633636, + "learning_rate": 2.4864289848724247e-05, + "loss": 0.0613, + "step": 35500 + }, + { + "epoch": 1.0581381524194506, + "grad_norm": 0.16263699531555176, + "learning_rate": 2.47860707413607e-05, + "loss": 0.0594, + "step": 35600 + }, + { + "epoch": 1.0611104506004043, + "grad_norm": 0.3361489474773407, + "learning_rate": 2.4707851633997155e-05, + "loss": 0.0586, + "step": 35700 + }, + { + "epoch": 1.0640827487813578, + "grad_norm": 0.1547601968050003, + "learning_rate": 2.4629632526633607e-05, + "loss": 0.0609, + "step": 35800 + }, + { + "epoch": 1.0670550469623112, + "grad_norm": 0.2251814603805542, + "learning_rate": 2.4551413419270063e-05, + "loss": 0.0608, + "step": 35900 + }, + { + "epoch": 1.0700273451432647, + "grad_norm": 0.12384767085313797, + "learning_rate": 2.4473194311906515e-05, + "loss": 0.0605, + "step": 36000 + }, + { + "epoch": 1.0729996433242184, + "grad_norm": 0.166926771402359, + "learning_rate": 2.4394975204542967e-05, + "loss": 0.0628, + "step": 36100 + }, + { + "epoch": 1.0759719415051718, + "grad_norm": 0.19243651628494263, + "learning_rate": 2.431675609717942e-05, + "loss": 0.0598, + "step": 36200 + }, + { + "epoch": 1.0789442396861253, + "grad_norm": 0.16473597288131714, + "learning_rate": 2.4238536989815875e-05, + "loss": 0.0597, + "step": 36300 + }, + { + "epoch": 1.0819165378670788, + "grad_norm": 0.17486000061035156, + "learning_rate": 2.4160317882452327e-05, + "loss": 0.0611, + "step": 36400 + }, + { + "epoch": 1.0848888360480324, + "grad_norm": 0.1539386510848999, + "learning_rate": 2.408209877508878e-05, + "loss": 0.0581, + "step": 36500 + }, + { + "epoch": 1.087861134228986, + "grad_norm": 0.19575689733028412, + "learning_rate": 2.4003879667725232e-05, + "loss": 0.0573, + "step": 36600 + }, + { + "epoch": 1.0908334324099394, + "grad_norm": 0.11995241791009903, + "learning_rate": 2.3925660560361688e-05, + "loss": 0.0578, + "step": 36700 + }, + { + "epoch": 1.0938057305908928, + "grad_norm": 0.18003156781196594, + "learning_rate": 2.384744145299814e-05, + "loss": 0.0596, + "step": 36800 + }, + { + "epoch": 1.0967780287718465, + "grad_norm": 0.13467411696910858, + "learning_rate": 2.3769222345634592e-05, + "loss": 0.0613, + "step": 36900 + }, + { + "epoch": 1.0997503269528, + "grad_norm": 0.13951335847377777, + "learning_rate": 2.3691003238271045e-05, + "loss": 0.0578, + "step": 37000 + }, + { + "epoch": 1.1027226251337534, + "grad_norm": 0.20946165919303894, + "learning_rate": 2.36127841309075e-05, + "loss": 0.0584, + "step": 37100 + }, + { + "epoch": 1.1056949233147069, + "grad_norm": 0.24194690585136414, + "learning_rate": 2.3534565023543953e-05, + "loss": 0.0615, + "step": 37200 + }, + { + "epoch": 1.1086672214956605, + "grad_norm": 0.17437128722667694, + "learning_rate": 2.3456345916180405e-05, + "loss": 0.0592, + "step": 37300 + }, + { + "epoch": 1.111639519676614, + "grad_norm": 0.1673300862312317, + "learning_rate": 2.3378126808816857e-05, + "loss": 0.0589, + "step": 37400 + }, + { + "epoch": 1.1146118178575675, + "grad_norm": 0.0975237712264061, + "learning_rate": 2.3299907701453313e-05, + "loss": 0.0607, + "step": 37500 + }, + { + "epoch": 1.117584116038521, + "grad_norm": 0.127375528216362, + "learning_rate": 2.3221688594089765e-05, + "loss": 0.0595, + "step": 37600 + }, + { + "epoch": 1.1205564142194744, + "grad_norm": 0.20405639708042145, + "learning_rate": 2.3143469486726217e-05, + "loss": 0.0612, + "step": 37700 + }, + { + "epoch": 1.123528712400428, + "grad_norm": 0.1644122451543808, + "learning_rate": 2.306525037936267e-05, + "loss": 0.0609, + "step": 37800 + }, + { + "epoch": 1.1265010105813815, + "grad_norm": 0.18849913775920868, + "learning_rate": 2.2987031271999125e-05, + "loss": 0.0595, + "step": 37900 + }, + { + "epoch": 1.129473308762335, + "grad_norm": 0.1566327065229416, + "learning_rate": 2.2908812164635578e-05, + "loss": 0.0609, + "step": 38000 + }, + { + "epoch": 1.1324456069432887, + "grad_norm": 0.17153958976268768, + "learning_rate": 2.2830593057272033e-05, + "loss": 0.0584, + "step": 38100 + }, + { + "epoch": 1.1354179051242421, + "grad_norm": 0.17965900897979736, + "learning_rate": 2.2752373949908486e-05, + "loss": 0.0606, + "step": 38200 + }, + { + "epoch": 1.1383902033051956, + "grad_norm": 0.16267681121826172, + "learning_rate": 2.2674154842544938e-05, + "loss": 0.0598, + "step": 38300 + }, + { + "epoch": 1.141362501486149, + "grad_norm": 0.11333649605512619, + "learning_rate": 2.2595935735181393e-05, + "loss": 0.0593, + "step": 38400 + }, + { + "epoch": 1.1443347996671025, + "grad_norm": 0.0, + "learning_rate": 2.2517716627817846e-05, + "loss": 0.0585, + "step": 38500 + }, + { + "epoch": 1.1473070978480562, + "grad_norm": 0.1778153032064438, + "learning_rate": 2.2439497520454298e-05, + "loss": 0.0555, + "step": 38600 + }, + { + "epoch": 1.1502793960290096, + "grad_norm": 0.14332982897758484, + "learning_rate": 2.236127841309075e-05, + "loss": 0.0573, + "step": 38700 + }, + { + "epoch": 1.153251694209963, + "grad_norm": 0.15101206302642822, + "learning_rate": 2.2283059305727206e-05, + "loss": 0.0612, + "step": 38800 + }, + { + "epoch": 1.1562239923909168, + "grad_norm": 0.13762034475803375, + "learning_rate": 2.2204840198363658e-05, + "loss": 0.0596, + "step": 38900 + }, + { + "epoch": 1.1591962905718702, + "grad_norm": 0.16889068484306335, + "learning_rate": 2.212662109100011e-05, + "loss": 0.0606, + "step": 39000 + }, + { + "epoch": 1.1621685887528237, + "grad_norm": 0.11983152478933334, + "learning_rate": 2.2048401983636563e-05, + "loss": 0.0572, + "step": 39100 + }, + { + "epoch": 1.1651408869337772, + "grad_norm": 0.11952123045921326, + "learning_rate": 2.197018287627302e-05, + "loss": 0.0597, + "step": 39200 + }, + { + "epoch": 1.1681131851147306, + "grad_norm": 0.16299547255039215, + "learning_rate": 2.189196376890947e-05, + "loss": 0.0605, + "step": 39300 + }, + { + "epoch": 1.1710854832956843, + "grad_norm": 0.17656153440475464, + "learning_rate": 2.1813744661545923e-05, + "loss": 0.0589, + "step": 39400 + }, + { + "epoch": 1.1740577814766378, + "grad_norm": 0.22217115759849548, + "learning_rate": 2.1735525554182375e-05, + "loss": 0.0615, + "step": 39500 + }, + { + "epoch": 1.1770300796575912, + "grad_norm": 0.11914844065904617, + "learning_rate": 2.1657306446818828e-05, + "loss": 0.0589, + "step": 39600 + }, + { + "epoch": 1.1800023778385447, + "grad_norm": 0.15875238180160522, + "learning_rate": 2.1579087339455283e-05, + "loss": 0.0606, + "step": 39700 + }, + { + "epoch": 1.1829746760194984, + "grad_norm": 0.1375878006219864, + "learning_rate": 2.1500868232091736e-05, + "loss": 0.0598, + "step": 39800 + }, + { + "epoch": 1.1859469742004518, + "grad_norm": 0.12903353571891785, + "learning_rate": 2.1422649124728188e-05, + "loss": 0.0584, + "step": 39900 + }, + { + "epoch": 1.1889192723814053, + "grad_norm": 0.19753535091876984, + "learning_rate": 2.134443001736464e-05, + "loss": 0.058, + "step": 40000 + }, + { + "epoch": 1.1918915705623587, + "grad_norm": 0.19985559582710266, + "learning_rate": 2.1266210910001096e-05, + "loss": 0.0599, + "step": 40100 + }, + { + "epoch": 1.1948638687433124, + "grad_norm": 0.20591992139816284, + "learning_rate": 2.1187991802637548e-05, + "loss": 0.0605, + "step": 40200 + }, + { + "epoch": 1.1978361669242659, + "grad_norm": 0.18269148468971252, + "learning_rate": 2.1109772695274004e-05, + "loss": 0.0615, + "step": 40300 + }, + { + "epoch": 1.2008084651052193, + "grad_norm": 0.3032814562320709, + "learning_rate": 2.1031553587910456e-05, + "loss": 0.0597, + "step": 40400 + }, + { + "epoch": 1.2037807632861728, + "grad_norm": 0.1347813457250595, + "learning_rate": 2.0953334480546912e-05, + "loss": 0.0585, + "step": 40500 + }, + { + "epoch": 1.2067530614671265, + "grad_norm": 0.15395672619342804, + "learning_rate": 2.0875115373183364e-05, + "loss": 0.0584, + "step": 40600 + }, + { + "epoch": 1.20972535964808, + "grad_norm": 0.1638413816690445, + "learning_rate": 2.0796896265819816e-05, + "loss": 0.0599, + "step": 40700 + }, + { + "epoch": 1.2126976578290334, + "grad_norm": 0.20154951512813568, + "learning_rate": 2.071867715845627e-05, + "loss": 0.058, + "step": 40800 + }, + { + "epoch": 1.2156699560099868, + "grad_norm": 0.12632374465465546, + "learning_rate": 2.0640458051092724e-05, + "loss": 0.0615, + "step": 40900 + }, + { + "epoch": 1.2186422541909405, + "grad_norm": 0.187855526804924, + "learning_rate": 2.0562238943729177e-05, + "loss": 0.0617, + "step": 41000 + }, + { + "epoch": 1.221614552371894, + "grad_norm": 0.16328568756580353, + "learning_rate": 2.048401983636563e-05, + "loss": 0.0583, + "step": 41100 + }, + { + "epoch": 1.2245868505528474, + "grad_norm": 0.15591071546077728, + "learning_rate": 2.040580072900208e-05, + "loss": 0.0594, + "step": 41200 + }, + { + "epoch": 1.227559148733801, + "grad_norm": 0.11492554098367691, + "learning_rate": 2.0327581621638537e-05, + "loss": 0.0569, + "step": 41300 + }, + { + "epoch": 1.2305314469147546, + "grad_norm": 0.18282866477966309, + "learning_rate": 2.024936251427499e-05, + "loss": 0.0628, + "step": 41400 + }, + { + "epoch": 1.233503745095708, + "grad_norm": 0.2215399593114853, + "learning_rate": 2.017114340691144e-05, + "loss": 0.0591, + "step": 41500 + }, + { + "epoch": 1.2364760432766615, + "grad_norm": 0.17401248216629028, + "learning_rate": 2.0092924299547894e-05, + "loss": 0.0587, + "step": 41600 + }, + { + "epoch": 1.239448341457615, + "grad_norm": 0.21835525333881378, + "learning_rate": 2.0014705192184346e-05, + "loss": 0.0611, + "step": 41700 + }, + { + "epoch": 1.2424206396385686, + "grad_norm": 0.13052473962306976, + "learning_rate": 1.99364860848208e-05, + "loss": 0.0594, + "step": 41800 + }, + { + "epoch": 1.245392937819522, + "grad_norm": 0.13643550872802734, + "learning_rate": 1.9858266977457254e-05, + "loss": 0.0604, + "step": 41900 + }, + { + "epoch": 1.2483652360004756, + "grad_norm": 0.11867769807577133, + "learning_rate": 1.9780047870093706e-05, + "loss": 0.0592, + "step": 42000 + }, + { + "epoch": 1.251337534181429, + "grad_norm": 0.33036288619041443, + "learning_rate": 1.970182876273016e-05, + "loss": 0.058, + "step": 42100 + }, + { + "epoch": 1.2543098323623827, + "grad_norm": 0.20882196724414825, + "learning_rate": 1.9623609655366614e-05, + "loss": 0.0593, + "step": 42200 + }, + { + "epoch": 1.2572821305433362, + "grad_norm": 0.14684346318244934, + "learning_rate": 1.9545390548003066e-05, + "loss": 0.0596, + "step": 42300 + }, + { + "epoch": 1.2602544287242896, + "grad_norm": 0.14338365197181702, + "learning_rate": 1.946717144063952e-05, + "loss": 0.059, + "step": 42400 + }, + { + "epoch": 1.263226726905243, + "grad_norm": 0.1522979885339737, + "learning_rate": 1.938895233327597e-05, + "loss": 0.0591, + "step": 42500 + }, + { + "epoch": 1.2661990250861965, + "grad_norm": 0.18392986059188843, + "learning_rate": 1.9310733225912427e-05, + "loss": 0.059, + "step": 42600 + }, + { + "epoch": 1.2691713232671502, + "grad_norm": 0.18199172616004944, + "learning_rate": 1.923251411854888e-05, + "loss": 0.0575, + "step": 42700 + }, + { + "epoch": 1.2721436214481037, + "grad_norm": 0.17532601952552795, + "learning_rate": 1.9154295011185335e-05, + "loss": 0.0575, + "step": 42800 + }, + { + "epoch": 1.2751159196290571, + "grad_norm": 0.1952797770500183, + "learning_rate": 1.9076075903821787e-05, + "loss": 0.0594, + "step": 42900 + }, + { + "epoch": 1.2780882178100108, + "grad_norm": 0.2015935480594635, + "learning_rate": 1.8997856796458243e-05, + "loss": 0.0596, + "step": 43000 + }, + { + "epoch": 1.2810605159909643, + "grad_norm": 0.19316500425338745, + "learning_rate": 1.8919637689094695e-05, + "loss": 0.0618, + "step": 43100 + }, + { + "epoch": 1.2840328141719177, + "grad_norm": 0.2155391424894333, + "learning_rate": 1.8841418581731147e-05, + "loss": 0.0591, + "step": 43200 + }, + { + "epoch": 1.2870051123528712, + "grad_norm": 0.1305920034646988, + "learning_rate": 1.87631994743676e-05, + "loss": 0.0585, + "step": 43300 + }, + { + "epoch": 1.2899774105338246, + "grad_norm": 0.17479225993156433, + "learning_rate": 1.868498036700405e-05, + "loss": 0.0577, + "step": 43400 + }, + { + "epoch": 1.2929497087147783, + "grad_norm": 0.16353924572467804, + "learning_rate": 1.8606761259640507e-05, + "loss": 0.0628, + "step": 43500 + }, + { + "epoch": 1.2959220068957318, + "grad_norm": 0.20186668634414673, + "learning_rate": 1.852854215227696e-05, + "loss": 0.0566, + "step": 43600 + }, + { + "epoch": 1.2988943050766852, + "grad_norm": 0.12033119797706604, + "learning_rate": 1.8450323044913412e-05, + "loss": 0.0586, + "step": 43700 + }, + { + "epoch": 1.301866603257639, + "grad_norm": 0.11598894745111465, + "learning_rate": 1.8372103937549864e-05, + "loss": 0.059, + "step": 43800 + }, + { + "epoch": 1.3048389014385924, + "grad_norm": 0.13539230823516846, + "learning_rate": 1.829388483018632e-05, + "loss": 0.0591, + "step": 43900 + }, + { + "epoch": 1.3078111996195458, + "grad_norm": 0.18936382234096527, + "learning_rate": 1.8215665722822772e-05, + "loss": 0.0576, + "step": 44000 + }, + { + "epoch": 1.3107834978004993, + "grad_norm": 0.1952689290046692, + "learning_rate": 1.8137446615459224e-05, + "loss": 0.0574, + "step": 44100 + }, + { + "epoch": 1.3137557959814528, + "grad_norm": 0.10991407930850983, + "learning_rate": 1.8059227508095677e-05, + "loss": 0.0576, + "step": 44200 + }, + { + "epoch": 1.3167280941624064, + "grad_norm": 0.1684737652540207, + "learning_rate": 1.7981008400732132e-05, + "loss": 0.0591, + "step": 44300 + }, + { + "epoch": 1.31970039234336, + "grad_norm": 0.2002449780702591, + "learning_rate": 1.7902789293368585e-05, + "loss": 0.0587, + "step": 44400 + }, + { + "epoch": 1.3226726905243134, + "grad_norm": 0.13676512241363525, + "learning_rate": 1.7824570186005037e-05, + "loss": 0.0586, + "step": 44500 + }, + { + "epoch": 1.325644988705267, + "grad_norm": 0.236215740442276, + "learning_rate": 1.774635107864149e-05, + "loss": 0.0608, + "step": 44600 + }, + { + "epoch": 1.3286172868862205, + "grad_norm": 0.17539894580841064, + "learning_rate": 1.7668131971277945e-05, + "loss": 0.059, + "step": 44700 + }, + { + "epoch": 1.331589585067174, + "grad_norm": 0.16261810064315796, + "learning_rate": 1.7589912863914397e-05, + "loss": 0.0589, + "step": 44800 + }, + { + "epoch": 1.3345618832481274, + "grad_norm": 0.15192443132400513, + "learning_rate": 1.751169375655085e-05, + "loss": 0.0574, + "step": 44900 + }, + { + "epoch": 1.3375341814290809, + "grad_norm": 0.22095446288585663, + "learning_rate": 1.7433474649187305e-05, + "loss": 0.0572, + "step": 45000 + }, + { + "epoch": 1.3405064796100346, + "grad_norm": 0.16923335194587708, + "learning_rate": 1.7355255541823757e-05, + "loss": 0.0587, + "step": 45100 + }, + { + "epoch": 1.343478777790988, + "grad_norm": 0.18771971762180328, + "learning_rate": 1.7277036434460213e-05, + "loss": 0.0595, + "step": 45200 + }, + { + "epoch": 1.3464510759719415, + "grad_norm": 0.0, + "learning_rate": 1.7198817327096665e-05, + "loss": 0.0586, + "step": 45300 + }, + { + "epoch": 1.3494233741528951, + "grad_norm": 0.21618609130382538, + "learning_rate": 1.7120598219733118e-05, + "loss": 0.0568, + "step": 45400 + }, + { + "epoch": 1.3523956723338486, + "grad_norm": 0.13301512598991394, + "learning_rate": 1.704237911236957e-05, + "loss": 0.0589, + "step": 45500 + }, + { + "epoch": 1.355367970514802, + "grad_norm": 0.24915924668312073, + "learning_rate": 1.6964160005006026e-05, + "loss": 0.0596, + "step": 45600 + }, + { + "epoch": 1.3583402686957555, + "grad_norm": 0.1706874519586563, + "learning_rate": 1.6885940897642478e-05, + "loss": 0.0583, + "step": 45700 + }, + { + "epoch": 1.361312566876709, + "grad_norm": 0.19821156561374664, + "learning_rate": 1.680772179027893e-05, + "loss": 0.06, + "step": 45800 + }, + { + "epoch": 1.3642848650576627, + "grad_norm": 0.0, + "learning_rate": 1.6729502682915382e-05, + "loss": 0.0592, + "step": 45900 + }, + { + "epoch": 1.3672571632386161, + "grad_norm": 0.20178964734077454, + "learning_rate": 1.6651283575551838e-05, + "loss": 0.0566, + "step": 46000 + }, + { + "epoch": 1.3702294614195696, + "grad_norm": 0.17375877499580383, + "learning_rate": 1.657306446818829e-05, + "loss": 0.0599, + "step": 46100 + }, + { + "epoch": 1.373201759600523, + "grad_norm": 0.15663665533065796, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.0587, + "step": 46200 + }, + { + "epoch": 1.3761740577814767, + "grad_norm": 0.14343243837356567, + "learning_rate": 1.6416626253461195e-05, + "loss": 0.0592, + "step": 46300 + }, + { + "epoch": 1.3791463559624302, + "grad_norm": 0.0, + "learning_rate": 1.633840714609765e-05, + "loss": 0.0594, + "step": 46400 + }, + { + "epoch": 1.3821186541433836, + "grad_norm": 0.1911957859992981, + "learning_rate": 1.6260188038734103e-05, + "loss": 0.0588, + "step": 46500 + }, + { + "epoch": 1.385090952324337, + "grad_norm": 0.10217937082052231, + "learning_rate": 1.6181968931370555e-05, + "loss": 0.0604, + "step": 46600 + }, + { + "epoch": 1.3880632505052906, + "grad_norm": 0.19517165422439575, + "learning_rate": 1.6103749824007007e-05, + "loss": 0.0609, + "step": 46700 + }, + { + "epoch": 1.3910355486862442, + "grad_norm": 0.1945805549621582, + "learning_rate": 1.6025530716643463e-05, + "loss": 0.0582, + "step": 46800 + }, + { + "epoch": 1.3940078468671977, + "grad_norm": 0.20694541931152344, + "learning_rate": 1.5947311609279915e-05, + "loss": 0.0588, + "step": 46900 + }, + { + "epoch": 1.3969801450481512, + "grad_norm": 0.15820534527301788, + "learning_rate": 1.5869092501916368e-05, + "loss": 0.0593, + "step": 47000 + }, + { + "epoch": 1.3999524432291048, + "grad_norm": 0.1335432231426239, + "learning_rate": 1.579087339455282e-05, + "loss": 0.0599, + "step": 47100 + }, + { + "epoch": 1.4029247414100583, + "grad_norm": 0.2578977942466736, + "learning_rate": 1.5712654287189276e-05, + "loss": 0.0597, + "step": 47200 + }, + { + "epoch": 1.4058970395910118, + "grad_norm": 0.20304128527641296, + "learning_rate": 1.5634435179825728e-05, + "loss": 0.0597, + "step": 47300 + }, + { + "epoch": 1.4088693377719652, + "grad_norm": 0.22874633967876434, + "learning_rate": 1.5556216072462184e-05, + "loss": 0.0583, + "step": 47400 + }, + { + "epoch": 1.4118416359529187, + "grad_norm": 0.20737852156162262, + "learning_rate": 1.5477996965098636e-05, + "loss": 0.0589, + "step": 47500 + }, + { + "epoch": 1.4148139341338724, + "grad_norm": 0.217135488986969, + "learning_rate": 1.5399777857735088e-05, + "loss": 0.0575, + "step": 47600 + }, + { + "epoch": 1.4177862323148258, + "grad_norm": 0.1210002452135086, + "learning_rate": 1.5321558750371544e-05, + "loss": 0.0613, + "step": 47700 + }, + { + "epoch": 1.4207585304957793, + "grad_norm": 0.0, + "learning_rate": 1.5243339643007994e-05, + "loss": 0.0606, + "step": 47800 + }, + { + "epoch": 1.423730828676733, + "grad_norm": 0.15625840425491333, + "learning_rate": 1.5165120535644447e-05, + "loss": 0.0574, + "step": 47900 + }, + { + "epoch": 1.4267031268576864, + "grad_norm": 0.14403069019317627, + "learning_rate": 1.50869014282809e-05, + "loss": 0.0577, + "step": 48000 + }, + { + "epoch": 1.4296754250386399, + "grad_norm": 0.12980596721172333, + "learning_rate": 1.5008682320917355e-05, + "loss": 0.0604, + "step": 48100 + }, + { + "epoch": 1.4326477232195933, + "grad_norm": 0.2431592047214508, + "learning_rate": 1.4930463213553809e-05, + "loss": 0.0584, + "step": 48200 + }, + { + "epoch": 1.4356200214005468, + "grad_norm": 0.09385453909635544, + "learning_rate": 1.4852244106190261e-05, + "loss": 0.0603, + "step": 48300 + }, + { + "epoch": 1.4385923195815005, + "grad_norm": 0.1633906066417694, + "learning_rate": 1.4774024998826713e-05, + "loss": 0.0587, + "step": 48400 + }, + { + "epoch": 1.441564617762454, + "grad_norm": 0.16115331649780273, + "learning_rate": 1.4695805891463169e-05, + "loss": 0.058, + "step": 48500 + }, + { + "epoch": 1.4445369159434074, + "grad_norm": 0.1680040806531906, + "learning_rate": 1.4617586784099621e-05, + "loss": 0.0572, + "step": 48600 + }, + { + "epoch": 1.447509214124361, + "grad_norm": 0.14411240816116333, + "learning_rate": 1.4539367676736073e-05, + "loss": 0.0566, + "step": 48700 + }, + { + "epoch": 1.4504815123053145, + "grad_norm": 0.14987049996852875, + "learning_rate": 1.4461148569372526e-05, + "loss": 0.0559, + "step": 48800 + }, + { + "epoch": 1.453453810486268, + "grad_norm": 0.0, + "learning_rate": 1.4382929462008981e-05, + "loss": 0.0577, + "step": 48900 + }, + { + "epoch": 1.4564261086672214, + "grad_norm": 0.15166877210140228, + "learning_rate": 1.4304710354645434e-05, + "loss": 0.0573, + "step": 49000 + }, + { + "epoch": 1.459398406848175, + "grad_norm": 0.15557777881622314, + "learning_rate": 1.4226491247281886e-05, + "loss": 0.0589, + "step": 49100 + }, + { + "epoch": 1.4623707050291286, + "grad_norm": 0.19683128595352173, + "learning_rate": 1.414827213991834e-05, + "loss": 0.0573, + "step": 49200 + }, + { + "epoch": 1.465343003210082, + "grad_norm": 0.16488976776599884, + "learning_rate": 1.4070053032554792e-05, + "loss": 0.0602, + "step": 49300 + }, + { + "epoch": 1.4683153013910355, + "grad_norm": 0.18177999556064606, + "learning_rate": 1.3991833925191248e-05, + "loss": 0.0577, + "step": 49400 + }, + { + "epoch": 1.4712875995719892, + "grad_norm": 0.22599105536937714, + "learning_rate": 1.39136148178277e-05, + "loss": 0.0574, + "step": 49500 + }, + { + "epoch": 1.4742598977529426, + "grad_norm": 0.1609424203634262, + "learning_rate": 1.3835395710464152e-05, + "loss": 0.0572, + "step": 49600 + }, + { + "epoch": 1.477232195933896, + "grad_norm": 0.16923430562019348, + "learning_rate": 1.3757176603100605e-05, + "loss": 0.0595, + "step": 49700 + }, + { + "epoch": 1.4802044941148496, + "grad_norm": 0.0, + "learning_rate": 1.367895749573706e-05, + "loss": 0.0597, + "step": 49800 + }, + { + "epoch": 1.483176792295803, + "grad_norm": 0.20833246409893036, + "learning_rate": 1.3600738388373513e-05, + "loss": 0.0571, + "step": 49900 + }, + { + "epoch": 1.4861490904767567, + "grad_norm": 0.22977645695209503, + "learning_rate": 1.3522519281009965e-05, + "loss": 0.059, + "step": 50000 + }, + { + "epoch": 1.4891213886577102, + "grad_norm": 0.2032373696565628, + "learning_rate": 1.3444300173646417e-05, + "loss": 0.0592, + "step": 50100 + }, + { + "epoch": 1.4920936868386636, + "grad_norm": 0.10967738181352615, + "learning_rate": 1.3366081066282873e-05, + "loss": 0.0591, + "step": 50200 + }, + { + "epoch": 1.4950659850196173, + "grad_norm": 0.13823756575584412, + "learning_rate": 1.3287861958919325e-05, + "loss": 0.0578, + "step": 50300 + }, + { + "epoch": 1.4980382832005708, + "grad_norm": 0.1448056697845459, + "learning_rate": 1.320964285155578e-05, + "loss": 0.0574, + "step": 50400 + }, + { + "epoch": 1.5010105813815242, + "grad_norm": 0.19842751324176788, + "learning_rate": 1.3131423744192231e-05, + "loss": 0.058, + "step": 50500 + }, + { + "epoch": 1.5039828795624777, + "grad_norm": 0.1823042631149292, + "learning_rate": 1.3053204636828687e-05, + "loss": 0.0585, + "step": 50600 + }, + { + "epoch": 1.5069551777434311, + "grad_norm": 0.16752302646636963, + "learning_rate": 1.297498552946514e-05, + "loss": 0.0581, + "step": 50700 + }, + { + "epoch": 1.5099274759243846, + "grad_norm": 0.1563909947872162, + "learning_rate": 1.2896766422101592e-05, + "loss": 0.0559, + "step": 50800 + }, + { + "epoch": 1.5128997741053383, + "grad_norm": 0.15815581381320953, + "learning_rate": 1.2818547314738044e-05, + "loss": 0.0581, + "step": 50900 + }, + { + "epoch": 1.5158720722862917, + "grad_norm": 0.22975870966911316, + "learning_rate": 1.2740328207374496e-05, + "loss": 0.0581, + "step": 51000 + }, + { + "epoch": 1.5188443704672454, + "grad_norm": 0.12994658946990967, + "learning_rate": 1.2662109100010952e-05, + "loss": 0.059, + "step": 51100 + }, + { + "epoch": 1.5218166686481989, + "grad_norm": 0.06841659545898438, + "learning_rate": 1.2583889992647404e-05, + "loss": 0.0564, + "step": 51200 + }, + { + "epoch": 1.5247889668291523, + "grad_norm": 0.32525792717933655, + "learning_rate": 1.2505670885283857e-05, + "loss": 0.0582, + "step": 51300 + }, + { + "epoch": 1.5277612650101058, + "grad_norm": 0.17364244163036346, + "learning_rate": 1.242745177792031e-05, + "loss": 0.0567, + "step": 51400 + }, + { + "epoch": 1.5307335631910592, + "grad_norm": 0.2119700312614441, + "learning_rate": 1.2349232670556764e-05, + "loss": 0.0586, + "step": 51500 + }, + { + "epoch": 1.5337058613720127, + "grad_norm": 0.18451537191867828, + "learning_rate": 1.2271013563193218e-05, + "loss": 0.0575, + "step": 51600 + }, + { + "epoch": 1.5366781595529664, + "grad_norm": 0.16617034375667572, + "learning_rate": 1.219279445582967e-05, + "loss": 0.06, + "step": 51700 + }, + { + "epoch": 1.5396504577339198, + "grad_norm": 0.19621536135673523, + "learning_rate": 1.2114575348466125e-05, + "loss": 0.0576, + "step": 51800 + }, + { + "epoch": 1.5426227559148735, + "grad_norm": 0.2017040103673935, + "learning_rate": 1.2036356241102577e-05, + "loss": 0.0583, + "step": 51900 + }, + { + "epoch": 1.545595054095827, + "grad_norm": 0.17019014060497284, + "learning_rate": 1.1958137133739031e-05, + "loss": 0.0587, + "step": 52000 + }, + { + "epoch": 1.5485673522767804, + "grad_norm": 0.14348675310611725, + "learning_rate": 1.1879918026375483e-05, + "loss": 0.0576, + "step": 52100 + }, + { + "epoch": 1.551539650457734, + "grad_norm": 0.15633194148540497, + "learning_rate": 1.1801698919011937e-05, + "loss": 0.0568, + "step": 52200 + }, + { + "epoch": 1.5545119486386874, + "grad_norm": 0.19768303632736206, + "learning_rate": 1.172347981164839e-05, + "loss": 0.0591, + "step": 52300 + }, + { + "epoch": 1.5574842468196408, + "grad_norm": 0.21937786042690277, + "learning_rate": 1.1645260704284844e-05, + "loss": 0.0603, + "step": 52400 + }, + { + "epoch": 1.5604565450005945, + "grad_norm": 0.12828095257282257, + "learning_rate": 1.1567041596921296e-05, + "loss": 0.0598, + "step": 52500 + }, + { + "epoch": 1.563428843181548, + "grad_norm": 0.13009525835514069, + "learning_rate": 1.148882248955775e-05, + "loss": 0.0576, + "step": 52600 + }, + { + "epoch": 1.5664011413625016, + "grad_norm": 0.1119093969464302, + "learning_rate": 1.1410603382194204e-05, + "loss": 0.0593, + "step": 52700 + }, + { + "epoch": 1.569373439543455, + "grad_norm": 0.15185515582561493, + "learning_rate": 1.1332384274830656e-05, + "loss": 0.0577, + "step": 52800 + }, + { + "epoch": 1.5723457377244086, + "grad_norm": 0.1988178938627243, + "learning_rate": 1.125416516746711e-05, + "loss": 0.0592, + "step": 52900 + }, + { + "epoch": 1.575318035905362, + "grad_norm": 0.11751049011945724, + "learning_rate": 1.1175946060103562e-05, + "loss": 0.0572, + "step": 53000 + }, + { + "epoch": 1.5782903340863155, + "grad_norm": 0.2535243034362793, + "learning_rate": 1.1097726952740016e-05, + "loss": 0.0599, + "step": 53100 + }, + { + "epoch": 1.581262632267269, + "grad_norm": 0.12450870126485825, + "learning_rate": 1.1019507845376469e-05, + "loss": 0.0587, + "step": 53200 + }, + { + "epoch": 1.5842349304482226, + "grad_norm": 0.13519836962223053, + "learning_rate": 1.0941288738012923e-05, + "loss": 0.0577, + "step": 53300 + }, + { + "epoch": 1.587207228629176, + "grad_norm": 0.212506502866745, + "learning_rate": 1.0863069630649375e-05, + "loss": 0.0576, + "step": 53400 + }, + { + "epoch": 1.5901795268101298, + "grad_norm": 0.15001560747623444, + "learning_rate": 1.0784850523285829e-05, + "loss": 0.0572, + "step": 53500 + }, + { + "epoch": 1.5931518249910832, + "grad_norm": 0.0, + "learning_rate": 1.0706631415922281e-05, + "loss": 0.0574, + "step": 53600 + }, + { + "epoch": 1.5961241231720367, + "grad_norm": 0.14410234987735748, + "learning_rate": 1.0628412308558735e-05, + "loss": 0.0552, + "step": 53700 + }, + { + "epoch": 1.5990964213529901, + "grad_norm": 0.18214349448680878, + "learning_rate": 1.0550193201195187e-05, + "loss": 0.0575, + "step": 53800 + }, + { + "epoch": 1.6020687195339436, + "grad_norm": 0.19318780303001404, + "learning_rate": 1.0471974093831641e-05, + "loss": 0.0581, + "step": 53900 + }, + { + "epoch": 1.605041017714897, + "grad_norm": 0.1810021996498108, + "learning_rate": 1.0393754986468095e-05, + "loss": 0.0588, + "step": 54000 + }, + { + "epoch": 1.6080133158958507, + "grad_norm": 0.14539185166358948, + "learning_rate": 1.031553587910455e-05, + "loss": 0.0583, + "step": 54100 + }, + { + "epoch": 1.6109856140768042, + "grad_norm": 0.15464851260185242, + "learning_rate": 1.0237316771741002e-05, + "loss": 0.0571, + "step": 54200 + }, + { + "epoch": 1.6139579122577576, + "grad_norm": 0.18842099606990814, + "learning_rate": 1.0159097664377456e-05, + "loss": 0.0568, + "step": 54300 + }, + { + "epoch": 1.6169302104387113, + "grad_norm": 0.16852693259716034, + "learning_rate": 1.0080878557013908e-05, + "loss": 0.0581, + "step": 54400 + }, + { + "epoch": 1.6199025086196648, + "grad_norm": 0.24774104356765747, + "learning_rate": 1.0002659449650362e-05, + "loss": 0.0588, + "step": 54500 + }, + { + "epoch": 1.6228748068006182, + "grad_norm": 0.14442922174930573, + "learning_rate": 9.924440342286814e-06, + "loss": 0.0567, + "step": 54600 + }, + { + "epoch": 1.6258471049815717, + "grad_norm": 0.24381335079669952, + "learning_rate": 9.846221234923266e-06, + "loss": 0.0599, + "step": 54700 + }, + { + "epoch": 1.6288194031625252, + "grad_norm": 0.13182982802391052, + "learning_rate": 9.76800212755972e-06, + "loss": 0.0589, + "step": 54800 + }, + { + "epoch": 1.6317917013434786, + "grad_norm": 0.1650897115468979, + "learning_rate": 9.689783020196173e-06, + "loss": 0.0577, + "step": 54900 + }, + { + "epoch": 1.6347639995244323, + "grad_norm": 0.16550056636333466, + "learning_rate": 9.611563912832627e-06, + "loss": 0.0587, + "step": 55000 + }, + { + "epoch": 1.6377362977053858, + "grad_norm": 0.16596747934818268, + "learning_rate": 9.53334480546908e-06, + "loss": 0.056, + "step": 55100 + }, + { + "epoch": 1.6407085958863394, + "grad_norm": 0.1435985118150711, + "learning_rate": 9.455125698105535e-06, + "loss": 0.0583, + "step": 55200 + }, + { + "epoch": 1.643680894067293, + "grad_norm": 0.17438799142837524, + "learning_rate": 9.376906590741987e-06, + "loss": 0.057, + "step": 55300 + }, + { + "epoch": 1.6466531922482464, + "grad_norm": 0.20165254175662994, + "learning_rate": 9.29868748337844e-06, + "loss": 0.0584, + "step": 55400 + }, + { + "epoch": 1.6496254904291998, + "grad_norm": 0.2093760073184967, + "learning_rate": 9.220468376014893e-06, + "loss": 0.0571, + "step": 55500 + }, + { + "epoch": 1.6525977886101533, + "grad_norm": 0.21770691871643066, + "learning_rate": 9.142249268651347e-06, + "loss": 0.0587, + "step": 55600 + }, + { + "epoch": 1.6555700867911067, + "grad_norm": 0.15709801018238068, + "learning_rate": 9.0640301612878e-06, + "loss": 0.0596, + "step": 55700 + }, + { + "epoch": 1.6585423849720604, + "grad_norm": 0.219644233584404, + "learning_rate": 8.985811053924253e-06, + "loss": 0.0592, + "step": 55800 + }, + { + "epoch": 1.6615146831530139, + "grad_norm": 0.12435632944107056, + "learning_rate": 8.907591946560706e-06, + "loss": 0.0581, + "step": 55900 + }, + { + "epoch": 1.6644869813339676, + "grad_norm": 0.15070226788520813, + "learning_rate": 8.82937283919716e-06, + "loss": 0.0591, + "step": 56000 + }, + { + "epoch": 1.667459279514921, + "grad_norm": 0.20532342791557312, + "learning_rate": 8.751153731833612e-06, + "loss": 0.0572, + "step": 56100 + }, + { + "epoch": 1.6704315776958745, + "grad_norm": 0.15827345848083496, + "learning_rate": 8.672934624470066e-06, + "loss": 0.0586, + "step": 56200 + }, + { + "epoch": 1.673403875876828, + "grad_norm": 0.1817328780889511, + "learning_rate": 8.59471551710652e-06, + "loss": 0.0592, + "step": 56300 + }, + { + "epoch": 1.6763761740577814, + "grad_norm": 0.14720812439918518, + "learning_rate": 8.516496409742974e-06, + "loss": 0.0581, + "step": 56400 + }, + { + "epoch": 1.6793484722387348, + "grad_norm": 0.20043276250362396, + "learning_rate": 8.438277302379426e-06, + "loss": 0.0582, + "step": 56500 + }, + { + "epoch": 1.6823207704196885, + "grad_norm": 0.19054606556892395, + "learning_rate": 8.360058195015878e-06, + "loss": 0.0553, + "step": 56600 + }, + { + "epoch": 1.685293068600642, + "grad_norm": 0.1736610233783722, + "learning_rate": 8.281839087652332e-06, + "loss": 0.0592, + "step": 56700 + }, + { + "epoch": 1.6882653667815957, + "grad_norm": 0.3207443952560425, + "learning_rate": 8.203619980288785e-06, + "loss": 0.0584, + "step": 56800 + }, + { + "epoch": 1.6912376649625491, + "grad_norm": 0.14914827048778534, + "learning_rate": 8.125400872925239e-06, + "loss": 0.0586, + "step": 56900 + }, + { + "epoch": 1.6942099631435026, + "grad_norm": 0.18544642627239227, + "learning_rate": 8.047181765561691e-06, + "loss": 0.0583, + "step": 57000 + }, + { + "epoch": 1.697182261324456, + "grad_norm": 0.2703917324542999, + "learning_rate": 7.968962658198145e-06, + "loss": 0.0601, + "step": 57100 + }, + { + "epoch": 1.7001545595054095, + "grad_norm": 0.13196884095668793, + "learning_rate": 7.890743550834597e-06, + "loss": 0.0574, + "step": 57200 + }, + { + "epoch": 1.703126857686363, + "grad_norm": 0.11756039410829544, + "learning_rate": 7.812524443471051e-06, + "loss": 0.0586, + "step": 57300 + }, + { + "epoch": 1.7060991558673166, + "grad_norm": 0.21705962717533112, + "learning_rate": 7.734305336107505e-06, + "loss": 0.0554, + "step": 57400 + }, + { + "epoch": 1.70907145404827, + "grad_norm": 0.17258594930171967, + "learning_rate": 7.656086228743959e-06, + "loss": 0.0584, + "step": 57500 + }, + { + "epoch": 1.7120437522292238, + "grad_norm": 0.2133166640996933, + "learning_rate": 7.5778671213804105e-06, + "loss": 0.0566, + "step": 57600 + }, + { + "epoch": 1.7150160504101772, + "grad_norm": 0.16656166315078735, + "learning_rate": 7.4996480140168645e-06, + "loss": 0.058, + "step": 57700 + }, + { + "epoch": 1.7179883485911307, + "grad_norm": 0.13819120824337006, + "learning_rate": 7.421428906653318e-06, + "loss": 0.0581, + "step": 57800 + }, + { + "epoch": 1.7209606467720842, + "grad_norm": 0.2640224099159241, + "learning_rate": 7.343209799289772e-06, + "loss": 0.0577, + "step": 57900 + }, + { + "epoch": 1.7239329449530376, + "grad_norm": 0.1444404572248459, + "learning_rate": 7.264990691926224e-06, + "loss": 0.0564, + "step": 58000 + }, + { + "epoch": 1.726905243133991, + "grad_norm": 0.2005554735660553, + "learning_rate": 7.186771584562678e-06, + "loss": 0.0588, + "step": 58100 + }, + { + "epoch": 1.7298775413149448, + "grad_norm": 0.1748191863298416, + "learning_rate": 7.10855247719913e-06, + "loss": 0.0579, + "step": 58200 + }, + { + "epoch": 1.7328498394958982, + "grad_norm": 0.18748654425144196, + "learning_rate": 7.030333369835584e-06, + "loss": 0.0585, + "step": 58300 + }, + { + "epoch": 1.735822137676852, + "grad_norm": 0.2286122590303421, + "learning_rate": 6.952114262472037e-06, + "loss": 0.0544, + "step": 58400 + }, + { + "epoch": 1.7387944358578054, + "grad_norm": 0.2468654066324234, + "learning_rate": 6.8738951551084895e-06, + "loss": 0.0561, + "step": 58500 + }, + { + "epoch": 1.7417667340387588, + "grad_norm": 0.15972191095352173, + "learning_rate": 6.7956760477449435e-06, + "loss": 0.0562, + "step": 58600 + }, + { + "epoch": 1.7447390322197123, + "grad_norm": 0.15185460448265076, + "learning_rate": 6.717456940381396e-06, + "loss": 0.0563, + "step": 58700 + }, + { + "epoch": 1.7477113304006657, + "grad_norm": 0.22079932689666748, + "learning_rate": 6.63923783301785e-06, + "loss": 0.0574, + "step": 58800 + }, + { + "epoch": 1.7506836285816192, + "grad_norm": 0.22975362837314606, + "learning_rate": 6.561018725654303e-06, + "loss": 0.0554, + "step": 58900 + }, + { + "epoch": 1.7536559267625729, + "grad_norm": 0.3252791166305542, + "learning_rate": 6.482799618290757e-06, + "loss": 0.0577, + "step": 59000 + }, + { + "epoch": 1.7566282249435263, + "grad_norm": 0.18925951421260834, + "learning_rate": 6.404580510927209e-06, + "loss": 0.0561, + "step": 59100 + }, + { + "epoch": 1.75960052312448, + "grad_norm": 0.18320415914058685, + "learning_rate": 6.326361403563663e-06, + "loss": 0.0585, + "step": 59200 + }, + { + "epoch": 1.7625728213054335, + "grad_norm": 0.17666569352149963, + "learning_rate": 6.248142296200116e-06, + "loss": 0.0544, + "step": 59300 + }, + { + "epoch": 1.765545119486387, + "grad_norm": 0.1489667445421219, + "learning_rate": 6.169923188836569e-06, + "loss": 0.0593, + "step": 59400 + }, + { + "epoch": 1.7685174176673404, + "grad_norm": 0.17205996811389923, + "learning_rate": 6.0917040814730225e-06, + "loss": 0.0553, + "step": 59500 + }, + { + "epoch": 1.7714897158482938, + "grad_norm": 0.18075639009475708, + "learning_rate": 6.013484974109476e-06, + "loss": 0.0577, + "step": 59600 + }, + { + "epoch": 1.7744620140292473, + "grad_norm": 0.17098510265350342, + "learning_rate": 5.935265866745929e-06, + "loss": 0.0574, + "step": 59700 + }, + { + "epoch": 1.7774343122102008, + "grad_norm": 0.1404750496149063, + "learning_rate": 5.857046759382382e-06, + "loss": 0.056, + "step": 59800 + }, + { + "epoch": 1.7804066103911544, + "grad_norm": 0.1939045637845993, + "learning_rate": 5.778827652018835e-06, + "loss": 0.0579, + "step": 59900 + }, + { + "epoch": 1.783378908572108, + "grad_norm": 0.17957621812820435, + "learning_rate": 5.700608544655288e-06, + "loss": 0.0549, + "step": 60000 + } + ], + "logging_steps": 100, + "max_steps": 67288, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.780997500618482e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}