Muedi's picture
Training in progress, step 25500, checkpoint
9725edc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9967166979362101,
"eval_steps": 500,
"global_step": 25500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003908692933083177,
"grad_norm": 0.47716134786605835,
"learning_rate": 4.98240400406663e-05,
"loss": 1.3868,
"step": 100
},
{
"epoch": 0.007817385866166354,
"grad_norm": 0.28900113701820374,
"learning_rate": 4.9628528974739973e-05,
"loss": 1.3348,
"step": 200
},
{
"epoch": 0.01172607879924953,
"grad_norm": 0.37652796506881714,
"learning_rate": 4.943301790881364e-05,
"loss": 1.3298,
"step": 300
},
{
"epoch": 0.015634771732332707,
"grad_norm": 0.24992702901363373,
"learning_rate": 4.923750684288731e-05,
"loss": 1.3309,
"step": 400
},
{
"epoch": 0.019543464665415886,
"grad_norm": 0.3237004280090332,
"learning_rate": 4.904199577696098e-05,
"loss": 1.3228,
"step": 500
},
{
"epoch": 0.02345215759849906,
"grad_norm": 0.259011447429657,
"learning_rate": 4.884648471103465e-05,
"loss": 1.3166,
"step": 600
},
{
"epoch": 0.02736085053158224,
"grad_norm": 0.34121841192245483,
"learning_rate": 4.8650973645108314e-05,
"loss": 1.3145,
"step": 700
},
{
"epoch": 0.031269543464665414,
"grad_norm": 0.41398781538009644,
"learning_rate": 4.845546257918198e-05,
"loss": 1.3081,
"step": 800
},
{
"epoch": 0.03517823639774859,
"grad_norm": 0.4289904832839966,
"learning_rate": 4.825995151325565e-05,
"loss": 1.3073,
"step": 900
},
{
"epoch": 0.03908692933083177,
"grad_norm": 0.21226634085178375,
"learning_rate": 4.8064440447329324e-05,
"loss": 1.3057,
"step": 1000
},
{
"epoch": 0.04299562226391495,
"grad_norm": 0.2929254174232483,
"learning_rate": 4.786892938140299e-05,
"loss": 1.3034,
"step": 1100
},
{
"epoch": 0.04690431519699812,
"grad_norm": 0.22025109827518463,
"learning_rate": 4.767341831547666e-05,
"loss": 1.3048,
"step": 1200
},
{
"epoch": 0.0508130081300813,
"grad_norm": 0.36455097794532776,
"learning_rate": 4.747790724955033e-05,
"loss": 1.3028,
"step": 1300
},
{
"epoch": 0.05472170106316448,
"grad_norm": 0.24527208507061005,
"learning_rate": 4.728239618362399e-05,
"loss": 1.2996,
"step": 1400
},
{
"epoch": 0.05863039399624766,
"grad_norm": 0.33187615871429443,
"learning_rate": 4.7086885117697664e-05,
"loss": 1.3001,
"step": 1500
},
{
"epoch": 0.06253908692933083,
"grad_norm": 0.502117931842804,
"learning_rate": 4.689137405177133e-05,
"loss": 1.2954,
"step": 1600
},
{
"epoch": 0.06644777986241401,
"grad_norm": 0.289112389087677,
"learning_rate": 4.6695862985845e-05,
"loss": 1.2969,
"step": 1700
},
{
"epoch": 0.07035647279549719,
"grad_norm": 0.24350133538246155,
"learning_rate": 4.6500351919918674e-05,
"loss": 1.2929,
"step": 1800
},
{
"epoch": 0.07426516572858036,
"grad_norm": 0.2194579690694809,
"learning_rate": 4.630484085399234e-05,
"loss": 1.2913,
"step": 1900
},
{
"epoch": 0.07817385866166354,
"grad_norm": 0.3578532636165619,
"learning_rate": 4.6109329788066005e-05,
"loss": 1.2946,
"step": 2000
},
{
"epoch": 0.08208255159474671,
"grad_norm": 0.22988037765026093,
"learning_rate": 4.591381872213967e-05,
"loss": 1.2906,
"step": 2100
},
{
"epoch": 0.0859912445278299,
"grad_norm": 0.44038262963294983,
"learning_rate": 4.571830765621334e-05,
"loss": 1.2941,
"step": 2200
},
{
"epoch": 0.08989993746091307,
"grad_norm": 0.27333030104637146,
"learning_rate": 4.5522796590287015e-05,
"loss": 1.2903,
"step": 2300
},
{
"epoch": 0.09380863039399624,
"grad_norm": 0.23196421563625336,
"learning_rate": 4.532728552436068e-05,
"loss": 1.2897,
"step": 2400
},
{
"epoch": 0.09771732332707943,
"grad_norm": 0.25054216384887695,
"learning_rate": 4.513177445843435e-05,
"loss": 1.2891,
"step": 2500
},
{
"epoch": 0.1016260162601626,
"grad_norm": 0.17129705846309662,
"learning_rate": 4.493626339250802e-05,
"loss": 1.2874,
"step": 2600
},
{
"epoch": 0.10553470919324578,
"grad_norm": 0.5260087251663208,
"learning_rate": 4.474075232658168e-05,
"loss": 1.289,
"step": 2700
},
{
"epoch": 0.10944340212632896,
"grad_norm": 0.2251499593257904,
"learning_rate": 4.4545241260655355e-05,
"loss": 1.2882,
"step": 2800
},
{
"epoch": 0.11335209505941213,
"grad_norm": 0.24193763732910156,
"learning_rate": 4.434973019472902e-05,
"loss": 1.2886,
"step": 2900
},
{
"epoch": 0.11726078799249531,
"grad_norm": 0.2361423224210739,
"learning_rate": 4.415421912880269e-05,
"loss": 1.2863,
"step": 3000
},
{
"epoch": 0.12116948092557848,
"grad_norm": 0.22438161075115204,
"learning_rate": 4.3958708062876365e-05,
"loss": 1.2861,
"step": 3100
},
{
"epoch": 0.12507817385866166,
"grad_norm": 0.30306315422058105,
"learning_rate": 4.376319699695003e-05,
"loss": 1.2838,
"step": 3200
},
{
"epoch": 0.12898686679174484,
"grad_norm": 0.2526134252548218,
"learning_rate": 4.35676859310237e-05,
"loss": 1.288,
"step": 3300
},
{
"epoch": 0.13289555972482803,
"grad_norm": 0.27540501952171326,
"learning_rate": 4.337217486509736e-05,
"loss": 1.2878,
"step": 3400
},
{
"epoch": 0.13680425265791119,
"grad_norm": 0.2769360840320587,
"learning_rate": 4.3176663799171034e-05,
"loss": 1.2844,
"step": 3500
},
{
"epoch": 0.14071294559099437,
"grad_norm": 0.3868384063243866,
"learning_rate": 4.2981152733244706e-05,
"loss": 1.2856,
"step": 3600
},
{
"epoch": 0.14462163852407756,
"grad_norm": 0.2426549792289734,
"learning_rate": 4.278564166731837e-05,
"loss": 1.2843,
"step": 3700
},
{
"epoch": 0.1485303314571607,
"grad_norm": 0.28486019372940063,
"learning_rate": 4.2590130601392044e-05,
"loss": 1.2812,
"step": 3800
},
{
"epoch": 0.1524390243902439,
"grad_norm": 0.3195390999317169,
"learning_rate": 4.239461953546571e-05,
"loss": 1.2845,
"step": 3900
},
{
"epoch": 0.15634771732332708,
"grad_norm": 0.3183465003967285,
"learning_rate": 4.219910846953938e-05,
"loss": 1.281,
"step": 4000
},
{
"epoch": 0.16025641025641027,
"grad_norm": 0.30105575919151306,
"learning_rate": 4.2003597403613047e-05,
"loss": 1.2817,
"step": 4100
},
{
"epoch": 0.16416510318949343,
"grad_norm": 0.3548736274242401,
"learning_rate": 4.180808633768671e-05,
"loss": 1.2805,
"step": 4200
},
{
"epoch": 0.1680737961225766,
"grad_norm": 0.21018876135349274,
"learning_rate": 4.1612575271760384e-05,
"loss": 1.28,
"step": 4300
},
{
"epoch": 0.1719824890556598,
"grad_norm": 0.2551437318325043,
"learning_rate": 4.141706420583405e-05,
"loss": 1.279,
"step": 4400
},
{
"epoch": 0.17589118198874296,
"grad_norm": 0.21301493048667908,
"learning_rate": 4.122155313990772e-05,
"loss": 1.2802,
"step": 4500
},
{
"epoch": 0.17979987492182614,
"grad_norm": 0.3330891728401184,
"learning_rate": 4.1026042073981394e-05,
"loss": 1.2784,
"step": 4600
},
{
"epoch": 0.18370856785490933,
"grad_norm": 0.20550762116909027,
"learning_rate": 4.083053100805506e-05,
"loss": 1.2802,
"step": 4700
},
{
"epoch": 0.18761726078799248,
"grad_norm": 0.2603992521762848,
"learning_rate": 4.0635019942128725e-05,
"loss": 1.2792,
"step": 4800
},
{
"epoch": 0.19152595372107567,
"grad_norm": 0.17917224764823914,
"learning_rate": 4.04395088762024e-05,
"loss": 1.2803,
"step": 4900
},
{
"epoch": 0.19543464665415886,
"grad_norm": 0.17226773500442505,
"learning_rate": 4.024399781027606e-05,
"loss": 1.2775,
"step": 5000
},
{
"epoch": 0.199343339587242,
"grad_norm": 0.28991296887397766,
"learning_rate": 4.0048486744349735e-05,
"loss": 1.2801,
"step": 5100
},
{
"epoch": 0.2032520325203252,
"grad_norm": 0.2766139805316925,
"learning_rate": 3.98529756784234e-05,
"loss": 1.2773,
"step": 5200
},
{
"epoch": 0.20716072545340838,
"grad_norm": 0.3101244866847992,
"learning_rate": 3.965746461249707e-05,
"loss": 1.2786,
"step": 5300
},
{
"epoch": 0.21106941838649157,
"grad_norm": 0.2537230849266052,
"learning_rate": 3.946195354657074e-05,
"loss": 1.2782,
"step": 5400
},
{
"epoch": 0.21497811131957473,
"grad_norm": 0.388886958360672,
"learning_rate": 3.92664424806444e-05,
"loss": 1.2766,
"step": 5500
},
{
"epoch": 0.2188868042526579,
"grad_norm": 0.3374210596084595,
"learning_rate": 3.9070931414718075e-05,
"loss": 1.273,
"step": 5600
},
{
"epoch": 0.2227954971857411,
"grad_norm": 0.34907710552215576,
"learning_rate": 3.887542034879174e-05,
"loss": 1.2772,
"step": 5700
},
{
"epoch": 0.22670419011882426,
"grad_norm": 0.22610360383987427,
"learning_rate": 3.867990928286541e-05,
"loss": 1.2744,
"step": 5800
},
{
"epoch": 0.23061288305190744,
"grad_norm": 0.27353575825691223,
"learning_rate": 3.8484398216939085e-05,
"loss": 1.2755,
"step": 5900
},
{
"epoch": 0.23452157598499063,
"grad_norm": 0.3231126368045807,
"learning_rate": 3.828888715101275e-05,
"loss": 1.2746,
"step": 6000
},
{
"epoch": 0.23843026891807378,
"grad_norm": 0.3560299575328827,
"learning_rate": 3.8093376085086416e-05,
"loss": 1.2725,
"step": 6100
},
{
"epoch": 0.24233896185115697,
"grad_norm": 0.2950970530509949,
"learning_rate": 3.789786501916008e-05,
"loss": 1.27,
"step": 6200
},
{
"epoch": 0.24624765478424016,
"grad_norm": 0.25703492760658264,
"learning_rate": 3.770235395323375e-05,
"loss": 1.2705,
"step": 6300
},
{
"epoch": 0.2501563477173233,
"grad_norm": 0.4161667227745056,
"learning_rate": 3.7506842887307426e-05,
"loss": 1.2725,
"step": 6400
},
{
"epoch": 0.2540650406504065,
"grad_norm": 0.4292469322681427,
"learning_rate": 3.731133182138109e-05,
"loss": 1.2736,
"step": 6500
},
{
"epoch": 0.2579737335834897,
"grad_norm": 0.28723031282424927,
"learning_rate": 3.711582075545476e-05,
"loss": 1.2696,
"step": 6600
},
{
"epoch": 0.26188242651657284,
"grad_norm": 0.3981460928916931,
"learning_rate": 3.692030968952843e-05,
"loss": 1.2712,
"step": 6700
},
{
"epoch": 0.26579111944965605,
"grad_norm": 0.509950578212738,
"learning_rate": 3.6724798623602094e-05,
"loss": 1.272,
"step": 6800
},
{
"epoch": 0.2696998123827392,
"grad_norm": 0.2820459008216858,
"learning_rate": 3.6529287557675766e-05,
"loss": 1.2704,
"step": 6900
},
{
"epoch": 0.27360850531582237,
"grad_norm": 0.2927381098270416,
"learning_rate": 3.633377649174943e-05,
"loss": 1.268,
"step": 7000
},
{
"epoch": 0.2775171982489056,
"grad_norm": 0.2999459505081177,
"learning_rate": 3.6138265425823104e-05,
"loss": 1.2673,
"step": 7100
},
{
"epoch": 0.28142589118198874,
"grad_norm": 0.3287661373615265,
"learning_rate": 3.5942754359896776e-05,
"loss": 1.2693,
"step": 7200
},
{
"epoch": 0.2853345841150719,
"grad_norm": 0.4145341217517853,
"learning_rate": 3.574724329397044e-05,
"loss": 1.2655,
"step": 7300
},
{
"epoch": 0.2892432770481551,
"grad_norm": 0.41897234320640564,
"learning_rate": 3.555173222804411e-05,
"loss": 1.2633,
"step": 7400
},
{
"epoch": 0.29315196998123827,
"grad_norm": 0.6244714856147766,
"learning_rate": 3.535622116211777e-05,
"loss": 1.2659,
"step": 7500
},
{
"epoch": 0.2970606629143214,
"grad_norm": 0.41341808438301086,
"learning_rate": 3.5160710096191444e-05,
"loss": 1.2635,
"step": 7600
},
{
"epoch": 0.30096935584740464,
"grad_norm": 0.39480525255203247,
"learning_rate": 3.4965199030265117e-05,
"loss": 1.2633,
"step": 7700
},
{
"epoch": 0.3048780487804878,
"grad_norm": 0.5332251787185669,
"learning_rate": 3.476968796433878e-05,
"loss": 1.262,
"step": 7800
},
{
"epoch": 0.30878674171357096,
"grad_norm": 0.46669408679008484,
"learning_rate": 3.4574176898412454e-05,
"loss": 1.2587,
"step": 7900
},
{
"epoch": 0.31269543464665417,
"grad_norm": 0.46353575587272644,
"learning_rate": 3.437866583248612e-05,
"loss": 1.2611,
"step": 8000
},
{
"epoch": 0.3166041275797373,
"grad_norm": 0.5144259333610535,
"learning_rate": 3.418315476655979e-05,
"loss": 1.2603,
"step": 8100
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.501035749912262,
"learning_rate": 3.398764370063346e-05,
"loss": 1.26,
"step": 8200
},
{
"epoch": 0.3244215134459037,
"grad_norm": 0.6831699013710022,
"learning_rate": 3.379213263470712e-05,
"loss": 1.2598,
"step": 8300
},
{
"epoch": 0.32833020637898686,
"grad_norm": 0.5810164213180542,
"learning_rate": 3.3596621568780795e-05,
"loss": 1.2561,
"step": 8400
},
{
"epoch": 0.33223889931207007,
"grad_norm": 0.45409688353538513,
"learning_rate": 3.340111050285447e-05,
"loss": 1.2552,
"step": 8500
},
{
"epoch": 0.3361475922451532,
"grad_norm": 0.6193764805793762,
"learning_rate": 3.320559943692813e-05,
"loss": 1.2538,
"step": 8600
},
{
"epoch": 0.3400562851782364,
"grad_norm": 0.4400370717048645,
"learning_rate": 3.3010088371001805e-05,
"loss": 1.2548,
"step": 8700
},
{
"epoch": 0.3439649781113196,
"grad_norm": 0.7784037590026855,
"learning_rate": 3.281457730507547e-05,
"loss": 1.2537,
"step": 8800
},
{
"epoch": 0.34787367104440275,
"grad_norm": 0.7589293718338013,
"learning_rate": 3.2619066239149135e-05,
"loss": 1.2519,
"step": 8900
},
{
"epoch": 0.3517823639774859,
"grad_norm": 0.4971660375595093,
"learning_rate": 3.242355517322281e-05,
"loss": 1.25,
"step": 9000
},
{
"epoch": 0.3556910569105691,
"grad_norm": 0.5253425240516663,
"learning_rate": 3.222804410729647e-05,
"loss": 1.2497,
"step": 9100
},
{
"epoch": 0.3595997498436523,
"grad_norm": 0.7438779473304749,
"learning_rate": 3.2032533041370145e-05,
"loss": 1.2458,
"step": 9200
},
{
"epoch": 0.36350844277673544,
"grad_norm": 0.9425673484802246,
"learning_rate": 3.183702197544381e-05,
"loss": 1.2422,
"step": 9300
},
{
"epoch": 0.36741713570981865,
"grad_norm": 0.5804303288459778,
"learning_rate": 3.164151090951748e-05,
"loss": 1.2445,
"step": 9400
},
{
"epoch": 0.3713258286429018,
"grad_norm": 0.7079547047615051,
"learning_rate": 3.144599984359115e-05,
"loss": 1.2402,
"step": 9500
},
{
"epoch": 0.37523452157598497,
"grad_norm": 1.088934063911438,
"learning_rate": 3.1250488777664814e-05,
"loss": 1.2453,
"step": 9600
},
{
"epoch": 0.3791432145090682,
"grad_norm": 0.48442408442497253,
"learning_rate": 3.1054977711738486e-05,
"loss": 1.2469,
"step": 9700
},
{
"epoch": 0.38305190744215134,
"grad_norm": 0.803571879863739,
"learning_rate": 3.085946664581215e-05,
"loss": 1.2448,
"step": 9800
},
{
"epoch": 0.3869606003752345,
"grad_norm": 0.7402758598327637,
"learning_rate": 3.0663955579885823e-05,
"loss": 1.239,
"step": 9900
},
{
"epoch": 0.3908692933083177,
"grad_norm": 0.874411404132843,
"learning_rate": 3.0468444513959492e-05,
"loss": 1.2424,
"step": 10000
},
{
"epoch": 0.39477798624140087,
"grad_norm": 0.8369914293289185,
"learning_rate": 3.0272933448033158e-05,
"loss": 1.2406,
"step": 10100
},
{
"epoch": 0.398686679174484,
"grad_norm": 0.9173115491867065,
"learning_rate": 3.007742238210683e-05,
"loss": 1.2338,
"step": 10200
},
{
"epoch": 0.40259537210756724,
"grad_norm": 1.1409565210342407,
"learning_rate": 2.98819113161805e-05,
"loss": 1.2332,
"step": 10300
},
{
"epoch": 0.4065040650406504,
"grad_norm": 1.2753782272338867,
"learning_rate": 2.9686400250254164e-05,
"loss": 1.23,
"step": 10400
},
{
"epoch": 0.4104127579737336,
"grad_norm": 1.3038467168807983,
"learning_rate": 2.9490889184327836e-05,
"loss": 1.229,
"step": 10500
},
{
"epoch": 0.41432145090681677,
"grad_norm": 0.7983734607696533,
"learning_rate": 2.92953781184015e-05,
"loss": 1.2318,
"step": 10600
},
{
"epoch": 0.4182301438398999,
"grad_norm": 0.8295246362686157,
"learning_rate": 2.909986705247517e-05,
"loss": 1.2335,
"step": 10700
},
{
"epoch": 0.42213883677298314,
"grad_norm": 0.8072339296340942,
"learning_rate": 2.8904355986548843e-05,
"loss": 1.2241,
"step": 10800
},
{
"epoch": 0.4260475297060663,
"grad_norm": 0.7493770122528076,
"learning_rate": 2.8708844920622508e-05,
"loss": 1.2269,
"step": 10900
},
{
"epoch": 0.42995622263914945,
"grad_norm": 0.705977737903595,
"learning_rate": 2.8513333854696177e-05,
"loss": 1.225,
"step": 11000
},
{
"epoch": 0.43386491557223267,
"grad_norm": 1.4560986757278442,
"learning_rate": 2.8317822788769842e-05,
"loss": 1.2255,
"step": 11100
},
{
"epoch": 0.4377736085053158,
"grad_norm": 1.0365822315216064,
"learning_rate": 2.8122311722843514e-05,
"loss": 1.2228,
"step": 11200
},
{
"epoch": 0.441682301438399,
"grad_norm": 0.9956429600715637,
"learning_rate": 2.7926800656917183e-05,
"loss": 1.2185,
"step": 11300
},
{
"epoch": 0.4455909943714822,
"grad_norm": 0.9062424302101135,
"learning_rate": 2.773128959099085e-05,
"loss": 1.2229,
"step": 11400
},
{
"epoch": 0.44949968730456535,
"grad_norm": 1.64096999168396,
"learning_rate": 2.753577852506452e-05,
"loss": 1.2186,
"step": 11500
},
{
"epoch": 0.4534083802376485,
"grad_norm": 1.88444983959198,
"learning_rate": 2.7340267459138186e-05,
"loss": 1.2206,
"step": 11600
},
{
"epoch": 0.4573170731707317,
"grad_norm": 1.2047277688980103,
"learning_rate": 2.7144756393211855e-05,
"loss": 1.2166,
"step": 11700
},
{
"epoch": 0.4612257661038149,
"grad_norm": 1.3528029918670654,
"learning_rate": 2.6949245327285527e-05,
"loss": 1.2052,
"step": 11800
},
{
"epoch": 0.46513445903689804,
"grad_norm": 0.8786093592643738,
"learning_rate": 2.6753734261359193e-05,
"loss": 1.2166,
"step": 11900
},
{
"epoch": 0.46904315196998125,
"grad_norm": 1.253136157989502,
"learning_rate": 2.655822319543286e-05,
"loss": 1.2184,
"step": 12000
},
{
"epoch": 0.4729518449030644,
"grad_norm": 1.228351354598999,
"learning_rate": 2.6362712129506534e-05,
"loss": 1.2095,
"step": 12100
},
{
"epoch": 0.47686053783614757,
"grad_norm": 1.020992398262024,
"learning_rate": 2.61672010635802e-05,
"loss": 1.2028,
"step": 12200
},
{
"epoch": 0.4807692307692308,
"grad_norm": 1.2239114046096802,
"learning_rate": 2.597168999765387e-05,
"loss": 1.2082,
"step": 12300
},
{
"epoch": 0.48467792370231394,
"grad_norm": 0.8391302824020386,
"learning_rate": 2.5776178931727533e-05,
"loss": 1.2115,
"step": 12400
},
{
"epoch": 0.4885866166353971,
"grad_norm": 1.241330623626709,
"learning_rate": 2.5580667865801205e-05,
"loss": 1.2026,
"step": 12500
},
{
"epoch": 0.4924953095684803,
"grad_norm": 1.6619151830673218,
"learning_rate": 2.5385156799874878e-05,
"loss": 1.2068,
"step": 12600
},
{
"epoch": 0.49640400250156347,
"grad_norm": 1.0257009267807007,
"learning_rate": 2.5189645733948543e-05,
"loss": 1.2058,
"step": 12700
},
{
"epoch": 0.5003126954346466,
"grad_norm": 1.6809911727905273,
"learning_rate": 2.4994134668022212e-05,
"loss": 1.2003,
"step": 12800
},
{
"epoch": 0.5042213883677298,
"grad_norm": 0.6175216436386108,
"learning_rate": 2.479862360209588e-05,
"loss": 1.2084,
"step": 12900
},
{
"epoch": 0.508130081300813,
"grad_norm": 1.0972894430160522,
"learning_rate": 2.460311253616955e-05,
"loss": 1.1951,
"step": 13000
},
{
"epoch": 0.5120387742338962,
"grad_norm": 1.5631442070007324,
"learning_rate": 2.4407601470243215e-05,
"loss": 1.2004,
"step": 13100
},
{
"epoch": 0.5159474671669794,
"grad_norm": 1.4894869327545166,
"learning_rate": 2.4212090404316887e-05,
"loss": 1.1905,
"step": 13200
},
{
"epoch": 0.5198561601000625,
"grad_norm": 1.101841926574707,
"learning_rate": 2.4016579338390556e-05,
"loss": 1.1967,
"step": 13300
},
{
"epoch": 0.5237648530331457,
"grad_norm": 1.0999876260757446,
"learning_rate": 2.382106827246422e-05,
"loss": 1.1895,
"step": 13400
},
{
"epoch": 0.5276735459662288,
"grad_norm": 1.0898387432098389,
"learning_rate": 2.362555720653789e-05,
"loss": 1.1902,
"step": 13500
},
{
"epoch": 0.5315822388993121,
"grad_norm": 1.2711913585662842,
"learning_rate": 2.343004614061156e-05,
"loss": 1.1957,
"step": 13600
},
{
"epoch": 0.5354909318323953,
"grad_norm": 1.1854056119918823,
"learning_rate": 2.3234535074685228e-05,
"loss": 1.1905,
"step": 13700
},
{
"epoch": 0.5393996247654784,
"grad_norm": 1.2872633934020996,
"learning_rate": 2.3039024008758896e-05,
"loss": 1.1908,
"step": 13800
},
{
"epoch": 0.5433083176985616,
"grad_norm": 1.082897663116455,
"learning_rate": 2.2843512942832565e-05,
"loss": 1.1822,
"step": 13900
},
{
"epoch": 0.5472170106316447,
"grad_norm": 1.7656800746917725,
"learning_rate": 2.2648001876906234e-05,
"loss": 1.1883,
"step": 14000
},
{
"epoch": 0.551125703564728,
"grad_norm": 2.594040632247925,
"learning_rate": 2.2452490810979903e-05,
"loss": 1.1881,
"step": 14100
},
{
"epoch": 0.5550343964978112,
"grad_norm": 1.6944106817245483,
"learning_rate": 2.225697974505357e-05,
"loss": 1.1869,
"step": 14200
},
{
"epoch": 0.5589430894308943,
"grad_norm": 1.4870433807373047,
"learning_rate": 2.206146867912724e-05,
"loss": 1.1862,
"step": 14300
},
{
"epoch": 0.5628517823639775,
"grad_norm": 2.2079107761383057,
"learning_rate": 2.1865957613200906e-05,
"loss": 1.1794,
"step": 14400
},
{
"epoch": 0.5667604752970606,
"grad_norm": 1.419458270072937,
"learning_rate": 2.1670446547274575e-05,
"loss": 1.1798,
"step": 14500
},
{
"epoch": 0.5706691682301438,
"grad_norm": 1.2365930080413818,
"learning_rate": 2.1474935481348247e-05,
"loss": 1.1827,
"step": 14600
},
{
"epoch": 0.5745778611632271,
"grad_norm": 1.2718974351882935,
"learning_rate": 2.1279424415421916e-05,
"loss": 1.1805,
"step": 14700
},
{
"epoch": 0.5784865540963102,
"grad_norm": 1.7296931743621826,
"learning_rate": 2.108391334949558e-05,
"loss": 1.1797,
"step": 14800
},
{
"epoch": 0.5823952470293934,
"grad_norm": 0.8652032613754272,
"learning_rate": 2.088840228356925e-05,
"loss": 1.1719,
"step": 14900
},
{
"epoch": 0.5863039399624765,
"grad_norm": 2.346026659011841,
"learning_rate": 2.0692891217642922e-05,
"loss": 1.1847,
"step": 15000
},
{
"epoch": 0.5902126328955597,
"grad_norm": 1.951084852218628,
"learning_rate": 2.0497380151716587e-05,
"loss": 1.1794,
"step": 15100
},
{
"epoch": 0.5941213258286429,
"grad_norm": 1.3734288215637207,
"learning_rate": 2.0301869085790256e-05,
"loss": 1.1749,
"step": 15200
},
{
"epoch": 0.5980300187617261,
"grad_norm": 2.9074900150299072,
"learning_rate": 2.0106358019863925e-05,
"loss": 1.1729,
"step": 15300
},
{
"epoch": 0.6019387116948093,
"grad_norm": 2.278609275817871,
"learning_rate": 1.9910846953937594e-05,
"loss": 1.1703,
"step": 15400
},
{
"epoch": 0.6058474046278924,
"grad_norm": 1.291671633720398,
"learning_rate": 1.9715335888011263e-05,
"loss": 1.1621,
"step": 15500
},
{
"epoch": 0.6097560975609756,
"grad_norm": 1.9885281324386597,
"learning_rate": 1.951982482208493e-05,
"loss": 1.1636,
"step": 15600
},
{
"epoch": 0.6136647904940588,
"grad_norm": 1.5937877893447876,
"learning_rate": 1.93243137561586e-05,
"loss": 1.1701,
"step": 15700
},
{
"epoch": 0.6175734834271419,
"grad_norm": 2.537714719772339,
"learning_rate": 1.9128802690232266e-05,
"loss": 1.1659,
"step": 15800
},
{
"epoch": 0.6214821763602252,
"grad_norm": 1.8786745071411133,
"learning_rate": 1.8933291624305938e-05,
"loss": 1.163,
"step": 15900
},
{
"epoch": 0.6253908692933083,
"grad_norm": 1.65304696559906,
"learning_rate": 1.8737780558379607e-05,
"loss": 1.1624,
"step": 16000
},
{
"epoch": 0.6292995622263915,
"grad_norm": 1.7467074394226074,
"learning_rate": 1.8542269492453272e-05,
"loss": 1.1672,
"step": 16100
},
{
"epoch": 0.6332082551594747,
"grad_norm": 3.616380214691162,
"learning_rate": 1.834675842652694e-05,
"loss": 1.1672,
"step": 16200
},
{
"epoch": 0.6371169480925578,
"grad_norm": 1.736578106880188,
"learning_rate": 1.815124736060061e-05,
"loss": 1.1556,
"step": 16300
},
{
"epoch": 0.6410256410256411,
"grad_norm": 1.544382929801941,
"learning_rate": 1.7955736294674282e-05,
"loss": 1.1601,
"step": 16400
},
{
"epoch": 0.6449343339587242,
"grad_norm": 1.1988792419433594,
"learning_rate": 1.7760225228747947e-05,
"loss": 1.1568,
"step": 16500
},
{
"epoch": 0.6488430268918074,
"grad_norm": 2.0486536026000977,
"learning_rate": 1.7564714162821616e-05,
"loss": 1.1522,
"step": 16600
},
{
"epoch": 0.6527517198248906,
"grad_norm": 1.95686674118042,
"learning_rate": 1.7369203096895285e-05,
"loss": 1.1577,
"step": 16700
},
{
"epoch": 0.6566604127579737,
"grad_norm": 1.7548372745513916,
"learning_rate": 1.7173692030968954e-05,
"loss": 1.1589,
"step": 16800
},
{
"epoch": 0.6605691056910569,
"grad_norm": 1.1432212591171265,
"learning_rate": 1.6978180965042622e-05,
"loss": 1.1429,
"step": 16900
},
{
"epoch": 0.6644777986241401,
"grad_norm": 1.4253298044204712,
"learning_rate": 1.678266989911629e-05,
"loss": 1.1538,
"step": 17000
},
{
"epoch": 0.6683864915572233,
"grad_norm": 2.0788207054138184,
"learning_rate": 1.658715883318996e-05,
"loss": 1.1441,
"step": 17100
},
{
"epoch": 0.6722951844903065,
"grad_norm": 2.1387529373168945,
"learning_rate": 1.639164776726363e-05,
"loss": 1.1476,
"step": 17200
},
{
"epoch": 0.6762038774233896,
"grad_norm": 1.2654587030410767,
"learning_rate": 1.6196136701337298e-05,
"loss": 1.1447,
"step": 17300
},
{
"epoch": 0.6801125703564728,
"grad_norm": 2.0427193641662598,
"learning_rate": 1.6000625635410967e-05,
"loss": 1.157,
"step": 17400
},
{
"epoch": 0.6840212632895559,
"grad_norm": 1.7374306917190552,
"learning_rate": 1.5805114569484632e-05,
"loss": 1.1565,
"step": 17500
},
{
"epoch": 0.6879299562226392,
"grad_norm": 2.3408329486846924,
"learning_rate": 1.56096035035583e-05,
"loss": 1.1446,
"step": 17600
},
{
"epoch": 0.6918386491557224,
"grad_norm": 1.5187078714370728,
"learning_rate": 1.5414092437631973e-05,
"loss": 1.1558,
"step": 17700
},
{
"epoch": 0.6957473420888055,
"grad_norm": 2.5938124656677246,
"learning_rate": 1.521858137170564e-05,
"loss": 1.1489,
"step": 17800
},
{
"epoch": 0.6996560350218887,
"grad_norm": 2.978803873062134,
"learning_rate": 1.5023070305779307e-05,
"loss": 1.1412,
"step": 17900
},
{
"epoch": 0.7035647279549718,
"grad_norm": 3.2251853942871094,
"learning_rate": 1.4827559239852976e-05,
"loss": 1.1472,
"step": 18000
},
{
"epoch": 0.707473420888055,
"grad_norm": 1.528054118156433,
"learning_rate": 1.4632048173926646e-05,
"loss": 1.1406,
"step": 18100
},
{
"epoch": 0.7113821138211383,
"grad_norm": 1.627551794052124,
"learning_rate": 1.4436537108000314e-05,
"loss": 1.1378,
"step": 18200
},
{
"epoch": 0.7152908067542214,
"grad_norm": 1.2601124048233032,
"learning_rate": 1.4241026042073982e-05,
"loss": 1.1423,
"step": 18300
},
{
"epoch": 0.7191994996873046,
"grad_norm": 1.6382373571395874,
"learning_rate": 1.404551497614765e-05,
"loss": 1.1397,
"step": 18400
},
{
"epoch": 0.7231081926203877,
"grad_norm": 2.1206157207489014,
"learning_rate": 1.3850003910221318e-05,
"loss": 1.1401,
"step": 18500
},
{
"epoch": 0.7270168855534709,
"grad_norm": 1.3527886867523193,
"learning_rate": 1.3654492844294989e-05,
"loss": 1.142,
"step": 18600
},
{
"epoch": 0.7309255784865542,
"grad_norm": 1.8993370532989502,
"learning_rate": 1.3458981778368658e-05,
"loss": 1.1403,
"step": 18700
},
{
"epoch": 0.7348342714196373,
"grad_norm": 1.5903937816619873,
"learning_rate": 1.3263470712442325e-05,
"loss": 1.133,
"step": 18800
},
{
"epoch": 0.7387429643527205,
"grad_norm": 1.9694344997406006,
"learning_rate": 1.3067959646515993e-05,
"loss": 1.1279,
"step": 18900
},
{
"epoch": 0.7426516572858036,
"grad_norm": 2.955965757369995,
"learning_rate": 1.2872448580589664e-05,
"loss": 1.1221,
"step": 19000
},
{
"epoch": 0.7465603502188868,
"grad_norm": 1.9453381299972534,
"learning_rate": 1.2676937514663331e-05,
"loss": 1.1341,
"step": 19100
},
{
"epoch": 0.7504690431519699,
"grad_norm": 3.0147523880004883,
"learning_rate": 1.2481426448737e-05,
"loss": 1.1309,
"step": 19200
},
{
"epoch": 0.7543777360850532,
"grad_norm": 1.702540397644043,
"learning_rate": 1.2285915382810667e-05,
"loss": 1.1348,
"step": 19300
},
{
"epoch": 0.7582864290181364,
"grad_norm": 2.3534483909606934,
"learning_rate": 1.2090404316884336e-05,
"loss": 1.1234,
"step": 19400
},
{
"epoch": 0.7621951219512195,
"grad_norm": 1.785124659538269,
"learning_rate": 1.1894893250958005e-05,
"loss": 1.1344,
"step": 19500
},
{
"epoch": 0.7661038148843027,
"grad_norm": 2.5952718257904053,
"learning_rate": 1.1699382185031673e-05,
"loss": 1.1335,
"step": 19600
},
{
"epoch": 0.7700125078173858,
"grad_norm": 1.8933463096618652,
"learning_rate": 1.1503871119105342e-05,
"loss": 1.1366,
"step": 19700
},
{
"epoch": 0.773921200750469,
"grad_norm": 2.2508208751678467,
"learning_rate": 1.1308360053179011e-05,
"loss": 1.1258,
"step": 19800
},
{
"epoch": 0.7778298936835523,
"grad_norm": 1.6183587312698364,
"learning_rate": 1.111284898725268e-05,
"loss": 1.1341,
"step": 19900
},
{
"epoch": 0.7817385866166354,
"grad_norm": 1.8715559244155884,
"learning_rate": 1.0917337921326349e-05,
"loss": 1.1307,
"step": 20000
},
{
"epoch": 0.7856472795497186,
"grad_norm": 2.4041683673858643,
"learning_rate": 1.0721826855400016e-05,
"loss": 1.1273,
"step": 20100
},
{
"epoch": 0.7895559724828017,
"grad_norm": 1.704397439956665,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.1255,
"step": 20200
},
{
"epoch": 0.7934646654158849,
"grad_norm": 1.8504843711853027,
"learning_rate": 1.0330804723547353e-05,
"loss": 1.1185,
"step": 20300
},
{
"epoch": 0.797373358348968,
"grad_norm": 4.145756244659424,
"learning_rate": 1.0135293657621022e-05,
"loss": 1.1262,
"step": 20400
},
{
"epoch": 0.8012820512820513,
"grad_norm": 2.0327529907226562,
"learning_rate": 9.939782591694691e-06,
"loss": 1.1253,
"step": 20500
},
{
"epoch": 0.8051907442151345,
"grad_norm": 1.2211467027664185,
"learning_rate": 9.744271525768358e-06,
"loss": 1.1238,
"step": 20600
},
{
"epoch": 0.8090994371482176,
"grad_norm": 4.430849075317383,
"learning_rate": 9.548760459842028e-06,
"loss": 1.1177,
"step": 20700
},
{
"epoch": 0.8130081300813008,
"grad_norm": 3.4501967430114746,
"learning_rate": 9.353249393915696e-06,
"loss": 1.1182,
"step": 20800
},
{
"epoch": 0.816916823014384,
"grad_norm": 1.5024946928024292,
"learning_rate": 9.157738327989366e-06,
"loss": 1.1186,
"step": 20900
},
{
"epoch": 0.8208255159474672,
"grad_norm": 1.509515404701233,
"learning_rate": 8.962227262063033e-06,
"loss": 1.1242,
"step": 21000
},
{
"epoch": 0.8247342088805504,
"grad_norm": 5.708670139312744,
"learning_rate": 8.766716196136702e-06,
"loss": 1.1188,
"step": 21100
},
{
"epoch": 0.8286429018136335,
"grad_norm": 2.3833084106445312,
"learning_rate": 8.57120513021037e-06,
"loss": 1.1195,
"step": 21200
},
{
"epoch": 0.8325515947467167,
"grad_norm": 2.416119337081909,
"learning_rate": 8.375694064284038e-06,
"loss": 1.1296,
"step": 21300
},
{
"epoch": 0.8364602876797999,
"grad_norm": 1.5248167514801025,
"learning_rate": 8.180182998357708e-06,
"loss": 1.1209,
"step": 21400
},
{
"epoch": 0.840368980612883,
"grad_norm": 3.158363103866577,
"learning_rate": 7.984671932431375e-06,
"loss": 1.1233,
"step": 21500
},
{
"epoch": 0.8442776735459663,
"grad_norm": 1.9253687858581543,
"learning_rate": 7.789160866505044e-06,
"loss": 1.1243,
"step": 21600
},
{
"epoch": 0.8481863664790494,
"grad_norm": 1.714058518409729,
"learning_rate": 7.593649800578713e-06,
"loss": 1.1117,
"step": 21700
},
{
"epoch": 0.8520950594121326,
"grad_norm": 1.440436601638794,
"learning_rate": 7.398138734652382e-06,
"loss": 1.1159,
"step": 21800
},
{
"epoch": 0.8560037523452158,
"grad_norm": 1.4876196384429932,
"learning_rate": 7.20262766872605e-06,
"loss": 1.1196,
"step": 21900
},
{
"epoch": 0.8599124452782989,
"grad_norm": 2.6332156658172607,
"learning_rate": 7.007116602799718e-06,
"loss": 1.1276,
"step": 22000
},
{
"epoch": 0.8638211382113821,
"grad_norm": 1.7228409051895142,
"learning_rate": 6.8116055368733874e-06,
"loss": 1.1077,
"step": 22100
},
{
"epoch": 0.8677298311444653,
"grad_norm": 2.629837989807129,
"learning_rate": 6.616094470947055e-06,
"loss": 1.1176,
"step": 22200
},
{
"epoch": 0.8716385240775485,
"grad_norm": 2.1659109592437744,
"learning_rate": 6.420583405020725e-06,
"loss": 1.1101,
"step": 22300
},
{
"epoch": 0.8755472170106317,
"grad_norm": 1.792125940322876,
"learning_rate": 6.225072339094393e-06,
"loss": 1.1123,
"step": 22400
},
{
"epoch": 0.8794559099437148,
"grad_norm": 3.5081870555877686,
"learning_rate": 6.029561273168061e-06,
"loss": 1.1074,
"step": 22500
},
{
"epoch": 0.883364602876798,
"grad_norm": 1.7166240215301514,
"learning_rate": 5.83405020724173e-06,
"loss": 1.1204,
"step": 22600
},
{
"epoch": 0.8872732958098811,
"grad_norm": 1.851130485534668,
"learning_rate": 5.6385391413153985e-06,
"loss": 1.1106,
"step": 22700
},
{
"epoch": 0.8911819887429644,
"grad_norm": 1.5522408485412598,
"learning_rate": 5.443028075389067e-06,
"loss": 1.1104,
"step": 22800
},
{
"epoch": 0.8950906816760476,
"grad_norm": 2.766508102416992,
"learning_rate": 5.247517009462736e-06,
"loss": 1.0969,
"step": 22900
},
{
"epoch": 0.8989993746091307,
"grad_norm": 1.1996078491210938,
"learning_rate": 5.052005943536404e-06,
"loss": 1.1136,
"step": 23000
},
{
"epoch": 0.9029080675422139,
"grad_norm": 1.9336436986923218,
"learning_rate": 4.856494877610073e-06,
"loss": 1.1048,
"step": 23100
},
{
"epoch": 0.906816760475297,
"grad_norm": 1.5276035070419312,
"learning_rate": 4.660983811683742e-06,
"loss": 1.1097,
"step": 23200
},
{
"epoch": 0.9107254534083803,
"grad_norm": 1.8674997091293335,
"learning_rate": 4.4654727457574105e-06,
"loss": 1.1123,
"step": 23300
},
{
"epoch": 0.9146341463414634,
"grad_norm": 1.247707724571228,
"learning_rate": 4.2699616798310784e-06,
"loss": 1.1161,
"step": 23400
},
{
"epoch": 0.9185428392745466,
"grad_norm": 3.970885753631592,
"learning_rate": 4.074450613904747e-06,
"loss": 1.1082,
"step": 23500
},
{
"epoch": 0.9224515322076298,
"grad_norm": 2.1680970191955566,
"learning_rate": 3.878939547978415e-06,
"loss": 1.1141,
"step": 23600
},
{
"epoch": 0.9263602251407129,
"grad_norm": 1.1423863172531128,
"learning_rate": 3.6834284820520844e-06,
"loss": 1.1078,
"step": 23700
},
{
"epoch": 0.9302689180737961,
"grad_norm": 1.8042670488357544,
"learning_rate": 3.487917416125753e-06,
"loss": 1.0984,
"step": 23800
},
{
"epoch": 0.9341776110068793,
"grad_norm": 1.9992506504058838,
"learning_rate": 3.2924063501994216e-06,
"loss": 1.1,
"step": 23900
},
{
"epoch": 0.9380863039399625,
"grad_norm": 1.4657387733459473,
"learning_rate": 3.09689528427309e-06,
"loss": 1.1099,
"step": 24000
},
{
"epoch": 0.9419949968730457,
"grad_norm": 2.647845983505249,
"learning_rate": 2.9013842183467584e-06,
"loss": 1.0996,
"step": 24100
},
{
"epoch": 0.9459036898061288,
"grad_norm": 2.4407107830047607,
"learning_rate": 2.705873152420427e-06,
"loss": 1.1028,
"step": 24200
},
{
"epoch": 0.949812382739212,
"grad_norm": 1.7697856426239014,
"learning_rate": 2.5103620864940955e-06,
"loss": 1.0975,
"step": 24300
},
{
"epoch": 0.9537210756722951,
"grad_norm": 1.754012942314148,
"learning_rate": 2.3148510205677643e-06,
"loss": 1.1066,
"step": 24400
},
{
"epoch": 0.9576297686053784,
"grad_norm": 2.4356472492218018,
"learning_rate": 2.119339954641433e-06,
"loss": 1.1036,
"step": 24500
},
{
"epoch": 0.9615384615384616,
"grad_norm": 2.5310487747192383,
"learning_rate": 1.923828888715101e-06,
"loss": 1.1145,
"step": 24600
},
{
"epoch": 0.9654471544715447,
"grad_norm": 1.101137638092041,
"learning_rate": 1.7283178227887699e-06,
"loss": 1.1031,
"step": 24700
},
{
"epoch": 0.9693558474046279,
"grad_norm": 1.346105933189392,
"learning_rate": 1.5328067568624385e-06,
"loss": 1.1139,
"step": 24800
},
{
"epoch": 0.973264540337711,
"grad_norm": 0.9766319394111633,
"learning_rate": 1.337295690936107e-06,
"loss": 1.1068,
"step": 24900
},
{
"epoch": 0.9771732332707942,
"grad_norm": 1.3190845251083374,
"learning_rate": 1.1417846250097757e-06,
"loss": 1.0991,
"step": 25000
},
{
"epoch": 0.9810819262038775,
"grad_norm": 1.3196136951446533,
"learning_rate": 9.462735590834442e-07,
"loss": 1.0986,
"step": 25100
},
{
"epoch": 0.9849906191369606,
"grad_norm": 1.408789873123169,
"learning_rate": 7.507624931571127e-07,
"loss": 1.1009,
"step": 25200
},
{
"epoch": 0.9888993120700438,
"grad_norm": 2.120715379714966,
"learning_rate": 5.552514272307812e-07,
"loss": 1.093,
"step": 25300
},
{
"epoch": 0.9928080050031269,
"grad_norm": 1.3537365198135376,
"learning_rate": 3.5974036130444985e-07,
"loss": 1.1002,
"step": 25400
},
{
"epoch": 0.9967166979362101,
"grad_norm": 1.4996395111083984,
"learning_rate": 1.642292953781184e-07,
"loss": 1.1042,
"step": 25500
}
],
"logging_steps": 100,
"max_steps": 25584,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.130887225344e+16,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}