{ "best_metric": 1.3470451831817627, "best_model_checkpoint": "./output/checkpoint-4200", "epoch": 0.11244377811094453, "eval_steps": 150, "global_step": 4200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002677232812165346, "grad_norm": 12.16030502319336, "learning_rate": 4.4e-06, "loss": 1.5442, "step": 10 }, { "epoch": 0.0005354465624330692, "grad_norm": 12.028721809387207, "learning_rate": 8.8e-06, "loss": 1.5341, "step": 20 }, { "epoch": 0.0008031698436496038, "grad_norm": 9.741438865661621, "learning_rate": 1.3199999999999999e-05, "loss": 1.5258, "step": 30 }, { "epoch": 0.0010708931248661383, "grad_norm": 11.791377067565918, "learning_rate": 1.76e-05, "loss": 1.4766, "step": 40 }, { "epoch": 0.0013386164060826729, "grad_norm": 10.31489372253418, "learning_rate": 2.2e-05, "loss": 1.4898, "step": 50 }, { "epoch": 0.0016063396872992076, "grad_norm": 11.65046501159668, "learning_rate": 2.6399999999999998e-05, "loss": 1.4536, "step": 60 }, { "epoch": 0.0018740629685157421, "grad_norm": 11.001107215881348, "learning_rate": 3.0799999999999996e-05, "loss": 1.4933, "step": 70 }, { "epoch": 0.0021417862497322766, "grad_norm": 10.670427322387695, "learning_rate": 3.52e-05, "loss": 1.4816, "step": 80 }, { "epoch": 0.002409509530948811, "grad_norm": 11.35387134552002, "learning_rate": 3.96e-05, "loss": 1.4636, "step": 90 }, { "epoch": 0.0026772328121653457, "grad_norm": 10.275943756103516, "learning_rate": 4.4e-05, "loss": 1.4996, "step": 100 }, { "epoch": 0.0029449560933818807, "grad_norm": 9.64588451385498, "learning_rate": 4.399954783308405e-05, "loss": 1.5114, "step": 110 }, { "epoch": 0.003212679374598415, "grad_norm": 9.566597938537598, "learning_rate": 4.399819135092302e-05, "loss": 1.542, "step": 120 }, { "epoch": 0.0034804026558149497, "grad_norm": 10.304100036621094, "learning_rate": 4.399593060927658e-05, "loss": 1.4911, "step": 130 }, { "epoch": 0.0037481259370314842, "grad_norm": 8.622906684875488, "learning_rate": 4.3992765701074955e-05, "loss": 1.4655, "step": 140 }, { "epoch": 0.004015849218248019, "grad_norm": 9.855925559997559, "learning_rate": 4.398869675641513e-05, "loss": 1.5424, "step": 150 }, { "epoch": 0.004015849218248019, "eval_loss": 1.5061633586883545, "eval_runtime": 76.7585, "eval_samples_per_second": 6.514, "eval_steps_per_second": 6.514, "step": 150 }, { "epoch": 0.004283572499464553, "grad_norm": 9.257708549499512, "learning_rate": 4.398372394255549e-05, "loss": 1.4863, "step": 160 }, { "epoch": 0.004551295780681088, "grad_norm": 7.029283046722412, "learning_rate": 4.397784746390892e-05, "loss": 1.504, "step": 170 }, { "epoch": 0.004819019061897622, "grad_norm": 8.059552192687988, "learning_rate": 4.3971067562034454e-05, "loss": 1.4734, "step": 180 }, { "epoch": 0.005086742343114157, "grad_norm": 8.756830215454102, "learning_rate": 4.39633845156273e-05, "loss": 1.5308, "step": 190 }, { "epoch": 0.005354465624330691, "grad_norm": 8.60611629486084, "learning_rate": 4.39547986405074e-05, "loss": 1.4985, "step": 200 }, { "epoch": 0.005622188905547227, "grad_norm": 8.613848686218262, "learning_rate": 4.3945310289606455e-05, "loss": 1.493, "step": 210 }, { "epoch": 0.005889912186763761, "grad_norm": 7.788877010345459, "learning_rate": 4.39349198529534e-05, "loss": 1.492, "step": 220 }, { "epoch": 0.006157635467980296, "grad_norm": 7.323453903198242, "learning_rate": 4.39236277576584e-05, "loss": 1.5138, "step": 230 }, { "epoch": 0.00642535874919683, "grad_norm": 7.350474834442139, "learning_rate": 4.391143446789526e-05, "loss": 1.5314, "step": 240 }, { "epoch": 0.006693082030413365, "grad_norm": 7.383894443511963, "learning_rate": 4.389834048488236e-05, "loss": 1.5231, "step": 250 }, { "epoch": 0.006960805311629899, "grad_norm": 7.436572551727295, "learning_rate": 4.388434634686206e-05, "loss": 1.5264, "step": 260 }, { "epoch": 0.007228528592846434, "grad_norm": 7.9078779220581055, "learning_rate": 4.386945262907856e-05, "loss": 1.4744, "step": 270 }, { "epoch": 0.0074962518740629685, "grad_norm": 7.032011985778809, "learning_rate": 4.3853659943754275e-05, "loss": 1.4156, "step": 280 }, { "epoch": 0.007763975155279503, "grad_norm": 7.743986129760742, "learning_rate": 4.383696894006463e-05, "loss": 1.5338, "step": 290 }, { "epoch": 0.008031698436496038, "grad_norm": 7.996994495391846, "learning_rate": 4.381938030411141e-05, "loss": 1.4912, "step": 300 }, { "epoch": 0.008031698436496038, "eval_loss": 1.5058677196502686, "eval_runtime": 76.63, "eval_samples_per_second": 6.525, "eval_steps_per_second": 6.525, "step": 300 }, { "epoch": 0.008299421717712573, "grad_norm": 6.838953971862793, "learning_rate": 4.380089475889457e-05, "loss": 1.4854, "step": 310 }, { "epoch": 0.008567144998929107, "grad_norm": 7.260242462158203, "learning_rate": 4.378151306428244e-05, "loss": 1.5401, "step": 320 }, { "epoch": 0.008834868280145642, "grad_norm": 6.737756729125977, "learning_rate": 4.3761236016980594e-05, "loss": 1.5013, "step": 330 }, { "epoch": 0.009102591561362176, "grad_norm": 6.963688373565674, "learning_rate": 4.3740064450499026e-05, "loss": 1.4989, "step": 340 }, { "epoch": 0.009370314842578711, "grad_norm": 7.6938557624816895, "learning_rate": 4.37179992351179e-05, "loss": 1.5317, "step": 350 }, { "epoch": 0.009638038123795245, "grad_norm": 6.700031757354736, "learning_rate": 4.3695041277851804e-05, "loss": 1.405, "step": 360 }, { "epoch": 0.00990576140501178, "grad_norm": 6.925078868865967, "learning_rate": 4.367119152241245e-05, "loss": 1.4966, "step": 370 }, { "epoch": 0.010173484686228314, "grad_norm": 6.87849235534668, "learning_rate": 4.364645094916985e-05, "loss": 1.4933, "step": 380 }, { "epoch": 0.01044120796744485, "grad_norm": 8.074585914611816, "learning_rate": 4.3620820575112083e-05, "loss": 1.4782, "step": 390 }, { "epoch": 0.010708931248661383, "grad_norm": 7.07144832611084, "learning_rate": 4.359430145380344e-05, "loss": 1.4871, "step": 400 }, { "epoch": 0.010976654529877918, "grad_norm": 7.2088117599487305, "learning_rate": 4.356689467534112e-05, "loss": 1.4855, "step": 410 }, { "epoch": 0.011244377811094454, "grad_norm": 7.868666648864746, "learning_rate": 4.353860136631044e-05, "loss": 1.5246, "step": 420 }, { "epoch": 0.011512101092310987, "grad_norm": 7.853616237640381, "learning_rate": 4.350942268973854e-05, "loss": 1.5302, "step": 430 }, { "epoch": 0.011779824373527523, "grad_norm": 7.353667259216309, "learning_rate": 4.347935984504649e-05, "loss": 1.4305, "step": 440 }, { "epoch": 0.012047547654744056, "grad_norm": 6.460302352905273, "learning_rate": 4.344841406800012e-05, "loss": 1.4506, "step": 450 }, { "epoch": 0.012047547654744056, "eval_loss": 1.4909805059432983, "eval_runtime": 76.606, "eval_samples_per_second": 6.527, "eval_steps_per_second": 6.527, "step": 450 }, { "epoch": 0.012315270935960592, "grad_norm": 7.3225226402282715, "learning_rate": 4.34165866306591e-05, "loss": 1.461, "step": 460 }, { "epoch": 0.012582994217177125, "grad_norm": 6.5725297927856445, "learning_rate": 4.3383878841324734e-05, "loss": 1.4007, "step": 470 }, { "epoch": 0.01285071749839366, "grad_norm": 7.1205315589904785, "learning_rate": 4.3350292044486125e-05, "loss": 1.557, "step": 480 }, { "epoch": 0.013118440779610194, "grad_norm": 6.783862113952637, "learning_rate": 4.331582762076494e-05, "loss": 1.5214, "step": 490 }, { "epoch": 0.01338616406082673, "grad_norm": 6.475296974182129, "learning_rate": 4.328048698685865e-05, "loss": 1.4874, "step": 500 }, { "epoch": 0.013653887342043263, "grad_norm": 7.158287525177002, "learning_rate": 4.32442715954823e-05, "loss": 1.4794, "step": 510 }, { "epoch": 0.013921610623259799, "grad_norm": 5.706000804901123, "learning_rate": 4.320718293530877e-05, "loss": 1.4921, "step": 520 }, { "epoch": 0.014189333904476333, "grad_norm": 7.483499050140381, "learning_rate": 4.3169222530907634e-05, "loss": 1.4899, "step": 530 }, { "epoch": 0.014457057185692868, "grad_norm": 6.9520182609558105, "learning_rate": 4.313039194268243e-05, "loss": 1.4908, "step": 540 }, { "epoch": 0.014724780466909402, "grad_norm": 6.7435784339904785, "learning_rate": 4.309069276680653e-05, "loss": 1.45, "step": 550 }, { "epoch": 0.014992503748125937, "grad_norm": 7.162035942077637, "learning_rate": 4.305012663515759e-05, "loss": 1.4702, "step": 560 }, { "epoch": 0.015260227029342472, "grad_norm": 6.717561721801758, "learning_rate": 4.300869521525039e-05, "loss": 1.5131, "step": 570 }, { "epoch": 0.015527950310559006, "grad_norm": 6.205082893371582, "learning_rate": 4.296640021016832e-05, "loss": 1.4342, "step": 580 }, { "epoch": 0.01579567359177554, "grad_norm": 6.673497676849365, "learning_rate": 4.292324335849338e-05, "loss": 1.4917, "step": 590 }, { "epoch": 0.016063396872992075, "grad_norm": 6.291605472564697, "learning_rate": 4.287922643423471e-05, "loss": 1.5018, "step": 600 }, { "epoch": 0.016063396872992075, "eval_loss": 1.476717472076416, "eval_runtime": 76.7435, "eval_samples_per_second": 6.515, "eval_steps_per_second": 6.515, "step": 600 }, { "epoch": 0.01633112015420861, "grad_norm": 6.723529815673828, "learning_rate": 4.283435124675567e-05, "loss": 1.4652, "step": 610 }, { "epoch": 0.016598843435425146, "grad_norm": 6.2483062744140625, "learning_rate": 4.278861964069944e-05, "loss": 1.5094, "step": 620 }, { "epoch": 0.01686656671664168, "grad_norm": 6.773886203765869, "learning_rate": 4.274203349591324e-05, "loss": 1.4771, "step": 630 }, { "epoch": 0.017134289997858213, "grad_norm": 7.28003454208374, "learning_rate": 4.269459472737102e-05, "loss": 1.436, "step": 640 }, { "epoch": 0.017402013279074747, "grad_norm": 6.813667297363281, "learning_rate": 4.264630528509473e-05, "loss": 1.4094, "step": 650 }, { "epoch": 0.017669736560291284, "grad_norm": 7.017402172088623, "learning_rate": 4.259716715407422e-05, "loss": 1.5255, "step": 660 }, { "epoch": 0.017937459841507818, "grad_norm": 7.313465595245361, "learning_rate": 4.254718235418559e-05, "loss": 1.4647, "step": 670 }, { "epoch": 0.01820518312272435, "grad_norm": 6.323640823364258, "learning_rate": 4.249635294010819e-05, "loss": 1.4799, "step": 680 }, { "epoch": 0.01847290640394089, "grad_norm": 7.1620588302612305, "learning_rate": 4.244468100124014e-05, "loss": 1.4344, "step": 690 }, { "epoch": 0.018740629685157422, "grad_norm": 6.160943508148193, "learning_rate": 4.239216866161248e-05, "loss": 1.516, "step": 700 }, { "epoch": 0.019008352966373956, "grad_norm": 6.571516036987305, "learning_rate": 4.233881807980179e-05, "loss": 1.5133, "step": 710 }, { "epoch": 0.01927607624759049, "grad_norm": 5.696547031402588, "learning_rate": 4.228463144884155e-05, "loss": 1.4318, "step": 720 }, { "epoch": 0.019543799528807027, "grad_norm": 6.653096675872803, "learning_rate": 4.2229610996131915e-05, "loss": 1.461, "step": 730 }, { "epoch": 0.01981152281002356, "grad_norm": 6.497095584869385, "learning_rate": 4.217375898334819e-05, "loss": 1.4359, "step": 740 }, { "epoch": 0.020079246091240094, "grad_norm": 6.566861629486084, "learning_rate": 4.211707770634788e-05, "loss": 1.445, "step": 750 }, { "epoch": 0.020079246091240094, "eval_loss": 1.4696097373962402, "eval_runtime": 76.6203, "eval_samples_per_second": 6.526, "eval_steps_per_second": 6.526, "step": 750 }, { "epoch": 0.020346969372456628, "grad_norm": 6.802274703979492, "learning_rate": 4.205956949507625e-05, "loss": 1.4485, "step": 760 }, { "epoch": 0.020614692653673165, "grad_norm": 6.832027912139893, "learning_rate": 4.200123671347065e-05, "loss": 1.5034, "step": 770 }, { "epoch": 0.0208824159348897, "grad_norm": 7.041072368621826, "learning_rate": 4.1942081759363236e-05, "loss": 1.5225, "step": 780 }, { "epoch": 0.021150139216106232, "grad_norm": 6.773481369018555, "learning_rate": 4.1882107064382496e-05, "loss": 1.4718, "step": 790 }, { "epoch": 0.021417862497322766, "grad_norm": 6.902709007263184, "learning_rate": 4.1821315093853216e-05, "loss": 1.4562, "step": 800 }, { "epoch": 0.021685585778539303, "grad_norm": 5.957550048828125, "learning_rate": 4.1759708346695215e-05, "loss": 1.4798, "step": 810 }, { "epoch": 0.021953309059755836, "grad_norm": 6.139000415802002, "learning_rate": 4.1697289355320565e-05, "loss": 1.5084, "step": 820 }, { "epoch": 0.02222103234097237, "grad_norm": 6.227492332458496, "learning_rate": 4.1634060685529527e-05, "loss": 1.4597, "step": 830 }, { "epoch": 0.022488755622188907, "grad_norm": 6.531513214111328, "learning_rate": 4.157002493640506e-05, "loss": 1.4326, "step": 840 }, { "epoch": 0.02275647890340544, "grad_norm": 6.589105606079102, "learning_rate": 4.1505184740206006e-05, "loss": 1.431, "step": 850 }, { "epoch": 0.023024202184621975, "grad_norm": 6.289806842803955, "learning_rate": 4.143954276225886e-05, "loss": 1.5167, "step": 860 }, { "epoch": 0.023291925465838508, "grad_norm": 6.444394111633301, "learning_rate": 4.1373101700848235e-05, "loss": 1.4948, "step": 870 }, { "epoch": 0.023559648747055045, "grad_norm": 6.666561126708984, "learning_rate": 4.1305864287105946e-05, "loss": 1.4879, "step": 880 }, { "epoch": 0.02382737202827158, "grad_norm": 6.169593811035156, "learning_rate": 4.12378332848987e-05, "loss": 1.5557, "step": 890 }, { "epoch": 0.024095095309488113, "grad_norm": 6.8753743171691895, "learning_rate": 4.116901149071457e-05, "loss": 1.4446, "step": 900 }, { "epoch": 0.024095095309488113, "eval_loss": 1.4628037214279175, "eval_runtime": 76.661, "eval_samples_per_second": 6.522, "eval_steps_per_second": 6.522, "step": 900 }, { "epoch": 0.024362818590704646, "grad_norm": 6.948112964630127, "learning_rate": 4.1099401733547925e-05, "loss": 1.4916, "step": 910 }, { "epoch": 0.024630541871921183, "grad_norm": 6.508751392364502, "learning_rate": 4.102900687478326e-05, "loss": 1.4659, "step": 920 }, { "epoch": 0.024898265153137717, "grad_norm": 6.732906818389893, "learning_rate": 4.095782980807749e-05, "loss": 1.4834, "step": 930 }, { "epoch": 0.02516598843435425, "grad_norm": 6.261349678039551, "learning_rate": 4.088587345924105e-05, "loss": 1.4585, "step": 940 }, { "epoch": 0.025433711715570784, "grad_norm": 5.926994323730469, "learning_rate": 4.081314078611762e-05, "loss": 1.4887, "step": 950 }, { "epoch": 0.02570143499678732, "grad_norm": 6.746396064758301, "learning_rate": 4.073963477846249e-05, "loss": 1.4561, "step": 960 }, { "epoch": 0.025969158278003855, "grad_norm": 6.503916263580322, "learning_rate": 4.066535845781975e-05, "loss": 1.5013, "step": 970 }, { "epoch": 0.02623688155922039, "grad_norm": 6.83921480178833, "learning_rate": 4.059031487739803e-05, "loss": 1.517, "step": 980 }, { "epoch": 0.026504604840436926, "grad_norm": 6.210860252380371, "learning_rate": 4.051450712194497e-05, "loss": 1.4849, "step": 990 }, { "epoch": 0.02677232812165346, "grad_norm": 6.381270885467529, "learning_rate": 4.043793830762049e-05, "loss": 1.4685, "step": 1000 }, { "epoch": 0.027040051402869993, "grad_norm": 6.763075351715088, "learning_rate": 4.036061158186866e-05, "loss": 1.5412, "step": 1010 }, { "epoch": 0.027307774684086527, "grad_norm": 6.492913722991943, "learning_rate": 4.028253012328828e-05, "loss": 1.4398, "step": 1020 }, { "epoch": 0.027575497965303064, "grad_norm": 6.383675575256348, "learning_rate": 4.0203697141502323e-05, "loss": 1.4514, "step": 1030 }, { "epoch": 0.027843221246519598, "grad_norm": 6.685030937194824, "learning_rate": 4.0124115877025874e-05, "loss": 1.4688, "step": 1040 }, { "epoch": 0.02811094452773613, "grad_norm": 6.681553840637207, "learning_rate": 4.004378960113303e-05, "loss": 1.4862, "step": 1050 }, { "epoch": 0.02811094452773613, "eval_loss": 1.4577986001968384, "eval_runtime": 76.6839, "eval_samples_per_second": 6.52, "eval_steps_per_second": 6.52, "step": 1050 }, { "epoch": 0.028378667808952665, "grad_norm": 6.192584037780762, "learning_rate": 3.996272161572237e-05, "loss": 1.4383, "step": 1060 }, { "epoch": 0.028646391090169202, "grad_norm": 6.383575439453125, "learning_rate": 3.988091525318126e-05, "loss": 1.4015, "step": 1070 }, { "epoch": 0.028914114371385736, "grad_norm": 7.042336940765381, "learning_rate": 3.979837387624884e-05, "loss": 1.468, "step": 1080 }, { "epoch": 0.02918183765260227, "grad_norm": 6.60390567779541, "learning_rate": 3.971510087787784e-05, "loss": 1.4932, "step": 1090 }, { "epoch": 0.029449560933818803, "grad_norm": 6.5373029708862305, "learning_rate": 3.9631099681095044e-05, "loss": 1.4381, "step": 1100 }, { "epoch": 0.02971728421503534, "grad_norm": 7.118472099304199, "learning_rate": 3.954637373886066e-05, "loss": 1.4057, "step": 1110 }, { "epoch": 0.029985007496251874, "grad_norm": 7.386786460876465, "learning_rate": 3.9460926533926315e-05, "loss": 1.4978, "step": 1120 }, { "epoch": 0.030252730777468408, "grad_norm": 6.418981552124023, "learning_rate": 3.937476157869193e-05, "loss": 1.4897, "step": 1130 }, { "epoch": 0.030520454058684945, "grad_norm": 5.941694259643555, "learning_rate": 3.9287882415061334e-05, "loss": 1.4381, "step": 1140 }, { "epoch": 0.03078817733990148, "grad_norm": 6.574525833129883, "learning_rate": 3.9200292614296655e-05, "loss": 1.4143, "step": 1150 }, { "epoch": 0.031055900621118012, "grad_norm": 6.349545478820801, "learning_rate": 3.911199577687154e-05, "loss": 1.3937, "step": 1160 }, { "epoch": 0.03132362390233455, "grad_norm": 6.873767375946045, "learning_rate": 3.902299553232315e-05, "loss": 1.4515, "step": 1170 }, { "epoch": 0.03159134718355108, "grad_norm": 6.043056964874268, "learning_rate": 3.893329553910293e-05, "loss": 1.5538, "step": 1180 }, { "epoch": 0.031859070464767617, "grad_norm": 6.418127059936523, "learning_rate": 3.884289948442628e-05, "loss": 1.4745, "step": 1190 }, { "epoch": 0.03212679374598415, "grad_norm": 6.102353096008301, "learning_rate": 3.875181108412096e-05, "loss": 1.4772, "step": 1200 }, { "epoch": 0.03212679374598415, "eval_loss": 1.449093222618103, "eval_runtime": 76.8079, "eval_samples_per_second": 6.51, "eval_steps_per_second": 6.51, "step": 1200 }, { "epoch": 0.032394517027200684, "grad_norm": 6.624678134918213, "learning_rate": 3.8660034082474316e-05, "loss": 1.4526, "step": 1210 }, { "epoch": 0.03266224030841722, "grad_norm": 6.601352214813232, "learning_rate": 3.856757225207944e-05, "loss": 1.5247, "step": 1220 }, { "epoch": 0.03292996358963375, "grad_norm": 6.580589771270752, "learning_rate": 3.847442939368002e-05, "loss": 1.4694, "step": 1230 }, { "epoch": 0.03319768687085029, "grad_norm": 5.504952430725098, "learning_rate": 3.8380609336014156e-05, "loss": 1.411, "step": 1240 }, { "epoch": 0.033465410152066825, "grad_norm": 6.643352508544922, "learning_rate": 3.828611593565694e-05, "loss": 1.4278, "step": 1250 }, { "epoch": 0.03373313343328336, "grad_norm": 6.425754070281982, "learning_rate": 3.819095307686197e-05, "loss": 1.4253, "step": 1260 }, { "epoch": 0.03400085671449989, "grad_norm": 6.0909504890441895, "learning_rate": 3.809512467140163e-05, "loss": 1.4681, "step": 1270 }, { "epoch": 0.034268579995716426, "grad_norm": 5.82462215423584, "learning_rate": 3.799863465840634e-05, "loss": 1.4275, "step": 1280 }, { "epoch": 0.03453630327693296, "grad_norm": 6.24934720993042, "learning_rate": 3.790148700420261e-05, "loss": 1.4313, "step": 1290 }, { "epoch": 0.034804026558149494, "grad_norm": 6.211289882659912, "learning_rate": 3.7803685702150006e-05, "loss": 1.4216, "step": 1300 }, { "epoch": 0.035071749839366034, "grad_norm": 5.66799259185791, "learning_rate": 3.7705234772476984e-05, "loss": 1.4799, "step": 1310 }, { "epoch": 0.03533947312058257, "grad_norm": 6.351191997528076, "learning_rate": 3.760613826211567e-05, "loss": 1.4281, "step": 1320 }, { "epoch": 0.0356071964017991, "grad_norm": 5.707607269287109, "learning_rate": 3.7506400244535455e-05, "loss": 1.463, "step": 1330 }, { "epoch": 0.035874919683015635, "grad_norm": 5.780487537384033, "learning_rate": 3.740602481957561e-05, "loss": 1.4731, "step": 1340 }, { "epoch": 0.03614264296423217, "grad_norm": 5.354376316070557, "learning_rate": 3.7305016113276704e-05, "loss": 1.4492, "step": 1350 }, { "epoch": 0.03614264296423217, "eval_loss": 1.444738745689392, "eval_runtime": 76.8239, "eval_samples_per_second": 6.508, "eval_steps_per_second": 6.508, "step": 1350 }, { "epoch": 0.0364103662454487, "grad_norm": 5.93463134765625, "learning_rate": 3.7203378277711024e-05, "loss": 1.4602, "step": 1360 }, { "epoch": 0.036678089526665236, "grad_norm": 5.922112464904785, "learning_rate": 3.710111549081191e-05, "loss": 1.4412, "step": 1370 }, { "epoch": 0.03694581280788178, "grad_norm": 6.667977809906006, "learning_rate": 3.699823195620199e-05, "loss": 1.4475, "step": 1380 }, { "epoch": 0.03721353608909831, "grad_norm": 6.021790504455566, "learning_rate": 3.689473190302041e-05, "loss": 1.4206, "step": 1390 }, { "epoch": 0.037481259370314844, "grad_norm": 6.152276039123535, "learning_rate": 3.679061958574897e-05, "loss": 1.4288, "step": 1400 }, { "epoch": 0.03774898265153138, "grad_norm": 5.695444583892822, "learning_rate": 3.668589928403726e-05, "loss": 1.4424, "step": 1410 }, { "epoch": 0.03801670593274791, "grad_norm": 6.346884727478027, "learning_rate": 3.6580575302526706e-05, "loss": 1.5001, "step": 1420 }, { "epoch": 0.038284429213964445, "grad_norm": 5.674633979797363, "learning_rate": 3.647465197067368e-05, "loss": 1.4796, "step": 1430 }, { "epoch": 0.03855215249518098, "grad_norm": 6.168262481689453, "learning_rate": 3.6368133642571464e-05, "loss": 1.4428, "step": 1440 }, { "epoch": 0.03881987577639751, "grad_norm": 6.4981369972229, "learning_rate": 3.6261024696771345e-05, "loss": 1.4281, "step": 1450 }, { "epoch": 0.03908759905761405, "grad_norm": 5.703588962554932, "learning_rate": 3.615332953610255e-05, "loss": 1.3934, "step": 1460 }, { "epoch": 0.03935532233883059, "grad_norm": 5.69433069229126, "learning_rate": 3.604505258749132e-05, "loss": 1.4482, "step": 1470 }, { "epoch": 0.03962304562004712, "grad_norm": 5.763268947601318, "learning_rate": 3.5936198301778945e-05, "loss": 1.4629, "step": 1480 }, { "epoch": 0.039890768901263654, "grad_norm": 6.472227096557617, "learning_rate": 3.5826771153538716e-05, "loss": 1.4301, "step": 1490 }, { "epoch": 0.04015849218248019, "grad_norm": 6.155264854431152, "learning_rate": 3.571677564089214e-05, "loss": 1.4703, "step": 1500 }, { "epoch": 0.04015849218248019, "eval_loss": 1.4356995820999146, "eval_runtime": 76.8148, "eval_samples_per_second": 6.509, "eval_steps_per_second": 6.509, "step": 1500 }, { "epoch": 0.04042621546369672, "grad_norm": 6.241977214813232, "learning_rate": 3.560621628532389e-05, "loss": 1.4461, "step": 1510 }, { "epoch": 0.040693938744913255, "grad_norm": 6.195336818695068, "learning_rate": 3.5495097631496066e-05, "loss": 1.3735, "step": 1520 }, { "epoch": 0.040961662026129796, "grad_norm": 5.899549961090088, "learning_rate": 3.5383424247061286e-05, "loss": 1.4787, "step": 1530 }, { "epoch": 0.04122938530734633, "grad_norm": 6.187289714813232, "learning_rate": 3.5271200722475e-05, "loss": 1.4413, "step": 1540 }, { "epoch": 0.04149710858856286, "grad_norm": 5.872448444366455, "learning_rate": 3.515843167080675e-05, "loss": 1.4317, "step": 1550 }, { "epoch": 0.0417648318697794, "grad_norm": 6.877863883972168, "learning_rate": 3.5045121727550566e-05, "loss": 1.4593, "step": 1560 }, { "epoch": 0.04203255515099593, "grad_norm": 6.644635200500488, "learning_rate": 3.493127555043441e-05, "loss": 1.4622, "step": 1570 }, { "epoch": 0.042300278432212464, "grad_norm": 6.314537525177002, "learning_rate": 3.481689781922871e-05, "loss": 1.5365, "step": 1580 }, { "epoch": 0.042568001713429, "grad_norm": 6.645462989807129, "learning_rate": 3.470199323555403e-05, "loss": 1.4534, "step": 1590 }, { "epoch": 0.04283572499464553, "grad_norm": 6.462603569030762, "learning_rate": 3.4586566522687734e-05, "loss": 1.4786, "step": 1600 }, { "epoch": 0.04310344827586207, "grad_norm": 6.516161918640137, "learning_rate": 3.44706224253699e-05, "loss": 1.3987, "step": 1610 }, { "epoch": 0.043371171557078605, "grad_norm": 6.5383076667785645, "learning_rate": 3.435416570960824e-05, "loss": 1.4993, "step": 1620 }, { "epoch": 0.04363889483829514, "grad_norm": 6.37644100189209, "learning_rate": 3.4237201162482225e-05, "loss": 1.4527, "step": 1630 }, { "epoch": 0.04390661811951167, "grad_norm": 6.744267463684082, "learning_rate": 3.411973359194625e-05, "loss": 1.4213, "step": 1640 }, { "epoch": 0.044174341400728206, "grad_norm": 6.487947463989258, "learning_rate": 3.400176782663207e-05, "loss": 1.4266, "step": 1650 }, { "epoch": 0.044174341400728206, "eval_loss": 1.4300212860107422, "eval_runtime": 76.7363, "eval_samples_per_second": 6.516, "eval_steps_per_second": 6.516, "step": 1650 }, { "epoch": 0.04444206468194474, "grad_norm": 5.463531494140625, "learning_rate": 3.3883308715650246e-05, "loss": 1.4868, "step": 1660 }, { "epoch": 0.044709787963161274, "grad_norm": 6.814722537994385, "learning_rate": 3.3764361128390853e-05, "loss": 1.441, "step": 1670 }, { "epoch": 0.044977511244377814, "grad_norm": 5.653580665588379, "learning_rate": 3.3644929954323324e-05, "loss": 1.4674, "step": 1680 }, { "epoch": 0.04524523452559435, "grad_norm": 5.94467306137085, "learning_rate": 3.3525020102795434e-05, "loss": 1.4337, "step": 1690 }, { "epoch": 0.04551295780681088, "grad_norm": 6.066728115081787, "learning_rate": 3.3404636502831555e-05, "loss": 1.4701, "step": 1700 }, { "epoch": 0.045780681088027415, "grad_norm": 6.075364112854004, "learning_rate": 3.328378410292994e-05, "loss": 1.4264, "step": 1710 }, { "epoch": 0.04604840436924395, "grad_norm": 6.536380290985107, "learning_rate": 3.3162467870859404e-05, "loss": 1.4928, "step": 1720 }, { "epoch": 0.04631612765046048, "grad_norm": 6.932302951812744, "learning_rate": 3.3040692793455106e-05, "loss": 1.4472, "step": 1730 }, { "epoch": 0.046583850931677016, "grad_norm": 6.54493522644043, "learning_rate": 3.2918463876413504e-05, "loss": 1.3929, "step": 1740 }, { "epoch": 0.04685157421289355, "grad_norm": 5.732593059539795, "learning_rate": 3.279578614408664e-05, "loss": 1.4182, "step": 1750 }, { "epoch": 0.04711929749411009, "grad_norm": 6.897890090942383, "learning_rate": 3.2672664639275584e-05, "loss": 1.466, "step": 1760 }, { "epoch": 0.047387020775326624, "grad_norm": 6.196009159088135, "learning_rate": 3.254910442302319e-05, "loss": 1.4552, "step": 1770 }, { "epoch": 0.04765474405654316, "grad_norm": 6.375300407409668, "learning_rate": 3.242511057440597e-05, "loss": 1.4139, "step": 1780 }, { "epoch": 0.04792246733775969, "grad_norm": 5.817831993103027, "learning_rate": 3.2300688190325404e-05, "loss": 1.4855, "step": 1790 }, { "epoch": 0.048190190618976225, "grad_norm": 5.730225563049316, "learning_rate": 3.217584238529838e-05, "loss": 1.3845, "step": 1800 }, { "epoch": 0.048190190618976225, "eval_loss": 1.4201979637145996, "eval_runtime": 76.7247, "eval_samples_per_second": 6.517, "eval_steps_per_second": 6.517, "step": 1800 }, { "epoch": 0.04845791390019276, "grad_norm": 7.2009382247924805, "learning_rate": 3.205057829124693e-05, "loss": 1.3661, "step": 1810 }, { "epoch": 0.04872563718140929, "grad_norm": 6.178856372833252, "learning_rate": 3.192490105728736e-05, "loss": 1.4082, "step": 1820 }, { "epoch": 0.04899336046262583, "grad_norm": 5.327315330505371, "learning_rate": 3.17988158495185e-05, "loss": 1.4033, "step": 1830 }, { "epoch": 0.04926108374384237, "grad_norm": 5.719634532928467, "learning_rate": 3.1672327850809405e-05, "loss": 1.4505, "step": 1840 }, { "epoch": 0.0495288070250589, "grad_norm": 6.736356735229492, "learning_rate": 3.154544226058628e-05, "loss": 1.4521, "step": 1850 }, { "epoch": 0.049796530306275434, "grad_norm": 5.911272048950195, "learning_rate": 3.1418164294618766e-05, "loss": 1.452, "step": 1860 }, { "epoch": 0.05006425358749197, "grad_norm": 6.058777332305908, "learning_rate": 3.129049918480552e-05, "loss": 1.4431, "step": 1870 }, { "epoch": 0.0503319768687085, "grad_norm": 5.928140640258789, "learning_rate": 3.116245217895918e-05, "loss": 1.4781, "step": 1880 }, { "epoch": 0.050599700149925035, "grad_norm": 5.614587783813477, "learning_rate": 3.1034028540590635e-05, "loss": 1.3831, "step": 1890 }, { "epoch": 0.05086742343114157, "grad_norm": 6.133342742919922, "learning_rate": 3.090523354869266e-05, "loss": 1.4711, "step": 1900 }, { "epoch": 0.05113514671235811, "grad_norm": 7.014552593231201, "learning_rate": 3.0776072497522916e-05, "loss": 1.4404, "step": 1910 }, { "epoch": 0.05140286999357464, "grad_norm": 7.53794002532959, "learning_rate": 3.064655069638632e-05, "loss": 1.4262, "step": 1920 }, { "epoch": 0.05167059327479118, "grad_norm": 5.97748327255249, "learning_rate": 3.0516673469416818e-05, "loss": 1.3836, "step": 1930 }, { "epoch": 0.05193831655600771, "grad_norm": 5.628602504730225, "learning_rate": 3.0386446155358518e-05, "loss": 1.4083, "step": 1940 }, { "epoch": 0.052206039837224244, "grad_norm": 6.5765380859375, "learning_rate": 3.0255874107346232e-05, "loss": 1.4374, "step": 1950 }, { "epoch": 0.052206039837224244, "eval_loss": 1.4104682207107544, "eval_runtime": 76.7665, "eval_samples_per_second": 6.513, "eval_steps_per_second": 6.513, "step": 1950 }, { "epoch": 0.05247376311844078, "grad_norm": 6.185977458953857, "learning_rate": 3.012496269268544e-05, "loss": 1.4185, "step": 1960 }, { "epoch": 0.05274148639965731, "grad_norm": 6.482154846191406, "learning_rate": 2.9993717292631652e-05, "loss": 1.4446, "step": 1970 }, { "epoch": 0.05300920968087385, "grad_norm": 6.260786056518555, "learning_rate": 2.9862143302169223e-05, "loss": 1.4123, "step": 1980 }, { "epoch": 0.053276932962090386, "grad_norm": 6.361741542816162, "learning_rate": 2.9730246129789542e-05, "loss": 1.4646, "step": 1990 }, { "epoch": 0.05354465624330692, "grad_norm": 6.388660430908203, "learning_rate": 2.9598031197268768e-05, "loss": 1.4232, "step": 2000 }, { "epoch": 0.05381237952452345, "grad_norm": 6.549992084503174, "learning_rate": 2.946550393944493e-05, "loss": 1.398, "step": 2010 }, { "epoch": 0.05408010280573999, "grad_norm": 5.965968132019043, "learning_rate": 2.933266980399452e-05, "loss": 1.3618, "step": 2020 }, { "epoch": 0.05434782608695652, "grad_norm": 6.706049919128418, "learning_rate": 2.9199534251208573e-05, "loss": 1.4274, "step": 2030 }, { "epoch": 0.054615549368173054, "grad_norm": 5.7372355461120605, "learning_rate": 2.9066102753768204e-05, "loss": 1.3954, "step": 2040 }, { "epoch": 0.05488327264938959, "grad_norm": 6.6037468910217285, "learning_rate": 2.893238079651966e-05, "loss": 1.3763, "step": 2050 }, { "epoch": 0.05515099593060613, "grad_norm": 6.51908540725708, "learning_rate": 2.8798373876248843e-05, "loss": 1.3945, "step": 2060 }, { "epoch": 0.05541871921182266, "grad_norm": 6.044327259063721, "learning_rate": 2.8664087501455387e-05, "loss": 1.4487, "step": 2070 }, { "epoch": 0.055686442493039195, "grad_norm": 6.289717674255371, "learning_rate": 2.852952719212619e-05, "loss": 1.4311, "step": 2080 }, { "epoch": 0.05595416577425573, "grad_norm": 5.992395401000977, "learning_rate": 2.8394698479508542e-05, "loss": 1.3859, "step": 2090 }, { "epoch": 0.05622188905547226, "grad_norm": 6.025457382202148, "learning_rate": 2.8259606905882712e-05, "loss": 1.4162, "step": 2100 }, { "epoch": 0.05622188905547226, "eval_loss": 1.4084678888320923, "eval_runtime": 76.754, "eval_samples_per_second": 6.514, "eval_steps_per_second": 6.514, "step": 2100 }, { "epoch": 0.056489612336688796, "grad_norm": 5.75090217590332, "learning_rate": 2.8124258024334192e-05, "loss": 1.4478, "step": 2110 }, { "epoch": 0.05675733561790533, "grad_norm": 6.59517240524292, "learning_rate": 2.7988657398525364e-05, "loss": 1.4742, "step": 2120 }, { "epoch": 0.05702505889912187, "grad_norm": 5.816342830657959, "learning_rate": 2.785281060246685e-05, "loss": 1.4508, "step": 2130 }, { "epoch": 0.057292782180338404, "grad_norm": 5.353818416595459, "learning_rate": 2.7716723220288365e-05, "loss": 1.4593, "step": 2140 }, { "epoch": 0.05756050546155494, "grad_norm": 5.926205635070801, "learning_rate": 2.758040084600916e-05, "loss": 1.4599, "step": 2150 }, { "epoch": 0.05782822874277147, "grad_norm": 6.504787921905518, "learning_rate": 2.7443849083308117e-05, "loss": 1.3973, "step": 2160 }, { "epoch": 0.058095952023988005, "grad_norm": 5.680723190307617, "learning_rate": 2.7307073545293355e-05, "loss": 1.4051, "step": 2170 }, { "epoch": 0.05836367530520454, "grad_norm": 7.509329795837402, "learning_rate": 2.7170079854271533e-05, "loss": 1.3807, "step": 2180 }, { "epoch": 0.05863139858642107, "grad_norm": 6.508755683898926, "learning_rate": 2.703287364151672e-05, "loss": 1.3869, "step": 2190 }, { "epoch": 0.058899121867637606, "grad_norm": 6.544657230377197, "learning_rate": 2.6895460547038913e-05, "loss": 1.3409, "step": 2200 }, { "epoch": 0.05916684514885415, "grad_norm": 6.107619762420654, "learning_rate": 2.6757846219352235e-05, "loss": 1.389, "step": 2210 }, { "epoch": 0.05943456843007068, "grad_norm": 6.255703449249268, "learning_rate": 2.6620036315242682e-05, "loss": 1.4385, "step": 2220 }, { "epoch": 0.059702291711287214, "grad_norm": 5.130046844482422, "learning_rate": 2.6482036499535665e-05, "loss": 1.3614, "step": 2230 }, { "epoch": 0.05997001499250375, "grad_norm": 6.854607105255127, "learning_rate": 2.6343852444863075e-05, "loss": 1.4465, "step": 2240 }, { "epoch": 0.06023773827372028, "grad_norm": 6.745285511016846, "learning_rate": 2.6205489831430192e-05, "loss": 1.4, "step": 2250 }, { "epoch": 0.06023773827372028, "eval_loss": 1.3977503776550293, "eval_runtime": 76.7205, "eval_samples_per_second": 6.517, "eval_steps_per_second": 6.517, "step": 2250 }, { "epoch": 0.060505461554936815, "grad_norm": 5.628711223602295, "learning_rate": 2.6066954346782113e-05, "loss": 1.43, "step": 2260 }, { "epoch": 0.06077318483615335, "grad_norm": 6.356362342834473, "learning_rate": 2.5928251685570005e-05, "loss": 1.4382, "step": 2270 }, { "epoch": 0.06104090811736989, "grad_norm": 5.99081563949585, "learning_rate": 2.5789387549317016e-05, "loss": 1.4363, "step": 2280 }, { "epoch": 0.06130863139858642, "grad_norm": 6.202702522277832, "learning_rate": 2.5650367646183896e-05, "loss": 1.3932, "step": 2290 }, { "epoch": 0.06157635467980296, "grad_norm": 6.321465015411377, "learning_rate": 2.5511197690734344e-05, "loss": 1.4056, "step": 2300 }, { "epoch": 0.06184407796101949, "grad_norm": 5.804145812988281, "learning_rate": 2.5371883403700148e-05, "loss": 1.4132, "step": 2310 }, { "epoch": 0.062111801242236024, "grad_norm": 5.593542098999023, "learning_rate": 2.5232430511745995e-05, "loss": 1.4603, "step": 2320 }, { "epoch": 0.06237952452345256, "grad_norm": 6.308177947998047, "learning_rate": 2.5092844747234063e-05, "loss": 1.361, "step": 2330 }, { "epoch": 0.0626472478046691, "grad_norm": 5.957157611846924, "learning_rate": 2.495313184798842e-05, "loss": 1.435, "step": 2340 }, { "epoch": 0.06291497108588563, "grad_norm": 5.485774517059326, "learning_rate": 2.4813297557059133e-05, "loss": 1.413, "step": 2350 }, { "epoch": 0.06318269436710217, "grad_norm": 7.090158939361572, "learning_rate": 2.467334762248621e-05, "loss": 1.3819, "step": 2360 }, { "epoch": 0.0634504176483187, "grad_norm": 6.819372653961182, "learning_rate": 2.4533287797063308e-05, "loss": 1.4347, "step": 2370 }, { "epoch": 0.06371814092953523, "grad_norm": 5.654256820678711, "learning_rate": 2.439312383810128e-05, "loss": 1.3902, "step": 2380 }, { "epoch": 0.06398586421075177, "grad_norm": 6.394632339477539, "learning_rate": 2.4252861507191487e-05, "loss": 1.4324, "step": 2390 }, { "epoch": 0.0642535874919683, "grad_norm": 6.346138954162598, "learning_rate": 2.4112506569969e-05, "loss": 1.3853, "step": 2400 }, { "epoch": 0.0642535874919683, "eval_loss": 1.389374017715454, "eval_runtime": 76.6782, "eval_samples_per_second": 6.521, "eval_steps_per_second": 6.521, "step": 2400 }, { "epoch": 0.06452131077318483, "grad_norm": 5.798035144805908, "learning_rate": 2.3972064795875537e-05, "loss": 1.3668, "step": 2410 }, { "epoch": 0.06478903405440137, "grad_norm": 6.213179588317871, "learning_rate": 2.3831541957922366e-05, "loss": 1.3913, "step": 2420 }, { "epoch": 0.0650567573356179, "grad_norm": 6.443445682525635, "learning_rate": 2.3690943832452967e-05, "loss": 1.4176, "step": 2430 }, { "epoch": 0.06532448061683443, "grad_norm": 6.543423652648926, "learning_rate": 2.3550276198905584e-05, "loss": 1.5036, "step": 2440 }, { "epoch": 0.06559220389805097, "grad_norm": 5.8855977058410645, "learning_rate": 2.3409544839575687e-05, "loss": 1.3749, "step": 2450 }, { "epoch": 0.0658599271792675, "grad_norm": 6.113175868988037, "learning_rate": 2.3268755539378238e-05, "loss": 1.3555, "step": 2460 }, { "epoch": 0.06612765046048405, "grad_norm": 6.519189357757568, "learning_rate": 2.3127914085609943e-05, "loss": 1.3457, "step": 2470 }, { "epoch": 0.06639537374170058, "grad_norm": 6.135042667388916, "learning_rate": 2.298702626771133e-05, "loss": 1.4143, "step": 2480 }, { "epoch": 0.06666309702291712, "grad_norm": 6.562228679656982, "learning_rate": 2.2846097877028762e-05, "loss": 1.4549, "step": 2490 }, { "epoch": 0.06693082030413365, "grad_norm": 6.2036213874816895, "learning_rate": 2.270513470657642e-05, "loss": 1.3422, "step": 2500 }, { "epoch": 0.06719854358535018, "grad_norm": 6.321053981781006, "learning_rate": 2.25641425507981e-05, "loss": 1.4206, "step": 2510 }, { "epoch": 0.06746626686656672, "grad_norm": 5.922671794891357, "learning_rate": 2.2423127205329117e-05, "loss": 1.4368, "step": 2520 }, { "epoch": 0.06773399014778325, "grad_norm": 6.139718532562256, "learning_rate": 2.2282094466758e-05, "loss": 1.3574, "step": 2530 }, { "epoch": 0.06800171342899979, "grad_norm": 5.755593776702881, "learning_rate": 2.2141050132388245e-05, "loss": 1.4075, "step": 2540 }, { "epoch": 0.06826943671021632, "grad_norm": 5.7373151779174805, "learning_rate": 2.2e-05, "loss": 1.3812, "step": 2550 }, { "epoch": 0.06826943671021632, "eval_loss": 1.3869917392730713, "eval_runtime": 76.6763, "eval_samples_per_second": 6.521, "eval_steps_per_second": 6.521, "step": 2550 }, { "epoch": 0.06853715999143285, "grad_norm": 6.435483932495117, "learning_rate": 2.1858949867611754e-05, "loss": 1.3586, "step": 2560 }, { "epoch": 0.06880488327264939, "grad_norm": 5.814359188079834, "learning_rate": 2.1717905533241997e-05, "loss": 1.3745, "step": 2570 }, { "epoch": 0.06907260655386592, "grad_norm": 6.140771389007568, "learning_rate": 2.157687279467088e-05, "loss": 1.3296, "step": 2580 }, { "epoch": 0.06934032983508245, "grad_norm": 5.861440181732178, "learning_rate": 2.14358574492019e-05, "loss": 1.3911, "step": 2590 }, { "epoch": 0.06960805311629899, "grad_norm": 6.584283351898193, "learning_rate": 2.1294865293423586e-05, "loss": 1.4143, "step": 2600 }, { "epoch": 0.06987577639751552, "grad_norm": 5.859135627746582, "learning_rate": 2.1153902122971233e-05, "loss": 1.3923, "step": 2610 }, { "epoch": 0.07014349967873207, "grad_norm": 6.673269748687744, "learning_rate": 2.101297373228868e-05, "loss": 1.4072, "step": 2620 }, { "epoch": 0.0704112229599486, "grad_norm": 5.8205180168151855, "learning_rate": 2.087208591439006e-05, "loss": 1.3962, "step": 2630 }, { "epoch": 0.07067894624116514, "grad_norm": 5.918448448181152, "learning_rate": 2.0731244460621764e-05, "loss": 1.4121, "step": 2640 }, { "epoch": 0.07094666952238167, "grad_norm": 6.024654865264893, "learning_rate": 2.0590455160424316e-05, "loss": 1.3958, "step": 2650 }, { "epoch": 0.0712143928035982, "grad_norm": 6.21071195602417, "learning_rate": 2.044972380109441e-05, "loss": 1.4155, "step": 2660 }, { "epoch": 0.07148211608481474, "grad_norm": 6.8569207191467285, "learning_rate": 2.030905616754704e-05, "loss": 1.3968, "step": 2670 }, { "epoch": 0.07174983936603127, "grad_norm": 6.207950592041016, "learning_rate": 2.0168458042077636e-05, "loss": 1.3722, "step": 2680 }, { "epoch": 0.0720175626472478, "grad_norm": 5.884634494781494, "learning_rate": 2.0027935204124465e-05, "loss": 1.4165, "step": 2690 }, { "epoch": 0.07228528592846434, "grad_norm": 5.943591117858887, "learning_rate": 1.9887493430031e-05, "loss": 1.4054, "step": 2700 }, { "epoch": 0.07228528592846434, "eval_loss": 1.3810029029846191, "eval_runtime": 76.6801, "eval_samples_per_second": 6.521, "eval_steps_per_second": 6.521, "step": 2700 }, { "epoch": 0.07255300920968087, "grad_norm": 6.2774457931518555, "learning_rate": 1.9747138492808512e-05, "loss": 1.4184, "step": 2710 }, { "epoch": 0.0728207324908974, "grad_norm": 6.596461772918701, "learning_rate": 1.960687616189872e-05, "loss": 1.4314, "step": 2720 }, { "epoch": 0.07308845577211394, "grad_norm": 6.17069149017334, "learning_rate": 1.9466712202936694e-05, "loss": 1.4248, "step": 2730 }, { "epoch": 0.07335617905333047, "grad_norm": 6.085130214691162, "learning_rate": 1.932665237751379e-05, "loss": 1.3966, "step": 2740 }, { "epoch": 0.073623902334547, "grad_norm": 6.430164813995361, "learning_rate": 1.9186702442940866e-05, "loss": 1.3521, "step": 2750 }, { "epoch": 0.07389162561576355, "grad_norm": 5.946996212005615, "learning_rate": 1.9046868152011587e-05, "loss": 1.336, "step": 2760 }, { "epoch": 0.07415934889698009, "grad_norm": 6.169567108154297, "learning_rate": 1.8907155252765942e-05, "loss": 1.4099, "step": 2770 }, { "epoch": 0.07442707217819662, "grad_norm": 5.974761962890625, "learning_rate": 1.8767569488254004e-05, "loss": 1.3588, "step": 2780 }, { "epoch": 0.07469479545941315, "grad_norm": 5.632639408111572, "learning_rate": 1.8628116596299847e-05, "loss": 1.3704, "step": 2790 }, { "epoch": 0.07496251874062969, "grad_norm": 5.559203147888184, "learning_rate": 1.848880230926566e-05, "loss": 1.3878, "step": 2800 }, { "epoch": 0.07523024202184622, "grad_norm": 6.6522674560546875, "learning_rate": 1.8349632353816113e-05, "loss": 1.4324, "step": 2810 }, { "epoch": 0.07549796530306276, "grad_norm": 5.803051471710205, "learning_rate": 1.8210612450682986e-05, "loss": 1.4132, "step": 2820 }, { "epoch": 0.07576568858427929, "grad_norm": 5.304571151733398, "learning_rate": 1.8071748314429994e-05, "loss": 1.3607, "step": 2830 }, { "epoch": 0.07603341186549582, "grad_norm": 5.904123783111572, "learning_rate": 1.7933045653217886e-05, "loss": 1.3963, "step": 2840 }, { "epoch": 0.07630113514671236, "grad_norm": 5.9972686767578125, "learning_rate": 1.7794510168569814e-05, "loss": 1.4353, "step": 2850 }, { "epoch": 0.07630113514671236, "eval_loss": 1.3754030466079712, "eval_runtime": 76.726, "eval_samples_per_second": 6.517, "eval_steps_per_second": 6.517, "step": 2850 }, { "epoch": 0.07656885842792889, "grad_norm": 6.276034832000732, "learning_rate": 1.7656147555136924e-05, "loss": 1.3894, "step": 2860 }, { "epoch": 0.07683658170914542, "grad_norm": 6.091196060180664, "learning_rate": 1.7517963500464338e-05, "loss": 1.3956, "step": 2870 }, { "epoch": 0.07710430499036196, "grad_norm": 6.1393513679504395, "learning_rate": 1.7379963684757313e-05, "loss": 1.4192, "step": 2880 }, { "epoch": 0.07737202827157849, "grad_norm": 6.082838535308838, "learning_rate": 1.7242153780647764e-05, "loss": 1.3598, "step": 2890 }, { "epoch": 0.07763975155279502, "grad_norm": 7.009051322937012, "learning_rate": 1.7104539452961086e-05, "loss": 1.3388, "step": 2900 }, { "epoch": 0.07790747483401157, "grad_norm": 6.073249340057373, "learning_rate": 1.6967126358483283e-05, "loss": 1.4014, "step": 2910 }, { "epoch": 0.0781751981152281, "grad_norm": 5.901647567749023, "learning_rate": 1.6829920145728465e-05, "loss": 1.3795, "step": 2920 }, { "epoch": 0.07844292139644464, "grad_norm": 5.663522243499756, "learning_rate": 1.6692926454706644e-05, "loss": 1.4444, "step": 2930 }, { "epoch": 0.07871064467766117, "grad_norm": 6.163628578186035, "learning_rate": 1.655615091669189e-05, "loss": 1.3579, "step": 2940 }, { "epoch": 0.07897836795887771, "grad_norm": 5.641757965087891, "learning_rate": 1.641959915399084e-05, "loss": 1.3816, "step": 2950 }, { "epoch": 0.07924609124009424, "grad_norm": 5.533544540405273, "learning_rate": 1.6283276779711637e-05, "loss": 1.4021, "step": 2960 }, { "epoch": 0.07951381452131077, "grad_norm": 6.252144813537598, "learning_rate": 1.614718939753315e-05, "loss": 1.3424, "step": 2970 }, { "epoch": 0.07978153780252731, "grad_norm": 6.014984607696533, "learning_rate": 1.6011342601474635e-05, "loss": 1.3733, "step": 2980 }, { "epoch": 0.08004926108374384, "grad_norm": 6.043126106262207, "learning_rate": 1.5875741975665813e-05, "loss": 1.4402, "step": 2990 }, { "epoch": 0.08031698436496038, "grad_norm": 6.2684478759765625, "learning_rate": 1.5740393094117287e-05, "loss": 1.3955, "step": 3000 }, { "epoch": 0.08031698436496038, "eval_loss": 1.3712314367294312, "eval_runtime": 77.0642, "eval_samples_per_second": 6.488, "eval_steps_per_second": 6.488, "step": 3000 }, { "epoch": 0.08058470764617691, "grad_norm": 6.531871795654297, "learning_rate": 1.560530152049146e-05, "loss": 1.3728, "step": 3010 }, { "epoch": 0.08085243092739344, "grad_norm": 6.215692043304443, "learning_rate": 1.5470472807873805e-05, "loss": 1.322, "step": 3020 }, { "epoch": 0.08112015420860998, "grad_norm": 5.670928001403809, "learning_rate": 1.5335912498544615e-05, "loss": 1.3643, "step": 3030 }, { "epoch": 0.08138787748982651, "grad_norm": 5.801737308502197, "learning_rate": 1.5201626123751158e-05, "loss": 1.3653, "step": 3040 }, { "epoch": 0.08165560077104304, "grad_norm": 5.651313781738281, "learning_rate": 1.5067619203480345e-05, "loss": 1.3818, "step": 3050 }, { "epoch": 0.08192332405225959, "grad_norm": 6.425565242767334, "learning_rate": 1.4933897246231798e-05, "loss": 1.3276, "step": 3060 }, { "epoch": 0.08219104733347612, "grad_norm": 6.21942663192749, "learning_rate": 1.4800465748791428e-05, "loss": 1.429, "step": 3070 }, { "epoch": 0.08245877061469266, "grad_norm": 6.354944229125977, "learning_rate": 1.4667330196005485e-05, "loss": 1.4254, "step": 3080 }, { "epoch": 0.08272649389590919, "grad_norm": 6.185739517211914, "learning_rate": 1.4534496060555075e-05, "loss": 1.3998, "step": 3090 }, { "epoch": 0.08299421717712573, "grad_norm": 5.781863212585449, "learning_rate": 1.4401968802731235e-05, "loss": 1.3384, "step": 3100 }, { "epoch": 0.08326194045834226, "grad_norm": 6.628792762756348, "learning_rate": 1.4269753870210459e-05, "loss": 1.4146, "step": 3110 }, { "epoch": 0.0835296637395588, "grad_norm": 6.093694686889648, "learning_rate": 1.4137856697830786e-05, "loss": 1.3662, "step": 3120 }, { "epoch": 0.08379738702077533, "grad_norm": 6.078185558319092, "learning_rate": 1.4006282707368348e-05, "loss": 1.3716, "step": 3130 }, { "epoch": 0.08406511030199186, "grad_norm": 6.203483581542969, "learning_rate": 1.3875037307314563e-05, "loss": 1.3371, "step": 3140 }, { "epoch": 0.0843328335832084, "grad_norm": 5.880634307861328, "learning_rate": 1.374412589265377e-05, "loss": 1.3464, "step": 3150 }, { "epoch": 0.0843328335832084, "eval_loss": 1.3659946918487549, "eval_runtime": 77.0452, "eval_samples_per_second": 6.49, "eval_steps_per_second": 6.49, "step": 3150 }, { "epoch": 0.08460055686442493, "grad_norm": 6.3485426902771, "learning_rate": 1.3613553844641483e-05, "loss": 1.3366, "step": 3160 }, { "epoch": 0.08486828014564146, "grad_norm": 6.721098899841309, "learning_rate": 1.3483326530583184e-05, "loss": 1.3628, "step": 3170 }, { "epoch": 0.085136003426858, "grad_norm": 5.912144660949707, "learning_rate": 1.3353449303613682e-05, "loss": 1.3403, "step": 3180 }, { "epoch": 0.08540372670807453, "grad_norm": 5.860577583312988, "learning_rate": 1.3223927502477084e-05, "loss": 1.3453, "step": 3190 }, { "epoch": 0.08567144998929106, "grad_norm": 6.3982977867126465, "learning_rate": 1.3094766451307336e-05, "loss": 1.3556, "step": 3200 }, { "epoch": 0.08593917327050761, "grad_norm": 6.073590278625488, "learning_rate": 1.2965971459409366e-05, "loss": 1.3984, "step": 3210 }, { "epoch": 0.08620689655172414, "grad_norm": 6.372732162475586, "learning_rate": 1.2837547821040825e-05, "loss": 1.4089, "step": 3220 }, { "epoch": 0.08647461983294068, "grad_norm": 6.449525356292725, "learning_rate": 1.2709500815194487e-05, "loss": 1.3884, "step": 3230 }, { "epoch": 0.08674234311415721, "grad_norm": 5.904713153839111, "learning_rate": 1.2581835705381243e-05, "loss": 1.3976, "step": 3240 }, { "epoch": 0.08701006639537374, "grad_norm": 6.398531913757324, "learning_rate": 1.2454557739413722e-05, "loss": 1.3942, "step": 3250 }, { "epoch": 0.08727778967659028, "grad_norm": 6.1607465744018555, "learning_rate": 1.2327672149190595e-05, "loss": 1.3698, "step": 3260 }, { "epoch": 0.08754551295780681, "grad_norm": 5.903096675872803, "learning_rate": 1.2201184150481497e-05, "loss": 1.4183, "step": 3270 }, { "epoch": 0.08781323623902335, "grad_norm": 6.210367679595947, "learning_rate": 1.2075098942712635e-05, "loss": 1.3717, "step": 3280 }, { "epoch": 0.08808095952023988, "grad_norm": 6.082081317901611, "learning_rate": 1.1949421708753062e-05, "loss": 1.3694, "step": 3290 }, { "epoch": 0.08834868280145641, "grad_norm": 5.826544284820557, "learning_rate": 1.1824157614701629e-05, "loss": 1.4473, "step": 3300 }, { "epoch": 0.08834868280145641, "eval_loss": 1.3619885444641113, "eval_runtime": 77.1061, "eval_samples_per_second": 6.485, "eval_steps_per_second": 6.485, "step": 3300 }, { "epoch": 0.08861640608267295, "grad_norm": 6.470825672149658, "learning_rate": 1.1699311809674596e-05, "loss": 1.357, "step": 3310 }, { "epoch": 0.08888412936388948, "grad_norm": 5.989506244659424, "learning_rate": 1.157488942559403e-05, "loss": 1.322, "step": 3320 }, { "epoch": 0.08915185264510601, "grad_norm": 6.708034992218018, "learning_rate": 1.1450895576976816e-05, "loss": 1.3652, "step": 3330 }, { "epoch": 0.08941957592632255, "grad_norm": 6.264359474182129, "learning_rate": 1.1327335360724412e-05, "loss": 1.3661, "step": 3340 }, { "epoch": 0.08968729920753908, "grad_norm": 6.633790969848633, "learning_rate": 1.1204213855913374e-05, "loss": 1.3522, "step": 3350 }, { "epoch": 0.08995502248875563, "grad_norm": 5.526124477386475, "learning_rate": 1.1081536123586505e-05, "loss": 1.3492, "step": 3360 }, { "epoch": 0.09022274576997216, "grad_norm": 6.267175197601318, "learning_rate": 1.09593072065449e-05, "loss": 1.3805, "step": 3370 }, { "epoch": 0.0904904690511887, "grad_norm": 6.826523780822754, "learning_rate": 1.0837532129140595e-05, "loss": 1.3379, "step": 3380 }, { "epoch": 0.09075819233240523, "grad_norm": 6.352426052093506, "learning_rate": 1.0716215897070067e-05, "loss": 1.378, "step": 3390 }, { "epoch": 0.09102591561362176, "grad_norm": 6.353774547576904, "learning_rate": 1.0595363497168449e-05, "loss": 1.4057, "step": 3400 }, { "epoch": 0.0912936388948383, "grad_norm": 6.023704528808594, "learning_rate": 1.0474979897204557e-05, "loss": 1.419, "step": 3410 }, { "epoch": 0.09156136217605483, "grad_norm": 6.525381565093994, "learning_rate": 1.0355070045676677e-05, "loss": 1.3737, "step": 3420 }, { "epoch": 0.09182908545727136, "grad_norm": 6.321014404296875, "learning_rate": 1.0235638871609145e-05, "loss": 1.3252, "step": 3430 }, { "epoch": 0.0920968087384879, "grad_norm": 5.880143165588379, "learning_rate": 1.011669128434976e-05, "loss": 1.3581, "step": 3440 }, { "epoch": 0.09236453201970443, "grad_norm": 6.851429462432861, "learning_rate": 9.99823217336793e-06, "loss": 1.4074, "step": 3450 }, { "epoch": 0.09236453201970443, "eval_loss": 1.3579777479171753, "eval_runtime": 76.8014, "eval_samples_per_second": 6.51, "eval_steps_per_second": 6.51, "step": 3450 }, { "epoch": 0.09263225530092097, "grad_norm": 6.41058874130249, "learning_rate": 9.880266408053746e-06, "loss": 1.433, "step": 3460 }, { "epoch": 0.0928999785821375, "grad_norm": 5.9317474365234375, "learning_rate": 9.762798837517776e-06, "loss": 1.3759, "step": 3470 }, { "epoch": 0.09316770186335403, "grad_norm": 5.728269100189209, "learning_rate": 9.645834290391754e-06, "loss": 1.4632, "step": 3480 }, { "epoch": 0.09343542514457057, "grad_norm": 5.710354328155518, "learning_rate": 9.529377574630109e-06, "loss": 1.422, "step": 3490 }, { "epoch": 0.0937031484257871, "grad_norm": 6.150035381317139, "learning_rate": 9.413433477312272e-06, "loss": 1.4113, "step": 3500 }, { "epoch": 0.09397087170700365, "grad_norm": 6.171891689300537, "learning_rate": 9.298006764445976e-06, "loss": 1.4115, "step": 3510 }, { "epoch": 0.09423859498822018, "grad_norm": 6.584611415863037, "learning_rate": 9.183102180771285e-06, "loss": 1.3631, "step": 3520 }, { "epoch": 0.09450631826943671, "grad_norm": 6.219729423522949, "learning_rate": 9.068724449565594e-06, "loss": 1.3497, "step": 3530 }, { "epoch": 0.09477404155065325, "grad_norm": 5.961699485778809, "learning_rate": 8.954878272449433e-06, "loss": 1.3476, "step": 3540 }, { "epoch": 0.09504176483186978, "grad_norm": 6.4813385009765625, "learning_rate": 8.841568329193249e-06, "loss": 1.3281, "step": 3550 }, { "epoch": 0.09530948811308632, "grad_norm": 5.715578079223633, "learning_rate": 8.728799277524998e-06, "loss": 1.3114, "step": 3560 }, { "epoch": 0.09557721139430285, "grad_norm": 5.67549467086792, "learning_rate": 8.61657575293871e-06, "loss": 1.3119, "step": 3570 }, { "epoch": 0.09584493467551938, "grad_norm": 6.634474277496338, "learning_rate": 8.50490236850394e-06, "loss": 1.3587, "step": 3580 }, { "epoch": 0.09611265795673592, "grad_norm": 5.7471537590026855, "learning_rate": 8.393783714676107e-06, "loss": 1.3607, "step": 3590 }, { "epoch": 0.09638038123795245, "grad_norm": 5.866701602935791, "learning_rate": 8.283224359107863e-06, "loss": 1.3247, "step": 3600 }, { "epoch": 0.09638038123795245, "eval_loss": 1.3553622961044312, "eval_runtime": 76.9249, "eval_samples_per_second": 6.5, "eval_steps_per_second": 6.5, "step": 3600 }, { "epoch": 0.09664810451916898, "grad_norm": 5.779167652130127, "learning_rate": 8.17322884646128e-06, "loss": 1.375, "step": 3610 }, { "epoch": 0.09691582780038552, "grad_norm": 6.503204345703125, "learning_rate": 8.06380169822107e-06, "loss": 1.3767, "step": 3620 }, { "epoch": 0.09718355108160205, "grad_norm": 5.67221212387085, "learning_rate": 7.95494741250868e-06, "loss": 1.2996, "step": 3630 }, { "epoch": 0.09745127436281859, "grad_norm": 6.475659370422363, "learning_rate": 7.846670463897457e-06, "loss": 1.3827, "step": 3640 }, { "epoch": 0.09771899764403512, "grad_norm": 6.146843910217285, "learning_rate": 7.738975303228659e-06, "loss": 1.3489, "step": 3650 }, { "epoch": 0.09798672092525167, "grad_norm": 6.763230323791504, "learning_rate": 7.631866357428526e-06, "loss": 1.3631, "step": 3660 }, { "epoch": 0.0982544442064682, "grad_norm": 6.853928565979004, "learning_rate": 7.525348029326323e-06, "loss": 1.3683, "step": 3670 }, { "epoch": 0.09852216748768473, "grad_norm": 6.183257102966309, "learning_rate": 7.4194246974732955e-06, "loss": 1.3744, "step": 3680 }, { "epoch": 0.09878989076890127, "grad_norm": 6.155274391174316, "learning_rate": 7.314100715962744e-06, "loss": 1.389, "step": 3690 }, { "epoch": 0.0990576140501178, "grad_norm": 6.754117012023926, "learning_rate": 7.209380414251028e-06, "loss": 1.3267, "step": 3700 }, { "epoch": 0.09932533733133433, "grad_norm": 6.333691596984863, "learning_rate": 7.105268096979596e-06, "loss": 1.3774, "step": 3710 }, { "epoch": 0.09959306061255087, "grad_norm": 6.452340602874756, "learning_rate": 7.001768043798013e-06, "loss": 1.3038, "step": 3720 }, { "epoch": 0.0998607838937674, "grad_norm": 5.832094192504883, "learning_rate": 6.898884509188095e-06, "loss": 1.3978, "step": 3730 }, { "epoch": 0.10012850717498394, "grad_norm": 5.7019476890563965, "learning_rate": 6.796621722288977e-06, "loss": 1.358, "step": 3740 }, { "epoch": 0.10039623045620047, "grad_norm": 5.743053913116455, "learning_rate": 6.6949838867233e-06, "loss": 1.3567, "step": 3750 }, { "epoch": 0.10039623045620047, "eval_loss": 1.3537319898605347, "eval_runtime": 76.8454, "eval_samples_per_second": 6.507, "eval_steps_per_second": 6.507, "step": 3750 }, { "epoch": 0.100663953737417, "grad_norm": 6.490472793579102, "learning_rate": 6.5939751804243974e-06, "loss": 1.361, "step": 3760 }, { "epoch": 0.10093167701863354, "grad_norm": 6.32999324798584, "learning_rate": 6.493599755464546e-06, "loss": 1.2968, "step": 3770 }, { "epoch": 0.10119940029985007, "grad_norm": 6.559702396392822, "learning_rate": 6.3938617378843264e-06, "loss": 1.4176, "step": 3780 }, { "epoch": 0.1014671235810666, "grad_norm": 5.832455158233643, "learning_rate": 6.294765227523008e-06, "loss": 1.3828, "step": 3790 }, { "epoch": 0.10173484686228314, "grad_norm": 6.728024005889893, "learning_rate": 6.196314297849995e-06, "loss": 1.3902, "step": 3800 }, { "epoch": 0.10200257014349969, "grad_norm": 6.092176914215088, "learning_rate": 6.098512995797388e-06, "loss": 1.3587, "step": 3810 }, { "epoch": 0.10227029342471622, "grad_norm": 6.502336025238037, "learning_rate": 6.0013653415936585e-06, "loss": 1.3619, "step": 3820 }, { "epoch": 0.10253801670593275, "grad_norm": 6.602701187133789, "learning_rate": 5.90487532859837e-06, "loss": 1.3325, "step": 3830 }, { "epoch": 0.10280573998714929, "grad_norm": 6.637482166290283, "learning_rate": 5.809046923138031e-06, "loss": 1.3899, "step": 3840 }, { "epoch": 0.10307346326836582, "grad_norm": 5.880363941192627, "learning_rate": 5.713884064343061e-06, "loss": 1.3481, "step": 3850 }, { "epoch": 0.10334118654958235, "grad_norm": 7.036133289337158, "learning_rate": 5.6193906639858486e-06, "loss": 1.3156, "step": 3860 }, { "epoch": 0.10360890983079889, "grad_norm": 5.999964714050293, "learning_rate": 5.52557060631998e-06, "loss": 1.3756, "step": 3870 }, { "epoch": 0.10387663311201542, "grad_norm": 5.966408729553223, "learning_rate": 5.432427747920561e-06, "loss": 1.3588, "step": 3880 }, { "epoch": 0.10414435639323195, "grad_norm": 5.987645626068115, "learning_rate": 5.339965917525687e-06, "loss": 1.427, "step": 3890 }, { "epoch": 0.10441207967444849, "grad_norm": 5.433709621429443, "learning_rate": 5.248188915879043e-06, "loss": 1.3687, "step": 3900 }, { "epoch": 0.10441207967444849, "eval_loss": 1.350784420967102, "eval_runtime": 76.8402, "eval_samples_per_second": 6.507, "eval_steps_per_second": 6.507, "step": 3900 }, { "epoch": 0.10467980295566502, "grad_norm": 6.524111270904541, "learning_rate": 5.157100515573715e-06, "loss": 1.3006, "step": 3910 }, { "epoch": 0.10494752623688156, "grad_norm": 5.474837303161621, "learning_rate": 5.066704460897067e-06, "loss": 1.3463, "step": 3920 }, { "epoch": 0.10521524951809809, "grad_norm": 5.868412494659424, "learning_rate": 4.977004467676848e-06, "loss": 1.2881, "step": 3930 }, { "epoch": 0.10548297279931462, "grad_norm": 5.966287136077881, "learning_rate": 4.888004223128458e-06, "loss": 1.3636, "step": 3940 }, { "epoch": 0.10575069608053116, "grad_norm": 5.976463794708252, "learning_rate": 4.799707385703344e-06, "loss": 1.3411, "step": 3950 }, { "epoch": 0.1060184193617477, "grad_norm": 5.5595197677612305, "learning_rate": 4.712117584938669e-06, "loss": 1.3114, "step": 3960 }, { "epoch": 0.10628614264296424, "grad_norm": 5.7463483810424805, "learning_rate": 4.625238421308069e-06, "loss": 1.3472, "step": 3970 }, { "epoch": 0.10655386592418077, "grad_norm": 6.120302200317383, "learning_rate": 4.5390734660736906e-06, "loss": 1.4384, "step": 3980 }, { "epoch": 0.1068215892053973, "grad_norm": 6.155236721038818, "learning_rate": 4.453626261139344e-06, "loss": 1.3494, "step": 3990 }, { "epoch": 0.10708931248661384, "grad_norm": 6.032073974609375, "learning_rate": 4.368900318904957e-06, "loss": 1.3464, "step": 4000 }, { "epoch": 0.10735703576783037, "grad_norm": 6.827203750610352, "learning_rate": 4.284899122122165e-06, "loss": 1.3534, "step": 4010 }, { "epoch": 0.1076247590490469, "grad_norm": 5.927024841308594, "learning_rate": 4.201626123751159e-06, "loss": 1.333, "step": 4020 }, { "epoch": 0.10789248233026344, "grad_norm": 5.960188865661621, "learning_rate": 4.1190847468187425e-06, "loss": 1.3458, "step": 4030 }, { "epoch": 0.10816020561147997, "grad_norm": 6.299499034881592, "learning_rate": 4.037278384277628e-06, "loss": 1.3516, "step": 4040 }, { "epoch": 0.1084279288926965, "grad_norm": 6.968238353729248, "learning_rate": 3.956210398866969e-06, "loss": 1.369, "step": 4050 }, { "epoch": 0.1084279288926965, "eval_loss": 1.348792552947998, "eval_runtime": 76.8298, "eval_samples_per_second": 6.508, "eval_steps_per_second": 6.508, "step": 4050 }, { "epoch": 0.10869565217391304, "grad_norm": 6.412740707397461, "learning_rate": 3.875884122974123e-06, "loss": 1.3756, "step": 4060 }, { "epoch": 0.10896337545512957, "grad_norm": 6.571822643280029, "learning_rate": 3.7963028584976805e-06, "loss": 1.3773, "step": 4070 }, { "epoch": 0.10923109873634611, "grad_norm": 6.47897481918335, "learning_rate": 3.717469876711713e-06, "loss": 1.3746, "step": 4080 }, { "epoch": 0.10949882201756264, "grad_norm": 6.563449382781982, "learning_rate": 3.6393884181313417e-06, "loss": 1.382, "step": 4090 }, { "epoch": 0.10976654529877918, "grad_norm": 6.455676078796387, "learning_rate": 3.562061692379507e-06, "loss": 1.3519, "step": 4100 }, { "epoch": 0.11003426857999572, "grad_norm": 5.957856178283691, "learning_rate": 3.4854928780550306e-06, "loss": 1.3711, "step": 4110 }, { "epoch": 0.11030199186121226, "grad_norm": 6.082734107971191, "learning_rate": 3.409685122601979e-06, "loss": 1.3038, "step": 4120 }, { "epoch": 0.11056971514242879, "grad_norm": 5.809603691101074, "learning_rate": 3.3346415421802494e-06, "loss": 1.3587, "step": 4130 }, { "epoch": 0.11083743842364532, "grad_norm": 6.081882476806641, "learning_rate": 3.26036522153751e-06, "loss": 1.3672, "step": 4140 }, { "epoch": 0.11110516170486186, "grad_norm": 5.788993835449219, "learning_rate": 3.186859213882386e-06, "loss": 1.3615, "step": 4150 }, { "epoch": 0.11137288498607839, "grad_norm": 5.722326755523682, "learning_rate": 3.114126540758946e-06, "loss": 1.2914, "step": 4160 }, { "epoch": 0.11164060826729492, "grad_norm": 6.233955383300781, "learning_rate": 3.042170191922509e-06, "loss": 1.3286, "step": 4170 }, { "epoch": 0.11190833154851146, "grad_norm": 6.276589393615723, "learning_rate": 2.9709931252167426e-06, "loss": 1.3943, "step": 4180 }, { "epoch": 0.11217605482972799, "grad_norm": 6.818645000457764, "learning_rate": 2.9005982664520734e-06, "loss": 1.3535, "step": 4190 }, { "epoch": 0.11244377811094453, "grad_norm": 6.53585147857666, "learning_rate": 2.830988509285433e-06, "loss": 1.3412, "step": 4200 }, { "epoch": 0.11244377811094453, "eval_loss": 1.3470451831817627, "eval_runtime": 76.7654, "eval_samples_per_second": 6.513, "eval_steps_per_second": 6.513, "step": 4200 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.521351998649088e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }