|
{ |
|
"best_metric": 1.3470451831817627, |
|
"best_model_checkpoint": "./output/checkpoint-4200", |
|
"epoch": 0.11244377811094453, |
|
"eval_steps": 150, |
|
"global_step": 4200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002677232812165346, |
|
"grad_norm": 12.16030502319336, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.5442, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005354465624330692, |
|
"grad_norm": 12.028721809387207, |
|
"learning_rate": 8.8e-06, |
|
"loss": 1.5341, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0008031698436496038, |
|
"grad_norm": 9.741438865661621, |
|
"learning_rate": 1.3199999999999999e-05, |
|
"loss": 1.5258, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0010708931248661383, |
|
"grad_norm": 11.791377067565918, |
|
"learning_rate": 1.76e-05, |
|
"loss": 1.4766, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0013386164060826729, |
|
"grad_norm": 10.31489372253418, |
|
"learning_rate": 2.2e-05, |
|
"loss": 1.4898, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0016063396872992076, |
|
"grad_norm": 11.65046501159668, |
|
"learning_rate": 2.6399999999999998e-05, |
|
"loss": 1.4536, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0018740629685157421, |
|
"grad_norm": 11.001107215881348, |
|
"learning_rate": 3.0799999999999996e-05, |
|
"loss": 1.4933, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0021417862497322766, |
|
"grad_norm": 10.670427322387695, |
|
"learning_rate": 3.52e-05, |
|
"loss": 1.4816, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.002409509530948811, |
|
"grad_norm": 11.35387134552002, |
|
"learning_rate": 3.96e-05, |
|
"loss": 1.4636, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0026772328121653457, |
|
"grad_norm": 10.275943756103516, |
|
"learning_rate": 4.4e-05, |
|
"loss": 1.4996, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0029449560933818807, |
|
"grad_norm": 9.64588451385498, |
|
"learning_rate": 4.399954783308405e-05, |
|
"loss": 1.5114, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.003212679374598415, |
|
"grad_norm": 9.566597938537598, |
|
"learning_rate": 4.399819135092302e-05, |
|
"loss": 1.542, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0034804026558149497, |
|
"grad_norm": 10.304100036621094, |
|
"learning_rate": 4.399593060927658e-05, |
|
"loss": 1.4911, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0037481259370314842, |
|
"grad_norm": 8.622906684875488, |
|
"learning_rate": 4.3992765701074955e-05, |
|
"loss": 1.4655, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.004015849218248019, |
|
"grad_norm": 9.855925559997559, |
|
"learning_rate": 4.398869675641513e-05, |
|
"loss": 1.5424, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004015849218248019, |
|
"eval_loss": 1.5061633586883545, |
|
"eval_runtime": 76.7585, |
|
"eval_samples_per_second": 6.514, |
|
"eval_steps_per_second": 6.514, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004283572499464553, |
|
"grad_norm": 9.257708549499512, |
|
"learning_rate": 4.398372394255549e-05, |
|
"loss": 1.4863, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004551295780681088, |
|
"grad_norm": 7.029283046722412, |
|
"learning_rate": 4.397784746390892e-05, |
|
"loss": 1.504, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004819019061897622, |
|
"grad_norm": 8.059552192687988, |
|
"learning_rate": 4.3971067562034454e-05, |
|
"loss": 1.4734, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.005086742343114157, |
|
"grad_norm": 8.756830215454102, |
|
"learning_rate": 4.39633845156273e-05, |
|
"loss": 1.5308, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.005354465624330691, |
|
"grad_norm": 8.60611629486084, |
|
"learning_rate": 4.39547986405074e-05, |
|
"loss": 1.4985, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005622188905547227, |
|
"grad_norm": 8.613848686218262, |
|
"learning_rate": 4.3945310289606455e-05, |
|
"loss": 1.493, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005889912186763761, |
|
"grad_norm": 7.788877010345459, |
|
"learning_rate": 4.39349198529534e-05, |
|
"loss": 1.492, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.006157635467980296, |
|
"grad_norm": 7.323453903198242, |
|
"learning_rate": 4.39236277576584e-05, |
|
"loss": 1.5138, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00642535874919683, |
|
"grad_norm": 7.350474834442139, |
|
"learning_rate": 4.391143446789526e-05, |
|
"loss": 1.5314, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006693082030413365, |
|
"grad_norm": 7.383894443511963, |
|
"learning_rate": 4.389834048488236e-05, |
|
"loss": 1.5231, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.006960805311629899, |
|
"grad_norm": 7.436572551727295, |
|
"learning_rate": 4.388434634686206e-05, |
|
"loss": 1.5264, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007228528592846434, |
|
"grad_norm": 7.9078779220581055, |
|
"learning_rate": 4.386945262907856e-05, |
|
"loss": 1.4744, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0074962518740629685, |
|
"grad_norm": 7.032011985778809, |
|
"learning_rate": 4.3853659943754275e-05, |
|
"loss": 1.4156, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007763975155279503, |
|
"grad_norm": 7.743986129760742, |
|
"learning_rate": 4.383696894006463e-05, |
|
"loss": 1.5338, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.008031698436496038, |
|
"grad_norm": 7.996994495391846, |
|
"learning_rate": 4.381938030411141e-05, |
|
"loss": 1.4912, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008031698436496038, |
|
"eval_loss": 1.5058677196502686, |
|
"eval_runtime": 76.63, |
|
"eval_samples_per_second": 6.525, |
|
"eval_steps_per_second": 6.525, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008299421717712573, |
|
"grad_norm": 6.838953971862793, |
|
"learning_rate": 4.380089475889457e-05, |
|
"loss": 1.4854, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008567144998929107, |
|
"grad_norm": 7.260242462158203, |
|
"learning_rate": 4.378151306428244e-05, |
|
"loss": 1.5401, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.008834868280145642, |
|
"grad_norm": 6.737756729125977, |
|
"learning_rate": 4.3761236016980594e-05, |
|
"loss": 1.5013, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.009102591561362176, |
|
"grad_norm": 6.963688373565674, |
|
"learning_rate": 4.3740064450499026e-05, |
|
"loss": 1.4989, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009370314842578711, |
|
"grad_norm": 7.6938557624816895, |
|
"learning_rate": 4.37179992351179e-05, |
|
"loss": 1.5317, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009638038123795245, |
|
"grad_norm": 6.700031757354736, |
|
"learning_rate": 4.3695041277851804e-05, |
|
"loss": 1.405, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.00990576140501178, |
|
"grad_norm": 6.925078868865967, |
|
"learning_rate": 4.367119152241245e-05, |
|
"loss": 1.4966, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.010173484686228314, |
|
"grad_norm": 6.87849235534668, |
|
"learning_rate": 4.364645094916985e-05, |
|
"loss": 1.4933, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.01044120796744485, |
|
"grad_norm": 8.074585914611816, |
|
"learning_rate": 4.3620820575112083e-05, |
|
"loss": 1.4782, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.010708931248661383, |
|
"grad_norm": 7.07144832611084, |
|
"learning_rate": 4.359430145380344e-05, |
|
"loss": 1.4871, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010976654529877918, |
|
"grad_norm": 7.2088117599487305, |
|
"learning_rate": 4.356689467534112e-05, |
|
"loss": 1.4855, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.011244377811094454, |
|
"grad_norm": 7.868666648864746, |
|
"learning_rate": 4.353860136631044e-05, |
|
"loss": 1.5246, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.011512101092310987, |
|
"grad_norm": 7.853616237640381, |
|
"learning_rate": 4.350942268973854e-05, |
|
"loss": 1.5302, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.011779824373527523, |
|
"grad_norm": 7.353667259216309, |
|
"learning_rate": 4.347935984504649e-05, |
|
"loss": 1.4305, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.012047547654744056, |
|
"grad_norm": 6.460302352905273, |
|
"learning_rate": 4.344841406800012e-05, |
|
"loss": 1.4506, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.012047547654744056, |
|
"eval_loss": 1.4909805059432983, |
|
"eval_runtime": 76.606, |
|
"eval_samples_per_second": 6.527, |
|
"eval_steps_per_second": 6.527, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.012315270935960592, |
|
"grad_norm": 7.3225226402282715, |
|
"learning_rate": 4.34165866306591e-05, |
|
"loss": 1.461, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.012582994217177125, |
|
"grad_norm": 6.5725297927856445, |
|
"learning_rate": 4.3383878841324734e-05, |
|
"loss": 1.4007, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.01285071749839366, |
|
"grad_norm": 7.1205315589904785, |
|
"learning_rate": 4.3350292044486125e-05, |
|
"loss": 1.557, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.013118440779610194, |
|
"grad_norm": 6.783862113952637, |
|
"learning_rate": 4.331582762076494e-05, |
|
"loss": 1.5214, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01338616406082673, |
|
"grad_norm": 6.475296974182129, |
|
"learning_rate": 4.328048698685865e-05, |
|
"loss": 1.4874, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.013653887342043263, |
|
"grad_norm": 7.158287525177002, |
|
"learning_rate": 4.32442715954823e-05, |
|
"loss": 1.4794, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.013921610623259799, |
|
"grad_norm": 5.706000804901123, |
|
"learning_rate": 4.320718293530877e-05, |
|
"loss": 1.4921, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.014189333904476333, |
|
"grad_norm": 7.483499050140381, |
|
"learning_rate": 4.3169222530907634e-05, |
|
"loss": 1.4899, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.014457057185692868, |
|
"grad_norm": 6.9520182609558105, |
|
"learning_rate": 4.313039194268243e-05, |
|
"loss": 1.4908, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.014724780466909402, |
|
"grad_norm": 6.7435784339904785, |
|
"learning_rate": 4.309069276680653e-05, |
|
"loss": 1.45, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.014992503748125937, |
|
"grad_norm": 7.162035942077637, |
|
"learning_rate": 4.305012663515759e-05, |
|
"loss": 1.4702, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.015260227029342472, |
|
"grad_norm": 6.717561721801758, |
|
"learning_rate": 4.300869521525039e-05, |
|
"loss": 1.5131, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.015527950310559006, |
|
"grad_norm": 6.205082893371582, |
|
"learning_rate": 4.296640021016832e-05, |
|
"loss": 1.4342, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.01579567359177554, |
|
"grad_norm": 6.673497676849365, |
|
"learning_rate": 4.292324335849338e-05, |
|
"loss": 1.4917, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.016063396872992075, |
|
"grad_norm": 6.291605472564697, |
|
"learning_rate": 4.287922643423471e-05, |
|
"loss": 1.5018, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.016063396872992075, |
|
"eval_loss": 1.476717472076416, |
|
"eval_runtime": 76.7435, |
|
"eval_samples_per_second": 6.515, |
|
"eval_steps_per_second": 6.515, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.01633112015420861, |
|
"grad_norm": 6.723529815673828, |
|
"learning_rate": 4.283435124675567e-05, |
|
"loss": 1.4652, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.016598843435425146, |
|
"grad_norm": 6.2483062744140625, |
|
"learning_rate": 4.278861964069944e-05, |
|
"loss": 1.5094, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.01686656671664168, |
|
"grad_norm": 6.773886203765869, |
|
"learning_rate": 4.274203349591324e-05, |
|
"loss": 1.4771, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.017134289997858213, |
|
"grad_norm": 7.28003454208374, |
|
"learning_rate": 4.269459472737102e-05, |
|
"loss": 1.436, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.017402013279074747, |
|
"grad_norm": 6.813667297363281, |
|
"learning_rate": 4.264630528509473e-05, |
|
"loss": 1.4094, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.017669736560291284, |
|
"grad_norm": 7.017402172088623, |
|
"learning_rate": 4.259716715407422e-05, |
|
"loss": 1.5255, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.017937459841507818, |
|
"grad_norm": 7.313465595245361, |
|
"learning_rate": 4.254718235418559e-05, |
|
"loss": 1.4647, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.01820518312272435, |
|
"grad_norm": 6.323640823364258, |
|
"learning_rate": 4.249635294010819e-05, |
|
"loss": 1.4799, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.01847290640394089, |
|
"grad_norm": 7.1620588302612305, |
|
"learning_rate": 4.244468100124014e-05, |
|
"loss": 1.4344, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.018740629685157422, |
|
"grad_norm": 6.160943508148193, |
|
"learning_rate": 4.239216866161248e-05, |
|
"loss": 1.516, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.019008352966373956, |
|
"grad_norm": 6.571516036987305, |
|
"learning_rate": 4.233881807980179e-05, |
|
"loss": 1.5133, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.01927607624759049, |
|
"grad_norm": 5.696547031402588, |
|
"learning_rate": 4.228463144884155e-05, |
|
"loss": 1.4318, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.019543799528807027, |
|
"grad_norm": 6.653096675872803, |
|
"learning_rate": 4.2229610996131915e-05, |
|
"loss": 1.461, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.01981152281002356, |
|
"grad_norm": 6.497095584869385, |
|
"learning_rate": 4.217375898334819e-05, |
|
"loss": 1.4359, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.020079246091240094, |
|
"grad_norm": 6.566861629486084, |
|
"learning_rate": 4.211707770634788e-05, |
|
"loss": 1.445, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.020079246091240094, |
|
"eval_loss": 1.4696097373962402, |
|
"eval_runtime": 76.6203, |
|
"eval_samples_per_second": 6.526, |
|
"eval_steps_per_second": 6.526, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.020346969372456628, |
|
"grad_norm": 6.802274703979492, |
|
"learning_rate": 4.205956949507625e-05, |
|
"loss": 1.4485, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.020614692653673165, |
|
"grad_norm": 6.832027912139893, |
|
"learning_rate": 4.200123671347065e-05, |
|
"loss": 1.5034, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.0208824159348897, |
|
"grad_norm": 7.041072368621826, |
|
"learning_rate": 4.1942081759363236e-05, |
|
"loss": 1.5225, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.021150139216106232, |
|
"grad_norm": 6.773481369018555, |
|
"learning_rate": 4.1882107064382496e-05, |
|
"loss": 1.4718, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.021417862497322766, |
|
"grad_norm": 6.902709007263184, |
|
"learning_rate": 4.1821315093853216e-05, |
|
"loss": 1.4562, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.021685585778539303, |
|
"grad_norm": 5.957550048828125, |
|
"learning_rate": 4.1759708346695215e-05, |
|
"loss": 1.4798, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.021953309059755836, |
|
"grad_norm": 6.139000415802002, |
|
"learning_rate": 4.1697289355320565e-05, |
|
"loss": 1.5084, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.02222103234097237, |
|
"grad_norm": 6.227492332458496, |
|
"learning_rate": 4.1634060685529527e-05, |
|
"loss": 1.4597, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.022488755622188907, |
|
"grad_norm": 6.531513214111328, |
|
"learning_rate": 4.157002493640506e-05, |
|
"loss": 1.4326, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02275647890340544, |
|
"grad_norm": 6.589105606079102, |
|
"learning_rate": 4.1505184740206006e-05, |
|
"loss": 1.431, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.023024202184621975, |
|
"grad_norm": 6.289806842803955, |
|
"learning_rate": 4.143954276225886e-05, |
|
"loss": 1.5167, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.023291925465838508, |
|
"grad_norm": 6.444394111633301, |
|
"learning_rate": 4.1373101700848235e-05, |
|
"loss": 1.4948, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.023559648747055045, |
|
"grad_norm": 6.666561126708984, |
|
"learning_rate": 4.1305864287105946e-05, |
|
"loss": 1.4879, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.02382737202827158, |
|
"grad_norm": 6.169593811035156, |
|
"learning_rate": 4.12378332848987e-05, |
|
"loss": 1.5557, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.024095095309488113, |
|
"grad_norm": 6.8753743171691895, |
|
"learning_rate": 4.116901149071457e-05, |
|
"loss": 1.4446, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.024095095309488113, |
|
"eval_loss": 1.4628037214279175, |
|
"eval_runtime": 76.661, |
|
"eval_samples_per_second": 6.522, |
|
"eval_steps_per_second": 6.522, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.024362818590704646, |
|
"grad_norm": 6.948112964630127, |
|
"learning_rate": 4.1099401733547925e-05, |
|
"loss": 1.4916, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.024630541871921183, |
|
"grad_norm": 6.508751392364502, |
|
"learning_rate": 4.102900687478326e-05, |
|
"loss": 1.4659, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.024898265153137717, |
|
"grad_norm": 6.732906818389893, |
|
"learning_rate": 4.095782980807749e-05, |
|
"loss": 1.4834, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.02516598843435425, |
|
"grad_norm": 6.261349678039551, |
|
"learning_rate": 4.088587345924105e-05, |
|
"loss": 1.4585, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.025433711715570784, |
|
"grad_norm": 5.926994323730469, |
|
"learning_rate": 4.081314078611762e-05, |
|
"loss": 1.4887, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02570143499678732, |
|
"grad_norm": 6.746396064758301, |
|
"learning_rate": 4.073963477846249e-05, |
|
"loss": 1.4561, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.025969158278003855, |
|
"grad_norm": 6.503916263580322, |
|
"learning_rate": 4.066535845781975e-05, |
|
"loss": 1.5013, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.02623688155922039, |
|
"grad_norm": 6.83921480178833, |
|
"learning_rate": 4.059031487739803e-05, |
|
"loss": 1.517, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.026504604840436926, |
|
"grad_norm": 6.210860252380371, |
|
"learning_rate": 4.051450712194497e-05, |
|
"loss": 1.4849, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.02677232812165346, |
|
"grad_norm": 6.381270885467529, |
|
"learning_rate": 4.043793830762049e-05, |
|
"loss": 1.4685, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.027040051402869993, |
|
"grad_norm": 6.763075351715088, |
|
"learning_rate": 4.036061158186866e-05, |
|
"loss": 1.5412, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.027307774684086527, |
|
"grad_norm": 6.492913722991943, |
|
"learning_rate": 4.028253012328828e-05, |
|
"loss": 1.4398, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.027575497965303064, |
|
"grad_norm": 6.383675575256348, |
|
"learning_rate": 4.0203697141502323e-05, |
|
"loss": 1.4514, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.027843221246519598, |
|
"grad_norm": 6.685030937194824, |
|
"learning_rate": 4.0124115877025874e-05, |
|
"loss": 1.4688, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.02811094452773613, |
|
"grad_norm": 6.681553840637207, |
|
"learning_rate": 4.004378960113303e-05, |
|
"loss": 1.4862, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.02811094452773613, |
|
"eval_loss": 1.4577986001968384, |
|
"eval_runtime": 76.6839, |
|
"eval_samples_per_second": 6.52, |
|
"eval_steps_per_second": 6.52, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.028378667808952665, |
|
"grad_norm": 6.192584037780762, |
|
"learning_rate": 3.996272161572237e-05, |
|
"loss": 1.4383, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.028646391090169202, |
|
"grad_norm": 6.383575439453125, |
|
"learning_rate": 3.988091525318126e-05, |
|
"loss": 1.4015, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.028914114371385736, |
|
"grad_norm": 7.042336940765381, |
|
"learning_rate": 3.979837387624884e-05, |
|
"loss": 1.468, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02918183765260227, |
|
"grad_norm": 6.60390567779541, |
|
"learning_rate": 3.971510087787784e-05, |
|
"loss": 1.4932, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.029449560933818803, |
|
"grad_norm": 6.5373029708862305, |
|
"learning_rate": 3.9631099681095044e-05, |
|
"loss": 1.4381, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.02971728421503534, |
|
"grad_norm": 7.118472099304199, |
|
"learning_rate": 3.954637373886066e-05, |
|
"loss": 1.4057, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.029985007496251874, |
|
"grad_norm": 7.386786460876465, |
|
"learning_rate": 3.9460926533926315e-05, |
|
"loss": 1.4978, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.030252730777468408, |
|
"grad_norm": 6.418981552124023, |
|
"learning_rate": 3.937476157869193e-05, |
|
"loss": 1.4897, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.030520454058684945, |
|
"grad_norm": 5.941694259643555, |
|
"learning_rate": 3.9287882415061334e-05, |
|
"loss": 1.4381, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.03078817733990148, |
|
"grad_norm": 6.574525833129883, |
|
"learning_rate": 3.9200292614296655e-05, |
|
"loss": 1.4143, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.031055900621118012, |
|
"grad_norm": 6.349545478820801, |
|
"learning_rate": 3.911199577687154e-05, |
|
"loss": 1.3937, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.03132362390233455, |
|
"grad_norm": 6.873767375946045, |
|
"learning_rate": 3.902299553232315e-05, |
|
"loss": 1.4515, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.03159134718355108, |
|
"grad_norm": 6.043056964874268, |
|
"learning_rate": 3.893329553910293e-05, |
|
"loss": 1.5538, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.031859070464767617, |
|
"grad_norm": 6.418127059936523, |
|
"learning_rate": 3.884289948442628e-05, |
|
"loss": 1.4745, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.03212679374598415, |
|
"grad_norm": 6.102353096008301, |
|
"learning_rate": 3.875181108412096e-05, |
|
"loss": 1.4772, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03212679374598415, |
|
"eval_loss": 1.449093222618103, |
|
"eval_runtime": 76.8079, |
|
"eval_samples_per_second": 6.51, |
|
"eval_steps_per_second": 6.51, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.032394517027200684, |
|
"grad_norm": 6.624678134918213, |
|
"learning_rate": 3.8660034082474316e-05, |
|
"loss": 1.4526, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.03266224030841722, |
|
"grad_norm": 6.601352214813232, |
|
"learning_rate": 3.856757225207944e-05, |
|
"loss": 1.5247, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03292996358963375, |
|
"grad_norm": 6.580589771270752, |
|
"learning_rate": 3.847442939368002e-05, |
|
"loss": 1.4694, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.03319768687085029, |
|
"grad_norm": 5.504952430725098, |
|
"learning_rate": 3.8380609336014156e-05, |
|
"loss": 1.411, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.033465410152066825, |
|
"grad_norm": 6.643352508544922, |
|
"learning_rate": 3.828611593565694e-05, |
|
"loss": 1.4278, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03373313343328336, |
|
"grad_norm": 6.425754070281982, |
|
"learning_rate": 3.819095307686197e-05, |
|
"loss": 1.4253, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03400085671449989, |
|
"grad_norm": 6.0909504890441895, |
|
"learning_rate": 3.809512467140163e-05, |
|
"loss": 1.4681, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.034268579995716426, |
|
"grad_norm": 5.82462215423584, |
|
"learning_rate": 3.799863465840634e-05, |
|
"loss": 1.4275, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.03453630327693296, |
|
"grad_norm": 6.24934720993042, |
|
"learning_rate": 3.790148700420261e-05, |
|
"loss": 1.4313, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.034804026558149494, |
|
"grad_norm": 6.211289882659912, |
|
"learning_rate": 3.7803685702150006e-05, |
|
"loss": 1.4216, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.035071749839366034, |
|
"grad_norm": 5.66799259185791, |
|
"learning_rate": 3.7705234772476984e-05, |
|
"loss": 1.4799, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.03533947312058257, |
|
"grad_norm": 6.351191997528076, |
|
"learning_rate": 3.760613826211567e-05, |
|
"loss": 1.4281, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.0356071964017991, |
|
"grad_norm": 5.707607269287109, |
|
"learning_rate": 3.7506400244535455e-05, |
|
"loss": 1.463, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.035874919683015635, |
|
"grad_norm": 5.780487537384033, |
|
"learning_rate": 3.740602481957561e-05, |
|
"loss": 1.4731, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03614264296423217, |
|
"grad_norm": 5.354376316070557, |
|
"learning_rate": 3.7305016113276704e-05, |
|
"loss": 1.4492, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03614264296423217, |
|
"eval_loss": 1.444738745689392, |
|
"eval_runtime": 76.8239, |
|
"eval_samples_per_second": 6.508, |
|
"eval_steps_per_second": 6.508, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.0364103662454487, |
|
"grad_norm": 5.93463134765625, |
|
"learning_rate": 3.7203378277711024e-05, |
|
"loss": 1.4602, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.036678089526665236, |
|
"grad_norm": 5.922112464904785, |
|
"learning_rate": 3.710111549081191e-05, |
|
"loss": 1.4412, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.03694581280788178, |
|
"grad_norm": 6.667977809906006, |
|
"learning_rate": 3.699823195620199e-05, |
|
"loss": 1.4475, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.03721353608909831, |
|
"grad_norm": 6.021790504455566, |
|
"learning_rate": 3.689473190302041e-05, |
|
"loss": 1.4206, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.037481259370314844, |
|
"grad_norm": 6.152276039123535, |
|
"learning_rate": 3.679061958574897e-05, |
|
"loss": 1.4288, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.03774898265153138, |
|
"grad_norm": 5.695444583892822, |
|
"learning_rate": 3.668589928403726e-05, |
|
"loss": 1.4424, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.03801670593274791, |
|
"grad_norm": 6.346884727478027, |
|
"learning_rate": 3.6580575302526706e-05, |
|
"loss": 1.5001, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.038284429213964445, |
|
"grad_norm": 5.674633979797363, |
|
"learning_rate": 3.647465197067368e-05, |
|
"loss": 1.4796, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.03855215249518098, |
|
"grad_norm": 6.168262481689453, |
|
"learning_rate": 3.6368133642571464e-05, |
|
"loss": 1.4428, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.03881987577639751, |
|
"grad_norm": 6.4981369972229, |
|
"learning_rate": 3.6261024696771345e-05, |
|
"loss": 1.4281, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.03908759905761405, |
|
"grad_norm": 5.703588962554932, |
|
"learning_rate": 3.615332953610255e-05, |
|
"loss": 1.3934, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.03935532233883059, |
|
"grad_norm": 5.69433069229126, |
|
"learning_rate": 3.604505258749132e-05, |
|
"loss": 1.4482, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.03962304562004712, |
|
"grad_norm": 5.763268947601318, |
|
"learning_rate": 3.5936198301778945e-05, |
|
"loss": 1.4629, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.039890768901263654, |
|
"grad_norm": 6.472227096557617, |
|
"learning_rate": 3.5826771153538716e-05, |
|
"loss": 1.4301, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.04015849218248019, |
|
"grad_norm": 6.155264854431152, |
|
"learning_rate": 3.571677564089214e-05, |
|
"loss": 1.4703, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04015849218248019, |
|
"eval_loss": 1.4356995820999146, |
|
"eval_runtime": 76.8148, |
|
"eval_samples_per_second": 6.509, |
|
"eval_steps_per_second": 6.509, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04042621546369672, |
|
"grad_norm": 6.241977214813232, |
|
"learning_rate": 3.560621628532389e-05, |
|
"loss": 1.4461, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.040693938744913255, |
|
"grad_norm": 6.195336818695068, |
|
"learning_rate": 3.5495097631496066e-05, |
|
"loss": 1.3735, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.040961662026129796, |
|
"grad_norm": 5.899549961090088, |
|
"learning_rate": 3.5383424247061286e-05, |
|
"loss": 1.4787, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.04122938530734633, |
|
"grad_norm": 6.187289714813232, |
|
"learning_rate": 3.5271200722475e-05, |
|
"loss": 1.4413, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.04149710858856286, |
|
"grad_norm": 5.872448444366455, |
|
"learning_rate": 3.515843167080675e-05, |
|
"loss": 1.4317, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.0417648318697794, |
|
"grad_norm": 6.877863883972168, |
|
"learning_rate": 3.5045121727550566e-05, |
|
"loss": 1.4593, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04203255515099593, |
|
"grad_norm": 6.644635200500488, |
|
"learning_rate": 3.493127555043441e-05, |
|
"loss": 1.4622, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.042300278432212464, |
|
"grad_norm": 6.314537525177002, |
|
"learning_rate": 3.481689781922871e-05, |
|
"loss": 1.5365, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.042568001713429, |
|
"grad_norm": 6.645462989807129, |
|
"learning_rate": 3.470199323555403e-05, |
|
"loss": 1.4534, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.04283572499464553, |
|
"grad_norm": 6.462603569030762, |
|
"learning_rate": 3.4586566522687734e-05, |
|
"loss": 1.4786, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04310344827586207, |
|
"grad_norm": 6.516161918640137, |
|
"learning_rate": 3.44706224253699e-05, |
|
"loss": 1.3987, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.043371171557078605, |
|
"grad_norm": 6.5383076667785645, |
|
"learning_rate": 3.435416570960824e-05, |
|
"loss": 1.4993, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.04363889483829514, |
|
"grad_norm": 6.37644100189209, |
|
"learning_rate": 3.4237201162482225e-05, |
|
"loss": 1.4527, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.04390661811951167, |
|
"grad_norm": 6.744267463684082, |
|
"learning_rate": 3.411973359194625e-05, |
|
"loss": 1.4213, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.044174341400728206, |
|
"grad_norm": 6.487947463989258, |
|
"learning_rate": 3.400176782663207e-05, |
|
"loss": 1.4266, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.044174341400728206, |
|
"eval_loss": 1.4300212860107422, |
|
"eval_runtime": 76.7363, |
|
"eval_samples_per_second": 6.516, |
|
"eval_steps_per_second": 6.516, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.04444206468194474, |
|
"grad_norm": 5.463531494140625, |
|
"learning_rate": 3.3883308715650246e-05, |
|
"loss": 1.4868, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.044709787963161274, |
|
"grad_norm": 6.814722537994385, |
|
"learning_rate": 3.3764361128390853e-05, |
|
"loss": 1.441, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.044977511244377814, |
|
"grad_norm": 5.653580665588379, |
|
"learning_rate": 3.3644929954323324e-05, |
|
"loss": 1.4674, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.04524523452559435, |
|
"grad_norm": 5.94467306137085, |
|
"learning_rate": 3.3525020102795434e-05, |
|
"loss": 1.4337, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04551295780681088, |
|
"grad_norm": 6.066728115081787, |
|
"learning_rate": 3.3404636502831555e-05, |
|
"loss": 1.4701, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.045780681088027415, |
|
"grad_norm": 6.075364112854004, |
|
"learning_rate": 3.328378410292994e-05, |
|
"loss": 1.4264, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04604840436924395, |
|
"grad_norm": 6.536380290985107, |
|
"learning_rate": 3.3162467870859404e-05, |
|
"loss": 1.4928, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04631612765046048, |
|
"grad_norm": 6.932302951812744, |
|
"learning_rate": 3.3040692793455106e-05, |
|
"loss": 1.4472, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.046583850931677016, |
|
"grad_norm": 6.54493522644043, |
|
"learning_rate": 3.2918463876413504e-05, |
|
"loss": 1.3929, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04685157421289355, |
|
"grad_norm": 5.732593059539795, |
|
"learning_rate": 3.279578614408664e-05, |
|
"loss": 1.4182, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04711929749411009, |
|
"grad_norm": 6.897890090942383, |
|
"learning_rate": 3.2672664639275584e-05, |
|
"loss": 1.466, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.047387020775326624, |
|
"grad_norm": 6.196009159088135, |
|
"learning_rate": 3.254910442302319e-05, |
|
"loss": 1.4552, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.04765474405654316, |
|
"grad_norm": 6.375300407409668, |
|
"learning_rate": 3.242511057440597e-05, |
|
"loss": 1.4139, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.04792246733775969, |
|
"grad_norm": 5.817831993103027, |
|
"learning_rate": 3.2300688190325404e-05, |
|
"loss": 1.4855, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.048190190618976225, |
|
"grad_norm": 5.730225563049316, |
|
"learning_rate": 3.217584238529838e-05, |
|
"loss": 1.3845, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.048190190618976225, |
|
"eval_loss": 1.4201979637145996, |
|
"eval_runtime": 76.7247, |
|
"eval_samples_per_second": 6.517, |
|
"eval_steps_per_second": 6.517, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.04845791390019276, |
|
"grad_norm": 7.2009382247924805, |
|
"learning_rate": 3.205057829124693e-05, |
|
"loss": 1.3661, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.04872563718140929, |
|
"grad_norm": 6.178856372833252, |
|
"learning_rate": 3.192490105728736e-05, |
|
"loss": 1.4082, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.04899336046262583, |
|
"grad_norm": 5.327315330505371, |
|
"learning_rate": 3.17988158495185e-05, |
|
"loss": 1.4033, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.04926108374384237, |
|
"grad_norm": 5.719634532928467, |
|
"learning_rate": 3.1672327850809405e-05, |
|
"loss": 1.4505, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.0495288070250589, |
|
"grad_norm": 6.736356735229492, |
|
"learning_rate": 3.154544226058628e-05, |
|
"loss": 1.4521, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.049796530306275434, |
|
"grad_norm": 5.911272048950195, |
|
"learning_rate": 3.1418164294618766e-05, |
|
"loss": 1.452, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.05006425358749197, |
|
"grad_norm": 6.058777332305908, |
|
"learning_rate": 3.129049918480552e-05, |
|
"loss": 1.4431, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.0503319768687085, |
|
"grad_norm": 5.928140640258789, |
|
"learning_rate": 3.116245217895918e-05, |
|
"loss": 1.4781, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.050599700149925035, |
|
"grad_norm": 5.614587783813477, |
|
"learning_rate": 3.1034028540590635e-05, |
|
"loss": 1.3831, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.05086742343114157, |
|
"grad_norm": 6.133342742919922, |
|
"learning_rate": 3.090523354869266e-05, |
|
"loss": 1.4711, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05113514671235811, |
|
"grad_norm": 7.014552593231201, |
|
"learning_rate": 3.0776072497522916e-05, |
|
"loss": 1.4404, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.05140286999357464, |
|
"grad_norm": 7.53794002532959, |
|
"learning_rate": 3.064655069638632e-05, |
|
"loss": 1.4262, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.05167059327479118, |
|
"grad_norm": 5.97748327255249, |
|
"learning_rate": 3.0516673469416818e-05, |
|
"loss": 1.3836, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.05193831655600771, |
|
"grad_norm": 5.628602504730225, |
|
"learning_rate": 3.0386446155358518e-05, |
|
"loss": 1.4083, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.052206039837224244, |
|
"grad_norm": 6.5765380859375, |
|
"learning_rate": 3.0255874107346232e-05, |
|
"loss": 1.4374, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.052206039837224244, |
|
"eval_loss": 1.4104682207107544, |
|
"eval_runtime": 76.7665, |
|
"eval_samples_per_second": 6.513, |
|
"eval_steps_per_second": 6.513, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05247376311844078, |
|
"grad_norm": 6.185977458953857, |
|
"learning_rate": 3.012496269268544e-05, |
|
"loss": 1.4185, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05274148639965731, |
|
"grad_norm": 6.482154846191406, |
|
"learning_rate": 2.9993717292631652e-05, |
|
"loss": 1.4446, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.05300920968087385, |
|
"grad_norm": 6.260786056518555, |
|
"learning_rate": 2.9862143302169223e-05, |
|
"loss": 1.4123, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.053276932962090386, |
|
"grad_norm": 6.361741542816162, |
|
"learning_rate": 2.9730246129789542e-05, |
|
"loss": 1.4646, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.05354465624330692, |
|
"grad_norm": 6.388660430908203, |
|
"learning_rate": 2.9598031197268768e-05, |
|
"loss": 1.4232, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05381237952452345, |
|
"grad_norm": 6.549992084503174, |
|
"learning_rate": 2.946550393944493e-05, |
|
"loss": 1.398, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.05408010280573999, |
|
"grad_norm": 5.965968132019043, |
|
"learning_rate": 2.933266980399452e-05, |
|
"loss": 1.3618, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.05434782608695652, |
|
"grad_norm": 6.706049919128418, |
|
"learning_rate": 2.9199534251208573e-05, |
|
"loss": 1.4274, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.054615549368173054, |
|
"grad_norm": 5.7372355461120605, |
|
"learning_rate": 2.9066102753768204e-05, |
|
"loss": 1.3954, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.05488327264938959, |
|
"grad_norm": 6.6037468910217285, |
|
"learning_rate": 2.893238079651966e-05, |
|
"loss": 1.3763, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.05515099593060613, |
|
"grad_norm": 6.51908540725708, |
|
"learning_rate": 2.8798373876248843e-05, |
|
"loss": 1.3945, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.05541871921182266, |
|
"grad_norm": 6.044327259063721, |
|
"learning_rate": 2.8664087501455387e-05, |
|
"loss": 1.4487, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.055686442493039195, |
|
"grad_norm": 6.289717674255371, |
|
"learning_rate": 2.852952719212619e-05, |
|
"loss": 1.4311, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.05595416577425573, |
|
"grad_norm": 5.992395401000977, |
|
"learning_rate": 2.8394698479508542e-05, |
|
"loss": 1.3859, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.05622188905547226, |
|
"grad_norm": 6.025457382202148, |
|
"learning_rate": 2.8259606905882712e-05, |
|
"loss": 1.4162, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05622188905547226, |
|
"eval_loss": 1.4084678888320923, |
|
"eval_runtime": 76.754, |
|
"eval_samples_per_second": 6.514, |
|
"eval_steps_per_second": 6.514, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.056489612336688796, |
|
"grad_norm": 5.75090217590332, |
|
"learning_rate": 2.8124258024334192e-05, |
|
"loss": 1.4478, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.05675733561790533, |
|
"grad_norm": 6.59517240524292, |
|
"learning_rate": 2.7988657398525364e-05, |
|
"loss": 1.4742, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.05702505889912187, |
|
"grad_norm": 5.816342830657959, |
|
"learning_rate": 2.785281060246685e-05, |
|
"loss": 1.4508, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.057292782180338404, |
|
"grad_norm": 5.353818416595459, |
|
"learning_rate": 2.7716723220288365e-05, |
|
"loss": 1.4593, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.05756050546155494, |
|
"grad_norm": 5.926205635070801, |
|
"learning_rate": 2.758040084600916e-05, |
|
"loss": 1.4599, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.05782822874277147, |
|
"grad_norm": 6.504787921905518, |
|
"learning_rate": 2.7443849083308117e-05, |
|
"loss": 1.3973, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.058095952023988005, |
|
"grad_norm": 5.680723190307617, |
|
"learning_rate": 2.7307073545293355e-05, |
|
"loss": 1.4051, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.05836367530520454, |
|
"grad_norm": 7.509329795837402, |
|
"learning_rate": 2.7170079854271533e-05, |
|
"loss": 1.3807, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.05863139858642107, |
|
"grad_norm": 6.508755683898926, |
|
"learning_rate": 2.703287364151672e-05, |
|
"loss": 1.3869, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.058899121867637606, |
|
"grad_norm": 6.544657230377197, |
|
"learning_rate": 2.6895460547038913e-05, |
|
"loss": 1.3409, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.05916684514885415, |
|
"grad_norm": 6.107619762420654, |
|
"learning_rate": 2.6757846219352235e-05, |
|
"loss": 1.389, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.05943456843007068, |
|
"grad_norm": 6.255703449249268, |
|
"learning_rate": 2.6620036315242682e-05, |
|
"loss": 1.4385, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.059702291711287214, |
|
"grad_norm": 5.130046844482422, |
|
"learning_rate": 2.6482036499535665e-05, |
|
"loss": 1.3614, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.05997001499250375, |
|
"grad_norm": 6.854607105255127, |
|
"learning_rate": 2.6343852444863075e-05, |
|
"loss": 1.4465, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.06023773827372028, |
|
"grad_norm": 6.745285511016846, |
|
"learning_rate": 2.6205489831430192e-05, |
|
"loss": 1.4, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06023773827372028, |
|
"eval_loss": 1.3977503776550293, |
|
"eval_runtime": 76.7205, |
|
"eval_samples_per_second": 6.517, |
|
"eval_steps_per_second": 6.517, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.060505461554936815, |
|
"grad_norm": 5.628711223602295, |
|
"learning_rate": 2.6066954346782113e-05, |
|
"loss": 1.43, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.06077318483615335, |
|
"grad_norm": 6.356362342834473, |
|
"learning_rate": 2.5928251685570005e-05, |
|
"loss": 1.4382, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.06104090811736989, |
|
"grad_norm": 5.99081563949585, |
|
"learning_rate": 2.5789387549317016e-05, |
|
"loss": 1.4363, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.06130863139858642, |
|
"grad_norm": 6.202702522277832, |
|
"learning_rate": 2.5650367646183896e-05, |
|
"loss": 1.3932, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.06157635467980296, |
|
"grad_norm": 6.321465015411377, |
|
"learning_rate": 2.5511197690734344e-05, |
|
"loss": 1.4056, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06184407796101949, |
|
"grad_norm": 5.804145812988281, |
|
"learning_rate": 2.5371883403700148e-05, |
|
"loss": 1.4132, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 5.593542098999023, |
|
"learning_rate": 2.5232430511745995e-05, |
|
"loss": 1.4603, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.06237952452345256, |
|
"grad_norm": 6.308177947998047, |
|
"learning_rate": 2.5092844747234063e-05, |
|
"loss": 1.361, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.0626472478046691, |
|
"grad_norm": 5.957157611846924, |
|
"learning_rate": 2.495313184798842e-05, |
|
"loss": 1.435, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.06291497108588563, |
|
"grad_norm": 5.485774517059326, |
|
"learning_rate": 2.4813297557059133e-05, |
|
"loss": 1.413, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.06318269436710217, |
|
"grad_norm": 7.090158939361572, |
|
"learning_rate": 2.467334762248621e-05, |
|
"loss": 1.3819, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.0634504176483187, |
|
"grad_norm": 6.819372653961182, |
|
"learning_rate": 2.4533287797063308e-05, |
|
"loss": 1.4347, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.06371814092953523, |
|
"grad_norm": 5.654256820678711, |
|
"learning_rate": 2.439312383810128e-05, |
|
"loss": 1.3902, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.06398586421075177, |
|
"grad_norm": 6.394632339477539, |
|
"learning_rate": 2.4252861507191487e-05, |
|
"loss": 1.4324, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.0642535874919683, |
|
"grad_norm": 6.346138954162598, |
|
"learning_rate": 2.4112506569969e-05, |
|
"loss": 1.3853, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.0642535874919683, |
|
"eval_loss": 1.389374017715454, |
|
"eval_runtime": 76.6782, |
|
"eval_samples_per_second": 6.521, |
|
"eval_steps_per_second": 6.521, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06452131077318483, |
|
"grad_norm": 5.798035144805908, |
|
"learning_rate": 2.3972064795875537e-05, |
|
"loss": 1.3668, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.06478903405440137, |
|
"grad_norm": 6.213179588317871, |
|
"learning_rate": 2.3831541957922366e-05, |
|
"loss": 1.3913, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.0650567573356179, |
|
"grad_norm": 6.443445682525635, |
|
"learning_rate": 2.3690943832452967e-05, |
|
"loss": 1.4176, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.06532448061683443, |
|
"grad_norm": 6.543423652648926, |
|
"learning_rate": 2.3550276198905584e-05, |
|
"loss": 1.5036, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.06559220389805097, |
|
"grad_norm": 5.8855977058410645, |
|
"learning_rate": 2.3409544839575687e-05, |
|
"loss": 1.3749, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.0658599271792675, |
|
"grad_norm": 6.113175868988037, |
|
"learning_rate": 2.3268755539378238e-05, |
|
"loss": 1.3555, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.06612765046048405, |
|
"grad_norm": 6.519189357757568, |
|
"learning_rate": 2.3127914085609943e-05, |
|
"loss": 1.3457, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.06639537374170058, |
|
"grad_norm": 6.135042667388916, |
|
"learning_rate": 2.298702626771133e-05, |
|
"loss": 1.4143, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.06666309702291712, |
|
"grad_norm": 6.562228679656982, |
|
"learning_rate": 2.2846097877028762e-05, |
|
"loss": 1.4549, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.06693082030413365, |
|
"grad_norm": 6.2036213874816895, |
|
"learning_rate": 2.270513470657642e-05, |
|
"loss": 1.3422, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06719854358535018, |
|
"grad_norm": 6.321053981781006, |
|
"learning_rate": 2.25641425507981e-05, |
|
"loss": 1.4206, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.06746626686656672, |
|
"grad_norm": 5.922671794891357, |
|
"learning_rate": 2.2423127205329117e-05, |
|
"loss": 1.4368, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.06773399014778325, |
|
"grad_norm": 6.139718532562256, |
|
"learning_rate": 2.2282094466758e-05, |
|
"loss": 1.3574, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.06800171342899979, |
|
"grad_norm": 5.755593776702881, |
|
"learning_rate": 2.2141050132388245e-05, |
|
"loss": 1.4075, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.06826943671021632, |
|
"grad_norm": 5.7373151779174805, |
|
"learning_rate": 2.2e-05, |
|
"loss": 1.3812, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.06826943671021632, |
|
"eval_loss": 1.3869917392730713, |
|
"eval_runtime": 76.6763, |
|
"eval_samples_per_second": 6.521, |
|
"eval_steps_per_second": 6.521, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.06853715999143285, |
|
"grad_norm": 6.435483932495117, |
|
"learning_rate": 2.1858949867611754e-05, |
|
"loss": 1.3586, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.06880488327264939, |
|
"grad_norm": 5.814359188079834, |
|
"learning_rate": 2.1717905533241997e-05, |
|
"loss": 1.3745, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.06907260655386592, |
|
"grad_norm": 6.140771389007568, |
|
"learning_rate": 2.157687279467088e-05, |
|
"loss": 1.3296, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.06934032983508245, |
|
"grad_norm": 5.861440181732178, |
|
"learning_rate": 2.14358574492019e-05, |
|
"loss": 1.3911, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.06960805311629899, |
|
"grad_norm": 6.584283351898193, |
|
"learning_rate": 2.1294865293423586e-05, |
|
"loss": 1.4143, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.06987577639751552, |
|
"grad_norm": 5.859135627746582, |
|
"learning_rate": 2.1153902122971233e-05, |
|
"loss": 1.3923, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.07014349967873207, |
|
"grad_norm": 6.673269748687744, |
|
"learning_rate": 2.101297373228868e-05, |
|
"loss": 1.4072, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.0704112229599486, |
|
"grad_norm": 5.8205180168151855, |
|
"learning_rate": 2.087208591439006e-05, |
|
"loss": 1.3962, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.07067894624116514, |
|
"grad_norm": 5.918448448181152, |
|
"learning_rate": 2.0731244460621764e-05, |
|
"loss": 1.4121, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.07094666952238167, |
|
"grad_norm": 6.024654865264893, |
|
"learning_rate": 2.0590455160424316e-05, |
|
"loss": 1.3958, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.0712143928035982, |
|
"grad_norm": 6.21071195602417, |
|
"learning_rate": 2.044972380109441e-05, |
|
"loss": 1.4155, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.07148211608481474, |
|
"grad_norm": 6.8569207191467285, |
|
"learning_rate": 2.030905616754704e-05, |
|
"loss": 1.3968, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.07174983936603127, |
|
"grad_norm": 6.207950592041016, |
|
"learning_rate": 2.0168458042077636e-05, |
|
"loss": 1.3722, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.0720175626472478, |
|
"grad_norm": 5.884634494781494, |
|
"learning_rate": 2.0027935204124465e-05, |
|
"loss": 1.4165, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.07228528592846434, |
|
"grad_norm": 5.943591117858887, |
|
"learning_rate": 1.9887493430031e-05, |
|
"loss": 1.4054, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07228528592846434, |
|
"eval_loss": 1.3810029029846191, |
|
"eval_runtime": 76.6801, |
|
"eval_samples_per_second": 6.521, |
|
"eval_steps_per_second": 6.521, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07255300920968087, |
|
"grad_norm": 6.2774457931518555, |
|
"learning_rate": 1.9747138492808512e-05, |
|
"loss": 1.4184, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.0728207324908974, |
|
"grad_norm": 6.596461772918701, |
|
"learning_rate": 1.960687616189872e-05, |
|
"loss": 1.4314, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.07308845577211394, |
|
"grad_norm": 6.17069149017334, |
|
"learning_rate": 1.9466712202936694e-05, |
|
"loss": 1.4248, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.07335617905333047, |
|
"grad_norm": 6.085130214691162, |
|
"learning_rate": 1.932665237751379e-05, |
|
"loss": 1.3966, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.073623902334547, |
|
"grad_norm": 6.430164813995361, |
|
"learning_rate": 1.9186702442940866e-05, |
|
"loss": 1.3521, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.07389162561576355, |
|
"grad_norm": 5.946996212005615, |
|
"learning_rate": 1.9046868152011587e-05, |
|
"loss": 1.336, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.07415934889698009, |
|
"grad_norm": 6.169567108154297, |
|
"learning_rate": 1.8907155252765942e-05, |
|
"loss": 1.4099, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.07442707217819662, |
|
"grad_norm": 5.974761962890625, |
|
"learning_rate": 1.8767569488254004e-05, |
|
"loss": 1.3588, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.07469479545941315, |
|
"grad_norm": 5.632639408111572, |
|
"learning_rate": 1.8628116596299847e-05, |
|
"loss": 1.3704, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.07496251874062969, |
|
"grad_norm": 5.559203147888184, |
|
"learning_rate": 1.848880230926566e-05, |
|
"loss": 1.3878, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.07523024202184622, |
|
"grad_norm": 6.6522674560546875, |
|
"learning_rate": 1.8349632353816113e-05, |
|
"loss": 1.4324, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.07549796530306276, |
|
"grad_norm": 5.803051471710205, |
|
"learning_rate": 1.8210612450682986e-05, |
|
"loss": 1.4132, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.07576568858427929, |
|
"grad_norm": 5.304571151733398, |
|
"learning_rate": 1.8071748314429994e-05, |
|
"loss": 1.3607, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.07603341186549582, |
|
"grad_norm": 5.904123783111572, |
|
"learning_rate": 1.7933045653217886e-05, |
|
"loss": 1.3963, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.07630113514671236, |
|
"grad_norm": 5.9972686767578125, |
|
"learning_rate": 1.7794510168569814e-05, |
|
"loss": 1.4353, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.07630113514671236, |
|
"eval_loss": 1.3754030466079712, |
|
"eval_runtime": 76.726, |
|
"eval_samples_per_second": 6.517, |
|
"eval_steps_per_second": 6.517, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.07656885842792889, |
|
"grad_norm": 6.276034832000732, |
|
"learning_rate": 1.7656147555136924e-05, |
|
"loss": 1.3894, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.07683658170914542, |
|
"grad_norm": 6.091196060180664, |
|
"learning_rate": 1.7517963500464338e-05, |
|
"loss": 1.3956, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.07710430499036196, |
|
"grad_norm": 6.1393513679504395, |
|
"learning_rate": 1.7379963684757313e-05, |
|
"loss": 1.4192, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.07737202827157849, |
|
"grad_norm": 6.082838535308838, |
|
"learning_rate": 1.7242153780647764e-05, |
|
"loss": 1.3598, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.07763975155279502, |
|
"grad_norm": 7.009051322937012, |
|
"learning_rate": 1.7104539452961086e-05, |
|
"loss": 1.3388, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07790747483401157, |
|
"grad_norm": 6.073249340057373, |
|
"learning_rate": 1.6967126358483283e-05, |
|
"loss": 1.4014, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.0781751981152281, |
|
"grad_norm": 5.901647567749023, |
|
"learning_rate": 1.6829920145728465e-05, |
|
"loss": 1.3795, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.07844292139644464, |
|
"grad_norm": 5.663522243499756, |
|
"learning_rate": 1.6692926454706644e-05, |
|
"loss": 1.4444, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.07871064467766117, |
|
"grad_norm": 6.163628578186035, |
|
"learning_rate": 1.655615091669189e-05, |
|
"loss": 1.3579, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.07897836795887771, |
|
"grad_norm": 5.641757965087891, |
|
"learning_rate": 1.641959915399084e-05, |
|
"loss": 1.3816, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.07924609124009424, |
|
"grad_norm": 5.533544540405273, |
|
"learning_rate": 1.6283276779711637e-05, |
|
"loss": 1.4021, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.07951381452131077, |
|
"grad_norm": 6.252144813537598, |
|
"learning_rate": 1.614718939753315e-05, |
|
"loss": 1.3424, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.07978153780252731, |
|
"grad_norm": 6.014984607696533, |
|
"learning_rate": 1.6011342601474635e-05, |
|
"loss": 1.3733, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.08004926108374384, |
|
"grad_norm": 6.043126106262207, |
|
"learning_rate": 1.5875741975665813e-05, |
|
"loss": 1.4402, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.08031698436496038, |
|
"grad_norm": 6.2684478759765625, |
|
"learning_rate": 1.5740393094117287e-05, |
|
"loss": 1.3955, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08031698436496038, |
|
"eval_loss": 1.3712314367294312, |
|
"eval_runtime": 77.0642, |
|
"eval_samples_per_second": 6.488, |
|
"eval_steps_per_second": 6.488, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08058470764617691, |
|
"grad_norm": 6.531871795654297, |
|
"learning_rate": 1.560530152049146e-05, |
|
"loss": 1.3728, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.08085243092739344, |
|
"grad_norm": 6.215692043304443, |
|
"learning_rate": 1.5470472807873805e-05, |
|
"loss": 1.322, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.08112015420860998, |
|
"grad_norm": 5.670928001403809, |
|
"learning_rate": 1.5335912498544615e-05, |
|
"loss": 1.3643, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.08138787748982651, |
|
"grad_norm": 5.801737308502197, |
|
"learning_rate": 1.5201626123751158e-05, |
|
"loss": 1.3653, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.08165560077104304, |
|
"grad_norm": 5.651313781738281, |
|
"learning_rate": 1.5067619203480345e-05, |
|
"loss": 1.3818, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.08192332405225959, |
|
"grad_norm": 6.425565242767334, |
|
"learning_rate": 1.4933897246231798e-05, |
|
"loss": 1.3276, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.08219104733347612, |
|
"grad_norm": 6.21942663192749, |
|
"learning_rate": 1.4800465748791428e-05, |
|
"loss": 1.429, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.08245877061469266, |
|
"grad_norm": 6.354944229125977, |
|
"learning_rate": 1.4667330196005485e-05, |
|
"loss": 1.4254, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.08272649389590919, |
|
"grad_norm": 6.185739517211914, |
|
"learning_rate": 1.4534496060555075e-05, |
|
"loss": 1.3998, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.08299421717712573, |
|
"grad_norm": 5.781863212585449, |
|
"learning_rate": 1.4401968802731235e-05, |
|
"loss": 1.3384, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.08326194045834226, |
|
"grad_norm": 6.628792762756348, |
|
"learning_rate": 1.4269753870210459e-05, |
|
"loss": 1.4146, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.0835296637395588, |
|
"grad_norm": 6.093694686889648, |
|
"learning_rate": 1.4137856697830786e-05, |
|
"loss": 1.3662, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.08379738702077533, |
|
"grad_norm": 6.078185558319092, |
|
"learning_rate": 1.4006282707368348e-05, |
|
"loss": 1.3716, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.08406511030199186, |
|
"grad_norm": 6.203483581542969, |
|
"learning_rate": 1.3875037307314563e-05, |
|
"loss": 1.3371, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.0843328335832084, |
|
"grad_norm": 5.880634307861328, |
|
"learning_rate": 1.374412589265377e-05, |
|
"loss": 1.3464, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.0843328335832084, |
|
"eval_loss": 1.3659946918487549, |
|
"eval_runtime": 77.0452, |
|
"eval_samples_per_second": 6.49, |
|
"eval_steps_per_second": 6.49, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.08460055686442493, |
|
"grad_norm": 6.3485426902771, |
|
"learning_rate": 1.3613553844641483e-05, |
|
"loss": 1.3366, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.08486828014564146, |
|
"grad_norm": 6.721098899841309, |
|
"learning_rate": 1.3483326530583184e-05, |
|
"loss": 1.3628, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.085136003426858, |
|
"grad_norm": 5.912144660949707, |
|
"learning_rate": 1.3353449303613682e-05, |
|
"loss": 1.3403, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.08540372670807453, |
|
"grad_norm": 5.860577583312988, |
|
"learning_rate": 1.3223927502477084e-05, |
|
"loss": 1.3453, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.08567144998929106, |
|
"grad_norm": 6.3982977867126465, |
|
"learning_rate": 1.3094766451307336e-05, |
|
"loss": 1.3556, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.08593917327050761, |
|
"grad_norm": 6.073590278625488, |
|
"learning_rate": 1.2965971459409366e-05, |
|
"loss": 1.3984, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 6.372732162475586, |
|
"learning_rate": 1.2837547821040825e-05, |
|
"loss": 1.4089, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.08647461983294068, |
|
"grad_norm": 6.449525356292725, |
|
"learning_rate": 1.2709500815194487e-05, |
|
"loss": 1.3884, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.08674234311415721, |
|
"grad_norm": 5.904713153839111, |
|
"learning_rate": 1.2581835705381243e-05, |
|
"loss": 1.3976, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.08701006639537374, |
|
"grad_norm": 6.398531913757324, |
|
"learning_rate": 1.2454557739413722e-05, |
|
"loss": 1.3942, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.08727778967659028, |
|
"grad_norm": 6.1607465744018555, |
|
"learning_rate": 1.2327672149190595e-05, |
|
"loss": 1.3698, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.08754551295780681, |
|
"grad_norm": 5.903096675872803, |
|
"learning_rate": 1.2201184150481497e-05, |
|
"loss": 1.4183, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.08781323623902335, |
|
"grad_norm": 6.210367679595947, |
|
"learning_rate": 1.2075098942712635e-05, |
|
"loss": 1.3717, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.08808095952023988, |
|
"grad_norm": 6.082081317901611, |
|
"learning_rate": 1.1949421708753062e-05, |
|
"loss": 1.3694, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.08834868280145641, |
|
"grad_norm": 5.826544284820557, |
|
"learning_rate": 1.1824157614701629e-05, |
|
"loss": 1.4473, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.08834868280145641, |
|
"eval_loss": 1.3619885444641113, |
|
"eval_runtime": 77.1061, |
|
"eval_samples_per_second": 6.485, |
|
"eval_steps_per_second": 6.485, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.08861640608267295, |
|
"grad_norm": 6.470825672149658, |
|
"learning_rate": 1.1699311809674596e-05, |
|
"loss": 1.357, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.08888412936388948, |
|
"grad_norm": 5.989506244659424, |
|
"learning_rate": 1.157488942559403e-05, |
|
"loss": 1.322, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.08915185264510601, |
|
"grad_norm": 6.708034992218018, |
|
"learning_rate": 1.1450895576976816e-05, |
|
"loss": 1.3652, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.08941957592632255, |
|
"grad_norm": 6.264359474182129, |
|
"learning_rate": 1.1327335360724412e-05, |
|
"loss": 1.3661, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.08968729920753908, |
|
"grad_norm": 6.633790969848633, |
|
"learning_rate": 1.1204213855913374e-05, |
|
"loss": 1.3522, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.08995502248875563, |
|
"grad_norm": 5.526124477386475, |
|
"learning_rate": 1.1081536123586505e-05, |
|
"loss": 1.3492, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.09022274576997216, |
|
"grad_norm": 6.267175197601318, |
|
"learning_rate": 1.09593072065449e-05, |
|
"loss": 1.3805, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.0904904690511887, |
|
"grad_norm": 6.826523780822754, |
|
"learning_rate": 1.0837532129140595e-05, |
|
"loss": 1.3379, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.09075819233240523, |
|
"grad_norm": 6.352426052093506, |
|
"learning_rate": 1.0716215897070067e-05, |
|
"loss": 1.378, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.09102591561362176, |
|
"grad_norm": 6.353774547576904, |
|
"learning_rate": 1.0595363497168449e-05, |
|
"loss": 1.4057, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.0912936388948383, |
|
"grad_norm": 6.023704528808594, |
|
"learning_rate": 1.0474979897204557e-05, |
|
"loss": 1.419, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.09156136217605483, |
|
"grad_norm": 6.525381565093994, |
|
"learning_rate": 1.0355070045676677e-05, |
|
"loss": 1.3737, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.09182908545727136, |
|
"grad_norm": 6.321014404296875, |
|
"learning_rate": 1.0235638871609145e-05, |
|
"loss": 1.3252, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.0920968087384879, |
|
"grad_norm": 5.880143165588379, |
|
"learning_rate": 1.011669128434976e-05, |
|
"loss": 1.3581, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.09236453201970443, |
|
"grad_norm": 6.851429462432861, |
|
"learning_rate": 9.99823217336793e-06, |
|
"loss": 1.4074, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.09236453201970443, |
|
"eval_loss": 1.3579777479171753, |
|
"eval_runtime": 76.8014, |
|
"eval_samples_per_second": 6.51, |
|
"eval_steps_per_second": 6.51, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.09263225530092097, |
|
"grad_norm": 6.41058874130249, |
|
"learning_rate": 9.880266408053746e-06, |
|
"loss": 1.433, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.0928999785821375, |
|
"grad_norm": 5.9317474365234375, |
|
"learning_rate": 9.762798837517776e-06, |
|
"loss": 1.3759, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.09316770186335403, |
|
"grad_norm": 5.728269100189209, |
|
"learning_rate": 9.645834290391754e-06, |
|
"loss": 1.4632, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.09343542514457057, |
|
"grad_norm": 5.710354328155518, |
|
"learning_rate": 9.529377574630109e-06, |
|
"loss": 1.422, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.0937031484257871, |
|
"grad_norm": 6.150035381317139, |
|
"learning_rate": 9.413433477312272e-06, |
|
"loss": 1.4113, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09397087170700365, |
|
"grad_norm": 6.171891689300537, |
|
"learning_rate": 9.298006764445976e-06, |
|
"loss": 1.4115, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.09423859498822018, |
|
"grad_norm": 6.584611415863037, |
|
"learning_rate": 9.183102180771285e-06, |
|
"loss": 1.3631, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.09450631826943671, |
|
"grad_norm": 6.219729423522949, |
|
"learning_rate": 9.068724449565594e-06, |
|
"loss": 1.3497, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.09477404155065325, |
|
"grad_norm": 5.961699485778809, |
|
"learning_rate": 8.954878272449433e-06, |
|
"loss": 1.3476, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.09504176483186978, |
|
"grad_norm": 6.4813385009765625, |
|
"learning_rate": 8.841568329193249e-06, |
|
"loss": 1.3281, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.09530948811308632, |
|
"grad_norm": 5.715578079223633, |
|
"learning_rate": 8.728799277524998e-06, |
|
"loss": 1.3114, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.09557721139430285, |
|
"grad_norm": 5.67549467086792, |
|
"learning_rate": 8.61657575293871e-06, |
|
"loss": 1.3119, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.09584493467551938, |
|
"grad_norm": 6.634474277496338, |
|
"learning_rate": 8.50490236850394e-06, |
|
"loss": 1.3587, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.09611265795673592, |
|
"grad_norm": 5.7471537590026855, |
|
"learning_rate": 8.393783714676107e-06, |
|
"loss": 1.3607, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.09638038123795245, |
|
"grad_norm": 5.866701602935791, |
|
"learning_rate": 8.283224359107863e-06, |
|
"loss": 1.3247, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.09638038123795245, |
|
"eval_loss": 1.3553622961044312, |
|
"eval_runtime": 76.9249, |
|
"eval_samples_per_second": 6.5, |
|
"eval_steps_per_second": 6.5, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.09664810451916898, |
|
"grad_norm": 5.779167652130127, |
|
"learning_rate": 8.17322884646128e-06, |
|
"loss": 1.375, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.09691582780038552, |
|
"grad_norm": 6.503204345703125, |
|
"learning_rate": 8.06380169822107e-06, |
|
"loss": 1.3767, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.09718355108160205, |
|
"grad_norm": 5.67221212387085, |
|
"learning_rate": 7.95494741250868e-06, |
|
"loss": 1.2996, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.09745127436281859, |
|
"grad_norm": 6.475659370422363, |
|
"learning_rate": 7.846670463897457e-06, |
|
"loss": 1.3827, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.09771899764403512, |
|
"grad_norm": 6.146843910217285, |
|
"learning_rate": 7.738975303228659e-06, |
|
"loss": 1.3489, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.09798672092525167, |
|
"grad_norm": 6.763230323791504, |
|
"learning_rate": 7.631866357428526e-06, |
|
"loss": 1.3631, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.0982544442064682, |
|
"grad_norm": 6.853928565979004, |
|
"learning_rate": 7.525348029326323e-06, |
|
"loss": 1.3683, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.09852216748768473, |
|
"grad_norm": 6.183257102966309, |
|
"learning_rate": 7.4194246974732955e-06, |
|
"loss": 1.3744, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.09878989076890127, |
|
"grad_norm": 6.155274391174316, |
|
"learning_rate": 7.314100715962744e-06, |
|
"loss": 1.389, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.0990576140501178, |
|
"grad_norm": 6.754117012023926, |
|
"learning_rate": 7.209380414251028e-06, |
|
"loss": 1.3267, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.09932533733133433, |
|
"grad_norm": 6.333691596984863, |
|
"learning_rate": 7.105268096979596e-06, |
|
"loss": 1.3774, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.09959306061255087, |
|
"grad_norm": 6.452340602874756, |
|
"learning_rate": 7.001768043798013e-06, |
|
"loss": 1.3038, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.0998607838937674, |
|
"grad_norm": 5.832094192504883, |
|
"learning_rate": 6.898884509188095e-06, |
|
"loss": 1.3978, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.10012850717498394, |
|
"grad_norm": 5.7019476890563965, |
|
"learning_rate": 6.796621722288977e-06, |
|
"loss": 1.358, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.10039623045620047, |
|
"grad_norm": 5.743053913116455, |
|
"learning_rate": 6.6949838867233e-06, |
|
"loss": 1.3567, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.10039623045620047, |
|
"eval_loss": 1.3537319898605347, |
|
"eval_runtime": 76.8454, |
|
"eval_samples_per_second": 6.507, |
|
"eval_steps_per_second": 6.507, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.100663953737417, |
|
"grad_norm": 6.490472793579102, |
|
"learning_rate": 6.5939751804243974e-06, |
|
"loss": 1.361, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.10093167701863354, |
|
"grad_norm": 6.32999324798584, |
|
"learning_rate": 6.493599755464546e-06, |
|
"loss": 1.2968, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.10119940029985007, |
|
"grad_norm": 6.559702396392822, |
|
"learning_rate": 6.3938617378843264e-06, |
|
"loss": 1.4176, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.1014671235810666, |
|
"grad_norm": 5.832455158233643, |
|
"learning_rate": 6.294765227523008e-06, |
|
"loss": 1.3828, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.10173484686228314, |
|
"grad_norm": 6.728024005889893, |
|
"learning_rate": 6.196314297849995e-06, |
|
"loss": 1.3902, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.10200257014349969, |
|
"grad_norm": 6.092176914215088, |
|
"learning_rate": 6.098512995797388e-06, |
|
"loss": 1.3587, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.10227029342471622, |
|
"grad_norm": 6.502336025238037, |
|
"learning_rate": 6.0013653415936585e-06, |
|
"loss": 1.3619, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.10253801670593275, |
|
"grad_norm": 6.602701187133789, |
|
"learning_rate": 5.90487532859837e-06, |
|
"loss": 1.3325, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.10280573998714929, |
|
"grad_norm": 6.637482166290283, |
|
"learning_rate": 5.809046923138031e-06, |
|
"loss": 1.3899, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.10307346326836582, |
|
"grad_norm": 5.880363941192627, |
|
"learning_rate": 5.713884064343061e-06, |
|
"loss": 1.3481, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.10334118654958235, |
|
"grad_norm": 7.036133289337158, |
|
"learning_rate": 5.6193906639858486e-06, |
|
"loss": 1.3156, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.10360890983079889, |
|
"grad_norm": 5.999964714050293, |
|
"learning_rate": 5.52557060631998e-06, |
|
"loss": 1.3756, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.10387663311201542, |
|
"grad_norm": 5.966408729553223, |
|
"learning_rate": 5.432427747920561e-06, |
|
"loss": 1.3588, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.10414435639323195, |
|
"grad_norm": 5.987645626068115, |
|
"learning_rate": 5.339965917525687e-06, |
|
"loss": 1.427, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.10441207967444849, |
|
"grad_norm": 5.433709621429443, |
|
"learning_rate": 5.248188915879043e-06, |
|
"loss": 1.3687, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.10441207967444849, |
|
"eval_loss": 1.350784420967102, |
|
"eval_runtime": 76.8402, |
|
"eval_samples_per_second": 6.507, |
|
"eval_steps_per_second": 6.507, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.10467980295566502, |
|
"grad_norm": 6.524111270904541, |
|
"learning_rate": 5.157100515573715e-06, |
|
"loss": 1.3006, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.10494752623688156, |
|
"grad_norm": 5.474837303161621, |
|
"learning_rate": 5.066704460897067e-06, |
|
"loss": 1.3463, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.10521524951809809, |
|
"grad_norm": 5.868412494659424, |
|
"learning_rate": 4.977004467676848e-06, |
|
"loss": 1.2881, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.10548297279931462, |
|
"grad_norm": 5.966287136077881, |
|
"learning_rate": 4.888004223128458e-06, |
|
"loss": 1.3636, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.10575069608053116, |
|
"grad_norm": 5.976463794708252, |
|
"learning_rate": 4.799707385703344e-06, |
|
"loss": 1.3411, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.1060184193617477, |
|
"grad_norm": 5.5595197677612305, |
|
"learning_rate": 4.712117584938669e-06, |
|
"loss": 1.3114, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.10628614264296424, |
|
"grad_norm": 5.7463483810424805, |
|
"learning_rate": 4.625238421308069e-06, |
|
"loss": 1.3472, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.10655386592418077, |
|
"grad_norm": 6.120302200317383, |
|
"learning_rate": 4.5390734660736906e-06, |
|
"loss": 1.4384, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.1068215892053973, |
|
"grad_norm": 6.155236721038818, |
|
"learning_rate": 4.453626261139344e-06, |
|
"loss": 1.3494, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.10708931248661384, |
|
"grad_norm": 6.032073974609375, |
|
"learning_rate": 4.368900318904957e-06, |
|
"loss": 1.3464, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.10735703576783037, |
|
"grad_norm": 6.827203750610352, |
|
"learning_rate": 4.284899122122165e-06, |
|
"loss": 1.3534, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.1076247590490469, |
|
"grad_norm": 5.927024841308594, |
|
"learning_rate": 4.201626123751159e-06, |
|
"loss": 1.333, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.10789248233026344, |
|
"grad_norm": 5.960188865661621, |
|
"learning_rate": 4.1190847468187425e-06, |
|
"loss": 1.3458, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.10816020561147997, |
|
"grad_norm": 6.299499034881592, |
|
"learning_rate": 4.037278384277628e-06, |
|
"loss": 1.3516, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.1084279288926965, |
|
"grad_norm": 6.968238353729248, |
|
"learning_rate": 3.956210398866969e-06, |
|
"loss": 1.369, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.1084279288926965, |
|
"eval_loss": 1.348792552947998, |
|
"eval_runtime": 76.8298, |
|
"eval_samples_per_second": 6.508, |
|
"eval_steps_per_second": 6.508, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"grad_norm": 6.412740707397461, |
|
"learning_rate": 3.875884122974123e-06, |
|
"loss": 1.3756, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.10896337545512957, |
|
"grad_norm": 6.571822643280029, |
|
"learning_rate": 3.7963028584976805e-06, |
|
"loss": 1.3773, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.10923109873634611, |
|
"grad_norm": 6.47897481918335, |
|
"learning_rate": 3.717469876711713e-06, |
|
"loss": 1.3746, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.10949882201756264, |
|
"grad_norm": 6.563449382781982, |
|
"learning_rate": 3.6393884181313417e-06, |
|
"loss": 1.382, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.10976654529877918, |
|
"grad_norm": 6.455676078796387, |
|
"learning_rate": 3.562061692379507e-06, |
|
"loss": 1.3519, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.11003426857999572, |
|
"grad_norm": 5.957856178283691, |
|
"learning_rate": 3.4854928780550306e-06, |
|
"loss": 1.3711, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.11030199186121226, |
|
"grad_norm": 6.082734107971191, |
|
"learning_rate": 3.409685122601979e-06, |
|
"loss": 1.3038, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.11056971514242879, |
|
"grad_norm": 5.809603691101074, |
|
"learning_rate": 3.3346415421802494e-06, |
|
"loss": 1.3587, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.11083743842364532, |
|
"grad_norm": 6.081882476806641, |
|
"learning_rate": 3.26036522153751e-06, |
|
"loss": 1.3672, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.11110516170486186, |
|
"grad_norm": 5.788993835449219, |
|
"learning_rate": 3.186859213882386e-06, |
|
"loss": 1.3615, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.11137288498607839, |
|
"grad_norm": 5.722326755523682, |
|
"learning_rate": 3.114126540758946e-06, |
|
"loss": 1.2914, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.11164060826729492, |
|
"grad_norm": 6.233955383300781, |
|
"learning_rate": 3.042170191922509e-06, |
|
"loss": 1.3286, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.11190833154851146, |
|
"grad_norm": 6.276589393615723, |
|
"learning_rate": 2.9709931252167426e-06, |
|
"loss": 1.3943, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.11217605482972799, |
|
"grad_norm": 6.818645000457764, |
|
"learning_rate": 2.9005982664520734e-06, |
|
"loss": 1.3535, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.11244377811094453, |
|
"grad_norm": 6.53585147857666, |
|
"learning_rate": 2.830988509285433e-06, |
|
"loss": 1.3412, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.11244377811094453, |
|
"eval_loss": 1.3470451831817627, |
|
"eval_runtime": 76.7654, |
|
"eval_samples_per_second": 6.513, |
|
"eval_steps_per_second": 6.513, |
|
"step": 4200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.521351998649088e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|