|
{ |
|
"best_metric": 1.22141695022583, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.06356521393667315, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001271304278733463, |
|
"grad_norm": 0.5326213836669922, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7856, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0001271304278733463, |
|
"eval_loss": 1.7346340417861938, |
|
"eval_runtime": 1249.3361, |
|
"eval_samples_per_second": 4.002, |
|
"eval_steps_per_second": 1.001, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002542608557466926, |
|
"grad_norm": 0.5624520778656006, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7626, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00038139128362003893, |
|
"grad_norm": 0.5890633463859558, |
|
"learning_rate": 6e-05, |
|
"loss": 1.739, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0005085217114933852, |
|
"grad_norm": 0.5437400937080383, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6671, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0006356521393667316, |
|
"grad_norm": 0.6639446020126343, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7684, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0007627825672400779, |
|
"grad_norm": 0.7031175494194031, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7247, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0008899129951134241, |
|
"grad_norm": 0.5311002731323242, |
|
"learning_rate": 0.00014, |
|
"loss": 1.6195, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0010170434229867704, |
|
"grad_norm": 0.25101518630981445, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5182, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0011441738508601168, |
|
"grad_norm": 0.8389205932617188, |
|
"learning_rate": 0.00018, |
|
"loss": 1.6646, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0012713042787334632, |
|
"grad_norm": 0.9317983388900757, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6395, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0013984347066068094, |
|
"grad_norm": 0.48066481947898865, |
|
"learning_rate": 0.00019999832015210023, |
|
"loss": 1.5921, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0015255651344801557, |
|
"grad_norm": 0.2073744535446167, |
|
"learning_rate": 0.00019999328066483865, |
|
"loss": 1.4335, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0016526955623535021, |
|
"grad_norm": 0.22661754488945007, |
|
"learning_rate": 0.0001999848817075267, |
|
"loss": 1.477, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0017798259902268483, |
|
"grad_norm": 0.3149760663509369, |
|
"learning_rate": 0.00019997312356234386, |
|
"loss": 1.5713, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0019069564181001947, |
|
"grad_norm": 0.31392771005630493, |
|
"learning_rate": 0.00019995800662432798, |
|
"loss": 1.5414, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.002034086845973541, |
|
"grad_norm": 0.2748984396457672, |
|
"learning_rate": 0.0001999395314013622, |
|
"loss": 1.5452, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0021612172738468874, |
|
"grad_norm": 0.19064395129680634, |
|
"learning_rate": 0.00019991769851415781, |
|
"loss": 1.5742, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0022883477017202336, |
|
"grad_norm": 0.1578415036201477, |
|
"learning_rate": 0.00019989250869623343, |
|
"loss": 1.5214, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0024154781295935798, |
|
"grad_norm": 0.20229928195476532, |
|
"learning_rate": 0.0001998639627938903, |
|
"loss": 1.3921, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0025426085574669264, |
|
"grad_norm": 0.2705669403076172, |
|
"learning_rate": 0.00019983206176618388, |
|
"loss": 1.4712, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0026697389853402725, |
|
"grad_norm": 0.27215054631233215, |
|
"learning_rate": 0.00019979680668489165, |
|
"loss": 1.4969, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0027968694132136187, |
|
"grad_norm": 0.20431619882583618, |
|
"learning_rate": 0.00019975819873447717, |
|
"loss": 1.431, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0029239998410869653, |
|
"grad_norm": 0.14068549871444702, |
|
"learning_rate": 0.00019971623921205005, |
|
"loss": 1.4543, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0030511302689603115, |
|
"grad_norm": 0.1594925820827484, |
|
"learning_rate": 0.00019967092952732264, |
|
"loss": 1.364, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0031782606968336576, |
|
"grad_norm": 0.17738410830497742, |
|
"learning_rate": 0.00019962227120256252, |
|
"loss": 1.4377, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0033053911247070042, |
|
"grad_norm": 0.1752498745918274, |
|
"learning_rate": 0.00019957026587254134, |
|
"loss": 1.3827, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0034325215525803504, |
|
"grad_norm": 0.18004052340984344, |
|
"learning_rate": 0.00019951491528448004, |
|
"loss": 1.3867, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0035596519804536966, |
|
"grad_norm": 0.1525774598121643, |
|
"learning_rate": 0.00019945622129799, |
|
"loss": 1.4164, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.003686782408327043, |
|
"grad_norm": 0.1710219830274582, |
|
"learning_rate": 0.00019939418588501057, |
|
"loss": 1.4155, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0038139128362003893, |
|
"grad_norm": 0.20903100073337555, |
|
"learning_rate": 0.000199328811129743, |
|
"loss": 1.5239, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003941043264073736, |
|
"grad_norm": 0.18399456143379211, |
|
"learning_rate": 0.00019926009922858006, |
|
"loss": 1.3889, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.004068173691947082, |
|
"grad_norm": 0.13330113887786865, |
|
"learning_rate": 0.0001991880524900327, |
|
"loss": 1.3587, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.004195304119820428, |
|
"grad_norm": 0.13807597756385803, |
|
"learning_rate": 0.00019911267333465218, |
|
"loss": 1.4211, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.004322434547693775, |
|
"grad_norm": 0.1550034135580063, |
|
"learning_rate": 0.0001990339642949488, |
|
"loss": 1.4317, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004449564975567121, |
|
"grad_norm": 0.1827971190214157, |
|
"learning_rate": 0.00019895192801530685, |
|
"loss": 1.4176, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.004576695403440467, |
|
"grad_norm": 0.16028065979480743, |
|
"learning_rate": 0.00019886656725189575, |
|
"loss": 1.4122, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.004703825831313814, |
|
"grad_norm": 0.14322146773338318, |
|
"learning_rate": 0.00019877788487257753, |
|
"loss": 1.423, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0048309562591871595, |
|
"grad_norm": 0.1685505211353302, |
|
"learning_rate": 0.00019868588385681032, |
|
"loss": 1.3702, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.004958086687060506, |
|
"grad_norm": 0.1632552444934845, |
|
"learning_rate": 0.00019859056729554844, |
|
"loss": 1.3164, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.005085217114933853, |
|
"grad_norm": 0.17149530351161957, |
|
"learning_rate": 0.00019849193839113833, |
|
"loss": 1.2799, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0052123475428071985, |
|
"grad_norm": 0.14993086457252502, |
|
"learning_rate": 0.00019839000045721118, |
|
"loss": 1.3412, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.005339477970680545, |
|
"grad_norm": 0.15981823205947876, |
|
"learning_rate": 0.00019828475691857145, |
|
"loss": 1.3698, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005466608398553892, |
|
"grad_norm": 0.15311439335346222, |
|
"learning_rate": 0.00019817621131108196, |
|
"loss": 1.3792, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005593738826427237, |
|
"grad_norm": 0.19757720828056335, |
|
"learning_rate": 0.00019806436728154485, |
|
"loss": 1.4082, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.005720869254300584, |
|
"grad_norm": 0.15765132009983063, |
|
"learning_rate": 0.00019794922858757928, |
|
"loss": 1.282, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.005847999682173931, |
|
"grad_norm": 0.15940701961517334, |
|
"learning_rate": 0.00019783079909749515, |
|
"loss": 1.4016, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.005975130110047276, |
|
"grad_norm": 0.1622331291437149, |
|
"learning_rate": 0.00019770908279016309, |
|
"loss": 1.3624, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.006102260537920623, |
|
"grad_norm": 0.14866457879543304, |
|
"learning_rate": 0.00019758408375488071, |
|
"loss": 1.2807, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0062293909657939696, |
|
"grad_norm": 0.16648173332214355, |
|
"learning_rate": 0.00019745580619123535, |
|
"loss": 1.3617, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.006356521393667315, |
|
"grad_norm": 0.16083629429340363, |
|
"learning_rate": 0.00019732425440896297, |
|
"loss": 1.3903, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006483651821540662, |
|
"grad_norm": 0.18012581765651703, |
|
"learning_rate": 0.00019718943282780323, |
|
"loss": 1.3472, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0066107822494140085, |
|
"grad_norm": 0.16914351284503937, |
|
"learning_rate": 0.00019705134597735113, |
|
"loss": 1.3765, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.006737912677287354, |
|
"grad_norm": 0.15967325866222382, |
|
"learning_rate": 0.00019690999849690484, |
|
"loss": 1.3312, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.006865043105160701, |
|
"grad_norm": 0.15996071696281433, |
|
"learning_rate": 0.00019676539513530968, |
|
"loss": 1.4227, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.006992173533034047, |
|
"grad_norm": 0.1715613752603531, |
|
"learning_rate": 0.0001966175407507987, |
|
"loss": 1.3634, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.007119303960907393, |
|
"grad_norm": 0.16633041203022003, |
|
"learning_rate": 0.00019646644031082948, |
|
"loss": 1.3279, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00724643438878074, |
|
"grad_norm": 0.16247525811195374, |
|
"learning_rate": 0.00019631209889191712, |
|
"loss": 1.3721, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.007373564816654086, |
|
"grad_norm": 0.1603326052427292, |
|
"learning_rate": 0.00019615452167946385, |
|
"loss": 1.3212, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.007500695244527432, |
|
"grad_norm": 0.16569744050502777, |
|
"learning_rate": 0.00019599371396758456, |
|
"loss": 1.3224, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.007627825672400779, |
|
"grad_norm": 0.16010916233062744, |
|
"learning_rate": 0.0001958296811589293, |
|
"loss": 1.3022, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007754956100274125, |
|
"grad_norm": 0.1680569052696228, |
|
"learning_rate": 0.00019566242876450137, |
|
"loss": 1.3197, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.007882086528147472, |
|
"grad_norm": 0.16432645916938782, |
|
"learning_rate": 0.00019549196240347248, |
|
"loss": 1.3167, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.008009216956020818, |
|
"grad_norm": 0.17412547767162323, |
|
"learning_rate": 0.00019531828780299383, |
|
"loss": 1.3196, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.008136347383894163, |
|
"grad_norm": 0.1736118346452713, |
|
"learning_rate": 0.0001951414107980036, |
|
"loss": 1.2966, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00826347781176751, |
|
"grad_norm": 0.16254910826683044, |
|
"learning_rate": 0.00019496133733103112, |
|
"loss": 1.3416, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.008390608239640857, |
|
"grad_norm": 0.16831637918949127, |
|
"learning_rate": 0.00019477807345199714, |
|
"loss": 1.3396, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.008517738667514202, |
|
"grad_norm": 0.1759282648563385, |
|
"learning_rate": 0.00019459162531801046, |
|
"loss": 1.3101, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00864486909538755, |
|
"grad_norm": 0.17314572632312775, |
|
"learning_rate": 0.00019440199919316123, |
|
"loss": 1.4026, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.008771999523260895, |
|
"grad_norm": 0.17074303328990936, |
|
"learning_rate": 0.00019420920144831044, |
|
"loss": 1.3088, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.008899129951134241, |
|
"grad_norm": 0.17773644626140594, |
|
"learning_rate": 0.0001940132385608757, |
|
"loss": 1.32, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.009026260379007589, |
|
"grad_norm": 0.1736891269683838, |
|
"learning_rate": 0.0001938141171146141, |
|
"loss": 1.2865, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.009153390806880934, |
|
"grad_norm": 0.17593072354793549, |
|
"learning_rate": 0.0001936118437994003, |
|
"loss": 1.3276, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00928052123475428, |
|
"grad_norm": 0.16799978911876678, |
|
"learning_rate": 0.00019340642541100248, |
|
"loss": 1.2585, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.009407651662627628, |
|
"grad_norm": 0.17271657288074493, |
|
"learning_rate": 0.00019319786885085364, |
|
"loss": 1.3838, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.009534782090500973, |
|
"grad_norm": 0.18318656086921692, |
|
"learning_rate": 0.0001929861811258197, |
|
"loss": 1.3857, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.009661912518374319, |
|
"grad_norm": 0.1850346177816391, |
|
"learning_rate": 0.0001927713693479643, |
|
"loss": 1.3884, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.009789042946247667, |
|
"grad_norm": 0.1707659363746643, |
|
"learning_rate": 0.0001925534407343097, |
|
"loss": 1.2674, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.009916173374121012, |
|
"grad_norm": 0.1803124099969864, |
|
"learning_rate": 0.0001923324026065944, |
|
"loss": 1.2899, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.010043303801994358, |
|
"grad_norm": 0.18143856525421143, |
|
"learning_rate": 0.0001921082623910271, |
|
"loss": 1.2967, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.010170434229867705, |
|
"grad_norm": 0.17903882265090942, |
|
"learning_rate": 0.00019188102761803717, |
|
"loss": 1.2913, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.010297564657741051, |
|
"grad_norm": 0.181906595826149, |
|
"learning_rate": 0.00019165070592202173, |
|
"loss": 1.2568, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.010424695085614397, |
|
"grad_norm": 0.1655733585357666, |
|
"learning_rate": 0.00019141730504108922, |
|
"loss": 1.2758, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.010551825513487744, |
|
"grad_norm": 0.17457562685012817, |
|
"learning_rate": 0.00019118083281679913, |
|
"loss": 1.2506, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.01067895594136109, |
|
"grad_norm": 0.18457446992397308, |
|
"learning_rate": 0.00019094129719389886, |
|
"loss": 1.3701, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.010806086369234436, |
|
"grad_norm": 0.17702506482601166, |
|
"learning_rate": 0.0001906987062200567, |
|
"loss": 1.3071, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.010933216797107783, |
|
"grad_norm": 0.1763213723897934, |
|
"learning_rate": 0.0001904530680455914, |
|
"loss": 1.2996, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.011060347224981129, |
|
"grad_norm": 0.17649979889392853, |
|
"learning_rate": 0.0001902043909231984, |
|
"loss": 1.314, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.011187477652854475, |
|
"grad_norm": 0.1817278414964676, |
|
"learning_rate": 0.00018995268320767252, |
|
"loss": 1.315, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.011314608080727822, |
|
"grad_norm": 0.17668454349040985, |
|
"learning_rate": 0.0001896979533556273, |
|
"loss": 1.2914, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.011441738508601168, |
|
"grad_norm": 0.17107272148132324, |
|
"learning_rate": 0.0001894402099252109, |
|
"loss": 1.2884, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.011568868936474514, |
|
"grad_norm": 0.18352022767066956, |
|
"learning_rate": 0.0001891794615758185, |
|
"loss": 1.3404, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.011695999364347861, |
|
"grad_norm": 0.17999856173992157, |
|
"learning_rate": 0.00018891571706780146, |
|
"loss": 1.3001, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.011823129792221207, |
|
"grad_norm": 0.17374040186405182, |
|
"learning_rate": 0.00018864898526217293, |
|
"loss": 1.266, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.011950260220094553, |
|
"grad_norm": 0.18675245344638824, |
|
"learning_rate": 0.0001883792751203102, |
|
"loss": 1.3347, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0120773906479679, |
|
"grad_norm": 0.18314674496650696, |
|
"learning_rate": 0.0001881065957036536, |
|
"loss": 1.3224, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.012204521075841246, |
|
"grad_norm": 0.17483913898468018, |
|
"learning_rate": 0.00018783095617340193, |
|
"loss": 1.2926, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.012331651503714592, |
|
"grad_norm": 0.19491428136825562, |
|
"learning_rate": 0.00018755236579020502, |
|
"loss": 1.2636, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.012458781931587939, |
|
"grad_norm": 0.17128659784793854, |
|
"learning_rate": 0.0001872708339138522, |
|
"loss": 1.2653, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.012585912359461285, |
|
"grad_norm": 0.172018900513649, |
|
"learning_rate": 0.00018698637000295816, |
|
"loss": 1.2686, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.01271304278733463, |
|
"grad_norm": 0.18891726434230804, |
|
"learning_rate": 0.0001866989836146449, |
|
"loss": 1.4058, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01271304278733463, |
|
"eval_loss": 1.2906723022460938, |
|
"eval_runtime": 1258.4463, |
|
"eval_samples_per_second": 3.973, |
|
"eval_steps_per_second": 0.993, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012840173215207978, |
|
"grad_norm": 0.1781390905380249, |
|
"learning_rate": 0.0001864086844042209, |
|
"loss": 1.3021, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.012967303643081324, |
|
"grad_norm": 0.17100100219249725, |
|
"learning_rate": 0.00018611548212485647, |
|
"loss": 1.2574, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.01309443407095467, |
|
"grad_norm": 0.18398095667362213, |
|
"learning_rate": 0.00018581938662725632, |
|
"loss": 1.2839, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.013221564498828017, |
|
"grad_norm": 0.18981115520000458, |
|
"learning_rate": 0.00018552040785932845, |
|
"loss": 1.3149, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.013348694926701363, |
|
"grad_norm": 0.18872378766536713, |
|
"learning_rate": 0.00018521855586584995, |
|
"loss": 1.279, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.013475825354574708, |
|
"grad_norm": 0.1824631690979004, |
|
"learning_rate": 0.00018491384078812959, |
|
"loss": 1.2743, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.013602955782448056, |
|
"grad_norm": 0.1971443146467209, |
|
"learning_rate": 0.000184606272863667, |
|
"loss": 1.3365, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.013730086210321402, |
|
"grad_norm": 0.19964328408241272, |
|
"learning_rate": 0.00018429586242580884, |
|
"loss": 1.3184, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.013857216638194747, |
|
"grad_norm": 0.17624543607234955, |
|
"learning_rate": 0.00018398261990340152, |
|
"loss": 1.2755, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.013984347066068095, |
|
"grad_norm": 0.18599238991737366, |
|
"learning_rate": 0.00018366655582044094, |
|
"loss": 1.3025, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01411147749394144, |
|
"grad_norm": 0.19051305949687958, |
|
"learning_rate": 0.00018334768079571884, |
|
"loss": 1.351, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.014238607921814786, |
|
"grad_norm": 0.1858106255531311, |
|
"learning_rate": 0.00018302600554246601, |
|
"loss": 1.2386, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.014365738349688134, |
|
"grad_norm": 0.17598244547843933, |
|
"learning_rate": 0.00018270154086799239, |
|
"loss": 1.2687, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.01449286877756148, |
|
"grad_norm": 0.18105947971343994, |
|
"learning_rate": 0.00018237429767332405, |
|
"loss": 1.2843, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.014619999205434825, |
|
"grad_norm": 0.18796177208423615, |
|
"learning_rate": 0.00018204428695283687, |
|
"loss": 1.2999, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.014747129633308173, |
|
"grad_norm": 0.18702763319015503, |
|
"learning_rate": 0.00018171151979388714, |
|
"loss": 1.2391, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.014874260061181518, |
|
"grad_norm": 0.17469799518585205, |
|
"learning_rate": 0.00018137600737643913, |
|
"loss": 1.2915, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.015001390489054864, |
|
"grad_norm": 0.1871766746044159, |
|
"learning_rate": 0.00018103776097268942, |
|
"loss": 1.2429, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.015128520916928212, |
|
"grad_norm": 0.18426093459129333, |
|
"learning_rate": 0.00018069679194668826, |
|
"loss": 1.2678, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.015255651344801557, |
|
"grad_norm": 0.1830713450908661, |
|
"learning_rate": 0.0001803531117539577, |
|
"loss": 1.3231, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.015382781772674903, |
|
"grad_norm": 0.19156108796596527, |
|
"learning_rate": 0.00018000673194110668, |
|
"loss": 1.3426, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.01550991220054825, |
|
"grad_norm": 0.18232569098472595, |
|
"learning_rate": 0.00017965766414544326, |
|
"loss": 1.2227, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.015637042628421596, |
|
"grad_norm": 0.18696987628936768, |
|
"learning_rate": 0.00017930592009458352, |
|
"loss": 1.2933, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.015764173056294944, |
|
"grad_norm": 0.18148070573806763, |
|
"learning_rate": 0.00017895151160605757, |
|
"loss": 1.3598, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.015891303484168288, |
|
"grad_norm": 0.1859319657087326, |
|
"learning_rate": 0.00017859445058691247, |
|
"loss": 1.2688, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.016018433912041635, |
|
"grad_norm": 0.18133966624736786, |
|
"learning_rate": 0.00017823474903331233, |
|
"loss": 1.2912, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.016145564339914983, |
|
"grad_norm": 0.16695751249790192, |
|
"learning_rate": 0.0001778724190301351, |
|
"loss": 1.2772, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.016272694767788327, |
|
"grad_norm": 0.17694084346294403, |
|
"learning_rate": 0.0001775074727505667, |
|
"loss": 1.2998, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.016399825195661674, |
|
"grad_norm": 0.18545518815517426, |
|
"learning_rate": 0.0001771399224556919, |
|
"loss": 1.2996, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.01652695562353502, |
|
"grad_norm": 0.1763446033000946, |
|
"learning_rate": 0.00017676978049408263, |
|
"loss": 1.2942, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.016654086051408366, |
|
"grad_norm": 0.1751178801059723, |
|
"learning_rate": 0.00017639705930138272, |
|
"loss": 1.2491, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.016781216479281713, |
|
"grad_norm": 0.17463481426239014, |
|
"learning_rate": 0.00017602177139989044, |
|
"loss": 1.3015, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01690834690715506, |
|
"grad_norm": 0.1884208619594574, |
|
"learning_rate": 0.0001756439293981377, |
|
"loss": 1.2555, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.017035477335028405, |
|
"grad_norm": 0.1824871301651001, |
|
"learning_rate": 0.00017526354599046635, |
|
"loss": 1.3321, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.017162607762901752, |
|
"grad_norm": 0.17852945625782013, |
|
"learning_rate": 0.00017488063395660177, |
|
"loss": 1.2134, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0172897381907751, |
|
"grad_norm": 0.17903351783752441, |
|
"learning_rate": 0.00017449520616122344, |
|
"loss": 1.202, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.017416868618648444, |
|
"grad_norm": 0.19624289870262146, |
|
"learning_rate": 0.00017410727555353282, |
|
"loss": 1.2983, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.01754399904652179, |
|
"grad_norm": 0.20271572470664978, |
|
"learning_rate": 0.00017371685516681825, |
|
"loss": 1.331, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.01767112947439514, |
|
"grad_norm": 0.19160455465316772, |
|
"learning_rate": 0.00017332395811801707, |
|
"loss": 1.2325, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.017798259902268482, |
|
"grad_norm": 0.19286282360553741, |
|
"learning_rate": 0.00017292859760727493, |
|
"loss": 1.3632, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01792539033014183, |
|
"grad_norm": 0.18525561690330505, |
|
"learning_rate": 0.00017253078691750227, |
|
"loss": 1.302, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.018052520758015177, |
|
"grad_norm": 0.17999610304832458, |
|
"learning_rate": 0.00017213053941392818, |
|
"loss": 1.2617, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.01817965118588852, |
|
"grad_norm": 0.1817435920238495, |
|
"learning_rate": 0.00017172786854365116, |
|
"loss": 1.285, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.01830678161376187, |
|
"grad_norm": 0.18393941223621368, |
|
"learning_rate": 0.00017132278783518756, |
|
"loss": 1.2033, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.018433912041635216, |
|
"grad_norm": 0.18280182778835297, |
|
"learning_rate": 0.00017091531089801694, |
|
"loss": 1.2454, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.01856104246950856, |
|
"grad_norm": 0.17269238829612732, |
|
"learning_rate": 0.00017050545142212483, |
|
"loss": 1.2137, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.018688172897381908, |
|
"grad_norm": 0.18515561521053314, |
|
"learning_rate": 0.00017009322317754278, |
|
"loss": 1.2876, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.018815303325255255, |
|
"grad_norm": 0.18649280071258545, |
|
"learning_rate": 0.0001696786400138859, |
|
"loss": 1.3279, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.0189424337531286, |
|
"grad_norm": 0.18008284270763397, |
|
"learning_rate": 0.00016926171585988727, |
|
"loss": 1.1943, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.019069564181001947, |
|
"grad_norm": 0.18855896592140198, |
|
"learning_rate": 0.00016884246472293016, |
|
"loss": 1.3458, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.019196694608875294, |
|
"grad_norm": 0.18721222877502441, |
|
"learning_rate": 0.00016842090068857742, |
|
"loss": 1.205, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.019323825036748638, |
|
"grad_norm": 0.18609726428985596, |
|
"learning_rate": 0.00016799703792009827, |
|
"loss": 1.3147, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.019450955464621986, |
|
"grad_norm": 0.18827542662620544, |
|
"learning_rate": 0.00016757089065799226, |
|
"loss": 1.2053, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.019578085892495333, |
|
"grad_norm": 0.19211921095848083, |
|
"learning_rate": 0.00016714247321951106, |
|
"loss": 1.2881, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.019705216320368677, |
|
"grad_norm": 0.1911146342754364, |
|
"learning_rate": 0.0001667117999981774, |
|
"loss": 1.2841, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.019832346748242025, |
|
"grad_norm": 0.1876746416091919, |
|
"learning_rate": 0.00016627888546330138, |
|
"loss": 1.2795, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.019959477176115372, |
|
"grad_norm": 0.18275220692157745, |
|
"learning_rate": 0.00016584374415949443, |
|
"loss": 1.2646, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.020086607603988716, |
|
"grad_norm": 0.19240595400333405, |
|
"learning_rate": 0.0001654063907061807, |
|
"loss": 1.2286, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.020213738031862064, |
|
"grad_norm": 0.17621144652366638, |
|
"learning_rate": 0.00016496683979710575, |
|
"loss": 1.2623, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.02034086845973541, |
|
"grad_norm": 0.18566247820854187, |
|
"learning_rate": 0.000164525106199843, |
|
"loss": 1.2915, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.020467998887608755, |
|
"grad_norm": 0.19843867421150208, |
|
"learning_rate": 0.00016408120475529763, |
|
"loss": 1.1703, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.020595129315482102, |
|
"grad_norm": 0.20230089128017426, |
|
"learning_rate": 0.00016363515037720773, |
|
"loss": 1.274, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.02072225974335545, |
|
"grad_norm": 0.1874382644891739, |
|
"learning_rate": 0.00016318695805164359, |
|
"loss": 1.267, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.020849390171228794, |
|
"grad_norm": 0.19301468133926392, |
|
"learning_rate": 0.0001627366428365039, |
|
"loss": 1.3385, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.02097652059910214, |
|
"grad_norm": 0.1960678994655609, |
|
"learning_rate": 0.00016228421986101005, |
|
"loss": 1.2469, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02110365102697549, |
|
"grad_norm": 0.2149035483598709, |
|
"learning_rate": 0.00016182970432519772, |
|
"loss": 1.2695, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.021230781454848833, |
|
"grad_norm": 0.1928316354751587, |
|
"learning_rate": 0.00016137311149940633, |
|
"loss": 1.2581, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.02135791188272218, |
|
"grad_norm": 0.18403369188308716, |
|
"learning_rate": 0.0001609144567237658, |
|
"loss": 1.2872, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.021485042310595528, |
|
"grad_norm": 0.18688054382801056, |
|
"learning_rate": 0.00016045375540768136, |
|
"loss": 1.2762, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.021612172738468872, |
|
"grad_norm": 0.19875864684581757, |
|
"learning_rate": 0.00015999102302931585, |
|
"loss": 1.2773, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02173930316634222, |
|
"grad_norm": 0.19474861025810242, |
|
"learning_rate": 0.0001595262751350695, |
|
"loss": 1.2329, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.021866433594215567, |
|
"grad_norm": 0.1946505606174469, |
|
"learning_rate": 0.00015905952733905775, |
|
"loss": 1.1726, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.02199356402208891, |
|
"grad_norm": 0.18479324877262115, |
|
"learning_rate": 0.00015859079532258677, |
|
"loss": 1.3177, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.022120694449962258, |
|
"grad_norm": 0.19268646836280823, |
|
"learning_rate": 0.00015812009483362642, |
|
"loss": 1.2721, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.022247824877835606, |
|
"grad_norm": 0.18371957540512085, |
|
"learning_rate": 0.0001576474416862812, |
|
"loss": 1.3083, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02237495530570895, |
|
"grad_norm": 0.1987624615430832, |
|
"learning_rate": 0.00015717285176025913, |
|
"loss": 1.2582, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.022502085733582297, |
|
"grad_norm": 0.19360652565956116, |
|
"learning_rate": 0.00015669634100033797, |
|
"loss": 1.2597, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.022629216161455645, |
|
"grad_norm": 0.1875244826078415, |
|
"learning_rate": 0.00015621792541582966, |
|
"loss": 1.2637, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.02275634658932899, |
|
"grad_norm": 0.19594229757785797, |
|
"learning_rate": 0.00015573762108004262, |
|
"loss": 1.2907, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.022883477017202336, |
|
"grad_norm": 0.1935066133737564, |
|
"learning_rate": 0.00015525544412974132, |
|
"loss": 1.2446, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.023010607445075684, |
|
"grad_norm": 0.19178606569766998, |
|
"learning_rate": 0.0001547714107646046, |
|
"loss": 1.2644, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.023137737872949028, |
|
"grad_norm": 0.18824580311775208, |
|
"learning_rate": 0.00015428553724668103, |
|
"loss": 1.2592, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.023264868300822375, |
|
"grad_norm": 0.1857818067073822, |
|
"learning_rate": 0.00015379783989984277, |
|
"loss": 1.2547, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.023391998728695722, |
|
"grad_norm": 0.18491147458553314, |
|
"learning_rate": 0.00015330833510923718, |
|
"loss": 1.3073, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.023519129156569066, |
|
"grad_norm": 0.19134363532066345, |
|
"learning_rate": 0.00015281703932073612, |
|
"loss": 1.2456, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.023646259584442414, |
|
"grad_norm": 0.18579505383968353, |
|
"learning_rate": 0.0001523239690403835, |
|
"loss": 1.2626, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.02377339001231576, |
|
"grad_norm": 0.18687140941619873, |
|
"learning_rate": 0.0001518291408338409, |
|
"loss": 1.2795, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.023900520440189105, |
|
"grad_norm": 0.1869836449623108, |
|
"learning_rate": 0.00015133257132583073, |
|
"loss": 1.2111, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.024027650868062453, |
|
"grad_norm": 0.18433886766433716, |
|
"learning_rate": 0.00015083427719957793, |
|
"loss": 1.1969, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0241547812959358, |
|
"grad_norm": 0.19012001156806946, |
|
"learning_rate": 0.0001503342751962493, |
|
"loss": 1.2973, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.024281911723809144, |
|
"grad_norm": 0.18975861370563507, |
|
"learning_rate": 0.00014983258211439117, |
|
"loss": 1.2964, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.024409042151682492, |
|
"grad_norm": 0.17685554921627045, |
|
"learning_rate": 0.0001493292148093649, |
|
"loss": 1.2763, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.02453617257955584, |
|
"grad_norm": 0.19333194196224213, |
|
"learning_rate": 0.00014882419019278075, |
|
"loss": 1.3203, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.024663303007429183, |
|
"grad_norm": 0.19778768718242645, |
|
"learning_rate": 0.00014831752523192948, |
|
"loss": 1.3204, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.02479043343530253, |
|
"grad_norm": 0.1869363635778427, |
|
"learning_rate": 0.00014780923694921255, |
|
"loss": 1.2258, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.024917563863175878, |
|
"grad_norm": 0.17671674489974976, |
|
"learning_rate": 0.00014729934242157004, |
|
"loss": 1.1667, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.025044694291049222, |
|
"grad_norm": 0.1893490105867386, |
|
"learning_rate": 0.00014678785877990697, |
|
"loss": 1.3572, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.02517182471892257, |
|
"grad_norm": 0.19606593251228333, |
|
"learning_rate": 0.00014627480320851774, |
|
"loss": 1.2507, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.025298955146795917, |
|
"grad_norm": 0.20087891817092896, |
|
"learning_rate": 0.00014576019294450888, |
|
"loss": 1.3149, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.02542608557466926, |
|
"grad_norm": 0.1857730895280838, |
|
"learning_rate": 0.00014524404527721977, |
|
"loss": 1.2893, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02542608557466926, |
|
"eval_loss": 1.2551084756851196, |
|
"eval_runtime": 1258.1994, |
|
"eval_samples_per_second": 3.974, |
|
"eval_steps_per_second": 0.993, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02555321600254261, |
|
"grad_norm": 0.18368631601333618, |
|
"learning_rate": 0.00014472637754764196, |
|
"loss": 1.2125, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.025680346430415956, |
|
"grad_norm": 0.18972043693065643, |
|
"learning_rate": 0.00014420720714783636, |
|
"loss": 1.2131, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.0258074768582893, |
|
"grad_norm": 0.18747109174728394, |
|
"learning_rate": 0.00014368655152034908, |
|
"loss": 1.2224, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.025934607286162648, |
|
"grad_norm": 0.18962696194648743, |
|
"learning_rate": 0.00014316442815762544, |
|
"loss": 1.2613, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.026061737714035995, |
|
"grad_norm": 0.18641987442970276, |
|
"learning_rate": 0.00014264085460142202, |
|
"loss": 1.2525, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.02618886814190934, |
|
"grad_norm": 0.19106072187423706, |
|
"learning_rate": 0.0001421158484422177, |
|
"loss": 1.2549, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.026315998569782686, |
|
"grad_norm": 0.19771872460842133, |
|
"learning_rate": 0.0001415894273186223, |
|
"loss": 1.2612, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.026443128997656034, |
|
"grad_norm": 0.18108506500720978, |
|
"learning_rate": 0.0001410616089167842, |
|
"loss": 1.2114, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.026570259425529378, |
|
"grad_norm": 0.17011211812496185, |
|
"learning_rate": 0.0001405324109697961, |
|
"loss": 1.2695, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.026697389853402725, |
|
"grad_norm": 0.1930396556854248, |
|
"learning_rate": 0.00014000185125709918, |
|
"loss": 1.211, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.026824520281276073, |
|
"grad_norm": 0.19416122138500214, |
|
"learning_rate": 0.00013946994760388582, |
|
"loss": 1.1772, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.026951650709149417, |
|
"grad_norm": 0.18353648483753204, |
|
"learning_rate": 0.00013893671788050074, |
|
"loss": 1.2672, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.027078781137022764, |
|
"grad_norm": 0.18951141834259033, |
|
"learning_rate": 0.00013840218000184053, |
|
"loss": 1.3209, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.027205911564896112, |
|
"grad_norm": 0.19500471651554108, |
|
"learning_rate": 0.00013786635192675184, |
|
"loss": 1.2519, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.027333041992769456, |
|
"grad_norm": 0.1958056539297104, |
|
"learning_rate": 0.00013732925165742805, |
|
"loss": 1.208, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.027460172420642803, |
|
"grad_norm": 0.1859259307384491, |
|
"learning_rate": 0.00013679089723880427, |
|
"loss": 1.2715, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.02758730284851615, |
|
"grad_norm": 0.18673139810562134, |
|
"learning_rate": 0.00013625130675795134, |
|
"loss": 1.292, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.027714433276389495, |
|
"grad_norm": 0.17959700524806976, |
|
"learning_rate": 0.00013571049834346799, |
|
"loss": 1.1896, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.027841563704262842, |
|
"grad_norm": 0.1903347671031952, |
|
"learning_rate": 0.0001351684901648718, |
|
"loss": 1.3381, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.02796869413213619, |
|
"grad_norm": 0.18993370234966278, |
|
"learning_rate": 0.00013462530043198873, |
|
"loss": 1.2739, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.028095824560009534, |
|
"grad_norm": 0.18846477568149567, |
|
"learning_rate": 0.0001340809473943415, |
|
"loss": 1.2399, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.02822295498788288, |
|
"grad_norm": 0.18699532747268677, |
|
"learning_rate": 0.00013353544934053616, |
|
"loss": 1.2061, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.02835008541575623, |
|
"grad_norm": 0.19469809532165527, |
|
"learning_rate": 0.00013298882459764798, |
|
"loss": 1.2455, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.028477215843629573, |
|
"grad_norm": 0.19830243289470673, |
|
"learning_rate": 0.00013244109153060548, |
|
"loss": 1.2542, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.02860434627150292, |
|
"grad_norm": 0.20483078062534332, |
|
"learning_rate": 0.0001318922685415735, |
|
"loss": 1.2287, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.028731476699376268, |
|
"grad_norm": 0.190695121884346, |
|
"learning_rate": 0.00013134237406933492, |
|
"loss": 1.2165, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.02885860712724961, |
|
"grad_norm": 0.19430223107337952, |
|
"learning_rate": 0.00013079142658867124, |
|
"loss": 1.2922, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.02898573755512296, |
|
"grad_norm": 0.1994917094707489, |
|
"learning_rate": 0.00013023944460974183, |
|
"loss": 1.2402, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.029112867982996306, |
|
"grad_norm": 0.20195803046226501, |
|
"learning_rate": 0.00012968644667746206, |
|
"loss": 1.2594, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.02923999841086965, |
|
"grad_norm": 0.19695453345775604, |
|
"learning_rate": 0.00012913245137088024, |
|
"loss": 1.2762, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.029367128838742998, |
|
"grad_norm": 0.19726547598838806, |
|
"learning_rate": 0.00012857747730255338, |
|
"loss": 1.2494, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.029494259266616345, |
|
"grad_norm": 0.19146564602851868, |
|
"learning_rate": 0.00012802154311792197, |
|
"loss": 1.2312, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.02962138969448969, |
|
"grad_norm": 0.19849611818790436, |
|
"learning_rate": 0.00012746466749468345, |
|
"loss": 1.2186, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.029748520122363037, |
|
"grad_norm": 0.18684804439544678, |
|
"learning_rate": 0.00012690686914216474, |
|
"loss": 1.1775, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.029875650550236384, |
|
"grad_norm": 0.19955115020275116, |
|
"learning_rate": 0.0001263481668006937, |
|
"loss": 1.2476, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.03000278097810973, |
|
"grad_norm": 0.20034034550189972, |
|
"learning_rate": 0.00012578857924096934, |
|
"loss": 1.2307, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.030129911405983076, |
|
"grad_norm": 0.1980581283569336, |
|
"learning_rate": 0.00012522812526343148, |
|
"loss": 1.2332, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.030257041833856423, |
|
"grad_norm": 0.1966305524110794, |
|
"learning_rate": 0.00012466682369762882, |
|
"loss": 1.2219, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.030384172261729767, |
|
"grad_norm": 0.19543439149856567, |
|
"learning_rate": 0.00012410469340158655, |
|
"loss": 1.2998, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.030511302689603115, |
|
"grad_norm": 0.19517168402671814, |
|
"learning_rate": 0.00012354175326117253, |
|
"loss": 1.2451, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.030638433117476462, |
|
"grad_norm": 0.18800078332424164, |
|
"learning_rate": 0.00012297802218946306, |
|
"loss": 1.2349, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.030765563545349806, |
|
"grad_norm": 0.20408718287944794, |
|
"learning_rate": 0.00012241351912610726, |
|
"loss": 1.3123, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.030892693973223154, |
|
"grad_norm": 0.19570188224315643, |
|
"learning_rate": 0.00012184826303669083, |
|
"loss": 1.2181, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.0310198244010965, |
|
"grad_norm": 0.1854136884212494, |
|
"learning_rate": 0.00012128227291209891, |
|
"loss": 1.2298, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.031146954828969845, |
|
"grad_norm": 0.19532455503940582, |
|
"learning_rate": 0.00012071556776787786, |
|
"loss": 1.3124, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.03127408525684319, |
|
"grad_norm": 0.18832701444625854, |
|
"learning_rate": 0.00012014816664359671, |
|
"loss": 1.1565, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.03140121568471654, |
|
"grad_norm": 0.19303160905838013, |
|
"learning_rate": 0.0001195800886022071, |
|
"loss": 1.2329, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.03152834611258989, |
|
"grad_norm": 0.1881171315908432, |
|
"learning_rate": 0.0001190113527294032, |
|
"loss": 1.2456, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.03165547654046323, |
|
"grad_norm": 0.19656208157539368, |
|
"learning_rate": 0.00011844197813298017, |
|
"loss": 1.2959, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.031782606968336576, |
|
"grad_norm": 0.19458794593811035, |
|
"learning_rate": 0.0001178719839421925, |
|
"loss": 1.2968, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.031909737396209926, |
|
"grad_norm": 0.1953679323196411, |
|
"learning_rate": 0.00011730138930711101, |
|
"loss": 1.3225, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.03203686782408327, |
|
"grad_norm": 0.18624471127986908, |
|
"learning_rate": 0.00011673021339797967, |
|
"loss": 1.2895, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.032163998251956614, |
|
"grad_norm": 0.1975700408220291, |
|
"learning_rate": 0.00011615847540457157, |
|
"loss": 1.2272, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.032291128679829965, |
|
"grad_norm": 0.18464815616607666, |
|
"learning_rate": 0.000115586194535544, |
|
"loss": 1.1589, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.03241825910770331, |
|
"grad_norm": 0.19085770845413208, |
|
"learning_rate": 0.00011501339001779332, |
|
"loss": 1.2129, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.03254538953557665, |
|
"grad_norm": 0.19415773451328278, |
|
"learning_rate": 0.00011444008109580884, |
|
"loss": 1.2209, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.032672519963450004, |
|
"grad_norm": 0.20239484310150146, |
|
"learning_rate": 0.00011386628703102633, |
|
"loss": 1.2872, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.03279965039132335, |
|
"grad_norm": 0.18557807803153992, |
|
"learning_rate": 0.00011329202710118088, |
|
"loss": 1.2661, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.03292678081919669, |
|
"grad_norm": 0.19118450582027435, |
|
"learning_rate": 0.00011271732059965925, |
|
"loss": 1.2781, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.03305391124707004, |
|
"grad_norm": 0.1958242654800415, |
|
"learning_rate": 0.00011214218683485158, |
|
"loss": 1.2579, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03318104167494339, |
|
"grad_norm": 0.1829763948917389, |
|
"learning_rate": 0.00011156664512950287, |
|
"loss": 1.2359, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.03330817210281673, |
|
"grad_norm": 0.18603093922138214, |
|
"learning_rate": 0.00011099071482006361, |
|
"loss": 1.2487, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.03343530253069008, |
|
"grad_norm": 0.18507151305675507, |
|
"learning_rate": 0.00011041441525604014, |
|
"loss": 1.2339, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.033562432958563426, |
|
"grad_norm": 0.198081374168396, |
|
"learning_rate": 0.00010983776579934482, |
|
"loss": 1.1937, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.03368956338643677, |
|
"grad_norm": 0.19567249715328217, |
|
"learning_rate": 0.00010926078582364514, |
|
"loss": 1.2447, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03381669381431012, |
|
"grad_norm": 0.1892256885766983, |
|
"learning_rate": 0.00010868349471371315, |
|
"loss": 1.2011, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.033943824242183465, |
|
"grad_norm": 0.19196678698062897, |
|
"learning_rate": 0.000108105911864774, |
|
"loss": 1.2341, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.03407095467005681, |
|
"grad_norm": 0.20349054038524628, |
|
"learning_rate": 0.00010752805668185442, |
|
"loss": 1.2582, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.03419808509793016, |
|
"grad_norm": 0.19051875174045563, |
|
"learning_rate": 0.0001069499485791307, |
|
"loss": 1.2593, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.034325215525803504, |
|
"grad_norm": 0.18554829061031342, |
|
"learning_rate": 0.00010637160697927651, |
|
"loss": 1.1395, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03445234595367685, |
|
"grad_norm": 0.18687109649181366, |
|
"learning_rate": 0.00010579305131281025, |
|
"loss": 1.2079, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.0345794763815502, |
|
"grad_norm": 0.19098562002182007, |
|
"learning_rate": 0.00010521430101744239, |
|
"loss": 1.2147, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.03470660680942354, |
|
"grad_norm": 0.19619682431221008, |
|
"learning_rate": 0.00010463537553742225, |
|
"loss": 1.1458, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.03483373723729689, |
|
"grad_norm": 0.1930547058582306, |
|
"learning_rate": 0.00010405629432288488, |
|
"loss": 1.2704, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.03496086766517024, |
|
"grad_norm": 0.19838818907737732, |
|
"learning_rate": 0.00010347707682919754, |
|
"loss": 1.2228, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03508799809304358, |
|
"grad_norm": 0.1933777928352356, |
|
"learning_rate": 0.00010289774251630602, |
|
"loss": 1.189, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.035215128520916926, |
|
"grad_norm": 0.20109447836875916, |
|
"learning_rate": 0.0001023183108480809, |
|
"loss": 1.2199, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.03534225894879028, |
|
"grad_norm": 0.18909600377082825, |
|
"learning_rate": 0.00010173880129166358, |
|
"loss": 1.1529, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.03546938937666362, |
|
"grad_norm": 0.18809406459331512, |
|
"learning_rate": 0.00010115923331681232, |
|
"loss": 1.2183, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.035596519804536965, |
|
"grad_norm": 0.191794291138649, |
|
"learning_rate": 0.00010057962639524798, |
|
"loss": 1.2621, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.035723650232410316, |
|
"grad_norm": 0.19512364268302917, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1575, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.03585078066028366, |
|
"grad_norm": 0.18720309436321259, |
|
"learning_rate": 9.942037360475205e-05, |
|
"loss": 1.2435, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.035977911088157004, |
|
"grad_norm": 0.20254142582416534, |
|
"learning_rate": 9.884076668318773e-05, |
|
"loss": 1.3042, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.036105041516030355, |
|
"grad_norm": 0.1859116405248642, |
|
"learning_rate": 9.826119870833643e-05, |
|
"loss": 1.2607, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.0362321719439037, |
|
"grad_norm": 0.20392954349517822, |
|
"learning_rate": 9.768168915191913e-05, |
|
"loss": 1.2478, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03635930237177704, |
|
"grad_norm": 0.20078176259994507, |
|
"learning_rate": 9.710225748369401e-05, |
|
"loss": 1.2359, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.036486432799650394, |
|
"grad_norm": 0.1939856857061386, |
|
"learning_rate": 9.65229231708025e-05, |
|
"loss": 1.2484, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.03661356322752374, |
|
"grad_norm": 0.19322216510772705, |
|
"learning_rate": 9.594370567711513e-05, |
|
"loss": 1.3249, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.03674069365539708, |
|
"grad_norm": 0.1958608329296112, |
|
"learning_rate": 9.536462446257776e-05, |
|
"loss": 1.2212, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.03686782408327043, |
|
"grad_norm": 0.2111774981021881, |
|
"learning_rate": 9.478569898255765e-05, |
|
"loss": 1.3251, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03699495451114378, |
|
"grad_norm": 0.20403634011745453, |
|
"learning_rate": 9.420694868718977e-05, |
|
"loss": 1.1831, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.03712208493901712, |
|
"grad_norm": 0.20421482622623444, |
|
"learning_rate": 9.362839302072354e-05, |
|
"loss": 1.2513, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.03724921536689047, |
|
"grad_norm": 0.1911601573228836, |
|
"learning_rate": 9.305005142086932e-05, |
|
"loss": 1.1875, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.037376345794763816, |
|
"grad_norm": 0.20054025948047638, |
|
"learning_rate": 9.247194331814562e-05, |
|
"loss": 1.214, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.03750347622263716, |
|
"grad_norm": 0.198801651597023, |
|
"learning_rate": 9.1894088135226e-05, |
|
"loss": 1.1993, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03763060665051051, |
|
"grad_norm": 0.1995508074760437, |
|
"learning_rate": 9.131650528628687e-05, |
|
"loss": 1.1986, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.037757737078383855, |
|
"grad_norm": 0.20325009524822235, |
|
"learning_rate": 9.073921417635486e-05, |
|
"loss": 1.2292, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.0378848675062572, |
|
"grad_norm": 0.193388894200325, |
|
"learning_rate": 9.016223420065519e-05, |
|
"loss": 1.2304, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.03801199793413055, |
|
"grad_norm": 0.1861080676317215, |
|
"learning_rate": 8.958558474395987e-05, |
|
"loss": 1.2843, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.03813912836200389, |
|
"grad_norm": 0.19273944199085236, |
|
"learning_rate": 8.900928517993644e-05, |
|
"loss": 1.111, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03813912836200389, |
|
"eval_loss": 1.2362135648727417, |
|
"eval_runtime": 1257.3039, |
|
"eval_samples_per_second": 3.977, |
|
"eval_steps_per_second": 0.994, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03826625878987724, |
|
"grad_norm": 0.19571375846862793, |
|
"learning_rate": 8.843335487049712e-05, |
|
"loss": 1.2259, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.03839338921775059, |
|
"grad_norm": 0.19907177984714508, |
|
"learning_rate": 8.785781316514841e-05, |
|
"loss": 1.1946, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.03852051964562393, |
|
"grad_norm": 0.19650404155254364, |
|
"learning_rate": 8.728267940034078e-05, |
|
"loss": 1.1977, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.038647650073497276, |
|
"grad_norm": 0.1937037855386734, |
|
"learning_rate": 8.670797289881915e-05, |
|
"loss": 1.1719, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.03877478050137063, |
|
"grad_norm": 0.2028638869524002, |
|
"learning_rate": 8.61337129689737e-05, |
|
"loss": 1.224, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.03890191092924397, |
|
"grad_norm": 0.19181466102600098, |
|
"learning_rate": 8.555991890419117e-05, |
|
"loss": 1.2375, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.039029041357117315, |
|
"grad_norm": 0.1887066513299942, |
|
"learning_rate": 8.498660998220669e-05, |
|
"loss": 1.1786, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.039156171784990666, |
|
"grad_norm": 0.18820159137248993, |
|
"learning_rate": 8.441380546445603e-05, |
|
"loss": 1.2536, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.03928330221286401, |
|
"grad_norm": 0.18667809665203094, |
|
"learning_rate": 8.384152459542848e-05, |
|
"loss": 1.2834, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.039410432640737354, |
|
"grad_norm": 0.19369632005691528, |
|
"learning_rate": 8.326978660202034e-05, |
|
"loss": 1.2989, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.039537563068610705, |
|
"grad_norm": 0.18864746391773224, |
|
"learning_rate": 8.269861069288903e-05, |
|
"loss": 1.292, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.03966469349648405, |
|
"grad_norm": 0.19906029105186462, |
|
"learning_rate": 8.212801605780753e-05, |
|
"loss": 1.2855, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.03979182392435739, |
|
"grad_norm": 0.19700987637043, |
|
"learning_rate": 8.155802186701984e-05, |
|
"loss": 1.1771, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.039918954352230744, |
|
"grad_norm": 0.20134317874908447, |
|
"learning_rate": 8.098864727059685e-05, |
|
"loss": 1.1995, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.04004608478010409, |
|
"grad_norm": 0.1883343607187271, |
|
"learning_rate": 8.04199113977929e-05, |
|
"loss": 1.2433, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.04017321520797743, |
|
"grad_norm": 0.19041708111763, |
|
"learning_rate": 7.985183335640331e-05, |
|
"loss": 1.2538, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.04030034563585078, |
|
"grad_norm": 0.1838679164648056, |
|
"learning_rate": 7.928443223212215e-05, |
|
"loss": 1.2025, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.04042747606372413, |
|
"grad_norm": 0.19493237137794495, |
|
"learning_rate": 7.871772708790114e-05, |
|
"loss": 1.2553, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.04055460649159747, |
|
"grad_norm": 0.197859525680542, |
|
"learning_rate": 7.815173696330919e-05, |
|
"loss": 1.2661, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.04068173691947082, |
|
"grad_norm": 0.19427183270454407, |
|
"learning_rate": 7.758648087389277e-05, |
|
"loss": 1.2121, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.040808867347344166, |
|
"grad_norm": 0.19236573576927185, |
|
"learning_rate": 7.702197781053696e-05, |
|
"loss": 1.2375, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.04093599777521751, |
|
"grad_norm": 0.19856838881969452, |
|
"learning_rate": 7.645824673882748e-05, |
|
"loss": 1.2648, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.04106312820309086, |
|
"grad_norm": 0.20721471309661865, |
|
"learning_rate": 7.589530659841349e-05, |
|
"loss": 1.2503, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.041190258630964205, |
|
"grad_norm": 0.19413287937641144, |
|
"learning_rate": 7.533317630237117e-05, |
|
"loss": 1.265, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.04131738905883755, |
|
"grad_norm": 0.1948065459728241, |
|
"learning_rate": 7.477187473656853e-05, |
|
"loss": 1.2581, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0414445194867109, |
|
"grad_norm": 0.19630448520183563, |
|
"learning_rate": 7.421142075903067e-05, |
|
"loss": 1.2013, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.041571649914584244, |
|
"grad_norm": 0.18867121636867523, |
|
"learning_rate": 7.365183319930635e-05, |
|
"loss": 1.1628, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.04169878034245759, |
|
"grad_norm": 0.2017098367214203, |
|
"learning_rate": 7.309313085783524e-05, |
|
"loss": 1.1882, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.04182591077033094, |
|
"grad_norm": 0.19574840366840363, |
|
"learning_rate": 7.253533250531656e-05, |
|
"loss": 1.1917, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.04195304119820428, |
|
"grad_norm": 0.2003111094236374, |
|
"learning_rate": 7.197845688207805e-05, |
|
"loss": 1.3069, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04208017162607763, |
|
"grad_norm": 0.19444699585437775, |
|
"learning_rate": 7.142252269744665e-05, |
|
"loss": 1.1623, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.04220730205395098, |
|
"grad_norm": 0.19306592643260956, |
|
"learning_rate": 7.086754862911982e-05, |
|
"loss": 1.2512, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.04233443248182432, |
|
"grad_norm": 0.1940678060054779, |
|
"learning_rate": 7.031355332253795e-05, |
|
"loss": 1.2404, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.042461562909697666, |
|
"grad_norm": 0.20104342699050903, |
|
"learning_rate": 6.976055539025818e-05, |
|
"loss": 1.1826, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.04258869333757102, |
|
"grad_norm": 0.20509012043476105, |
|
"learning_rate": 6.92085734113288e-05, |
|
"loss": 1.2247, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04271582376544436, |
|
"grad_norm": 0.2038545161485672, |
|
"learning_rate": 6.865762593066513e-05, |
|
"loss": 1.25, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.042842954193317705, |
|
"grad_norm": 0.20035366714000702, |
|
"learning_rate": 6.810773145842653e-05, |
|
"loss": 1.2243, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.042970084621191056, |
|
"grad_norm": 0.20092110335826874, |
|
"learning_rate": 6.755890846939454e-05, |
|
"loss": 1.2279, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.0430972150490644, |
|
"grad_norm": 0.20261281728744507, |
|
"learning_rate": 6.701117540235204e-05, |
|
"loss": 1.2418, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.043224345476937744, |
|
"grad_norm": 0.20916980504989624, |
|
"learning_rate": 6.646455065946386e-05, |
|
"loss": 1.2205, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.043351475904811095, |
|
"grad_norm": 0.1868792027235031, |
|
"learning_rate": 6.591905260565852e-05, |
|
"loss": 1.2149, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.04347860633268444, |
|
"grad_norm": 0.19856908917427063, |
|
"learning_rate": 6.537469956801128e-05, |
|
"loss": 1.2518, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.04360573676055778, |
|
"grad_norm": 0.19585344195365906, |
|
"learning_rate": 6.483150983512823e-05, |
|
"loss": 1.2202, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.04373286718843113, |
|
"grad_norm": 0.19705970585346222, |
|
"learning_rate": 6.428950165653204e-05, |
|
"loss": 1.2701, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.04385999761630448, |
|
"grad_norm": 0.19830965995788574, |
|
"learning_rate": 6.374869324204869e-05, |
|
"loss": 1.2132, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04398712804417782, |
|
"grad_norm": 0.20360921323299408, |
|
"learning_rate": 6.320910276119576e-05, |
|
"loss": 1.1979, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.04411425847205117, |
|
"grad_norm": 0.20261693000793457, |
|
"learning_rate": 6.267074834257199e-05, |
|
"loss": 1.2231, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.044241388899924516, |
|
"grad_norm": 0.19419489800930023, |
|
"learning_rate": 6.213364807324818e-05, |
|
"loss": 1.1575, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.04436851932779786, |
|
"grad_norm": 0.2025313675403595, |
|
"learning_rate": 6.15978199981595e-05, |
|
"loss": 1.2566, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.04449564975567121, |
|
"grad_norm": 0.19754880666732788, |
|
"learning_rate": 6.106328211949928e-05, |
|
"loss": 1.2821, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.044622780183544555, |
|
"grad_norm": 0.20343464612960815, |
|
"learning_rate": 6.053005239611418e-05, |
|
"loss": 1.2204, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.0447499106114179, |
|
"grad_norm": 0.20527192950248718, |
|
"learning_rate": 5.999814874290084e-05, |
|
"loss": 1.2513, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.04487704103929125, |
|
"grad_norm": 0.19321362674236298, |
|
"learning_rate": 5.946758903020393e-05, |
|
"loss": 1.2466, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.045004171467164594, |
|
"grad_norm": 0.20470896363258362, |
|
"learning_rate": 5.893839108321584e-05, |
|
"loss": 1.2846, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.04513130189503794, |
|
"grad_norm": 0.19084323942661285, |
|
"learning_rate": 5.841057268137771e-05, |
|
"loss": 1.2126, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.04525843232291129, |
|
"grad_norm": 0.19587008655071259, |
|
"learning_rate": 5.7884151557782305e-05, |
|
"loss": 1.1983, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.04538556275078463, |
|
"grad_norm": 0.20390859246253967, |
|
"learning_rate": 5.735914539857798e-05, |
|
"loss": 1.1981, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.04551269317865798, |
|
"grad_norm": 0.19584935903549194, |
|
"learning_rate": 5.68355718423746e-05, |
|
"loss": 1.2039, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.04563982360653133, |
|
"grad_norm": 0.19530071318149567, |
|
"learning_rate": 5.6313448479650946e-05, |
|
"loss": 1.236, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.04576695403440467, |
|
"grad_norm": 0.19659969210624695, |
|
"learning_rate": 5.579279285216369e-05, |
|
"loss": 1.1936, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.045894084462278016, |
|
"grad_norm": 0.1933298110961914, |
|
"learning_rate": 5.527362245235805e-05, |
|
"loss": 1.227, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.04602121489015137, |
|
"grad_norm": 0.20280398428440094, |
|
"learning_rate": 5.475595472278024e-05, |
|
"loss": 1.2644, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.04614834531802471, |
|
"grad_norm": 0.1918189376592636, |
|
"learning_rate": 5.4239807055491135e-05, |
|
"loss": 1.1495, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.046275475745898055, |
|
"grad_norm": 0.2044762223958969, |
|
"learning_rate": 5.372519679148227e-05, |
|
"loss": 1.241, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.046402606173771406, |
|
"grad_norm": 0.1972542256116867, |
|
"learning_rate": 5.321214122009306e-05, |
|
"loss": 1.1419, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04652973660164475, |
|
"grad_norm": 0.20039339363574982, |
|
"learning_rate": 5.270065757843e-05, |
|
"loss": 1.2718, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.046656867029518094, |
|
"grad_norm": 0.1938110589981079, |
|
"learning_rate": 5.219076305078749e-05, |
|
"loss": 1.1947, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.046783997457391445, |
|
"grad_norm": 0.20640990138053894, |
|
"learning_rate": 5.168247476807053e-05, |
|
"loss": 1.1526, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.04691112788526479, |
|
"grad_norm": 0.198054239153862, |
|
"learning_rate": 5.11758098072193e-05, |
|
"loss": 1.1965, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.04703825831313813, |
|
"grad_norm": 0.19604484736919403, |
|
"learning_rate": 5.067078519063514e-05, |
|
"loss": 1.2568, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.047165388741011484, |
|
"grad_norm": 0.2095029056072235, |
|
"learning_rate": 5.016741788560889e-05, |
|
"loss": 1.2822, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.04729251916888483, |
|
"grad_norm": 0.20356985926628113, |
|
"learning_rate": 4.9665724803750756e-05, |
|
"loss": 1.1434, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.04741964959675817, |
|
"grad_norm": 0.19989654421806335, |
|
"learning_rate": 4.9165722800422096e-05, |
|
"loss": 1.2767, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.04754678002463152, |
|
"grad_norm": 0.19582509994506836, |
|
"learning_rate": 4.86674286741693e-05, |
|
"loss": 1.2693, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.04767391045250487, |
|
"grad_norm": 0.1962389498949051, |
|
"learning_rate": 4.8170859166159144e-05, |
|
"loss": 1.3266, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04780104088037821, |
|
"grad_norm": 0.2056453377008438, |
|
"learning_rate": 4.7676030959616526e-05, |
|
"loss": 1.3004, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.04792817130825156, |
|
"grad_norm": 0.19587452709674835, |
|
"learning_rate": 4.71829606792639e-05, |
|
"loss": 1.2154, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.048055301736124906, |
|
"grad_norm": 0.19662117958068848, |
|
"learning_rate": 4.669166489076283e-05, |
|
"loss": 1.2434, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.04818243216399825, |
|
"grad_norm": 0.19508899748325348, |
|
"learning_rate": 4.620216010015724e-05, |
|
"loss": 1.2319, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.0483095625918716, |
|
"grad_norm": 0.19653861224651337, |
|
"learning_rate": 4.571446275331903e-05, |
|
"loss": 1.2006, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.048436693019744945, |
|
"grad_norm": 0.1971856951713562, |
|
"learning_rate": 4.5228589235395436e-05, |
|
"loss": 1.2937, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.04856382344761829, |
|
"grad_norm": 0.2165059596300125, |
|
"learning_rate": 4.4744555870258694e-05, |
|
"loss": 1.2722, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.04869095387549164, |
|
"grad_norm": 0.20020557940006256, |
|
"learning_rate": 4.4262378919957413e-05, |
|
"loss": 1.1947, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.048818084303364984, |
|
"grad_norm": 0.19455446302890778, |
|
"learning_rate": 4.378207458417035e-05, |
|
"loss": 1.1956, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.04894521473123833, |
|
"grad_norm": 0.202660471200943, |
|
"learning_rate": 4.3303658999662086e-05, |
|
"loss": 1.2553, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.04907234515911168, |
|
"grad_norm": 0.19681531190872192, |
|
"learning_rate": 4.282714823974088e-05, |
|
"loss": 1.2031, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.04919947558698502, |
|
"grad_norm": 0.20613734424114227, |
|
"learning_rate": 4.2352558313718795e-05, |
|
"loss": 1.2384, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.04932660601485837, |
|
"grad_norm": 0.1990024596452713, |
|
"learning_rate": 4.1879905166373614e-05, |
|
"loss": 1.2184, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.04945373644273172, |
|
"grad_norm": 0.21309691667556763, |
|
"learning_rate": 4.140920467741325e-05, |
|
"loss": 1.1853, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.04958086687060506, |
|
"grad_norm": 0.19488035142421722, |
|
"learning_rate": 4.094047266094225e-05, |
|
"loss": 1.1804, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.049707997298478405, |
|
"grad_norm": 0.19738849997520447, |
|
"learning_rate": 4.047372486493054e-05, |
|
"loss": 1.2534, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.049835127726351756, |
|
"grad_norm": 0.20008018612861633, |
|
"learning_rate": 4.0008976970684176e-05, |
|
"loss": 1.2723, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.0499622581542251, |
|
"grad_norm": 0.19521461427211761, |
|
"learning_rate": 3.954624459231866e-05, |
|
"loss": 1.1705, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.050089388582098444, |
|
"grad_norm": 0.20466111600399017, |
|
"learning_rate": 3.908554327623425e-05, |
|
"loss": 1.154, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.050216519009971795, |
|
"grad_norm": 0.2047969251871109, |
|
"learning_rate": 3.8626888500593695e-05, |
|
"loss": 1.2139, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.05034364943784514, |
|
"grad_norm": 0.1980600655078888, |
|
"learning_rate": 3.817029567480228e-05, |
|
"loss": 1.279, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.05047077986571848, |
|
"grad_norm": 0.20217813551425934, |
|
"learning_rate": 3.771578013898996e-05, |
|
"loss": 1.2561, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.050597910293591834, |
|
"grad_norm": 0.1985122561454773, |
|
"learning_rate": 3.726335716349612e-05, |
|
"loss": 1.2778, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.05072504072146518, |
|
"grad_norm": 0.19889195263385773, |
|
"learning_rate": 3.681304194835641e-05, |
|
"loss": 1.3225, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.05085217114933852, |
|
"grad_norm": 0.19213935732841492, |
|
"learning_rate": 3.6364849622792266e-05, |
|
"loss": 1.2308, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05085217114933852, |
|
"eval_loss": 1.2253398895263672, |
|
"eval_runtime": 1257.7786, |
|
"eval_samples_per_second": 3.975, |
|
"eval_steps_per_second": 0.994, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05097930157721187, |
|
"grad_norm": 0.19073964655399323, |
|
"learning_rate": 3.5918795244702396e-05, |
|
"loss": 1.1798, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.05110643200508522, |
|
"grad_norm": 0.19348806142807007, |
|
"learning_rate": 3.547489380015701e-05, |
|
"loss": 1.2429, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.05123356243295856, |
|
"grad_norm": 0.201893150806427, |
|
"learning_rate": 3.503316020289429e-05, |
|
"loss": 1.2302, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.05136069286083191, |
|
"grad_norm": 0.20248207449913025, |
|
"learning_rate": 3.459360929381931e-05, |
|
"loss": 1.2295, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.051487823288705256, |
|
"grad_norm": 0.20291946828365326, |
|
"learning_rate": 3.415625584050557e-05, |
|
"loss": 1.2925, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.0516149537165786, |
|
"grad_norm": 0.19170844554901123, |
|
"learning_rate": 3.372111453669864e-05, |
|
"loss": 1.1825, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.05174208414445195, |
|
"grad_norm": 0.1890149712562561, |
|
"learning_rate": 3.328820000182262e-05, |
|
"loss": 1.149, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.051869214572325295, |
|
"grad_norm": 0.20486074686050415, |
|
"learning_rate": 3.285752678048892e-05, |
|
"loss": 1.1458, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.05199634500019864, |
|
"grad_norm": 0.20006342232227325, |
|
"learning_rate": 3.242910934200775e-05, |
|
"loss": 1.2031, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.05212347542807199, |
|
"grad_norm": 0.20113137364387512, |
|
"learning_rate": 3.2002962079901744e-05, |
|
"loss": 1.2474, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.052250605855945334, |
|
"grad_norm": 0.1928662657737732, |
|
"learning_rate": 3.157909931142257e-05, |
|
"loss": 1.2189, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.05237773628381868, |
|
"grad_norm": 0.19416049122810364, |
|
"learning_rate": 3.115753527706986e-05, |
|
"loss": 1.2506, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.05250486671169203, |
|
"grad_norm": 0.19863539934158325, |
|
"learning_rate": 3.073828414011274e-05, |
|
"loss": 1.2019, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.05263199713956537, |
|
"grad_norm": 0.19736243784427643, |
|
"learning_rate": 3.0321359986114096e-05, |
|
"loss": 1.2718, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.05275912756743872, |
|
"grad_norm": 0.200786292552948, |
|
"learning_rate": 2.9906776822457205e-05, |
|
"loss": 1.2523, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05288625799531207, |
|
"grad_norm": 0.19629159569740295, |
|
"learning_rate": 2.9494548577875192e-05, |
|
"loss": 1.2156, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.05301338842318541, |
|
"grad_norm": 0.19293853640556335, |
|
"learning_rate": 2.9084689101983075e-05, |
|
"loss": 1.2422, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.053140518851058756, |
|
"grad_norm": 0.19834856688976288, |
|
"learning_rate": 2.8677212164812462e-05, |
|
"loss": 1.1569, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.05326764927893211, |
|
"grad_norm": 0.19939422607421875, |
|
"learning_rate": 2.827213145634887e-05, |
|
"loss": 1.18, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.05339477970680545, |
|
"grad_norm": 0.20803573727607727, |
|
"learning_rate": 2.7869460586071873e-05, |
|
"loss": 1.2536, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.053521910134678795, |
|
"grad_norm": 0.20102088153362274, |
|
"learning_rate": 2.7469213082497736e-05, |
|
"loss": 1.2664, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.053649040562552146, |
|
"grad_norm": 0.20080140233039856, |
|
"learning_rate": 2.7071402392725097e-05, |
|
"loss": 1.2931, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.05377617099042549, |
|
"grad_norm": 0.19506552815437317, |
|
"learning_rate": 2.6676041881982962e-05, |
|
"loss": 1.2461, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.053903301418298834, |
|
"grad_norm": 0.19534236192703247, |
|
"learning_rate": 2.6283144833181783e-05, |
|
"loss": 1.1971, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.054030431846172185, |
|
"grad_norm": 0.20050783455371857, |
|
"learning_rate": 2.589272444646723e-05, |
|
"loss": 1.169, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05415756227404553, |
|
"grad_norm": 0.201995849609375, |
|
"learning_rate": 2.5504793838776586e-05, |
|
"loss": 1.2758, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.05428469270191887, |
|
"grad_norm": 0.20075969398021698, |
|
"learning_rate": 2.5119366043398264e-05, |
|
"loss": 1.2455, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.054411823129792224, |
|
"grad_norm": 0.19450737535953522, |
|
"learning_rate": 2.473645400953366e-05, |
|
"loss": 1.2319, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.05453895355766557, |
|
"grad_norm": 0.19657433032989502, |
|
"learning_rate": 2.4356070601862324e-05, |
|
"loss": 1.2031, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.05466608398553891, |
|
"grad_norm": 0.1956198513507843, |
|
"learning_rate": 2.3978228600109565e-05, |
|
"loss": 1.2345, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05479321441341226, |
|
"grad_norm": 0.19880710542201996, |
|
"learning_rate": 2.3602940698617325e-05, |
|
"loss": 1.3102, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.05492034484128561, |
|
"grad_norm": 0.19965411722660065, |
|
"learning_rate": 2.3230219505917426e-05, |
|
"loss": 1.2873, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.05504747526915895, |
|
"grad_norm": 0.19630952179431915, |
|
"learning_rate": 2.2860077544308124e-05, |
|
"loss": 1.272, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.0551746056970323, |
|
"grad_norm": 0.19560639560222626, |
|
"learning_rate": 2.249252724943336e-05, |
|
"loss": 1.2593, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.055301736124905645, |
|
"grad_norm": 0.20019298791885376, |
|
"learning_rate": 2.2127580969864925e-05, |
|
"loss": 1.2723, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.05542886655277899, |
|
"grad_norm": 0.20072917640209198, |
|
"learning_rate": 2.176525096668769e-05, |
|
"loss": 1.2069, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.05555599698065234, |
|
"grad_norm": 0.20204520225524902, |
|
"learning_rate": 2.1405549413087544e-05, |
|
"loss": 1.2361, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.055683127408525684, |
|
"grad_norm": 0.20441173017024994, |
|
"learning_rate": 2.1048488393942454e-05, |
|
"loss": 1.1882, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.05581025783639903, |
|
"grad_norm": 0.1971142739057541, |
|
"learning_rate": 2.0694079905416473e-05, |
|
"loss": 1.2168, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.05593738826427238, |
|
"grad_norm": 0.20565344393253326, |
|
"learning_rate": 2.0342335854556737e-05, |
|
"loss": 1.2486, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.05606451869214572, |
|
"grad_norm": 0.19731251895427704, |
|
"learning_rate": 1.9993268058893343e-05, |
|
"loss": 1.116, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.05619164912001907, |
|
"grad_norm": 0.19447311758995056, |
|
"learning_rate": 1.964688824604234e-05, |
|
"loss": 1.2497, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.05631877954789242, |
|
"grad_norm": 0.2041405290365219, |
|
"learning_rate": 1.930320805331176e-05, |
|
"loss": 1.2843, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.05644590997576576, |
|
"grad_norm": 0.20058204233646393, |
|
"learning_rate": 1.896223902731058e-05, |
|
"loss": 1.2685, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.056573040403639106, |
|
"grad_norm": 0.20194143056869507, |
|
"learning_rate": 1.8623992623560893e-05, |
|
"loss": 1.2751, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.05670017083151246, |
|
"grad_norm": 0.20336030423641205, |
|
"learning_rate": 1.828848020611288e-05, |
|
"loss": 1.2362, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.0568273012593858, |
|
"grad_norm": 0.2054579108953476, |
|
"learning_rate": 1.7955713047163157e-05, |
|
"loss": 1.2254, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.056954431687259145, |
|
"grad_norm": 0.20549102127552032, |
|
"learning_rate": 1.762570232667595e-05, |
|
"loss": 1.2574, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.057081562115132496, |
|
"grad_norm": 0.2029975950717926, |
|
"learning_rate": 1.7298459132007627e-05, |
|
"loss": 1.2066, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.05720869254300584, |
|
"grad_norm": 0.20551042258739471, |
|
"learning_rate": 1.6973994457534026e-05, |
|
"loss": 1.2384, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.057335822970879184, |
|
"grad_norm": 0.20290708541870117, |
|
"learning_rate": 1.6652319204281187e-05, |
|
"loss": 1.2257, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.057462953398752535, |
|
"grad_norm": 0.202660471200943, |
|
"learning_rate": 1.6333444179559078e-05, |
|
"loss": 1.1865, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.05759008382662588, |
|
"grad_norm": 0.19935418665409088, |
|
"learning_rate": 1.601738009659849e-05, |
|
"loss": 1.2445, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.05771721425449922, |
|
"grad_norm": 0.19397136569023132, |
|
"learning_rate": 1.5704137574191203e-05, |
|
"loss": 1.2246, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.057844344682372574, |
|
"grad_norm": 0.20033963024616241, |
|
"learning_rate": 1.5393727136333035e-05, |
|
"loss": 1.2452, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.05797147511024592, |
|
"grad_norm": 0.19725894927978516, |
|
"learning_rate": 1.5086159211870442e-05, |
|
"loss": 1.1902, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.05809860553811926, |
|
"grad_norm": 0.21174763143062592, |
|
"learning_rate": 1.4781444134150047e-05, |
|
"loss": 1.1956, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.05822573596599261, |
|
"grad_norm": 0.20643867552280426, |
|
"learning_rate": 1.447959214067155e-05, |
|
"loss": 1.2708, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.05835286639386596, |
|
"grad_norm": 0.203439399600029, |
|
"learning_rate": 1.4180613372743679e-05, |
|
"loss": 1.1935, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.0584799968217393, |
|
"grad_norm": 0.2026398777961731, |
|
"learning_rate": 1.3884517875143544e-05, |
|
"loss": 1.2331, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.05860712724961265, |
|
"grad_norm": 0.20543427765369415, |
|
"learning_rate": 1.3591315595779108e-05, |
|
"loss": 1.2039, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.058734257677485996, |
|
"grad_norm": 0.2049439400434494, |
|
"learning_rate": 1.3301016385355092e-05, |
|
"loss": 1.1847, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.05886138810535934, |
|
"grad_norm": 0.20000450313091278, |
|
"learning_rate": 1.3013629997041853e-05, |
|
"loss": 1.2432, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.05898851853323269, |
|
"grad_norm": 0.20123903453350067, |
|
"learning_rate": 1.2729166086147803e-05, |
|
"loss": 1.2698, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.059115648961106035, |
|
"grad_norm": 0.20424886047840118, |
|
"learning_rate": 1.2447634209795e-05, |
|
"loss": 1.2635, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.05924277938897938, |
|
"grad_norm": 0.19165809452533722, |
|
"learning_rate": 1.2169043826598058e-05, |
|
"loss": 1.1772, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.05936990981685273, |
|
"grad_norm": 0.19973435997962952, |
|
"learning_rate": 1.1893404296346423e-05, |
|
"loss": 1.2326, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.059497040244726074, |
|
"grad_norm": 0.20723304152488708, |
|
"learning_rate": 1.1620724879689792e-05, |
|
"loss": 1.2451, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.05962417067259942, |
|
"grad_norm": 0.1965390294790268, |
|
"learning_rate": 1.135101473782706e-05, |
|
"loss": 1.2154, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.05975130110047277, |
|
"grad_norm": 0.19724096357822418, |
|
"learning_rate": 1.1084282932198541e-05, |
|
"loss": 1.2092, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.05987843152834611, |
|
"grad_norm": 0.19244475662708282, |
|
"learning_rate": 1.0820538424181515e-05, |
|
"loss": 1.12, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.06000556195621946, |
|
"grad_norm": 0.20514468848705292, |
|
"learning_rate": 1.0559790074789133e-05, |
|
"loss": 1.1499, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.06013269238409281, |
|
"grad_norm": 0.19560429453849792, |
|
"learning_rate": 1.030204664437271e-05, |
|
"loss": 1.2266, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.06025982281196615, |
|
"grad_norm": 0.20317231118679047, |
|
"learning_rate": 1.0047316792327499e-05, |
|
"loss": 1.1913, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.060386953239839496, |
|
"grad_norm": 0.19489817321300507, |
|
"learning_rate": 9.795609076801625e-06, |
|
"loss": 1.2461, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.06051408366771285, |
|
"grad_norm": 0.1974310576915741, |
|
"learning_rate": 9.546931954408622e-06, |
|
"loss": 1.1443, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.06064121409558619, |
|
"grad_norm": 0.19223681092262268, |
|
"learning_rate": 9.301293779943321e-06, |
|
"loss": 1.1864, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.060768344523459535, |
|
"grad_norm": 0.2029593586921692, |
|
"learning_rate": 9.058702806101172e-06, |
|
"loss": 1.2081, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.060895474951332886, |
|
"grad_norm": 0.20288948714733124, |
|
"learning_rate": 8.819167183200905e-06, |
|
"loss": 1.2794, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.06102260537920623, |
|
"grad_norm": 0.20905464887619019, |
|
"learning_rate": 8.58269495891081e-06, |
|
"loss": 1.2125, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.061149735807079574, |
|
"grad_norm": 0.2005111575126648, |
|
"learning_rate": 8.349294077978265e-06, |
|
"loss": 1.2183, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.061276866234952924, |
|
"grad_norm": 0.20153559744358063, |
|
"learning_rate": 8.118972381962853e-06, |
|
"loss": 1.2213, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.06140399666282627, |
|
"grad_norm": 0.20067910850048065, |
|
"learning_rate": 7.891737608972927e-06, |
|
"loss": 1.1919, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.06153112709069961, |
|
"grad_norm": 0.2007380872964859, |
|
"learning_rate": 7.6675973934056e-06, |
|
"loss": 1.2182, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.06165825751857296, |
|
"grad_norm": 0.19200630486011505, |
|
"learning_rate": 7.4465592656903114e-06, |
|
"loss": 1.2089, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.06178538794644631, |
|
"grad_norm": 0.19756072759628296, |
|
"learning_rate": 7.228630652035717e-06, |
|
"loss": 1.2236, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.06191251837431965, |
|
"grad_norm": 0.20330612361431122, |
|
"learning_rate": 7.0138188741803225e-06, |
|
"loss": 1.1603, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.062039648802193, |
|
"grad_norm": 0.2018778920173645, |
|
"learning_rate": 6.802131149146373e-06, |
|
"loss": 1.2256, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.062166779230066346, |
|
"grad_norm": 0.19810867309570312, |
|
"learning_rate": 6.59357458899752e-06, |
|
"loss": 1.2123, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.06229390965793969, |
|
"grad_norm": 0.19678053259849548, |
|
"learning_rate": 6.388156200599726e-06, |
|
"loss": 1.2266, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06242104008581304, |
|
"grad_norm": 0.20404191315174103, |
|
"learning_rate": 6.185882885385952e-06, |
|
"loss": 1.2376, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.06254817051368639, |
|
"grad_norm": 0.20588378608226776, |
|
"learning_rate": 5.986761439124289e-06, |
|
"loss": 1.2604, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.06267530094155974, |
|
"grad_norm": 0.19841307401657104, |
|
"learning_rate": 5.790798551689592e-06, |
|
"loss": 1.2849, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.06280243136943307, |
|
"grad_norm": 0.20967555046081543, |
|
"learning_rate": 5.598000806838766e-06, |
|
"loss": 1.2321, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.06292956179730642, |
|
"grad_norm": 0.19931279122829437, |
|
"learning_rate": 5.408374681989548e-06, |
|
"loss": 1.278, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.06305669222517978, |
|
"grad_norm": 0.19611623883247375, |
|
"learning_rate": 5.221926548002876e-06, |
|
"loss": 1.1999, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.06318382265305311, |
|
"grad_norm": 0.19863781332969666, |
|
"learning_rate": 5.038662668968886e-06, |
|
"loss": 1.2295, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.06331095308092646, |
|
"grad_norm": 0.19629532098770142, |
|
"learning_rate": 4.858589201996433e-06, |
|
"loss": 1.2194, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.06343808350879981, |
|
"grad_norm": 0.21103760600090027, |
|
"learning_rate": 4.681712197006205e-06, |
|
"loss": 1.2861, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.06356521393667315, |
|
"grad_norm": 0.20111264288425446, |
|
"learning_rate": 4.508037596527526e-06, |
|
"loss": 1.2123, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06356521393667315, |
|
"eval_loss": 1.22141695022583, |
|
"eval_runtime": 1257.3017, |
|
"eval_samples_per_second": 3.977, |
|
"eval_steps_per_second": 0.994, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 552, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.182000113647616e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|