llama-8b-lora-b512-lr5e-6-inverse / trainer_state.json
yangwang825's picture
Upload folder using huggingface_hub
d7dd8a3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7250168395527414,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008621851003637344,
"grad_norm": 1.4002951383590698,
"learning_rate": 1.4367816091954023e-07,
"loss": 1.6797,
"step": 10
},
{
"epoch": 0.017243702007274687,
"grad_norm": 1.374281644821167,
"learning_rate": 2.8735632183908047e-07,
"loss": 1.6722,
"step": 20
},
{
"epoch": 0.02586555301091203,
"grad_norm": 1.2334840297698975,
"learning_rate": 4.3103448275862073e-07,
"loss": 1.6699,
"step": 30
},
{
"epoch": 0.034487404014549375,
"grad_norm": 1.3353266716003418,
"learning_rate": 5.747126436781609e-07,
"loss": 1.6697,
"step": 40
},
{
"epoch": 0.043109255018186715,
"grad_norm": 1.3006664514541626,
"learning_rate": 7.183908045977011e-07,
"loss": 1.6547,
"step": 50
},
{
"epoch": 0.05173110602182406,
"grad_norm": 1.2290219068527222,
"learning_rate": 8.620689655172415e-07,
"loss": 1.6121,
"step": 60
},
{
"epoch": 0.0603529570254614,
"grad_norm": 0.8605530261993408,
"learning_rate": 1.0057471264367817e-06,
"loss": 1.5626,
"step": 70
},
{
"epoch": 0.06897480802909875,
"grad_norm": 0.5922505259513855,
"learning_rate": 1.1494252873563219e-06,
"loss": 1.5188,
"step": 80
},
{
"epoch": 0.07759665903273609,
"grad_norm": 0.5861937999725342,
"learning_rate": 1.2931034482758623e-06,
"loss": 1.5159,
"step": 90
},
{
"epoch": 0.08621851003637343,
"grad_norm": 0.5518978238105774,
"learning_rate": 1.4367816091954023e-06,
"loss": 1.4833,
"step": 100
},
{
"epoch": 0.09484036104001077,
"grad_norm": 0.5048322677612305,
"learning_rate": 1.5804597701149427e-06,
"loss": 1.4932,
"step": 110
},
{
"epoch": 0.10346221204364812,
"grad_norm": 0.6119816899299622,
"learning_rate": 1.724137931034483e-06,
"loss": 1.4988,
"step": 120
},
{
"epoch": 0.11208406304728546,
"grad_norm": 0.5448249578475952,
"learning_rate": 1.8678160919540231e-06,
"loss": 1.4513,
"step": 130
},
{
"epoch": 0.1207059140509228,
"grad_norm": 0.5504677295684814,
"learning_rate": 2.0114942528735633e-06,
"loss": 1.4618,
"step": 140
},
{
"epoch": 0.12932776505456015,
"grad_norm": 0.5333659052848816,
"learning_rate": 2.1551724137931035e-06,
"loss": 1.4717,
"step": 150
},
{
"epoch": 0.1379496160581975,
"grad_norm": 0.7873896360397339,
"learning_rate": 2.2988505747126437e-06,
"loss": 1.4464,
"step": 160
},
{
"epoch": 0.14657146706183483,
"grad_norm": 0.9079521298408508,
"learning_rate": 2.4425287356321844e-06,
"loss": 1.4313,
"step": 170
},
{
"epoch": 0.15519331806547218,
"grad_norm": 0.7075573801994324,
"learning_rate": 2.5862068965517246e-06,
"loss": 1.4527,
"step": 180
},
{
"epoch": 0.16381516906910953,
"grad_norm": 0.5483418107032776,
"learning_rate": 2.729885057471265e-06,
"loss": 1.4109,
"step": 190
},
{
"epoch": 0.17243702007274686,
"grad_norm": 0.5097762942314148,
"learning_rate": 2.8735632183908046e-06,
"loss": 1.3786,
"step": 200
},
{
"epoch": 0.18105887107638421,
"grad_norm": 0.6269800066947937,
"learning_rate": 3.017241379310345e-06,
"loss": 1.4585,
"step": 210
},
{
"epoch": 0.18968072208002154,
"grad_norm": 0.5950655937194824,
"learning_rate": 3.1609195402298854e-06,
"loss": 1.4093,
"step": 220
},
{
"epoch": 0.1983025730836589,
"grad_norm": 0.597637414932251,
"learning_rate": 3.3045977011494256e-06,
"loss": 1.407,
"step": 230
},
{
"epoch": 0.20692442408729625,
"grad_norm": 0.5727440118789673,
"learning_rate": 3.448275862068966e-06,
"loss": 1.3743,
"step": 240
},
{
"epoch": 0.21554627509093358,
"grad_norm": 0.5026169419288635,
"learning_rate": 3.5919540229885056e-06,
"loss": 1.3868,
"step": 250
},
{
"epoch": 0.22416812609457093,
"grad_norm": 0.5612446069717407,
"learning_rate": 3.7356321839080462e-06,
"loss": 1.3997,
"step": 260
},
{
"epoch": 0.23278997709820828,
"grad_norm": 0.5654894709587097,
"learning_rate": 3.8793103448275865e-06,
"loss": 1.3925,
"step": 270
},
{
"epoch": 0.2414118281018456,
"grad_norm": 0.5213720798492432,
"learning_rate": 4.022988505747127e-06,
"loss": 1.3982,
"step": 280
},
{
"epoch": 0.25003367910548296,
"grad_norm": 0.6513163447380066,
"learning_rate": 4.166666666666667e-06,
"loss": 1.4087,
"step": 290
},
{
"epoch": 0.2586555301091203,
"grad_norm": 0.5747817158699036,
"learning_rate": 4.310344827586207e-06,
"loss": 1.3629,
"step": 300
},
{
"epoch": 0.2672773811127577,
"grad_norm": 0.5259600877761841,
"learning_rate": 4.454022988505747e-06,
"loss": 1.3932,
"step": 310
},
{
"epoch": 0.275899232116395,
"grad_norm": 0.5602086782455444,
"learning_rate": 4.5977011494252875e-06,
"loss": 1.3537,
"step": 320
},
{
"epoch": 0.2845210831200323,
"grad_norm": 0.6604083180427551,
"learning_rate": 4.741379310344828e-06,
"loss": 1.3651,
"step": 330
},
{
"epoch": 0.29314293412366965,
"grad_norm": 0.5549290180206299,
"learning_rate": 4.885057471264369e-06,
"loss": 1.3452,
"step": 340
},
{
"epoch": 0.30176478512730703,
"grad_norm": 0.548821210861206,
"learning_rate": 4.999994959675734e-06,
"loss": 1.3647,
"step": 350
},
{
"epoch": 0.31038663613094436,
"grad_norm": 0.5312780141830444,
"learning_rate": 4.9998185504603824e-06,
"loss": 1.3404,
"step": 360
},
{
"epoch": 0.3190084871345817,
"grad_norm": 0.5477195382118225,
"learning_rate": 4.999390145355199e-06,
"loss": 1.3841,
"step": 370
},
{
"epoch": 0.32763033813821907,
"grad_norm": 0.5766547322273254,
"learning_rate": 4.998709787545849e-06,
"loss": 1.3594,
"step": 380
},
{
"epoch": 0.3362521891418564,
"grad_norm": 0.5785459280014038,
"learning_rate": 4.997777545616258e-06,
"loss": 1.3402,
"step": 390
},
{
"epoch": 0.3448740401454937,
"grad_norm": 0.5505363941192627,
"learning_rate": 4.996593513541701e-06,
"loss": 1.3355,
"step": 400
},
{
"epoch": 0.3534958911491311,
"grad_norm": 0.5421465039253235,
"learning_rate": 4.995157810679327e-06,
"loss": 1.359,
"step": 410
},
{
"epoch": 0.36211774215276843,
"grad_norm": 0.5955513119697571,
"learning_rate": 4.993470581756129e-06,
"loss": 1.3743,
"step": 420
},
{
"epoch": 0.37073959315640576,
"grad_norm": 0.6144652366638184,
"learning_rate": 4.991531996854352e-06,
"loss": 1.3447,
"step": 430
},
{
"epoch": 0.3793614441600431,
"grad_norm": 0.5953258872032166,
"learning_rate": 4.989342251394352e-06,
"loss": 1.3208,
"step": 440
},
{
"epoch": 0.38798329516368046,
"grad_norm": 0.5650104880332947,
"learning_rate": 4.986901566114891e-06,
"loss": 1.3562,
"step": 450
},
{
"epoch": 0.3966051461673178,
"grad_norm": 0.8173232078552246,
"learning_rate": 4.984210187050891e-06,
"loss": 1.3151,
"step": 460
},
{
"epoch": 0.4052269971709551,
"grad_norm": 0.5520560145378113,
"learning_rate": 4.981268385508627e-06,
"loss": 1.3591,
"step": 470
},
{
"epoch": 0.4138488481745925,
"grad_norm": 0.5939339399337769,
"learning_rate": 4.978076458038382e-06,
"loss": 1.3306,
"step": 480
},
{
"epoch": 0.4224706991782298,
"grad_norm": 0.5531189441680908,
"learning_rate": 4.974634726404551e-06,
"loss": 1.3338,
"step": 490
},
{
"epoch": 0.43109255018186715,
"grad_norm": 0.7108302116394043,
"learning_rate": 4.9709435375532065e-06,
"loss": 1.3248,
"step": 500
},
{
"epoch": 0.43109255018186715,
"eval_loss": 1.210019826889038,
"eval_runtime": 4375.303,
"eval_samples_per_second": 15.08,
"eval_steps_per_second": 7.54,
"step": 500
},
{
"epoch": 0.43971440118550453,
"grad_norm": 0.47356271743774414,
"learning_rate": 4.9670032635771205e-06,
"loss": 1.3342,
"step": 510
},
{
"epoch": 0.44833625218914186,
"grad_norm": 0.4977116286754608,
"learning_rate": 4.962814301678262e-06,
"loss": 1.3412,
"step": 520
},
{
"epoch": 0.4569581031927792,
"grad_norm": 0.534755289554596,
"learning_rate": 4.958377074127751e-06,
"loss": 1.32,
"step": 530
},
{
"epoch": 0.46557995419641657,
"grad_norm": 0.5627906918525696,
"learning_rate": 4.953692028223295e-06,
"loss": 1.3275,
"step": 540
},
{
"epoch": 0.4742018052000539,
"grad_norm": 0.5472209453582764,
"learning_rate": 4.948759636244096e-06,
"loss": 1.3352,
"step": 550
},
{
"epoch": 0.4828236562036912,
"grad_norm": 0.5113406777381897,
"learning_rate": 4.943580395403244e-06,
"loss": 1.31,
"step": 560
},
{
"epoch": 0.49144550720732855,
"grad_norm": 0.6487182974815369,
"learning_rate": 4.938154827797595e-06,
"loss": 1.2995,
"step": 570
},
{
"epoch": 0.5000673582109659,
"grad_norm": 0.6053293347358704,
"learning_rate": 4.932483480355139e-06,
"loss": 1.3377,
"step": 580
},
{
"epoch": 0.5086892092146033,
"grad_norm": 0.5979019999504089,
"learning_rate": 4.926566924779869e-06,
"loss": 1.3169,
"step": 590
},
{
"epoch": 0.5173110602182406,
"grad_norm": 0.6338688135147095,
"learning_rate": 4.920405757494147e-06,
"loss": 1.2965,
"step": 600
},
{
"epoch": 0.5259329112218779,
"grad_norm": 0.5050321221351624,
"learning_rate": 4.914000599578585e-06,
"loss": 1.3246,
"step": 610
},
{
"epoch": 0.5345547622255153,
"grad_norm": 0.5875179767608643,
"learning_rate": 4.907352096709432e-06,
"loss": 1.337,
"step": 620
},
{
"epoch": 0.5431766132291527,
"grad_norm": 0.6425178647041321,
"learning_rate": 4.900460919093492e-06,
"loss": 1.2946,
"step": 630
},
{
"epoch": 0.55179846423279,
"grad_norm": 0.541878342628479,
"learning_rate": 4.893327761400557e-06,
"loss": 1.2993,
"step": 640
},
{
"epoch": 0.5604203152364273,
"grad_norm": 0.586501955986023,
"learning_rate": 4.885953342693384e-06,
"loss": 1.3011,
"step": 650
},
{
"epoch": 0.5690421662400647,
"grad_norm": 0.5775993466377258,
"learning_rate": 4.878338406355211e-06,
"loss": 1.3213,
"step": 660
},
{
"epoch": 0.577664017243702,
"grad_norm": 0.5908535718917847,
"learning_rate": 4.870483720014814e-06,
"loss": 1.2963,
"step": 670
},
{
"epoch": 0.5862858682473393,
"grad_norm": 0.5903546810150146,
"learning_rate": 4.862390075469132e-06,
"loss": 1.2818,
"step": 680
},
{
"epoch": 0.5949077192509767,
"grad_norm": 0.6688754558563232,
"learning_rate": 4.854058288603445e-06,
"loss": 1.3254,
"step": 690
},
{
"epoch": 0.6035295702546141,
"grad_norm": 0.5674655437469482,
"learning_rate": 4.8454891993091305e-06,
"loss": 1.2957,
"step": 700
},
{
"epoch": 0.6121514212582514,
"grad_norm": 0.6107905507087708,
"learning_rate": 4.836683671398995e-06,
"loss": 1.2824,
"step": 710
},
{
"epoch": 0.6207732722618887,
"grad_norm": 0.5999839305877686,
"learning_rate": 4.827642592520203e-06,
"loss": 1.2977,
"step": 720
},
{
"epoch": 0.629395123265526,
"grad_norm": 0.5449870824813843,
"learning_rate": 4.818366874064789e-06,
"loss": 1.2949,
"step": 730
},
{
"epoch": 0.6380169742691634,
"grad_norm": 0.5735543966293335,
"learning_rate": 4.808857451077788e-06,
"loss": 1.3084,
"step": 740
},
{
"epoch": 0.6466388252728007,
"grad_norm": 0.5688530802726746,
"learning_rate": 4.799115282162979e-06,
"loss": 1.2974,
"step": 750
},
{
"epoch": 0.6552606762764381,
"grad_norm": 0.5878692269325256,
"learning_rate": 4.789141349386249e-06,
"loss": 1.3138,
"step": 760
},
{
"epoch": 0.6638825272800755,
"grad_norm": 0.642494261264801,
"learning_rate": 4.7789366581765995e-06,
"loss": 1.285,
"step": 770
},
{
"epoch": 0.6725043782837128,
"grad_norm": 0.6337887644767761,
"learning_rate": 4.768502237224788e-06,
"loss": 1.295,
"step": 780
},
{
"epoch": 0.6811262292873501,
"grad_norm": 0.6511521935462952,
"learning_rate": 4.757839138379635e-06,
"loss": 1.3059,
"step": 790
},
{
"epoch": 0.6897480802909874,
"grad_norm": 0.6140688061714172,
"learning_rate": 4.74694843654199e-06,
"loss": 1.2781,
"step": 800
},
{
"epoch": 0.6983699312946248,
"grad_norm": 0.5881298780441284,
"learning_rate": 4.735831229556374e-06,
"loss": 1.2944,
"step": 810
},
{
"epoch": 0.7069917822982622,
"grad_norm": 0.6124337315559387,
"learning_rate": 4.7244886381003115e-06,
"loss": 1.287,
"step": 820
},
{
"epoch": 0.7156136333018995,
"grad_norm": 0.5487476587295532,
"learning_rate": 4.712921805571362e-06,
"loss": 1.2885,
"step": 830
},
{
"epoch": 0.7242354843055369,
"grad_norm": 0.6456742286682129,
"learning_rate": 4.7011318979718565e-06,
"loss": 1.2899,
"step": 840
},
{
"epoch": 0.7328573353091742,
"grad_norm": 0.5877824425697327,
"learning_rate": 4.689120103791356e-06,
"loss": 1.3066,
"step": 850
},
{
"epoch": 0.7414791863128115,
"grad_norm": 0.628680408000946,
"learning_rate": 4.676887633886851e-06,
"loss": 1.3101,
"step": 860
},
{
"epoch": 0.7501010373164488,
"grad_norm": 0.6239911317825317,
"learning_rate": 4.664435721360695e-06,
"loss": 1.2782,
"step": 870
},
{
"epoch": 0.7587228883200862,
"grad_norm": 0.5513969659805298,
"learning_rate": 4.651765621436303e-06,
"loss": 1.2836,
"step": 880
},
{
"epoch": 0.7673447393237236,
"grad_norm": 0.5616466403007507,
"learning_rate": 4.638878611331615e-06,
"loss": 1.2967,
"step": 890
},
{
"epoch": 0.7759665903273609,
"grad_norm": 1.2961684465408325,
"learning_rate": 4.6257759901303535e-06,
"loss": 1.3094,
"step": 900
},
{
"epoch": 0.7845884413309983,
"grad_norm": 0.6225080490112305,
"learning_rate": 4.612459078651055e-06,
"loss": 1.3083,
"step": 910
},
{
"epoch": 0.7932102923346356,
"grad_norm": 0.6216508150100708,
"learning_rate": 4.598929219313938e-06,
"loss": 1.3286,
"step": 920
},
{
"epoch": 0.8018321433382729,
"grad_norm": 0.5944140553474426,
"learning_rate": 4.585187776005569e-06,
"loss": 1.263,
"step": 930
},
{
"epoch": 0.8104539943419102,
"grad_norm": 0.5992977023124695,
"learning_rate": 4.571236133941381e-06,
"loss": 1.2745,
"step": 940
},
{
"epoch": 0.8190758453455477,
"grad_norm": 0.5519088506698608,
"learning_rate": 4.557075699526032e-06,
"loss": 1.2772,
"step": 950
},
{
"epoch": 0.827697696349185,
"grad_norm": 0.5918429493904114,
"learning_rate": 4.542707900211636e-06,
"loss": 1.2915,
"step": 960
},
{
"epoch": 0.8363195473528223,
"grad_norm": 0.6135639548301697,
"learning_rate": 4.528134184353863e-06,
"loss": 1.2918,
"step": 970
},
{
"epoch": 0.8449413983564596,
"grad_norm": 0.6600371599197388,
"learning_rate": 4.5133560210659384e-06,
"loss": 1.2844,
"step": 980
},
{
"epoch": 0.853563249360097,
"grad_norm": 0.6321092844009399,
"learning_rate": 4.498374900070551e-06,
"loss": 1.282,
"step": 990
},
{
"epoch": 0.8621851003637343,
"grad_norm": 0.5802695155143738,
"learning_rate": 4.483192331549675e-06,
"loss": 1.2723,
"step": 1000
},
{
"epoch": 0.8621851003637343,
"eval_loss": 1.1568914651870728,
"eval_runtime": 4375.5203,
"eval_samples_per_second": 15.08,
"eval_steps_per_second": 7.54,
"step": 1000
},
{
"epoch": 0.8708069513673716,
"grad_norm": 0.5625444650650024,
"learning_rate": 4.467809845992338e-06,
"loss": 1.2788,
"step": 1010
},
{
"epoch": 0.8794288023710091,
"grad_norm": 0.575935959815979,
"learning_rate": 4.452228994040341e-06,
"loss": 1.302,
"step": 1020
},
{
"epoch": 0.8880506533746464,
"grad_norm": 0.5979976058006287,
"learning_rate": 4.4364513463319405e-06,
"loss": 1.271,
"step": 1030
},
{
"epoch": 0.8966725043782837,
"grad_norm": 0.6508215069770813,
"learning_rate": 4.420478493343523e-06,
"loss": 1.2838,
"step": 1040
},
{
"epoch": 0.905294355381921,
"grad_norm": 0.6415181756019592,
"learning_rate": 4.404312045229273e-06,
"loss": 1.2855,
"step": 1050
},
{
"epoch": 0.9139162063855584,
"grad_norm": 0.59377521276474,
"learning_rate": 4.387953631658863e-06,
"loss": 1.2745,
"step": 1060
},
{
"epoch": 0.9225380573891957,
"grad_norm": 0.6269784569740295,
"learning_rate": 4.371404901653174e-06,
"loss": 1.2667,
"step": 1070
},
{
"epoch": 0.9311599083928331,
"grad_norm": 0.6030882000923157,
"learning_rate": 4.35466752341806e-06,
"loss": 1.2433,
"step": 1080
},
{
"epoch": 0.9397817593964705,
"grad_norm": 0.6197340488433838,
"learning_rate": 4.337743184176188e-06,
"loss": 1.2791,
"step": 1090
},
{
"epoch": 0.9484036104001078,
"grad_norm": 0.607699453830719,
"learning_rate": 4.320633589996956e-06,
"loss": 1.278,
"step": 1100
},
{
"epoch": 0.9570254614037451,
"grad_norm": 0.6275235414505005,
"learning_rate": 4.303340465624507e-06,
"loss": 1.2587,
"step": 1110
},
{
"epoch": 0.9656473124073824,
"grad_norm": 0.6535059213638306,
"learning_rate": 4.285865554303874e-06,
"loss": 1.2895,
"step": 1120
},
{
"epoch": 0.9742691634110198,
"grad_norm": 0.6479883790016174,
"learning_rate": 4.2682106176052405e-06,
"loss": 1.2651,
"step": 1130
},
{
"epoch": 0.9828910144146571,
"grad_norm": 0.7725274562835693,
"learning_rate": 4.2503774352463735e-06,
"loss": 1.2384,
"step": 1140
},
{
"epoch": 0.9915128654182945,
"grad_norm": 0.6182934641838074,
"learning_rate": 4.23236780491321e-06,
"loss": 1.2723,
"step": 1150
},
{
"epoch": 1.0007813552472047,
"grad_norm": 1.8191434144973755,
"learning_rate": 4.214183542078646e-06,
"loss": 1.3882,
"step": 1160
},
{
"epoch": 1.009403206250842,
"grad_norm": 0.7100806832313538,
"learning_rate": 4.195826479819523e-06,
"loss": 1.2857,
"step": 1170
},
{
"epoch": 1.0180250572544793,
"grad_norm": 0.5903263688087463,
"learning_rate": 4.177298468631844e-06,
"loss": 1.2888,
"step": 1180
},
{
"epoch": 1.0266469082581167,
"grad_norm": 0.6088208556175232,
"learning_rate": 4.158601376244237e-06,
"loss": 1.2355,
"step": 1190
},
{
"epoch": 1.035268759261754,
"grad_norm": 0.6548230648040771,
"learning_rate": 4.139737087429672e-06,
"loss": 1.2435,
"step": 1200
},
{
"epoch": 1.0438906102653913,
"grad_norm": 0.6475362777709961,
"learning_rate": 4.120707503815464e-06,
"loss": 1.2462,
"step": 1210
},
{
"epoch": 1.0525124612690286,
"grad_norm": 0.7016700506210327,
"learning_rate": 4.101514543691588e-06,
"loss": 1.2479,
"step": 1220
},
{
"epoch": 1.061134312272666,
"grad_norm": 0.6940033435821533,
"learning_rate": 4.0821601418172926e-06,
"loss": 1.2659,
"step": 1230
},
{
"epoch": 1.0697561632763033,
"grad_norm": 0.6648741960525513,
"learning_rate": 4.0626462492260725e-06,
"loss": 1.2441,
"step": 1240
},
{
"epoch": 1.0783780142799406,
"grad_norm": 0.665122389793396,
"learning_rate": 4.042974833028992e-06,
"loss": 1.2792,
"step": 1250
},
{
"epoch": 1.0869998652835782,
"grad_norm": 0.6138463020324707,
"learning_rate": 4.0231478762163865e-06,
"loss": 1.2462,
"step": 1260
},
{
"epoch": 1.0956217162872155,
"grad_norm": 0.61916184425354,
"learning_rate": 4.003167377457972e-06,
"loss": 1.2858,
"step": 1270
},
{
"epoch": 1.1042435672908528,
"grad_norm": 0.6411153674125671,
"learning_rate": 3.983035350901356e-06,
"loss": 1.2519,
"step": 1280
},
{
"epoch": 1.1128654182944901,
"grad_norm": 0.6579316854476929,
"learning_rate": 3.962753825969016e-06,
"loss": 1.2661,
"step": 1290
},
{
"epoch": 1.1214872692981275,
"grad_norm": 0.6916026473045349,
"learning_rate": 3.942324847153706e-06,
"loss": 1.2812,
"step": 1300
},
{
"epoch": 1.1301091203017648,
"grad_norm": 0.6541363596916199,
"learning_rate": 3.921750473812377e-06,
"loss": 1.2454,
"step": 1310
},
{
"epoch": 1.1387309713054021,
"grad_norm": 0.6301002502441406,
"learning_rate": 3.901032779958563e-06,
"loss": 1.2452,
"step": 1320
},
{
"epoch": 1.1473528223090395,
"grad_norm": 0.6470747590065002,
"learning_rate": 3.880173854053325e-06,
"loss": 1.242,
"step": 1330
},
{
"epoch": 1.1559746733126768,
"grad_norm": 0.62432861328125,
"learning_rate": 3.859175798794715e-06,
"loss": 1.2578,
"step": 1340
},
{
"epoch": 1.164596524316314,
"grad_norm": 0.735650897026062,
"learning_rate": 3.838040730905811e-06,
"loss": 1.2323,
"step": 1350
},
{
"epoch": 1.1732183753199514,
"grad_norm": 0.6072832345962524,
"learning_rate": 3.816770780921343e-06,
"loss": 1.2417,
"step": 1360
},
{
"epoch": 1.1818402263235888,
"grad_norm": 0.6269782185554504,
"learning_rate": 3.7953680929729215e-06,
"loss": 1.2579,
"step": 1370
},
{
"epoch": 1.190462077327226,
"grad_norm": 0.6426697373390198,
"learning_rate": 3.7738348245728953e-06,
"loss": 1.2711,
"step": 1380
},
{
"epoch": 1.1990839283308636,
"grad_norm": 0.6683219075202942,
"learning_rate": 3.7521731463968638e-06,
"loss": 1.2375,
"step": 1390
},
{
"epoch": 1.207705779334501,
"grad_norm": 0.7327633500099182,
"learning_rate": 3.730385242064861e-06,
"loss": 1.2509,
"step": 1400
},
{
"epoch": 1.2163276303381383,
"grad_norm": 0.6698377728462219,
"learning_rate": 3.708473307921234e-06,
"loss": 1.2748,
"step": 1410
},
{
"epoch": 1.2249494813417756,
"grad_norm": 0.6427878737449646,
"learning_rate": 3.686439552813236e-06,
"loss": 1.2753,
"step": 1420
},
{
"epoch": 1.233571332345413,
"grad_norm": 0.7282299399375916,
"learning_rate": 3.6642861978683676e-06,
"loss": 1.2218,
"step": 1430
},
{
"epoch": 1.2421931833490503,
"grad_norm": 0.6039260029792786,
"learning_rate": 3.6420154762704685e-06,
"loss": 1.243,
"step": 1440
},
{
"epoch": 1.2508150343526876,
"grad_norm": 0.6218879222869873,
"learning_rate": 3.619629633034604e-06,
"loss": 1.2225,
"step": 1450
},
{
"epoch": 1.259436885356325,
"grad_norm": 0.660929799079895,
"learning_rate": 3.597130924780754e-06,
"loss": 1.2641,
"step": 1460
},
{
"epoch": 1.2680587363599622,
"grad_norm": 0.6086330413818359,
"learning_rate": 3.574521619506332e-06,
"loss": 1.2288,
"step": 1470
},
{
"epoch": 1.2766805873635996,
"grad_norm": 0.6594045162200928,
"learning_rate": 3.5518039963575577e-06,
"loss": 1.2558,
"step": 1480
},
{
"epoch": 1.285302438367237,
"grad_norm": 0.6506398320198059,
"learning_rate": 3.5289803453997087e-06,
"loss": 1.2361,
"step": 1490
},
{
"epoch": 1.2939242893708744,
"grad_norm": 0.6286528706550598,
"learning_rate": 3.506052967386265e-06,
"loss": 1.2344,
"step": 1500
},
{
"epoch": 1.2939242893708744,
"eval_loss": 1.1364344358444214,
"eval_runtime": 4371.3293,
"eval_samples_per_second": 15.094,
"eval_steps_per_second": 7.547,
"step": 1500
},
{
"epoch": 1.3025461403745116,
"grad_norm": 0.6357390880584717,
"learning_rate": 3.4830241735269852e-06,
"loss": 1.2597,
"step": 1510
},
{
"epoch": 1.311167991378149,
"grad_norm": 0.5889900326728821,
"learning_rate": 3.459896285254917e-06,
"loss": 1.2535,
"step": 1520
},
{
"epoch": 1.3197898423817862,
"grad_norm": 0.7132574319839478,
"learning_rate": 3.436671633992389e-06,
"loss": 1.2496,
"step": 1530
},
{
"epoch": 1.3284116933854238,
"grad_norm": 0.604434072971344,
"learning_rate": 3.4133525609159883e-06,
"loss": 1.2578,
"step": 1540
},
{
"epoch": 1.337033544389061,
"grad_norm": 0.6603388786315918,
"learning_rate": 3.3899414167205547e-06,
"loss": 1.2462,
"step": 1550
},
{
"epoch": 1.3456553953926984,
"grad_norm": 0.5738435983657837,
"learning_rate": 3.3664405613822216e-06,
"loss": 1.2309,
"step": 1560
},
{
"epoch": 1.3542772463963357,
"grad_norm": 0.6693400740623474,
"learning_rate": 3.3428523639205125e-06,
"loss": 1.2656,
"step": 1570
},
{
"epoch": 1.362899097399973,
"grad_norm": 0.6772233843803406,
"learning_rate": 3.319179202159532e-06,
"loss": 1.2326,
"step": 1580
},
{
"epoch": 1.3715209484036104,
"grad_norm": 0.6765257716178894,
"learning_rate": 3.295423462488271e-06,
"loss": 1.2666,
"step": 1590
},
{
"epoch": 1.3801427994072477,
"grad_norm": 0.61844402551651,
"learning_rate": 3.271587539620039e-06,
"loss": 1.2188,
"step": 1600
},
{
"epoch": 1.388764650410885,
"grad_norm": 0.6714752912521362,
"learning_rate": 3.247673836351068e-06,
"loss": 1.2276,
"step": 1610
},
{
"epoch": 1.3973865014145224,
"grad_norm": 0.5900276899337769,
"learning_rate": 3.2236847633182955e-06,
"loss": 1.2452,
"step": 1620
},
{
"epoch": 1.40600835241816,
"grad_norm": 0.6843028664588928,
"learning_rate": 3.199622738756357e-06,
"loss": 1.2317,
"step": 1630
},
{
"epoch": 1.414630203421797,
"grad_norm": 0.7222546935081482,
"learning_rate": 3.17549018825382e-06,
"loss": 1.2445,
"step": 1640
},
{
"epoch": 1.4232520544254346,
"grad_norm": 0.6822832226753235,
"learning_rate": 3.151289544508664e-06,
"loss": 1.2442,
"step": 1650
},
{
"epoch": 1.4318739054290717,
"grad_norm": 0.7010654211044312,
"learning_rate": 3.1270232470830525e-06,
"loss": 1.2517,
"step": 1660
},
{
"epoch": 1.4404957564327092,
"grad_norm": 0.6761536598205566,
"learning_rate": 3.102693742157415e-06,
"loss": 1.2424,
"step": 1670
},
{
"epoch": 1.4491176074363465,
"grad_norm": 0.730097234249115,
"learning_rate": 3.078303482283854e-06,
"loss": 1.2167,
"step": 1680
},
{
"epoch": 1.4577394584399839,
"grad_norm": 0.7009713053703308,
"learning_rate": 3.0538549261389154e-06,
"loss": 1.2492,
"step": 1690
},
{
"epoch": 1.4663613094436212,
"grad_norm": 0.5926857590675354,
"learning_rate": 3.029350538275742e-06,
"loss": 1.1965,
"step": 1700
},
{
"epoch": 1.4749831604472585,
"grad_norm": 0.6391776204109192,
"learning_rate": 3.0047927888756268e-06,
"loss": 1.2326,
"step": 1710
},
{
"epoch": 1.4836050114508958,
"grad_norm": 0.7003401517868042,
"learning_rate": 2.9801841534990115e-06,
"loss": 1.2248,
"step": 1720
},
{
"epoch": 1.4922268624545332,
"grad_norm": 0.682777464389801,
"learning_rate": 2.9555271128359326e-06,
"loss": 1.2305,
"step": 1730
},
{
"epoch": 1.5008487134581705,
"grad_norm": 0.5897073745727539,
"learning_rate": 2.9308241524559522e-06,
"loss": 1.2269,
"step": 1740
},
{
"epoch": 1.5094705644618078,
"grad_norm": 0.7111027240753174,
"learning_rate": 2.9060777625576014e-06,
"loss": 1.2338,
"step": 1750
},
{
"epoch": 1.5180924154654454,
"grad_norm": 0.6545217037200928,
"learning_rate": 2.8812904377173532e-06,
"loss": 1.2222,
"step": 1760
},
{
"epoch": 1.5267142664690825,
"grad_norm": 0.6440667510032654,
"learning_rate": 2.856464676638156e-06,
"loss": 1.2033,
"step": 1770
},
{
"epoch": 1.53533611747272,
"grad_norm": 0.7168214321136475,
"learning_rate": 2.831602981897546e-06,
"loss": 1.2479,
"step": 1780
},
{
"epoch": 1.5439579684763571,
"grad_norm": 0.6428610682487488,
"learning_rate": 2.8067078596953793e-06,
"loss": 1.2302,
"step": 1790
},
{
"epoch": 1.5525798194799947,
"grad_norm": 0.6651865839958191,
"learning_rate": 2.7817818196011897e-06,
"loss": 1.263,
"step": 1800
},
{
"epoch": 1.561201670483632,
"grad_norm": 0.6888891458511353,
"learning_rate": 2.756827374301207e-06,
"loss": 1.2001,
"step": 1810
},
{
"epoch": 1.5698235214872693,
"grad_norm": 0.6644035577774048,
"learning_rate": 2.73184703934507e-06,
"loss": 1.216,
"step": 1820
},
{
"epoch": 1.5784453724909067,
"grad_norm": 0.6795063614845276,
"learning_rate": 2.7068433328922405e-06,
"loss": 1.245,
"step": 1830
},
{
"epoch": 1.587067223494544,
"grad_norm": 0.7901127338409424,
"learning_rate": 2.68181877545816e-06,
"loss": 1.2168,
"step": 1840
},
{
"epoch": 1.5956890744981813,
"grad_norm": 0.6792474389076233,
"learning_rate": 2.6567758896601654e-06,
"loss": 1.2406,
"step": 1850
},
{
"epoch": 1.6043109255018186,
"grad_norm": 0.638313353061676,
"learning_rate": 2.6317171999631992e-06,
"loss": 1.253,
"step": 1860
},
{
"epoch": 1.612932776505456,
"grad_norm": 0.7407149076461792,
"learning_rate": 2.6066452324253257e-06,
"loss": 1.2279,
"step": 1870
},
{
"epoch": 1.6215546275090933,
"grad_norm": 0.6624804139137268,
"learning_rate": 2.58156251444309e-06,
"loss": 1.2433,
"step": 1880
},
{
"epoch": 1.6301764785127308,
"grad_norm": 0.6785764694213867,
"learning_rate": 2.5564715744967446e-06,
"loss": 1.2267,
"step": 1890
},
{
"epoch": 1.638798329516368,
"grad_norm": 0.7038357853889465,
"learning_rate": 2.531374941895361e-06,
"loss": 1.2371,
"step": 1900
},
{
"epoch": 1.6474201805200055,
"grad_norm": 0.7683678269386292,
"learning_rate": 2.506275146521863e-06,
"loss": 1.2039,
"step": 1910
},
{
"epoch": 1.6560420315236426,
"grad_norm": 0.6339368224143982,
"learning_rate": 2.4811747185780005e-06,
"loss": 1.201,
"step": 1920
},
{
"epoch": 1.6646638825272801,
"grad_norm": 0.8253235220909119,
"learning_rate": 2.45607618832929e-06,
"loss": 1.2585,
"step": 1930
},
{
"epoch": 1.6732857335309175,
"grad_norm": 0.7511754631996155,
"learning_rate": 2.4309820858499487e-06,
"loss": 1.2043,
"step": 1940
},
{
"epoch": 1.6819075845345548,
"grad_norm": 0.709459662437439,
"learning_rate": 2.405894940767851e-06,
"loss": 1.2493,
"step": 1950
},
{
"epoch": 1.6905294355381921,
"grad_norm": 0.6520094871520996,
"learning_rate": 2.380817282009523e-06,
"loss": 1.2514,
"step": 1960
},
{
"epoch": 1.6991512865418295,
"grad_norm": 0.6714244484901428,
"learning_rate": 2.35575163754522e-06,
"loss": 1.2204,
"step": 1970
},
{
"epoch": 1.7077731375454668,
"grad_norm": 0.6813965439796448,
"learning_rate": 2.330700534134086e-06,
"loss": 1.2042,
"step": 1980
},
{
"epoch": 1.716394988549104,
"grad_norm": 0.6882847547531128,
"learning_rate": 2.3056664970694433e-06,
"loss": 1.2139,
"step": 1990
},
{
"epoch": 1.7250168395527414,
"grad_norm": 0.7284813523292542,
"learning_rate": 2.280652049924232e-06,
"loss": 1.2124,
"step": 2000
},
{
"epoch": 1.7250168395527414,
"eval_loss": 1.1232779026031494,
"eval_runtime": 4379.8083,
"eval_samples_per_second": 15.065,
"eval_steps_per_second": 7.533,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 3477,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.949667944935509e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}