KareemElzeky's picture
Upload folder using huggingface_hub
904717f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995306228883699,
"eval_steps": 500,
"global_step": 1032,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029024285163601733,
"grad_norm": 88.48429870605469,
"learning_rate": 9.677419354838708e-05,
"loss": 0.7958,
"step": 10
},
{
"epoch": 0.058048570327203466,
"grad_norm": 78.84452056884766,
"learning_rate": 0.00019354838709677416,
"loss": 0.7504,
"step": 20
},
{
"epoch": 0.08707285549080519,
"grad_norm": 213.83749389648438,
"learning_rate": 0.00029032258064516127,
"loss": 0.7391,
"step": 30
},
{
"epoch": 0.11609714065440693,
"grad_norm": 64.08855438232422,
"learning_rate": 0.00029994016586766087,
"loss": 0.7567,
"step": 40
},
{
"epoch": 0.14512142581800866,
"grad_norm": 71.85431671142578,
"learning_rate": 0.00029973339311370587,
"loss": 0.7117,
"step": 50
},
{
"epoch": 0.17414571098161039,
"grad_norm": 66.32382202148438,
"learning_rate": 0.00029937914664890375,
"loss": 0.6959,
"step": 60
},
{
"epoch": 0.20316999614521214,
"grad_norm": 52.99678039550781,
"learning_rate": 0.00029887777537365414,
"loss": 0.6835,
"step": 70
},
{
"epoch": 0.23219428130881387,
"grad_norm": 53.15193557739258,
"learning_rate": 0.0002982297730928522,
"loss": 0.6855,
"step": 80
},
{
"epoch": 0.2612185664724156,
"grad_norm": 62.969337463378906,
"learning_rate": 0.00029743577802953563,
"loss": 0.6758,
"step": 90
},
{
"epoch": 0.2902428516360173,
"grad_norm": 47.597293853759766,
"learning_rate": 0.00029649657219629316,
"loss": 0.665,
"step": 100
},
{
"epoch": 0.31926713679961904,
"grad_norm": 50.93095397949219,
"learning_rate": 0.00029541308062505385,
"loss": 0.6689,
"step": 110
},
{
"epoch": 0.34829142196322077,
"grad_norm": 44.195335388183594,
"learning_rate": 0.00029418637045601514,
"loss": 0.6553,
"step": 120
},
{
"epoch": 0.3773157071268225,
"grad_norm": 46.52369689941406,
"learning_rate": 0.00029281764988660705,
"loss": 0.6584,
"step": 130
},
{
"epoch": 0.4063399922904243,
"grad_norm": 51.798343658447266,
"learning_rate": 0.0002913082669815285,
"loss": 0.6514,
"step": 140
},
{
"epoch": 0.435364277454026,
"grad_norm": 53.8443489074707,
"learning_rate": 0.0002896597083450262,
"loss": 0.6276,
"step": 150
},
{
"epoch": 0.46438856261762773,
"grad_norm": 44.94770812988281,
"learning_rate": 0.0002878735976567259,
"loss": 0.6428,
"step": 160
},
{
"epoch": 0.49341284778122946,
"grad_norm": 38.52789306640625,
"learning_rate": 0.0002859516940724558,
"loss": 0.6415,
"step": 170
},
{
"epoch": 0.5224371329448312,
"grad_norm": 52.5710563659668,
"learning_rate": 0.0002838958904916392,
"loss": 0.6302,
"step": 180
},
{
"epoch": 0.551461418108433,
"grad_norm": 46.27107238769531,
"learning_rate": 0.00028170821169296126,
"loss": 0.6246,
"step": 190
},
{
"epoch": 0.5804857032720346,
"grad_norm": 42.310123443603516,
"learning_rate": 0.00027939081234014705,
"loss": 0.627,
"step": 200
},
{
"epoch": 0.6095099884356364,
"grad_norm": 48.09523391723633,
"learning_rate": 0.0002769459748598149,
"loss": 0.623,
"step": 210
},
{
"epoch": 0.6385342735992381,
"grad_norm": 62.250152587890625,
"learning_rate": 0.0002743761071934942,
"loss": 0.6312,
"step": 220
},
{
"epoch": 0.6675585587628399,
"grad_norm": 42.713130950927734,
"learning_rate": 0.00027168374042602366,
"loss": 0.6101,
"step": 230
},
{
"epoch": 0.6965828439264415,
"grad_norm": 49.83562469482422,
"learning_rate": 0.00026887152629266354,
"loss": 0.6,
"step": 240
},
{
"epoch": 0.7256071290900433,
"grad_norm": 39.01671600341797,
"learning_rate": 0.0002659422345673789,
"loss": 0.6038,
"step": 250
},
{
"epoch": 0.754631414253645,
"grad_norm": 35.13432693481445,
"learning_rate": 0.0002628987503348651,
"loss": 0.5956,
"step": 260
},
{
"epoch": 0.7836556994172468,
"grad_norm": 41.503684997558594,
"learning_rate": 0.00025974407114900353,
"loss": 0.6134,
"step": 270
},
{
"epoch": 0.8126799845808486,
"grad_norm": 39.328548431396484,
"learning_rate": 0.0002564813040805443,
"loss": 0.59,
"step": 280
},
{
"epoch": 0.8417042697444502,
"grad_norm": 34.63987731933594,
"learning_rate": 0.0002531136626569259,
"loss": 0.5834,
"step": 290
},
{
"epoch": 0.870728554908052,
"grad_norm": 37.82402801513672,
"learning_rate": 0.0002496444636972439,
"loss": 0.6023,
"step": 300
},
{
"epoch": 0.8997528400716537,
"grad_norm": 38.01532745361328,
"learning_rate": 0.0002460771240454877,
"loss": 0.5866,
"step": 310
},
{
"epoch": 0.9287771252352555,
"grad_norm": 37.758487701416016,
"learning_rate": 0.00024241515720526083,
"loss": 0.6001,
"step": 320
},
{
"epoch": 0.9578014103988571,
"grad_norm": 34.032989501953125,
"learning_rate": 0.0002386621698793015,
"loss": 0.5833,
"step": 330
},
{
"epoch": 0.9868256955624589,
"grad_norm": 41.784881591796875,
"learning_rate": 0.0002348218584172095,
"loss": 0.5876,
"step": 340
},
{
"epoch": 1.0158499807260606,
"grad_norm": 35.09678268432617,
"learning_rate": 0.00023089800517487986,
"loss": 0.5319,
"step": 350
},
{
"epoch": 1.0448742658896624,
"grad_norm": 32.305877685546875,
"learning_rate": 0.00022689447478922784,
"loss": 0.4666,
"step": 360
},
{
"epoch": 1.0738985510532641,
"grad_norm": 35.80933380126953,
"learning_rate": 0.0002228152103718745,
"loss": 0.4619,
"step": 370
},
{
"epoch": 1.102922836216866,
"grad_norm": 32.89548873901367,
"learning_rate": 0.00021866422962554238,
"loss": 0.4739,
"step": 380
},
{
"epoch": 1.1319471213804675,
"grad_norm": 36.34146499633789,
"learning_rate": 0.0002144456208869851,
"loss": 0.4676,
"step": 390
},
{
"epoch": 1.1609714065440693,
"grad_norm": 42.522438049316406,
"learning_rate": 0.00021016353910034938,
"loss": 0.4765,
"step": 400
},
{
"epoch": 1.189995691707671,
"grad_norm": 34.677650451660156,
"learning_rate": 0.00020582220172493467,
"loss": 0.4715,
"step": 410
},
{
"epoch": 1.2190199768712728,
"grad_norm": 33.74694061279297,
"learning_rate": 0.0002014258845813811,
"loss": 0.4655,
"step": 420
},
{
"epoch": 1.2480442620348744,
"grad_norm": 30.60100555419922,
"learning_rate": 0.00019697891764037685,
"loss": 0.461,
"step": 430
},
{
"epoch": 1.2770685471984762,
"grad_norm": 38.6037483215332,
"learning_rate": 0.00019248568075803257,
"loss": 0.4719,
"step": 440
},
{
"epoch": 1.306092832362078,
"grad_norm": 32.19020080566406,
"learning_rate": 0.00018795059936212348,
"loss": 0.4586,
"step": 450
},
{
"epoch": 1.3351171175256797,
"grad_norm": 32.962276458740234,
"learning_rate": 0.00018337814009344714,
"loss": 0.4697,
"step": 460
},
{
"epoch": 1.3641414026892815,
"grad_norm": 29.69386863708496,
"learning_rate": 0.00017877280640659068,
"loss": 0.4639,
"step": 470
},
{
"epoch": 1.393165687852883,
"grad_norm": 31.52634620666504,
"learning_rate": 0.00017413913413443915,
"loss": 0.4579,
"step": 480
},
{
"epoch": 1.4221899730164849,
"grad_norm": 35.30017852783203,
"learning_rate": 0.0001694816870207949,
"loss": 0.4684,
"step": 490
},
{
"epoch": 1.4512142581800866,
"grad_norm": 33.88492202758789,
"learning_rate": 0.00016480505222550682,
"loss": 0.4534,
"step": 500
},
{
"epoch": 1.4802385433436884,
"grad_norm": 30.00653076171875,
"learning_rate": 0.00016011383580653697,
"loss": 0.464,
"step": 510
},
{
"epoch": 1.50926282850729,
"grad_norm": 33.75349807739258,
"learning_rate": 0.00015541265818341433,
"loss": 0.4497,
"step": 520
},
{
"epoch": 1.5382871136708918,
"grad_norm": 31.689538955688477,
"learning_rate": 0.00015070614958654393,
"loss": 0.4412,
"step": 530
},
{
"epoch": 1.5673113988344936,
"grad_norm": 28.848291397094727,
"learning_rate": 0.00014599894549685273,
"loss": 0.4467,
"step": 540
},
{
"epoch": 1.5963356839980953,
"grad_norm": 27.079084396362305,
"learning_rate": 0.0001412956820802647,
"loss": 0.4428,
"step": 550
},
{
"epoch": 1.6253599691616971,
"grad_norm": 29.99922752380371,
"learning_rate": 0.0001366009916215007,
"loss": 0.4374,
"step": 560
},
{
"epoch": 1.654384254325299,
"grad_norm": 28.763559341430664,
"learning_rate": 0.00013191949796170156,
"loss": 0.4419,
"step": 570
},
{
"epoch": 1.6834085394889005,
"grad_norm": 30.430801391601562,
"learning_rate": 0.00012725581194436694,
"loss": 0.445,
"step": 580
},
{
"epoch": 1.7124328246525022,
"grad_norm": 28.43861198425293,
"learning_rate": 0.00012261452687409576,
"loss": 0.4452,
"step": 590
},
{
"epoch": 1.7414571098161038,
"grad_norm": 33.317378997802734,
"learning_rate": 0.00011800021399260094,
"loss": 0.4378,
"step": 600
},
{
"epoch": 1.7704813949797056,
"grad_norm": 27.84680938720703,
"learning_rate": 0.00011341741797645384,
"loss": 0.4375,
"step": 610
},
{
"epoch": 1.7995056801433074,
"grad_norm": 32.20744705200195,
"learning_rate": 0.0001088706524609933,
"loss": 0.4281,
"step": 620
},
{
"epoch": 1.8285299653069091,
"grad_norm": 29.68756675720215,
"learning_rate": 0.00010436439559480705,
"loss": 0.4338,
"step": 630
},
{
"epoch": 1.857554250470511,
"grad_norm": 31.973575592041016,
"learning_rate": 9.990308562916479e-05,
"loss": 0.4265,
"step": 640
},
{
"epoch": 1.8865785356341127,
"grad_norm": 26.948545455932617,
"learning_rate": 9.549111654674586e-05,
"loss": 0.4165,
"step": 650
},
{
"epoch": 1.9156028207977145,
"grad_norm": 27.91978645324707,
"learning_rate": 9.11328337339681e-05,
"loss": 0.416,
"step": 660
},
{
"epoch": 1.944627105961316,
"grad_norm": 34.58734130859375,
"learning_rate": 8.68325297011791e-05,
"loss": 0.4196,
"step": 670
},
{
"epoch": 1.9736513911249178,
"grad_norm": 24.959909439086914,
"learning_rate": 8.259443985492576e-05,
"loss": 0.4305,
"step": 680
},
{
"epoch": 2.0026756762885194,
"grad_norm": 39.029258728027344,
"learning_rate": 7.842273832646591e-05,
"loss": 0.4122,
"step": 690
},
{
"epoch": 2.031699961452121,
"grad_norm": 27.386505126953125,
"learning_rate": 7.432153386063034e-05,
"loss": 0.2751,
"step": 700
},
{
"epoch": 2.060724246615723,
"grad_norm": 30.209821701049805,
"learning_rate": 7.029486576908444e-05,
"loss": 0.2654,
"step": 710
},
{
"epoch": 2.0897485317793247,
"grad_norm": 31.79279327392578,
"learning_rate": 6.63466999519756e-05,
"loss": 0.2648,
"step": 720
},
{
"epoch": 2.1187728169429265,
"grad_norm": 31.363250732421875,
"learning_rate": 6.248092499188372e-05,
"loss": 0.2587,
"step": 730
},
{
"epoch": 2.1477971021065283,
"grad_norm": 33.62345886230469,
"learning_rate": 5.870134832392269e-05,
"loss": 0.2564,
"step": 740
},
{
"epoch": 2.17682138727013,
"grad_norm": 31.332040786743164,
"learning_rate": 5.5011692485764734e-05,
"loss": 0.253,
"step": 750
},
{
"epoch": 2.205845672433732,
"grad_norm": 30.034757614135742,
"learning_rate": 5.141559145128093e-05,
"loss": 0.26,
"step": 760
},
{
"epoch": 2.234869957597333,
"grad_norm": 30.40983772277832,
"learning_rate": 4.791658705140897e-05,
"loss": 0.2507,
"step": 770
},
{
"epoch": 2.263894242760935,
"grad_norm": 27.134634017944336,
"learning_rate": 4.451812548577333e-05,
"loss": 0.2518,
"step": 780
},
{
"epoch": 2.2929185279245368,
"grad_norm": 27.9604434967041,
"learning_rate": 4.1223553928493564e-05,
"loss": 0.2494,
"step": 790
},
{
"epoch": 2.3219428130881385,
"grad_norm": 33.73405838012695,
"learning_rate": 3.803611723152345e-05,
"loss": 0.2441,
"step": 800
},
{
"epoch": 2.3509670982517403,
"grad_norm": 31.413331985473633,
"learning_rate": 3.495895472876854e-05,
"loss": 0.2479,
"step": 810
},
{
"epoch": 2.379991383415342,
"grad_norm": 28.82455062866211,
"learning_rate": 3.199509714412901e-05,
"loss": 0.2529,
"step": 820
},
{
"epoch": 2.409015668578944,
"grad_norm": 31.402931213378906,
"learning_rate": 2.9147463606513528e-05,
"loss": 0.2499,
"step": 830
},
{
"epoch": 2.4380399537425457,
"grad_norm": 25.637739181518555,
"learning_rate": 2.6418858774763992e-05,
"loss": 0.236,
"step": 840
},
{
"epoch": 2.467064238906147,
"grad_norm": 27.47572898864746,
"learning_rate": 2.38119700753228e-05,
"loss": 0.2432,
"step": 850
},
{
"epoch": 2.496088524069749,
"grad_norm": 28.527973175048828,
"learning_rate": 2.1329365055363595e-05,
"loss": 0.2428,
"step": 860
},
{
"epoch": 2.5251128092333506,
"grad_norm": 28.3017578125,
"learning_rate": 1.89734888539916e-05,
"loss": 0.2457,
"step": 870
},
{
"epoch": 2.5541370943969524,
"grad_norm": 27.692001342773438,
"learning_rate": 1.674666179400504e-05,
"loss": 0.2409,
"step": 880
},
{
"epoch": 2.583161379560554,
"grad_norm": 30.592241287231445,
"learning_rate": 1.4651077096589486e-05,
"loss": 0.2371,
"step": 890
},
{
"epoch": 2.612185664724156,
"grad_norm": 26.051584243774414,
"learning_rate": 1.2688798721195053e-05,
"loss": 0.2389,
"step": 900
},
{
"epoch": 2.6412099498877577,
"grad_norm": 28.38836097717285,
"learning_rate": 1.086175933272514e-05,
"loss": 0.2407,
"step": 910
},
{
"epoch": 2.6702342350513595,
"grad_norm": 27.81374740600586,
"learning_rate": 9.171758398038015e-06,
"loss": 0.2389,
"step": 920
},
{
"epoch": 2.6992585202149613,
"grad_norm": 27.540956497192383,
"learning_rate": 7.620460413636342e-06,
"loss": 0.2453,
"step": 930
},
{
"epoch": 2.728282805378563,
"grad_norm": 27.374300003051758,
"learning_rate": 6.209393266290291e-06,
"loss": 0.234,
"step": 940
},
{
"epoch": 2.757307090542165,
"grad_norm": 29.071474075317383,
"learning_rate": 4.939946728208627e-06,
"loss": 0.2406,
"step": 950
},
{
"epoch": 2.786331375705766,
"grad_norm": 25.93909454345703,
"learning_rate": 3.813371088240086e-06,
"loss": 0.231,
"step": 960
},
{
"epoch": 2.815355660869368,
"grad_norm": 28.83918571472168,
"learning_rate": 2.830775920453093e-06,
"loss": 0.2303,
"step": 970
},
{
"epoch": 2.8443799460329697,
"grad_norm": 28.06920623779297,
"learning_rate": 1.9931289913066694e-06,
"loss": 0.2339,
"step": 980
},
{
"epoch": 2.8734042311965715,
"grad_norm": 28.357439041137695,
"learning_rate": 1.3012553064889631e-06,
"loss": 0.2325,
"step": 990
},
{
"epoch": 2.9024285163601733,
"grad_norm": 25.29115104675293,
"learning_rate": 7.558362983619448e-07,
"loss": 0.2374,
"step": 1000
},
{
"epoch": 2.931452801523775,
"grad_norm": 27.02465057373047,
"learning_rate": 3.57409154812871e-07,
"loss": 0.2307,
"step": 1010
},
{
"epoch": 2.960477086687377,
"grad_norm": 26.2918701171875,
"learning_rate": 1.0636629017320431e-07,
"loss": 0.232,
"step": 1020
},
{
"epoch": 2.989501371850978,
"grad_norm": 28.43804359436035,
"learning_rate": 2.9549587264754428e-09,
"loss": 0.2287,
"step": 1030
},
{
"epoch": 2.995306228883699,
"step": 1032,
"total_flos": 1.0711204212442399e+18,
"train_loss": 0.44727156865735385,
"train_runtime": 21178.1386,
"train_samples_per_second": 6.247,
"train_steps_per_second": 0.049
}
],
"logging_steps": 10,
"max_steps": 1032,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0711204212442399e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}