|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002072538860103627, |
|
"grad_norm": 0.5270228385925293, |
|
"learning_rate": 4.140786749482402e-07, |
|
"loss": 0.7597, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02072538860103627, |
|
"grad_norm": 0.3120118975639343, |
|
"learning_rate": 4.140786749482402e-05, |
|
"loss": 0.8058, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04145077720207254, |
|
"grad_norm": 0.3838038146495819, |
|
"learning_rate": 8.281573498964804e-05, |
|
"loss": 0.8085, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06217616580310881, |
|
"grad_norm": 0.33677420020103455, |
|
"learning_rate": 0.00012422360248447205, |
|
"loss": 0.8452, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08290155440414508, |
|
"grad_norm": 0.2833797335624695, |
|
"learning_rate": 0.00016563146997929608, |
|
"loss": 0.7836, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 0.3029078245162964, |
|
"learning_rate": 0.0001999924354607825, |
|
"loss": 0.8032, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12435233160621761, |
|
"grad_norm": 0.30878347158432007, |
|
"learning_rate": 0.00019964190153093613, |
|
"loss": 0.8034, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.14507772020725387, |
|
"grad_norm": 0.1962185502052307, |
|
"learning_rate": 0.00019878125322990773, |
|
"loss": 0.8069, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.16580310880829016, |
|
"grad_norm": 0.2113143652677536, |
|
"learning_rate": 0.00019739761494902327, |
|
"loss": 0.8239, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.18652849740932642, |
|
"grad_norm": 0.36496707797050476, |
|
"learning_rate": 0.00019550431853565577, |
|
"loss": 0.7952, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 0.4117984175682068, |
|
"learning_rate": 0.00019311127115144138, |
|
"loss": 0.8159, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22797927461139897, |
|
"grad_norm": 0.25097981095314026, |
|
"learning_rate": 0.00019023099503382319, |
|
"loss": 0.8295, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.24870466321243523, |
|
"grad_norm": 0.2996521592140198, |
|
"learning_rate": 0.00018687856197021518, |
|
"loss": 0.8071, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2694300518134715, |
|
"grad_norm": 0.2897460162639618, |
|
"learning_rate": 0.0001830715144309886, |
|
"loss": 0.8338, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.29015544041450775, |
|
"grad_norm": 0.5588517785072327, |
|
"learning_rate": 0.0001788297737739727, |
|
"loss": 0.7572, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 0.47002148628234863, |
|
"learning_rate": 0.00017417553600081358, |
|
"loss": 0.809, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3316062176165803, |
|
"grad_norm": 0.3224999010562897, |
|
"learning_rate": 0.00016913315561067264, |
|
"loss": 0.7942, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.35233160621761656, |
|
"grad_norm": 0.5584061145782471, |
|
"learning_rate": 0.0001637290181590304, |
|
"loss": 0.8329, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.37305699481865284, |
|
"grad_norm": 0.3566969633102417, |
|
"learning_rate": 0.00015799140218846435, |
|
"loss": 0.7901, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.39378238341968913, |
|
"grad_norm": 0.27286580204963684, |
|
"learning_rate": 0.00015195033125388395, |
|
"loss": 0.8138, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 0.3148711621761322, |
|
"learning_rate": 0.00014563741681653824, |
|
"loss": 0.7867, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.43523316062176165, |
|
"grad_norm": 0.3646707832813263, |
|
"learning_rate": 0.0001390856928288946, |
|
"loss": 0.7681, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.45595854922279794, |
|
"grad_norm": 0.32302677631378174, |
|
"learning_rate": 0.00013232944287596522, |
|
"loss": 0.8134, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.47668393782383417, |
|
"grad_norm": 0.7468949556350708, |
|
"learning_rate": 0.000125404020777609, |
|
"loss": 0.7862, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.49740932642487046, |
|
"grad_norm": 0.36616334319114685, |
|
"learning_rate": 0.00011834566559055394, |
|
"loss": 0.7675, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"grad_norm": 0.3042179346084595, |
|
"learning_rate": 0.00011119131197818904, |
|
"loss": 0.7824, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.538860103626943, |
|
"grad_norm": 0.6282315850257874, |
|
"learning_rate": 0.0001039783969404153, |
|
"loss": 0.8008, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5595854922279793, |
|
"grad_norm": 0.309589147567749, |
|
"learning_rate": 9.674466391489112e-05, |
|
"loss": 0.7918, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5803108808290155, |
|
"grad_norm": 0.6195072531700134, |
|
"learning_rate": 8.952796527476341e-05, |
|
"loss": 0.7714, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6010362694300518, |
|
"grad_norm": 0.5037879943847656, |
|
"learning_rate": 8.236606425636553e-05, |
|
"loss": 0.8098, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6217616580310881, |
|
"grad_norm": 0.3202759325504303, |
|
"learning_rate": 7.529643735334646e-05, |
|
"loss": 0.7303, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6424870466321243, |
|
"grad_norm": 0.4311729669570923, |
|
"learning_rate": 6.835607821125519e-05, |
|
"loss": 0.7717, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6632124352331606, |
|
"grad_norm": 0.28787970542907715, |
|
"learning_rate": 6.158130404875231e-05, |
|
"loss": 0.7748, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6839378238341969, |
|
"grad_norm": 0.28564584255218506, |
|
"learning_rate": 5.5007565618399506e-05, |
|
"loss": 0.8161, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7046632124352331, |
|
"grad_norm": 0.5357826352119446, |
|
"learning_rate": 4.873136932795313e-05, |
|
"loss": 0.7594, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.7253886010362695, |
|
"grad_norm": 0.292579710483551, |
|
"learning_rate": 4.265882101960175e-05, |
|
"loss": 0.7572, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7461139896373057, |
|
"grad_norm": 0.29901111125946045, |
|
"learning_rate": 3.688632521467985e-05, |
|
"loss": 0.7514, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7668393782383419, |
|
"grad_norm": 0.3171032667160034, |
|
"learning_rate": 3.144408798582336e-05, |
|
"loss": 0.7862, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7875647668393783, |
|
"grad_norm": 0.43368250131607056, |
|
"learning_rate": 2.636058724251739e-05, |
|
"loss": 0.7994, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8082901554404145, |
|
"grad_norm": 0.4301137924194336, |
|
"learning_rate": 2.1662423713092516e-05, |
|
"loss": 0.7646, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.8290155440414507, |
|
"grad_norm": 0.20049989223480225, |
|
"learning_rate": 1.737418174955542e-05, |
|
"loss": 0.7742, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8497409326424871, |
|
"grad_norm": 0.5148625373840332, |
|
"learning_rate": 1.3518300683627982e-05, |
|
"loss": 0.7357, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8704663212435233, |
|
"grad_norm": 0.38820308446884155, |
|
"learning_rate": 1.011495740715882e-05, |
|
"loss": 0.7536, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8911917098445595, |
|
"grad_norm": 0.6004698276519775, |
|
"learning_rate": 7.1819607913342745e-06, |
|
"loss": 0.7238, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.9119170984455959, |
|
"grad_norm": 0.30640721321105957, |
|
"learning_rate": 4.734658497168276e-06, |
|
"loss": 0.7669, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.9326424870466321, |
|
"grad_norm": 0.23962052166461945, |
|
"learning_rate": 2.7858566649088814e-06, |
|
"loss": 0.7329, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9533678756476683, |
|
"grad_norm": 0.2379899024963379, |
|
"learning_rate": 1.3457529026076777e-06, |
|
"loss": 0.7845, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9740932642487047, |
|
"grad_norm": 0.291648268699646, |
|
"learning_rate": 4.218829245063227e-07, |
|
"loss": 0.773, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.9948186528497409, |
|
"grad_norm": 0.7486833930015564, |
|
"learning_rate": 1.9081118468888824e-08, |
|
"loss": 0.8057, |
|
"step": 4800 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 4825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3151301866366976e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|