|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.007455453664355476, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 7.455453664355477e-05, |
|
"grad_norm": 10.49715518951416, |
|
"learning_rate": 1e-05, |
|
"loss": 10.3675, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 7.455453664355477e-05, |
|
"eval_loss": 7.581628322601318, |
|
"eval_runtime": 1831.8815, |
|
"eval_samples_per_second": 6.166, |
|
"eval_steps_per_second": 0.771, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00014910907328710953, |
|
"grad_norm": 6.722843647003174, |
|
"learning_rate": 2e-05, |
|
"loss": 6.9309, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00022366360993066427, |
|
"grad_norm": 6.796828746795654, |
|
"learning_rate": 3e-05, |
|
"loss": 6.1734, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00029821814657421907, |
|
"grad_norm": 7.420350074768066, |
|
"learning_rate": 4e-05, |
|
"loss": 6.8689, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0003727726832177738, |
|
"grad_norm": 10.972343444824219, |
|
"learning_rate": 5e-05, |
|
"loss": 8.4649, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00044732721986132855, |
|
"grad_norm": 8.64516544342041, |
|
"learning_rate": 6e-05, |
|
"loss": 6.9404, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0005218817565048833, |
|
"grad_norm": 12.11950969696045, |
|
"learning_rate": 7e-05, |
|
"loss": 8.3633, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0005964362931484381, |
|
"grad_norm": 9.8136568069458, |
|
"learning_rate": 8e-05, |
|
"loss": 4.2432, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0006709908297919928, |
|
"grad_norm": 13.001618385314941, |
|
"learning_rate": 9e-05, |
|
"loss": 5.1604, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0006709908297919928, |
|
"eval_loss": 3.2318060398101807, |
|
"eval_runtime": 1836.1454, |
|
"eval_samples_per_second": 6.151, |
|
"eval_steps_per_second": 0.769, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0007455453664355476, |
|
"grad_norm": 8.031773567199707, |
|
"learning_rate": 0.0001, |
|
"loss": 3.0669, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0008200999030791024, |
|
"grad_norm": 9.736612319946289, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 2.9862, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0008946544397226571, |
|
"grad_norm": 8.880220413208008, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 1.9853, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0009692089763662119, |
|
"grad_norm": 8.896377563476562, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 2.2486, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0010437635130097667, |
|
"grad_norm": 8.374354362487793, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 2.2035, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0011183180496533215, |
|
"grad_norm": 6.076426029205322, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 1.6982, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0011928725862968763, |
|
"grad_norm": 7.052265167236328, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 1.2892, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0012674271229404308, |
|
"grad_norm": 8.613420486450195, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 0.8283, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0013419816595839856, |
|
"grad_norm": 6.096333026885986, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 1.2756, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0013419816595839856, |
|
"eval_loss": 1.3903357982635498, |
|
"eval_runtime": 1835.2174, |
|
"eval_samples_per_second": 6.155, |
|
"eval_steps_per_second": 0.769, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0014165361962275404, |
|
"grad_norm": 3.982419967651367, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.1631, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0014910907328710952, |
|
"grad_norm": 4.764151573181152, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 1.77, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00156564526951465, |
|
"grad_norm": 4.972811222076416, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 1.3896, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0016401998061582048, |
|
"grad_norm": 4.898617267608643, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.9579, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0017147543428017594, |
|
"grad_norm": 5.04691219329834, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 0.995, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0017893088794453142, |
|
"grad_norm": 3.136345386505127, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.9698, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.001863863416088869, |
|
"grad_norm": 4.723503589630127, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 1.3509, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0019384179527324238, |
|
"grad_norm": 2.264803886413574, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 0.6557, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0020129724893759786, |
|
"grad_norm": 3.9745659828186035, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.8906, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0020129724893759786, |
|
"eval_loss": 1.2160564661026, |
|
"eval_runtime": 1835.8241, |
|
"eval_samples_per_second": 6.153, |
|
"eval_steps_per_second": 0.769, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0020875270260195334, |
|
"grad_norm": 5.033815860748291, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.0061, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.002162081562663088, |
|
"grad_norm": 5.012985706329346, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 1.169, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.002236636099306643, |
|
"grad_norm": 5.339277267456055, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 1.3307, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0023111906359501977, |
|
"grad_norm": 7.8841352462768555, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 1.495, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0023857451725937525, |
|
"grad_norm": 4.286492824554443, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 1.2115, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.002460299709237307, |
|
"grad_norm": 6.6110358238220215, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 1.8073, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0025348542458808617, |
|
"grad_norm": 6.618931293487549, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 1.6379, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0026094087825244165, |
|
"grad_norm": 3.5826964378356934, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 1.1726, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0026839633191679713, |
|
"grad_norm": 3.5384361743927, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.8394, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0026839633191679713, |
|
"eval_loss": 1.1397225856781006, |
|
"eval_runtime": 1835.5605, |
|
"eval_samples_per_second": 6.153, |
|
"eval_steps_per_second": 0.769, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.002758517855811526, |
|
"grad_norm": 4.037872791290283, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 1.1838, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.002833072392455081, |
|
"grad_norm": 3.277031183242798, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.8776, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0029076269290986357, |
|
"grad_norm": 3.395357608795166, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.8324, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0029821814657421905, |
|
"grad_norm": 3.6542036533355713, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.0202, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0030567360023857452, |
|
"grad_norm": 4.9897356033325195, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 1.4792, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0031312905390293, |
|
"grad_norm": 5.673566818237305, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 1.5843, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.003205845075672855, |
|
"grad_norm": 3.129956007003784, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.5429, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0032803996123164096, |
|
"grad_norm": 4.75492000579834, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 1.1789, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0033549541489599644, |
|
"grad_norm": 3.616227626800537, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 1.1784, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0033549541489599644, |
|
"eval_loss": 1.0879795551300049, |
|
"eval_runtime": 1835.5507, |
|
"eval_samples_per_second": 6.153, |
|
"eval_steps_per_second": 0.769, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.003429508685603519, |
|
"grad_norm": 3.7293524742126465, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 1.2445, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0035040632222470736, |
|
"grad_norm": 3.8073537349700928, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 1.0901, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0035786177588906284, |
|
"grad_norm": 2.3273308277130127, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.5605, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.003653172295534183, |
|
"grad_norm": 5.4879584312438965, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 1.2716, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.003727726832177738, |
|
"grad_norm": 3.2840213775634766, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.4598, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0038022813688212928, |
|
"grad_norm": 6.318856716156006, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 1.8285, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0038768359054648475, |
|
"grad_norm": 3.896543502807617, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.8851, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.003951390442108402, |
|
"grad_norm": 3.0004584789276123, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.7748, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.004025944978751957, |
|
"grad_norm": 3.7571120262145996, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 1.0713, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.004025944978751957, |
|
"eval_loss": 1.0387237071990967, |
|
"eval_runtime": 1834.8348, |
|
"eval_samples_per_second": 6.156, |
|
"eval_steps_per_second": 0.77, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0041004995153955115, |
|
"grad_norm": 3.714383363723755, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8249, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.004175054052039067, |
|
"grad_norm": 3.444979190826416, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.8833, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.004249608588682621, |
|
"grad_norm": 3.4714789390563965, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 1.0161, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.004324163125326176, |
|
"grad_norm": 3.413393974304199, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.7506, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.004398717661969731, |
|
"grad_norm": 4.324283599853516, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.689, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.004473272198613286, |
|
"grad_norm": 5.534282684326172, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.8864, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00454782673525684, |
|
"grad_norm": 3.4391114711761475, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.8536, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0046223812719003955, |
|
"grad_norm": 4.6441450119018555, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.9308, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.00469693580854395, |
|
"grad_norm": 4.772485256195068, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.8085, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00469693580854395, |
|
"eval_loss": 1.0256173610687256, |
|
"eval_runtime": 1835.156, |
|
"eval_samples_per_second": 6.155, |
|
"eval_steps_per_second": 0.769, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.004771490345187505, |
|
"grad_norm": 2.1971137523651123, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.4036, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0048460448818310594, |
|
"grad_norm": 2.6656343936920166, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.6526, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.004920599418474614, |
|
"grad_norm": 3.9616692066192627, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.813, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.004995153955118169, |
|
"grad_norm": 6.481931209564209, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 1.6251, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.005069708491761723, |
|
"grad_norm": 4.02551794052124, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 1.1398, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.005144263028405279, |
|
"grad_norm": 2.932755470275879, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.7208, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.005218817565048833, |
|
"grad_norm": 4.461419105529785, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 1.227, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.005293372101692388, |
|
"grad_norm": 3.890793561935425, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 1.1223, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0053679266383359426, |
|
"grad_norm": 2.5252115726470947, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.8403, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0053679266383359426, |
|
"eval_loss": 1.0127910375595093, |
|
"eval_runtime": 1835.2572, |
|
"eval_samples_per_second": 6.154, |
|
"eval_steps_per_second": 0.769, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.005442481174979498, |
|
"grad_norm": 3.003502607345581, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.9357, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.005517035711623052, |
|
"grad_norm": 3.790604829788208, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 1.0105, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.005591590248266607, |
|
"grad_norm": 2.7565271854400635, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.6162, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.005666144784910162, |
|
"grad_norm": 3.3704497814178467, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 0.9727, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.005740699321553717, |
|
"grad_norm": 2.8851423263549805, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 0.8882, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.005815253858197271, |
|
"grad_norm": 3.439664602279663, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 1.2121, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.005889808394840826, |
|
"grad_norm": 3.186978816986084, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 0.789, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.005964362931484381, |
|
"grad_norm": 3.047581911087036, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.6949, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.006038917468127935, |
|
"grad_norm": 4.091953277587891, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 1.3274, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.006038917468127935, |
|
"eval_loss": 0.9921898245811462, |
|
"eval_runtime": 1834.9348, |
|
"eval_samples_per_second": 6.156, |
|
"eval_steps_per_second": 0.77, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0061134720047714905, |
|
"grad_norm": 2.8362438678741455, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.6543, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.006188026541415045, |
|
"grad_norm": 3.722740650177002, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 1.0046, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0062625810780586, |
|
"grad_norm": 2.977998971939087, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.7849, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0063371356147021544, |
|
"grad_norm": 3.894702196121216, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 1.0554, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00641169015134571, |
|
"grad_norm": 3.460470199584961, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 0.7595, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.006486244687989264, |
|
"grad_norm": 4.5167155265808105, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 1.3252, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.006560799224632819, |
|
"grad_norm": 2.837597131729126, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 0.9637, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.006635353761276374, |
|
"grad_norm": 3.9109745025634766, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 1.3869, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.006709908297919929, |
|
"grad_norm": 3.083695888519287, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 1.1615, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006709908297919929, |
|
"eval_loss": 0.9879239797592163, |
|
"eval_runtime": 1835.3859, |
|
"eval_samples_per_second": 6.154, |
|
"eval_steps_per_second": 0.769, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006784462834563483, |
|
"grad_norm": 3.445347547531128, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 1.2251, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.006859017371207038, |
|
"grad_norm": 4.91117525100708, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 1.2883, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.006933571907850593, |
|
"grad_norm": 4.425319671630859, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 1.7649, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.007008126444494147, |
|
"grad_norm": 3.255776882171631, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.8542, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.007082680981137702, |
|
"grad_norm": 6.230947017669678, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 1.0066, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.007157235517781257, |
|
"grad_norm": 2.7322230339050293, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.669, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.007231790054424812, |
|
"grad_norm": 2.8557915687561035, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 1.054, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.007306344591068366, |
|
"grad_norm": 3.587625741958618, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 1.292, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0073808991277119216, |
|
"grad_norm": 3.0693485736846924, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 1.1408, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0073808991277119216, |
|
"eval_loss": 0.9858196377754211, |
|
"eval_runtime": 1834.8841, |
|
"eval_samples_per_second": 6.156, |
|
"eval_steps_per_second": 0.77, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.007455453664355476, |
|
"grad_norm": 2.8233108520507812, |
|
"learning_rate": 0.0, |
|
"loss": 0.8195, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0757365611626496e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|