|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9990416866315286, |
|
"eval_steps": 200, |
|
"global_step": 3336, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011978917105893628, |
|
"grad_norm": 476.6510925292969, |
|
"learning_rate": 8e-07, |
|
"loss": 11.6475, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023957834211787255, |
|
"grad_norm": 74.57940673828125, |
|
"learning_rate": 1.9999928625229307e-06, |
|
"loss": 2.3869, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.035936751317680884, |
|
"grad_norm": 125.54178619384766, |
|
"learning_rate": 1.999912567076008e-06, |
|
"loss": 7.1899, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04791566842357451, |
|
"grad_norm": 14.804201126098633, |
|
"learning_rate": 1.999743061523497e-06, |
|
"loss": 5.0722, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.059894585529468136, |
|
"grad_norm": 9.310958862304688, |
|
"learning_rate": 1.999484360988329e-06, |
|
"loss": 2.9189, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07187350263536177, |
|
"grad_norm": 306.2783508300781, |
|
"learning_rate": 1.999136488551224e-06, |
|
"loss": 2.8403, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08385241974125539, |
|
"grad_norm": 134.76495361328125, |
|
"learning_rate": 1.9986994752486316e-06, |
|
"loss": 4.2047, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09583133684714902, |
|
"grad_norm": 78.88103485107422, |
|
"learning_rate": 1.998173360069964e-06, |
|
"loss": 5.1269, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10781025395304264, |
|
"grad_norm": 60.88028335571289, |
|
"learning_rate": 1.997558189954117e-06, |
|
"loss": 4.8787, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11978917105893627, |
|
"grad_norm": 48.85028076171875, |
|
"learning_rate": 1.9968540197852784e-06, |
|
"loss": 2.6971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1317680881648299, |
|
"grad_norm": 82.52726745605469, |
|
"learning_rate": 1.9960609123880376e-06, |
|
"loss": 6.6349, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14374700527072354, |
|
"grad_norm": 18.934968948364258, |
|
"learning_rate": 1.9951789385217753e-06, |
|
"loss": 3.6926, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15572592237661714, |
|
"grad_norm": 95.94075012207031, |
|
"learning_rate": 1.9942081768743535e-06, |
|
"loss": 5.221, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16770483948251078, |
|
"grad_norm": 69.58992767333984, |
|
"learning_rate": 1.9931487140550935e-06, |
|
"loss": 5.8621, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1796837565884044, |
|
"grad_norm": 18.388065338134766, |
|
"learning_rate": 1.9920006445870497e-06, |
|
"loss": 5.2103, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19166267369429804, |
|
"grad_norm": 117.37078857421875, |
|
"learning_rate": 1.9907640708985766e-06, |
|
"loss": 5.8106, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20364159080019167, |
|
"grad_norm": 118.88228607177734, |
|
"learning_rate": 1.9894391033141887e-06, |
|
"loss": 4.0891, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21562050790608528, |
|
"grad_norm": 17.98489761352539, |
|
"learning_rate": 1.9880258600447204e-06, |
|
"loss": 5.7061, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2275994250119789, |
|
"grad_norm": 170.75030517578125, |
|
"learning_rate": 1.986524467176777e-06, |
|
"loss": 4.2787, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23957834211787254, |
|
"grad_norm": 74.85968017578125, |
|
"learning_rate": 1.9849350586614863e-06, |
|
"loss": 7.8201, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23957834211787254, |
|
"eval_loss": 1.2861307859420776, |
|
"eval_runtime": 238.5238, |
|
"eval_samples_per_second": 6.247, |
|
"eval_steps_per_second": 3.123, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25155725922376615, |
|
"grad_norm": 254.58087158203125, |
|
"learning_rate": 1.983257776302548e-06, |
|
"loss": 5.8449, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2635361763296598, |
|
"grad_norm": 12.087563514709473, |
|
"learning_rate": 1.9814927697435826e-06, |
|
"loss": 5.7451, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2755150934355534, |
|
"grad_norm": 11.869245529174805, |
|
"learning_rate": 1.9796401964547794e-06, |
|
"loss": 6.4206, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2874940105414471, |
|
"grad_norm": 18.04955291748047, |
|
"learning_rate": 1.977700221718848e-06, |
|
"loss": 3.3466, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2994729276473407, |
|
"grad_norm": 40.88437271118164, |
|
"learning_rate": 1.975673018616273e-06, |
|
"loss": 4.5986, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3114518447532343, |
|
"grad_norm": 17.213830947875977, |
|
"learning_rate": 1.97355876800987e-06, |
|
"loss": 3.6809, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.32343076185912795, |
|
"grad_norm": 31.727243423461914, |
|
"learning_rate": 1.9713576585286513e-06, |
|
"loss": 4.692, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33540967896502155, |
|
"grad_norm": 95.36638641357422, |
|
"learning_rate": 1.9690698865509964e-06, |
|
"loss": 6.1814, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3473885960709152, |
|
"grad_norm": 25.670534133911133, |
|
"learning_rate": 1.966695656187131e-06, |
|
"loss": 2.8556, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3593675131768088, |
|
"grad_norm": 137.57431030273438, |
|
"learning_rate": 1.9642351792609162e-06, |
|
"loss": 3.3607, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3713464302827024, |
|
"grad_norm": 46.084354400634766, |
|
"learning_rate": 1.9616886752909523e-06, |
|
"loss": 6.1352, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3833253473885961, |
|
"grad_norm": 110.49552154541016, |
|
"learning_rate": 1.9590563714709916e-06, |
|
"loss": 5.8323, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3953042644944897, |
|
"grad_norm": 77.74449157714844, |
|
"learning_rate": 1.9563385026496687e-06, |
|
"loss": 5.7407, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40728318160038335, |
|
"grad_norm": 26.705305099487305, |
|
"learning_rate": 1.9535353113095493e-06, |
|
"loss": 5.3508, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41926209870627695, |
|
"grad_norm": 146.0390167236328, |
|
"learning_rate": 1.9506470475454957e-06, |
|
"loss": 2.9407, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43124101581217056, |
|
"grad_norm": 25.178991317749023, |
|
"learning_rate": 1.947673969042353e-06, |
|
"loss": 3.0089, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4432199329180642, |
|
"grad_norm": 8.458102226257324, |
|
"learning_rate": 1.9446163410519603e-06, |
|
"loss": 2.885, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4551988500239578, |
|
"grad_norm": 94.5208740234375, |
|
"learning_rate": 1.9414744363694842e-06, |
|
"loss": 3.8878, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4671777671298515, |
|
"grad_norm": 138.86561584472656, |
|
"learning_rate": 1.938248535309083e-06, |
|
"loss": 5.5948, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4791566842357451, |
|
"grad_norm": 10.55215072631836, |
|
"learning_rate": 1.9349389256788943e-06, |
|
"loss": 2.9242, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4791566842357451, |
|
"eval_loss": 1.1180063486099243, |
|
"eval_runtime": 237.218, |
|
"eval_samples_per_second": 6.281, |
|
"eval_steps_per_second": 3.141, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4911356013416387, |
|
"grad_norm": 11.329914093017578, |
|
"learning_rate": 1.931545902755359e-06, |
|
"loss": 5.7209, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5031145184475323, |
|
"grad_norm": 121.2057113647461, |
|
"learning_rate": 1.928069769256879e-06, |
|
"loss": 4.2294, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.515093435553426, |
|
"grad_norm": 70.16046905517578, |
|
"learning_rate": 1.9245108353168055e-06, |
|
"loss": 5.1172, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5270723526593196, |
|
"grad_norm": 55.029964447021484, |
|
"learning_rate": 1.9208694184557735e-06, |
|
"loss": 3.8455, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5390512697652132, |
|
"grad_norm": 16.75533103942871, |
|
"learning_rate": 1.9171458435533706e-06, |
|
"loss": 2.1762, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5510301868711068, |
|
"grad_norm": 37.192169189453125, |
|
"learning_rate": 1.913340442819153e-06, |
|
"loss": 4.6994, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5630091039770004, |
|
"grad_norm": 190.2852020263672, |
|
"learning_rate": 1.9094535557630067e-06, |
|
"loss": 8.188, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5749880210828942, |
|
"grad_norm": 14.840240478515625, |
|
"learning_rate": 1.905485529164856e-06, |
|
"loss": 2.4346, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5869669381887878, |
|
"grad_norm": 17.85882568359375, |
|
"learning_rate": 1.9014367170437255e-06, |
|
"loss": 5.1088, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5989458552946814, |
|
"grad_norm": 98.2167739868164, |
|
"learning_rate": 1.8973074806261558e-06, |
|
"loss": 4.4192, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.610924772400575, |
|
"grad_norm": 72.538330078125, |
|
"learning_rate": 1.8930981883139734e-06, |
|
"loss": 4.2753, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6229036895064686, |
|
"grad_norm": 121.5967788696289, |
|
"learning_rate": 1.8888092156514252e-06, |
|
"loss": 5.0462, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6348826066123623, |
|
"grad_norm": 161.8177947998047, |
|
"learning_rate": 1.8844409452916719e-06, |
|
"loss": 3.2489, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6468615237182559, |
|
"grad_norm": 149.68197631835938, |
|
"learning_rate": 1.8799937669626481e-06, |
|
"loss": 4.8399, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6588404408241495, |
|
"grad_norm": 87.29440307617188, |
|
"learning_rate": 1.8754680774322934e-06, |
|
"loss": 5.3579, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6708193579300431, |
|
"grad_norm": 70.05744171142578, |
|
"learning_rate": 1.8708642804731513e-06, |
|
"loss": 1.967, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6827982750359367, |
|
"grad_norm": 49.84896469116211, |
|
"learning_rate": 1.866182786826347e-06, |
|
"loss": 4.1978, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6947771921418304, |
|
"grad_norm": 29.61354637145996, |
|
"learning_rate": 1.861424014164941e-06, |
|
"loss": 4.025, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.706756109247724, |
|
"grad_norm": 99.13072204589844, |
|
"learning_rate": 1.8565883870566666e-06, |
|
"loss": 4.1162, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7187350263536176, |
|
"grad_norm": 130.23606872558594, |
|
"learning_rate": 1.8516763369260492e-06, |
|
"loss": 3.0065, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7187350263536176, |
|
"eval_loss": 1.104053258895874, |
|
"eval_runtime": 238.4518, |
|
"eval_samples_per_second": 6.249, |
|
"eval_steps_per_second": 3.124, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7307139434595112, |
|
"grad_norm": 50.47702407836914, |
|
"learning_rate": 1.8466883020159161e-06, |
|
"loss": 4.3503, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7426928605654048, |
|
"grad_norm": 17.21928596496582, |
|
"learning_rate": 1.8416247273482988e-06, |
|
"loss": 4.4346, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7546717776712986, |
|
"grad_norm": 49.47705841064453, |
|
"learning_rate": 1.8364860646847262e-06, |
|
"loss": 3.9906, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7666506947771922, |
|
"grad_norm": 14.143331527709961, |
|
"learning_rate": 1.831272772485922e-06, |
|
"loss": 3.3026, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7786296118830858, |
|
"grad_norm": 17.33100128173828, |
|
"learning_rate": 1.8259853158708997e-06, |
|
"loss": 6.0244, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7906085289889794, |
|
"grad_norm": 10.411093711853027, |
|
"learning_rate": 1.8206241665754687e-06, |
|
"loss": 2.8721, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.802587446094873, |
|
"grad_norm": 181.8240966796875, |
|
"learning_rate": 1.815189802910143e-06, |
|
"loss": 5.1721, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8145663632007667, |
|
"grad_norm": 39.83287048339844, |
|
"learning_rate": 1.80968270971747e-06, |
|
"loss": 4.9115, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8265452803066603, |
|
"grad_norm": 39.86928176879883, |
|
"learning_rate": 1.8041033783287737e-06, |
|
"loss": 3.8957, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8385241974125539, |
|
"grad_norm": 57.7371711730957, |
|
"learning_rate": 1.7984523065203188e-06, |
|
"loss": 3.1863, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8505031145184475, |
|
"grad_norm": 14.148628234863281, |
|
"learning_rate": 1.792729998468899e-06, |
|
"loss": 4.26, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8624820316243411, |
|
"grad_norm": 71.03279113769531, |
|
"learning_rate": 1.7869369647068577e-06, |
|
"loss": 4.9559, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8744609487302348, |
|
"grad_norm": 17.670730590820312, |
|
"learning_rate": 1.7810737220765372e-06, |
|
"loss": 3.9867, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8864398658361284, |
|
"grad_norm": 14.698404312133789, |
|
"learning_rate": 1.7751407936841684e-06, |
|
"loss": 2.7134, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.898418782942022, |
|
"grad_norm": 66.42393493652344, |
|
"learning_rate": 1.7691387088532001e-06, |
|
"loss": 3.2121, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9103977000479156, |
|
"grad_norm": 76.34748077392578, |
|
"learning_rate": 1.7630680030770732e-06, |
|
"loss": 4.7613, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9223766171538093, |
|
"grad_norm": 46.27962112426758, |
|
"learning_rate": 1.7569292179714465e-06, |
|
"loss": 3.2976, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.934355534259703, |
|
"grad_norm": 14.20971965789795, |
|
"learning_rate": 1.750722901225873e-06, |
|
"loss": 1.9176, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9463344513655966, |
|
"grad_norm": 56.962379455566406, |
|
"learning_rate": 1.7444496065549384e-06, |
|
"loss": 1.9859, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9583133684714902, |
|
"grad_norm": 32.20167541503906, |
|
"learning_rate": 1.7381098936488574e-06, |
|
"loss": 6.9549, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9583133684714902, |
|
"eval_loss": 1.0978227853775024, |
|
"eval_runtime": 238.5511, |
|
"eval_samples_per_second": 6.246, |
|
"eval_steps_per_second": 3.123, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9702922855773838, |
|
"grad_norm": 82.94165802001953, |
|
"learning_rate": 1.7317043281235418e-06, |
|
"loss": 4.1317, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9822712026832774, |
|
"grad_norm": 110.44422912597656, |
|
"learning_rate": 1.725233481470135e-06, |
|
"loss": 3.2924, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9942501197891711, |
|
"grad_norm": 88.77394104003906, |
|
"learning_rate": 1.7186979310040268e-06, |
|
"loss": 5.5422, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0071873502635362, |
|
"grad_norm": 167.41412353515625, |
|
"learning_rate": 1.7120982598133456e-06, |
|
"loss": 3.5133, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0191662673694297, |
|
"grad_norm": 16.926607131958008, |
|
"learning_rate": 1.7054350567069364e-06, |
|
"loss": 4.2376, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0311451844753234, |
|
"grad_norm": 96.12760925292969, |
|
"learning_rate": 1.698708916161829e-06, |
|
"loss": 3.6823, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0431241015812172, |
|
"grad_norm": 103.28370666503906, |
|
"learning_rate": 1.6919204382701987e-06, |
|
"loss": 2.5705, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0551030186871106, |
|
"grad_norm": 166.0828857421875, |
|
"learning_rate": 1.6850702286858298e-06, |
|
"loss": 2.9061, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0670819357930044, |
|
"grad_norm": 136.93392944335938, |
|
"learning_rate": 1.678158898570078e-06, |
|
"loss": 2.8635, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0790608528988979, |
|
"grad_norm": 14.542271614074707, |
|
"learning_rate": 1.6711870645373449e-06, |
|
"loss": 4.2555, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0910397700047916, |
|
"grad_norm": 53.567359924316406, |
|
"learning_rate": 1.6641553486000651e-06, |
|
"loss": 3.1885, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.1030186871106853, |
|
"grad_norm": 100.29656982421875, |
|
"learning_rate": 1.6570643781132118e-06, |
|
"loss": 4.953, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.1149976042165788, |
|
"grad_norm": 89.06425476074219, |
|
"learning_rate": 1.649914785718324e-06, |
|
"loss": 4.9896, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1269765213224725, |
|
"grad_norm": 17.858898162841797, |
|
"learning_rate": 1.6427072092870651e-06, |
|
"loss": 1.5295, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.138955438428366, |
|
"grad_norm": 15.720714569091797, |
|
"learning_rate": 1.6354422918643133e-06, |
|
"loss": 3.0117, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1509343555342597, |
|
"grad_norm": 13.404827117919922, |
|
"learning_rate": 1.628120681610789e-06, |
|
"loss": 2.1361, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1629132726401532, |
|
"grad_norm": 130.946044921875, |
|
"learning_rate": 1.6207430317452297e-06, |
|
"loss": 3.941, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.174892189746047, |
|
"grad_norm": 14.730375289916992, |
|
"learning_rate": 1.613310000486108e-06, |
|
"loss": 3.2318, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1868711068519406, |
|
"grad_norm": 82.65552520751953, |
|
"learning_rate": 1.6058222509929096e-06, |
|
"loss": 3.9045, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.1988500239578341, |
|
"grad_norm": Infinity, |
|
"learning_rate": 1.5982804513069664e-06, |
|
"loss": 5.5404, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1988500239578341, |
|
"eval_loss": 1.1037527322769165, |
|
"eval_runtime": 238.5531, |
|
"eval_samples_per_second": 6.246, |
|
"eval_steps_per_second": 3.123, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2108289410637278, |
|
"grad_norm": 67.75409698486328, |
|
"learning_rate": 1.5914471746978935e-06, |
|
"loss": 2.6392, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.2228078581696216, |
|
"grad_norm": 49.21822738647461, |
|
"learning_rate": 1.5838045373221053e-06, |
|
"loss": 4.0259, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.234786775275515, |
|
"grad_norm": 228.187744140625, |
|
"learning_rate": 1.5761098141278849e-06, |
|
"loss": 5.8343, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2467656923814088, |
|
"grad_norm": 33.68708419799805, |
|
"learning_rate": 1.5683636916223236e-06, |
|
"loss": 3.9807, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2587446094873023, |
|
"grad_norm": 140.39187622070312, |
|
"learning_rate": 1.5605668608982526e-06, |
|
"loss": 3.9716, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.270723526593196, |
|
"grad_norm": 11.902155876159668, |
|
"learning_rate": 1.5527200175725842e-06, |
|
"loss": 3.2315, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2827024436990895, |
|
"grad_norm": 144.3499298095703, |
|
"learning_rate": 1.5448238617242488e-06, |
|
"loss": 2.6336, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.2946813608049832, |
|
"grad_norm": 59.747928619384766, |
|
"learning_rate": 1.5368790978317395e-06, |
|
"loss": 3.206, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.306660277910877, |
|
"grad_norm": 53.534950256347656, |
|
"learning_rate": 1.5288864347102545e-06, |
|
"loss": 4.3036, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3186391950167704, |
|
"grad_norm": 48.62434387207031, |
|
"learning_rate": 1.520846585448463e-06, |
|
"loss": 2.4486, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3306181121226641, |
|
"grad_norm": 18.836971282958984, |
|
"learning_rate": 1.512760267344882e-06, |
|
"loss": 4.0121, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.3425970292285578, |
|
"grad_norm": 224.24917602539062, |
|
"learning_rate": 1.5046282018438814e-06, |
|
"loss": 2.8545, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3545759463344513, |
|
"grad_norm": 18.120187759399414, |
|
"learning_rate": 1.4964511144713174e-06, |
|
"loss": 3.1619, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.366554863440345, |
|
"grad_norm": 39.76359939575195, |
|
"learning_rate": 1.4882297347698048e-06, |
|
"loss": 3.0413, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3785337805462385, |
|
"grad_norm": 64.61255645751953, |
|
"learning_rate": 1.4799647962336255e-06, |
|
"loss": 3.8001, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3905126976521323, |
|
"grad_norm": 12.653026580810547, |
|
"learning_rate": 1.471657036243291e-06, |
|
"loss": 5.532, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.4024916147580258, |
|
"grad_norm": 53.37699508666992, |
|
"learning_rate": 1.4633071959997525e-06, |
|
"loss": 3.4156, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.4144705318639195, |
|
"grad_norm": 37.08938217163086, |
|
"learning_rate": 1.4549160204582731e-06, |
|
"loss": 2.5073, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.4264494489698132, |
|
"grad_norm": 139.479736328125, |
|
"learning_rate": 1.4464842582619652e-06, |
|
"loss": 3.36, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.4384283660757067, |
|
"grad_norm": 88.59599304199219, |
|
"learning_rate": 1.4380126616749975e-06, |
|
"loss": 5.2213, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4384283660757067, |
|
"eval_loss": 1.1036500930786133, |
|
"eval_runtime": 238.313, |
|
"eval_samples_per_second": 6.252, |
|
"eval_steps_per_second": 3.126, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4504072831816004, |
|
"grad_norm": 129.61988830566406, |
|
"learning_rate": 1.4295019865154785e-06, |
|
"loss": 7.1682, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.462386200287494, |
|
"grad_norm": 19.823545455932617, |
|
"learning_rate": 1.4209529920880272e-06, |
|
"loss": 4.6843, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4743651173933876, |
|
"grad_norm": 107.53630065917969, |
|
"learning_rate": 1.4123664411160252e-06, |
|
"loss": 2.4525, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4863440344992813, |
|
"grad_norm": 54.906280517578125, |
|
"learning_rate": 1.4037430996735722e-06, |
|
"loss": 5.9388, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4983229516051748, |
|
"grad_norm": 77.29488372802734, |
|
"learning_rate": 1.3950837371171355e-06, |
|
"loss": 5.3705, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5103018687110685, |
|
"grad_norm": 115.9428482055664, |
|
"learning_rate": 1.3863891260169114e-06, |
|
"loss": 4.0317, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.522280785816962, |
|
"grad_norm": 19.207189559936523, |
|
"learning_rate": 1.3776600420878973e-06, |
|
"loss": 3.8767, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5342597029228557, |
|
"grad_norm": 83.13814544677734, |
|
"learning_rate": 1.3688972641206837e-06, |
|
"loss": 4.5835, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5462386200287495, |
|
"grad_norm": 222.7005157470703, |
|
"learning_rate": 1.3601015739119733e-06, |
|
"loss": 3.3379, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.558217537134643, |
|
"grad_norm": 51.51054382324219, |
|
"learning_rate": 1.35127375619483e-06, |
|
"loss": 5.4397, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5701964542405367, |
|
"grad_norm": 109.09092712402344, |
|
"learning_rate": 1.3424145985686662e-06, |
|
"loss": 3.1896, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5821753713464304, |
|
"grad_norm": 11.890337944030762, |
|
"learning_rate": 1.333524891428976e-06, |
|
"loss": 4.4828, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5941542884523239, |
|
"grad_norm": 18.87173843383789, |
|
"learning_rate": 1.324605427896817e-06, |
|
"loss": 2.4719, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6061332055582176, |
|
"grad_norm": 110.26778411865234, |
|
"learning_rate": 1.3156570037480497e-06, |
|
"loss": 3.4721, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.6181121226641113, |
|
"grad_norm": 19.194913864135742, |
|
"learning_rate": 1.3066804173423397e-06, |
|
"loss": 4.3532, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.6300910397700048, |
|
"grad_norm": 89.65567016601562, |
|
"learning_rate": 1.297676469551931e-06, |
|
"loss": 4.1742, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6420699568758983, |
|
"grad_norm": 15.179718017578125, |
|
"learning_rate": 1.2886459636901927e-06, |
|
"loss": 4.2612, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.654048873981792, |
|
"grad_norm": 14.557687759399414, |
|
"learning_rate": 1.2795897054399498e-06, |
|
"loss": 5.2594, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6660277910876857, |
|
"grad_norm": 20.148624420166016, |
|
"learning_rate": 1.2705085027816008e-06, |
|
"loss": 3.3919, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6780067081935792, |
|
"grad_norm": 9.0452241897583, |
|
"learning_rate": 1.261403165921032e-06, |
|
"loss": 4.3208, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6780067081935792, |
|
"eval_loss": 1.0974289178848267, |
|
"eval_runtime": 238.4129, |
|
"eval_samples_per_second": 6.25, |
|
"eval_steps_per_second": 3.125, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.689985625299473, |
|
"grad_norm": 18.7681941986084, |
|
"learning_rate": 1.2522745072173336e-06, |
|
"loss": 2.5784, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.7019645424053667, |
|
"grad_norm": 112.4189224243164, |
|
"learning_rate": 1.243123341110321e-06, |
|
"loss": 4.0173, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.7139434595112601, |
|
"grad_norm": 13.051095008850098, |
|
"learning_rate": 1.2339504840478738e-06, |
|
"loss": 3.1098, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.7259223766171539, |
|
"grad_norm": 16.918392181396484, |
|
"learning_rate": 1.224756754413092e-06, |
|
"loss": 3.1983, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7379012937230476, |
|
"grad_norm": 167.8545379638672, |
|
"learning_rate": 1.2155429724512838e-06, |
|
"loss": 4.8368, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.749880210828941, |
|
"grad_norm": 19.237834930419922, |
|
"learning_rate": 1.206309960196784e-06, |
|
"loss": 3.1809, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.7618591279348346, |
|
"grad_norm": 95.0063247680664, |
|
"learning_rate": 1.1970585413996132e-06, |
|
"loss": 3.9006, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.7738380450407283, |
|
"grad_norm": 55.70530319213867, |
|
"learning_rate": 1.1877895414519858e-06, |
|
"loss": 3.3394, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.785816962146622, |
|
"grad_norm": 23.173871994018555, |
|
"learning_rate": 1.1785037873146697e-06, |
|
"loss": 2.4079, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7977958792525155, |
|
"grad_norm": 120.7456283569336, |
|
"learning_rate": 1.1692021074432054e-06, |
|
"loss": 4.2111, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8097747963584092, |
|
"grad_norm": 89.36892700195312, |
|
"learning_rate": 1.1598853317139958e-06, |
|
"loss": 1.8205, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.821753713464303, |
|
"grad_norm": 66.96819305419922, |
|
"learning_rate": 1.150554291350263e-06, |
|
"loss": 4.6707, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.8337326305701964, |
|
"grad_norm": 52.6048583984375, |
|
"learning_rate": 1.1412098188478914e-06, |
|
"loss": 2.2611, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8457115476760901, |
|
"grad_norm": 100.39757537841797, |
|
"learning_rate": 1.1318527479011513e-06, |
|
"loss": 3.3554, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.8576904647819839, |
|
"grad_norm": 12.364606857299805, |
|
"learning_rate": 1.1224839133283208e-06, |
|
"loss": 2.7868, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.8696693818878773, |
|
"grad_norm": 93.31403350830078, |
|
"learning_rate": 1.1131041509972032e-06, |
|
"loss": 3.7607, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.8816482989937708, |
|
"grad_norm": 77.30158233642578, |
|
"learning_rate": 1.1037142977505548e-06, |
|
"loss": 3.28, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8936272160996646, |
|
"grad_norm": 13.82420825958252, |
|
"learning_rate": 1.0943151913314211e-06, |
|
"loss": 3.3544, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.9056061332055583, |
|
"grad_norm": 16.398128509521484, |
|
"learning_rate": 1.084907670308397e-06, |
|
"loss": 2.7871, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.9175850503114518, |
|
"grad_norm": 19.750925064086914, |
|
"learning_rate": 1.0754925740008098e-06, |
|
"loss": 4.1985, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9175850503114518, |
|
"eval_loss": 1.1042989492416382, |
|
"eval_runtime": 238.4482, |
|
"eval_samples_per_second": 6.249, |
|
"eval_steps_per_second": 3.124, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9295639674173455, |
|
"grad_norm": 16.436159133911133, |
|
"learning_rate": 1.066070742403839e-06, |
|
"loss": 3.9566, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9415428845232392, |
|
"grad_norm": 67.2239761352539, |
|
"learning_rate": 1.056643016113572e-06, |
|
"loss": 4.0604, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9535218016291327, |
|
"grad_norm": 52.419456481933594, |
|
"learning_rate": 1.047210236252008e-06, |
|
"loss": 4.4566, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9655007187350264, |
|
"grad_norm": 102.50648498535156, |
|
"learning_rate": 1.0377732443920155e-06, |
|
"loss": 2.5929, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.9774796358409201, |
|
"grad_norm": 85.2761459350586, |
|
"learning_rate": 1.0283328824822498e-06, |
|
"loss": 3.278, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.9894585529468136, |
|
"grad_norm": 111.09942626953125, |
|
"learning_rate": 1.0188899927720324e-06, |
|
"loss": 2.1727, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.0023957834211785, |
|
"grad_norm": 88.2173080444336, |
|
"learning_rate": 1.009445417736213e-06, |
|
"loss": 4.7098, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.0143747005270725, |
|
"grad_norm": 15.37248420715332, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3523, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.026353617632966, |
|
"grad_norm": 428.4352111816406, |
|
"learning_rate": 9.905545822637871e-07, |
|
"loss": 4.4776, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.0383325347388594, |
|
"grad_norm": 16.143062591552734, |
|
"learning_rate": 9.811100072279673e-07, |
|
"loss": 3.2249, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.0503114518447534, |
|
"grad_norm": 142.73899841308594, |
|
"learning_rate": 9.716671175177506e-07, |
|
"loss": 3.6488, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.062290368950647, |
|
"grad_norm": 86.692626953125, |
|
"learning_rate": 9.622267556079844e-07, |
|
"loss": 2.3491, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.0742692860565404, |
|
"grad_norm": 127.98802947998047, |
|
"learning_rate": 9.527897637479921e-07, |
|
"loss": 4.828, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.0862482031624343, |
|
"grad_norm": 14.027091026306152, |
|
"learning_rate": 9.433569838864282e-07, |
|
"loss": 4.176, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.098227120268328, |
|
"grad_norm": 16.68279266357422, |
|
"learning_rate": 9.33929257596161e-07, |
|
"loss": 2.8037, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.1102060373742213, |
|
"grad_norm": 68.17903900146484, |
|
"learning_rate": 9.245074259991904e-07, |
|
"loss": 3.448, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.122184954480115, |
|
"grad_norm": 90.32327270507812, |
|
"learning_rate": 9.150923296916032e-07, |
|
"loss": 3.3154, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.1341638715860087, |
|
"grad_norm": 218.0877227783203, |
|
"learning_rate": 9.056848086685789e-07, |
|
"loss": 3.8818, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.1461427886919022, |
|
"grad_norm": 117.9521484375, |
|
"learning_rate": 8.96285702249445e-07, |
|
"loss": 3.7345, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.1581217057977957, |
|
"grad_norm": 14.993733406066895, |
|
"learning_rate": 8.868958490027966e-07, |
|
"loss": 3.3709, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1581217057977957, |
|
"eval_loss": 1.1172066926956177, |
|
"eval_runtime": 238.6675, |
|
"eval_samples_per_second": 6.243, |
|
"eval_steps_per_second": 3.121, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1701006229036897, |
|
"grad_norm": 273.74365234375, |
|
"learning_rate": 8.775160866716791e-07, |
|
"loss": 4.06, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.182079540009583, |
|
"grad_norm": 11.733575820922852, |
|
"learning_rate": 8.681472520988488e-07, |
|
"loss": 2.4028, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.1940584571154766, |
|
"grad_norm": 270.543212890625, |
|
"learning_rate": 8.587901811521087e-07, |
|
"loss": 5.1853, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.2060373742213706, |
|
"grad_norm": 18.38782501220703, |
|
"learning_rate": 8.494457086497368e-07, |
|
"loss": 1.9458, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.218016291327264, |
|
"grad_norm": 127.68826293945312, |
|
"learning_rate": 8.401146682860041e-07, |
|
"loss": 3.2338, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.2299952084331576, |
|
"grad_norm": 186.4503173828125, |
|
"learning_rate": 8.307978925567945e-07, |
|
"loss": 2.5094, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.241974125539051, |
|
"grad_norm": 15.038443565368652, |
|
"learning_rate": 8.214962126853307e-07, |
|
"loss": 2.6388, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.253953042644945, |
|
"grad_norm": 74.26438903808594, |
|
"learning_rate": 8.122104585480143e-07, |
|
"loss": 2.24, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.2659319597508385, |
|
"grad_norm": 57.38971710205078, |
|
"learning_rate": 8.029414586003866e-07, |
|
"loss": 4.3915, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.277910876856732, |
|
"grad_norm": 221.86866760253906, |
|
"learning_rate": 7.93690039803216e-07, |
|
"loss": 3.0979, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.289889793962626, |
|
"grad_norm": 66.43515014648438, |
|
"learning_rate": 7.844570275487159e-07, |
|
"loss": 2.0459, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.3018687110685194, |
|
"grad_norm": 174.6913604736328, |
|
"learning_rate": 7.752432455869081e-07, |
|
"loss": 4.3113, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.313847628174413, |
|
"grad_norm": 12.269379615783691, |
|
"learning_rate": 7.660495159521264e-07, |
|
"loss": 2.6802, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.3258265452803064, |
|
"grad_norm": 14.982316970825195, |
|
"learning_rate": 7.56876658889679e-07, |
|
"loss": 3.2325, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.3378054623862004, |
|
"grad_norm": 75.86939239501953, |
|
"learning_rate": 7.477254927826664e-07, |
|
"loss": 1.0064, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.349784379492094, |
|
"grad_norm": 12.59216594696045, |
|
"learning_rate": 7.38596834078968e-07, |
|
"loss": 2.3464, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.361763296597988, |
|
"grad_norm": 44.80192184448242, |
|
"learning_rate": 7.294914972183992e-07, |
|
"loss": 3.9336, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.3737422137038813, |
|
"grad_norm": 227.7502899169922, |
|
"learning_rate": 7.204102945600502e-07, |
|
"loss": 3.3652, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.3857211308097748, |
|
"grad_norm": 202.1083526611328, |
|
"learning_rate": 7.113540363098072e-07, |
|
"loss": 3.0293, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.3977000479156683, |
|
"grad_norm": 148.12112426757812, |
|
"learning_rate": 7.02323530448069e-07, |
|
"loss": 3.7548, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.3977000479156683, |
|
"eval_loss": 1.1212018728256226, |
|
"eval_runtime": 237.7587, |
|
"eval_samples_per_second": 6.267, |
|
"eval_steps_per_second": 3.133, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.409678965021562, |
|
"grad_norm": 107.33174133300781, |
|
"learning_rate": 6.933195826576603e-07, |
|
"loss": 3.9499, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.4216578821274557, |
|
"grad_norm": 15.232194900512695, |
|
"learning_rate": 6.843429962519504e-07, |
|
"loss": 4.3203, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.433636799233349, |
|
"grad_norm": 14.64511489868164, |
|
"learning_rate": 6.75394572103183e-07, |
|
"loss": 4.5243, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.445615716339243, |
|
"grad_norm": 16.1613712310791, |
|
"learning_rate": 6.664751085710239e-07, |
|
"loss": 3.5644, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.4575946334451366, |
|
"grad_norm": 61.8858642578125, |
|
"learning_rate": 6.575854014313338e-07, |
|
"loss": 3.7972, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.46957355055103, |
|
"grad_norm": 52.300514221191406, |
|
"learning_rate": 6.487262438051701e-07, |
|
"loss": 3.6956, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.4815524676569236, |
|
"grad_norm": 208.3380126953125, |
|
"learning_rate": 6.398984260880266e-07, |
|
"loss": 3.6895, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.4935313847628175, |
|
"grad_norm": 18.773216247558594, |
|
"learning_rate": 6.311027358793166e-07, |
|
"loss": 3.0383, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.505510301868711, |
|
"grad_norm": 19.703941345214844, |
|
"learning_rate": 6.223399579121029e-07, |
|
"loss": 2.5712, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.5174892189746045, |
|
"grad_norm": 63.63347625732422, |
|
"learning_rate": 6.136108739830886e-07, |
|
"loss": 2.2939, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5294681360804985, |
|
"grad_norm": 23.118276596069336, |
|
"learning_rate": 6.049162628828644e-07, |
|
"loss": 3.329, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.541447053186392, |
|
"grad_norm": 16.502140045166016, |
|
"learning_rate": 5.962569003264276e-07, |
|
"loss": 4.0458, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.5534259702922855, |
|
"grad_norm": 11.94331169128418, |
|
"learning_rate": 5.876335588839746e-07, |
|
"loss": 3.7107, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.565404887398179, |
|
"grad_norm": 20.692808151245117, |
|
"learning_rate": 5.79047007911973e-07, |
|
"loss": 2.3799, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.577383804504073, |
|
"grad_norm": 170.5503692626953, |
|
"learning_rate": 5.704980134845213e-07, |
|
"loss": 2.808, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.5893627216099664, |
|
"grad_norm": 17.668119430541992, |
|
"learning_rate": 5.619873383250029e-07, |
|
"loss": 2.4657, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.6013416387158603, |
|
"grad_norm": 12.804652214050293, |
|
"learning_rate": 5.535157417380346e-07, |
|
"loss": 4.2857, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.613320555821754, |
|
"grad_norm": 231.0024871826172, |
|
"learning_rate": 5.450839795417266e-07, |
|
"loss": 5.443, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.6252994729276473, |
|
"grad_norm": 156.8734893798828, |
|
"learning_rate": 5.366928040002476e-07, |
|
"loss": 4.17, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.637278390033541, |
|
"grad_norm": 49.93342590332031, |
|
"learning_rate": 5.283429637567091e-07, |
|
"loss": 3.2694, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.637278390033541, |
|
"eval_loss": 1.1175537109375, |
|
"eval_runtime": 238.0084, |
|
"eval_samples_per_second": 6.26, |
|
"eval_steps_per_second": 3.13, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.6492573071394347, |
|
"grad_norm": 11.659183502197266, |
|
"learning_rate": 5.200352037663745e-07, |
|
"loss": 1.2186, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.6612362242453282, |
|
"grad_norm": 16.97669219970703, |
|
"learning_rate": 5.117702652301952e-07, |
|
"loss": 3.9984, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.6732151413512217, |
|
"grad_norm": 49.165199279785156, |
|
"learning_rate": 5.035488855286823e-07, |
|
"loss": 2.916, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.6851940584571157, |
|
"grad_norm": 14.636496543884277, |
|
"learning_rate": 4.953717981561186e-07, |
|
"loss": 3.1995, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.697172975563009, |
|
"grad_norm": 13.993462562561035, |
|
"learning_rate": 4.872397326551179e-07, |
|
"loss": 3.1531, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.7091518926689027, |
|
"grad_norm": 14.866923332214355, |
|
"learning_rate": 4.791534145515368e-07, |
|
"loss": 2.7951, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.721130809774796, |
|
"grad_norm": 12.874122619628906, |
|
"learning_rate": 4.711135652897452e-07, |
|
"loss": 4.0197, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.73310972688069, |
|
"grad_norm": 166.74929809570312, |
|
"learning_rate": 4.6312090216826074e-07, |
|
"loss": 4.324, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.7450886439865836, |
|
"grad_norm": 169.29539489746094, |
|
"learning_rate": 4.551761382757513e-07, |
|
"loss": 4.1737, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.757067561092477, |
|
"grad_norm": 12.698741912841797, |
|
"learning_rate": 4.4727998242741627e-07, |
|
"loss": 3.2982, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.769046478198371, |
|
"grad_norm": 97.41565704345703, |
|
"learning_rate": 4.394331391017474e-07, |
|
"loss": 3.0522, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.7810253953042645, |
|
"grad_norm": 13.397387504577637, |
|
"learning_rate": 4.316363083776766e-07, |
|
"loss": 5.0599, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.793004312410158, |
|
"grad_norm": 14.772340774536133, |
|
"learning_rate": 4.2389018587211524e-07, |
|
"loss": 2.43, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.8049832295160515, |
|
"grad_norm": 218.20303344726562, |
|
"learning_rate": 4.1619546267789453e-07, |
|
"loss": 5.5137, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.8169621466219454, |
|
"grad_norm": 312.639892578125, |
|
"learning_rate": 4.0855282530210676e-07, |
|
"loss": 4.4751, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.828941063727839, |
|
"grad_norm": 293.28387451171875, |
|
"learning_rate": 4.0096295560485547e-07, |
|
"loss": 4.1398, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.840919980833733, |
|
"grad_norm": 84.95050811767578, |
|
"learning_rate": 3.934265307384239e-07, |
|
"loss": 3.7418, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.8528988979396264, |
|
"grad_norm": 108.16950988769531, |
|
"learning_rate": 3.8594422308685793e-07, |
|
"loss": 3.392, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.86487781504552, |
|
"grad_norm": 258.593505859375, |
|
"learning_rate": 3.785167002059799e-07, |
|
"loss": 4.247, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.8768567321514134, |
|
"grad_norm": 247.61813354492188, |
|
"learning_rate": 3.7114462476382966e-07, |
|
"loss": 3.9058, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.8768567321514134, |
|
"eval_loss": 1.125927209854126, |
|
"eval_runtime": 237.8939, |
|
"eval_samples_per_second": 6.263, |
|
"eval_steps_per_second": 3.132, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.8888356492573073, |
|
"grad_norm": 82.83909606933594, |
|
"learning_rate": 3.6382865448154187e-07, |
|
"loss": 3.9744, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.900814566363201, |
|
"grad_norm": 289.5834655761719, |
|
"learning_rate": 3.5656944207466633e-07, |
|
"loss": 4.8423, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.9127934834690943, |
|
"grad_norm": 11.12870979309082, |
|
"learning_rate": 3.4936763519493495e-07, |
|
"loss": 1.8868, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.924772400574988, |
|
"grad_norm": 259.54205322265625, |
|
"learning_rate": 3.4222387637247806e-07, |
|
"loss": 5.073, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.9367513176808817, |
|
"grad_norm": 260.9941711425781, |
|
"learning_rate": 3.351388029585007e-07, |
|
"loss": 3.9769, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.948730234786775, |
|
"grad_norm": 138.75767517089844, |
|
"learning_rate": 3.281130470684166e-07, |
|
"loss": 1.8905, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.9607091518926687, |
|
"grad_norm": 212.35589599609375, |
|
"learning_rate": 3.2114723552545606e-07, |
|
"loss": 5.6336, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.9726880689985626, |
|
"grad_norm": 77.375, |
|
"learning_rate": 3.142419898047399e-07, |
|
"loss": 2.5325, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.984666986104456, |
|
"grad_norm": 301.5721740722656, |
|
"learning_rate": 3.073979259778332e-07, |
|
"loss": 2.8451, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.9966459032103496, |
|
"grad_norm": 23.281728744506836, |
|
"learning_rate": 3.006156546577796e-07, |
|
"loss": 4.271, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.009583133684715, |
|
"grad_norm": 60.59037399291992, |
|
"learning_rate": 2.9389578094462607e-07, |
|
"loss": 4.768, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.0215620507906085, |
|
"grad_norm": 12.599560737609863, |
|
"learning_rate": 2.872389043714343e-07, |
|
"loss": 3.4648, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.033540967896502, |
|
"grad_norm": 60.7338981628418, |
|
"learning_rate": 2.806456188507943e-07, |
|
"loss": 4.1664, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.045519885002396, |
|
"grad_norm": 124.4323959350586, |
|
"learning_rate": 2.7411651262183465e-07, |
|
"loss": 3.2584, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.0574988021082894, |
|
"grad_norm": 10.696157455444336, |
|
"learning_rate": 2.676521681977425e-07, |
|
"loss": 1.6209, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.069477719214183, |
|
"grad_norm": 78.30728149414062, |
|
"learning_rate": 2.612531623137922e-07, |
|
"loss": 2.9967, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.081456636320077, |
|
"grad_norm": 12.110499382019043, |
|
"learning_rate": 2.5492006587589033e-07, |
|
"loss": 1.8501, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.0934355534259703, |
|
"grad_norm": 142.67459106445312, |
|
"learning_rate": 2.4865344390964016e-07, |
|
"loss": 6.7426, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.105414470531864, |
|
"grad_norm": 84.46195220947266, |
|
"learning_rate": 2.424538555099326e-07, |
|
"loss": 3.011, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.1173933876377578, |
|
"grad_norm": 287.4181213378906, |
|
"learning_rate": 2.3632185379106383e-07, |
|
"loss": 4.2906, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.1173933876377578, |
|
"eval_loss": 1.1396493911743164, |
|
"eval_runtime": 240.643, |
|
"eval_samples_per_second": 6.192, |
|
"eval_steps_per_second": 3.096, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.1293723047436512, |
|
"grad_norm": 17.018083572387695, |
|
"learning_rate": 2.302579858373881e-07, |
|
"loss": 2.1286, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.1413512218495447, |
|
"grad_norm": 13.265679359436035, |
|
"learning_rate": 2.2426279265450708e-07, |
|
"loss": 2.9965, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.1533301389554382, |
|
"grad_norm": 156.8545684814453, |
|
"learning_rate": 2.183368091210037e-07, |
|
"loss": 3.7899, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.165309056061332, |
|
"grad_norm": 17.818618774414062, |
|
"learning_rate": 2.1248056394072078e-07, |
|
"loss": 4.1165, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.1772879731672257, |
|
"grad_norm": 15.270909309387207, |
|
"learning_rate": 2.0669457959559177e-07, |
|
"loss": 3.1192, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.189266890273119, |
|
"grad_norm": 78.96019744873047, |
|
"learning_rate": 2.0097937229902485e-07, |
|
"loss": 5.3403, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.201245807379013, |
|
"grad_norm": 376.2461242675781, |
|
"learning_rate": 1.9533545194984791e-07, |
|
"loss": 3.9551, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.2132247244849066, |
|
"grad_norm": 16.849010467529297, |
|
"learning_rate": 1.8976332208681744e-07, |
|
"loss": 5.6715, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.2252036415908, |
|
"grad_norm": 171.6584930419922, |
|
"learning_rate": 1.8426347984369273e-07, |
|
"loss": 5.2323, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.237182558696694, |
|
"grad_norm": 120.8387680053711, |
|
"learning_rate": 1.788364159048833e-07, |
|
"loss": 4.2853, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.2491614758025875, |
|
"grad_norm": 13.419800758361816, |
|
"learning_rate": 1.734826144616698e-07, |
|
"loss": 2.9811, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.261140392908481, |
|
"grad_norm": 196.2430877685547, |
|
"learning_rate": 1.6820255316900756e-07, |
|
"loss": 4.3565, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.2731193100143745, |
|
"grad_norm": 15.63242244720459, |
|
"learning_rate": 1.6299670310290915e-07, |
|
"loss": 2.4933, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.2850982271202684, |
|
"grad_norm": 60.21677017211914, |
|
"learning_rate": 1.5786552871841774e-07, |
|
"loss": 2.6513, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.297077144226162, |
|
"grad_norm": 223.22097778320312, |
|
"learning_rate": 1.528094878081677e-07, |
|
"loss": 3.3477, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.3090560613320554, |
|
"grad_norm": 95.59901428222656, |
|
"learning_rate": 1.478290314615427e-07, |
|
"loss": 3.0678, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.3210349784379494, |
|
"grad_norm": 36.64326095581055, |
|
"learning_rate": 1.4292460402442995e-07, |
|
"loss": 3.4483, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.333013895543843, |
|
"grad_norm": 233.20370483398438, |
|
"learning_rate": 1.3809664305957625e-07, |
|
"loss": 2.9447, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.3449928126497364, |
|
"grad_norm": 14.824277877807617, |
|
"learning_rate": 1.3334557930754963e-07, |
|
"loss": 2.547, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.35697172975563, |
|
"grad_norm": 208.10154724121094, |
|
"learning_rate": 1.2867183664831038e-07, |
|
"loss": 6.0572, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.35697172975563, |
|
"eval_loss": 1.147083044052124, |
|
"eval_runtime": 239.357, |
|
"eval_samples_per_second": 6.225, |
|
"eval_steps_per_second": 3.113, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.368950646861524, |
|
"grad_norm": 167.9786834716797, |
|
"learning_rate": 1.2407583206339256e-07, |
|
"loss": 2.6004, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.3809295639674173, |
|
"grad_norm": 236.07484436035156, |
|
"learning_rate": 1.195579755987024e-07, |
|
"loss": 2.3534, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.3929084810733112, |
|
"grad_norm": 38.18111801147461, |
|
"learning_rate": 1.1511867032793321e-07, |
|
"loss": 2.5498, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.4048873981792047, |
|
"grad_norm": 136.2451934814453, |
|
"learning_rate": 1.107583123166066e-07, |
|
"loss": 5.3208, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.416866315285098, |
|
"grad_norm": 316.2095947265625, |
|
"learning_rate": 1.0647729058673427e-07, |
|
"loss": 4.3772, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.4288452323909917, |
|
"grad_norm": 14.228015899658203, |
|
"learning_rate": 1.0227598708211172e-07, |
|
"loss": 2.2948, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.4408241494968856, |
|
"grad_norm": 52.19302749633789, |
|
"learning_rate": 9.81547766342401e-08, |
|
"loss": 2.223, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.452803066602779, |
|
"grad_norm": 18.04366683959961, |
|
"learning_rate": 9.411402692888715e-08, |
|
"loss": 3.6671, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.4647819837086726, |
|
"grad_norm": 12.00094985961914, |
|
"learning_rate": 9.015409847328037e-08, |
|
"loss": 2.3488, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.4767609008145666, |
|
"grad_norm": 88.41423797607422, |
|
"learning_rate": 8.62753445639457e-08, |
|
"loss": 3.2758, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.48873981792046, |
|
"grad_norm": 56.944828033447266, |
|
"learning_rate": 8.247811125518489e-08, |
|
"loss": 1.576, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.5007187350263536, |
|
"grad_norm": 323.6489562988281, |
|
"learning_rate": 7.876273732820327e-08, |
|
"loss": 4.9528, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.512697652132247, |
|
"grad_norm": 10.932809829711914, |
|
"learning_rate": 7.51295542608834e-08, |
|
"loss": 3.6918, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 3.524676569238141, |
|
"grad_norm": 12.945392608642578, |
|
"learning_rate": 7.157888619821106e-08, |
|
"loss": 1.7161, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.5366554863440345, |
|
"grad_norm": 358.42498779296875, |
|
"learning_rate": 6.811104992335648e-08, |
|
"loss": 4.4565, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.548634403449928, |
|
"grad_norm": 231.78411865234375, |
|
"learning_rate": 6.47263548294108e-08, |
|
"loss": 3.5586, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.560613320555822, |
|
"grad_norm": 231.84341430664062, |
|
"learning_rate": 6.142510289178337e-08, |
|
"loss": 3.4724, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.5725922376617154, |
|
"grad_norm": 198.78810119628906, |
|
"learning_rate": 5.8207588641260185e-08, |
|
"loss": 2.5415, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.584571154767609, |
|
"grad_norm": 76.49454498291016, |
|
"learning_rate": 5.507409913772543e-08, |
|
"loss": 3.7494, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.5965500718735024, |
|
"grad_norm": 262.359130859375, |
|
"learning_rate": 5.202491394455155e-08, |
|
"loss": 4.0544, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.5965500718735024, |
|
"eval_loss": 1.1526151895523071, |
|
"eval_runtime": 238.4822, |
|
"eval_samples_per_second": 6.248, |
|
"eval_steps_per_second": 3.124, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.6085289889793963, |
|
"grad_norm": 14.773038864135742, |
|
"learning_rate": 4.9060305103657e-08, |
|
"loss": 3.8126, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 3.62050790608529, |
|
"grad_norm": 288.3175354003906, |
|
"learning_rate": 4.61805371112356e-08, |
|
"loss": 2.2371, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 3.6324868231911838, |
|
"grad_norm": 15.71839427947998, |
|
"learning_rate": 4.3661497350331423e-08, |
|
"loss": 2.8255, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 3.6444657402970773, |
|
"grad_norm": 47.34138870239258, |
|
"learning_rate": 4.094362852900846e-08, |
|
"loss": 2.4564, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.6564446574029708, |
|
"grad_norm": 133.03443908691406, |
|
"learning_rate": 3.8311324709047524e-08, |
|
"loss": 3.9076, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.6684235745088642, |
|
"grad_norm": 85.79124450683594, |
|
"learning_rate": 3.57648207390836e-08, |
|
"loss": 1.6264, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.680402491614758, |
|
"grad_norm": 11.73284912109375, |
|
"learning_rate": 3.3304343812869175e-08, |
|
"loss": 2.2377, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 3.6923814087206517, |
|
"grad_norm": 103.52379608154297, |
|
"learning_rate": 3.0930113449003536e-08, |
|
"loss": 2.5226, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.704360325826545, |
|
"grad_norm": 121.46673583984375, |
|
"learning_rate": 2.8642341471348585e-08, |
|
"loss": 5.8129, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 3.716339242932439, |
|
"grad_norm": 39.363887786865234, |
|
"learning_rate": 2.644123199013004e-08, |
|
"loss": 1.8684, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.7283181600383326, |
|
"grad_norm": 22.17119598388672, |
|
"learning_rate": 2.432698138372713e-08, |
|
"loss": 4.5753, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 3.740297077144226, |
|
"grad_norm": 123.43873596191406, |
|
"learning_rate": 2.2299778281151927e-08, |
|
"loss": 3.4706, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.7522759942501196, |
|
"grad_norm": 62.01347351074219, |
|
"learning_rate": 2.03598035452206e-08, |
|
"loss": 1.0962, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.7642549113560135, |
|
"grad_norm": 106.05194091796875, |
|
"learning_rate": 1.8507230256417316e-08, |
|
"loss": 4.0847, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.776233828461907, |
|
"grad_norm": 14.505044937133789, |
|
"learning_rate": 1.674222369745182e-08, |
|
"loss": 2.7005, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.7882127455678005, |
|
"grad_norm": 74.14771270751953, |
|
"learning_rate": 1.5064941338513548e-08, |
|
"loss": 4.5833, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.8001916626736945, |
|
"grad_norm": 109.61576080322266, |
|
"learning_rate": 1.3475532823222779e-08, |
|
"loss": 3.3511, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.812170579779588, |
|
"grad_norm": 142.658447265625, |
|
"learning_rate": 1.1974139955279294e-08, |
|
"loss": 4.0569, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.8241494968854814, |
|
"grad_norm": 374.00701904296875, |
|
"learning_rate": 1.0560896685811061e-08, |
|
"loss": 3.657, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.836128413991375, |
|
"grad_norm": 191.76698303222656, |
|
"learning_rate": 9.235929101423457e-09, |
|
"loss": 3.2204, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.836128413991375, |
|
"eval_loss": 1.1512279510498047, |
|
"eval_runtime": 239.5597, |
|
"eval_samples_per_second": 6.22, |
|
"eval_steps_per_second": 3.11, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.848107331097269, |
|
"grad_norm": 89.6385726928711, |
|
"learning_rate": 7.99935541295016e-09, |
|
"loss": 4.2172, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.8600862482031624, |
|
"grad_norm": 52.579227447509766, |
|
"learning_rate": 6.8512859449064705e-09, |
|
"loss": 3.1792, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.8720651653090563, |
|
"grad_norm": 159.96087646484375, |
|
"learning_rate": 5.791823125646522e-09, |
|
"loss": 4.6078, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.88404408241495, |
|
"grad_norm": 248.06581115722656, |
|
"learning_rate": 4.8210614782245866e-09, |
|
"loss": 4.6227, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.8960229995208433, |
|
"grad_norm": 10.945176124572754, |
|
"learning_rate": 3.939087611962377e-09, |
|
"loss": 1.8884, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.908001916626737, |
|
"grad_norm": 279.2950439453125, |
|
"learning_rate": 3.1459802147214554e-09, |
|
"loss": 3.5358, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.9199808337326307, |
|
"grad_norm": 278.5760192871094, |
|
"learning_rate": 2.441810045883175e-09, |
|
"loss": 4.0377, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.9319597508385242, |
|
"grad_norm": 20.70584487915039, |
|
"learning_rate": 1.8266399300355118e-09, |
|
"loss": 4.9587, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.9439386679444177, |
|
"grad_norm": 238.9534912109375, |
|
"learning_rate": 1.300524751368326e-09, |
|
"loss": 3.8602, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.9559175850503117, |
|
"grad_norm": 16.01015281677246, |
|
"learning_rate": 8.635114487760553e-10, |
|
"loss": 1.4575, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.967896502156205, |
|
"grad_norm": 14.110264778137207, |
|
"learning_rate": 5.156390116707321e-10, |
|
"loss": 1.1673, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.9798754192620986, |
|
"grad_norm": 43.93592834472656, |
|
"learning_rate": 2.56938476502655e-10, |
|
"loss": 3.8769, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.991854336367992, |
|
"grad_norm": 9.19588851928711, |
|
"learning_rate": 8.743292399204793e-11, |
|
"loss": 2.4475, |
|
"step": 3330 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3336, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.982695446856335e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|