tttx
/

ttt-problem10-32b-021025-sl25000

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 90,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.022222222222222223,
+      "grad_norm": 0.04145513522429468,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 0.163,
+      "step": 1
+    },
+    {
+      "epoch": 0.044444444444444446,
+      "grad_norm": 0.03806177997938909,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.1836,
+      "step": 2
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.042251235266237294,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.1856,
+      "step": 3
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.03855616390033849,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.1723,
+      "step": 4
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 0.03746547163614947,
+      "learning_rate": 2.777777777777778e-05,
+      "loss": 0.1807,
+      "step": 5
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.04899948662680787,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.1749,
+      "step": 6
+    },
+    {
+      "epoch": 0.15555555555555556,
+      "grad_norm": 0.043322070961254594,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 0.1793,
+      "step": 7
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.05063967435578541,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.1584,
+      "step": 8
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.04466829068234921,
+      "learning_rate": 5e-05,
+      "loss": 0.15,
+      "step": 9
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.040258968404847786,
+      "learning_rate": 4.998119881260576e-05,
+      "loss": 0.1535,
+      "step": 10
+    },
+    {
+      "epoch": 0.24444444444444444,
+      "grad_norm": 0.02894419013448755,
+      "learning_rate": 4.99248235291948e-05,
+      "loss": 0.1449,
+      "step": 11
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.032138571629973876,
+      "learning_rate": 4.983095894354858e-05,
+      "loss": 0.1354,
+      "step": 12
+    },
+    {
+      "epoch": 0.28888888888888886,
+      "grad_norm": 0.04078602334001387,
+      "learning_rate": 4.969974623692023e-05,
+      "loss": 0.1541,
+      "step": 13
+    },
+    {
+      "epoch": 0.3111111111111111,
+      "grad_norm": 0.05032291478536776,
+      "learning_rate": 4.953138276568462e-05,
+      "loss": 0.1392,
+      "step": 14
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.050090620555320695,
+      "learning_rate": 4.9326121764495596e-05,
+      "loss": 0.1444,
+      "step": 15
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.035480633684579106,
+      "learning_rate": 4.9084271965397014e-05,
+      "loss": 0.1304,
+      "step": 16
+    },
+    {
+      "epoch": 0.37777777777777777,
+      "grad_norm": 0.032726655928985764,
+      "learning_rate": 4.880619713346039e-05,
+      "loss": 0.1311,
+      "step": 17
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.029131540724124428,
+      "learning_rate": 4.849231551964771e-05,
+      "loss": 0.1223,
+      "step": 18
+    },
+    {
+      "epoch": 0.4222222222222222,
+      "grad_norm": 0.02081489783305179,
+      "learning_rate": 4.814309923172227e-05,
+      "loss": 0.118,
+      "step": 19
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.0210654978446037,
+      "learning_rate": 4.775907352415367e-05,
+      "loss": 0.1149,
+      "step": 20
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.02408549187311914,
+      "learning_rate": 4.734081600808531e-05,
+      "loss": 0.1091,
+      "step": 21
+    },
+    {
+      "epoch": 0.4888888888888889,
+      "grad_norm": 0.02376905383589948,
+      "learning_rate": 4.6888955782552274e-05,
+      "loss": 0.1028,
+      "step": 22
+    },
+    {
+      "epoch": 0.5111111111111111,
+      "grad_norm": 0.02173656728014757,
+      "learning_rate": 4.640417248825667e-05,
+      "loss": 0.1119,
+      "step": 23
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.02438990044693573,
+      "learning_rate": 4.588719528532342e-05,
+      "loss": 0.0992,
+      "step": 24
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 0.022324375822038472,
+      "learning_rate": 4.533880175657419e-05,
+      "loss": 0.0978,
+      "step": 25
+    },
+    {
+      "epoch": 0.5777777777777777,
+      "grad_norm": 0.021524779142548363,
+      "learning_rate": 4.475981673796899e-05,
+      "loss": 0.0941,
+      "step": 26
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.019110289084753145,
+      "learning_rate": 4.415111107797445e-05,
+      "loss": 0.0987,
+      "step": 27
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.023272052760941467,
+      "learning_rate": 4.351360032772512e-05,
+      "loss": 0.0816,
+      "step": 28
+    },
+    {
+      "epoch": 0.6444444444444445,
+      "grad_norm": 0.018818332739407733,
+      "learning_rate": 4.2848243363947484e-05,
+      "loss": 0.0877,
+      "step": 29
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.022738471553081317,
+      "learning_rate": 4.215604094671835e-05,
+      "loss": 0.0729,
+      "step": 30
+    },
+    {
+      "epoch": 0.6888888888888889,
+      "grad_norm": 0.016829351187921095,
+      "learning_rate": 4.14380342142266e-05,
+      "loss": 0.0793,
+      "step": 31
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.021682647661793453,
+      "learning_rate": 4.069530311680247e-05,
+      "loss": 0.0926,
+      "step": 32
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.020906839786747528,
+      "learning_rate": 3.9928964792569655e-05,
+      "loss": 0.0723,
+      "step": 33
+    },
+    {
+      "epoch": 0.7555555555555555,
+      "grad_norm": 0.024600157953580535,
+      "learning_rate": 3.914017188716347e-05,
+      "loss": 0.0814,
+      "step": 34
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 0.022668027885789463,
+      "learning_rate": 3.8330110820042285e-05,
+      "loss": 0.0866,
+      "step": 35
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.022565588311726368,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 0.086,
+      "step": 36
+    },
+    {
+      "epoch": 0.8222222222222222,
+      "grad_norm": 0.01637375633073296,
+      "learning_rate": 3.665108799256348e-05,
+      "loss": 0.0745,
+      "step": 37
+    },
+    {
+      "epoch": 0.8444444444444444,
+      "grad_norm": 0.016288962930045323,
+      "learning_rate": 3.578465164203134e-05,
+      "loss": 0.083,
+      "step": 38
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.014072318472000743,
+      "learning_rate": 3.490199415097892e-05,
+      "loss": 0.0741,
+      "step": 39
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.019458751327784993,
+      "learning_rate": 3.400444312011776e-05,
+      "loss": 0.071,
+      "step": 40
+    },
+    {
+      "epoch": 0.9111111111111111,
+      "grad_norm": 0.016883319096903937,
+      "learning_rate": 3.309334855145803e-05,
+      "loss": 0.0751,
+      "step": 41
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.01656670372950449,
+      "learning_rate": 3.217008081777726e-05,
+      "loss": 0.0736,
+      "step": 42
+    },
+    {
+      "epoch": 0.9555555555555556,
+      "grad_norm": 0.01611572370876753,
+      "learning_rate": 3.1236028601449534e-05,
+      "loss": 0.0744,
+      "step": 43
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.025130810955317162,
+      "learning_rate": 3.0292596805735274e-05,
+      "loss": 0.056,
+      "step": 44
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.018164615800081683,
+      "learning_rate": 2.9341204441673266e-05,
+      "loss": 0.0657,
+      "step": 45
+    },
+    {
+      "epoch": 1.0222222222222221,
+      "grad_norm": 0.019962124083379607,
+      "learning_rate": 2.8383282493753283e-05,
+      "loss": 0.0567,
+      "step": 46
+    },
+    {
+      "epoch": 1.0444444444444445,
+      "grad_norm": 0.016694411321077254,
+      "learning_rate": 2.742027176757948e-05,
+      "loss": 0.0656,
+      "step": 47
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.0159764946766061,
+      "learning_rate": 2.6453620722761896e-05,
+      "loss": 0.0617,
+      "step": 48
+    },
+    {
+      "epoch": 1.0888888888888888,
+      "grad_norm": 0.02018135193726075,
+      "learning_rate": 2.548478329429561e-05,
+      "loss": 0.0618,
+      "step": 49
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.01398667566746435,
+      "learning_rate": 2.4515216705704395e-05,
+      "loss": 0.0659,
+      "step": 50
+    },
+    {
+      "epoch": 1.1333333333333333,
+      "grad_norm": 0.015420358393515337,
+      "learning_rate": 2.3546379277238107e-05,
+      "loss": 0.0688,
+      "step": 51
+    },
+    {
+      "epoch": 1.1555555555555554,
+      "grad_norm": 0.018423384035212806,
+      "learning_rate": 2.2579728232420525e-05,
+      "loss": 0.0683,
+      "step": 52
+    },
+    {
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.014000938359142262,
+      "learning_rate": 2.161671750624673e-05,
+      "loss": 0.0575,
+      "step": 53
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.013842003203664193,
+      "learning_rate": 2.0658795558326743e-05,
+      "loss": 0.0617,
+      "step": 54
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 0.02311016112104676,
+      "learning_rate": 1.970740319426474e-05,
+      "loss": 0.0711,
+      "step": 55
+    },
+    {
+      "epoch": 1.2444444444444445,
+      "grad_norm": 0.01562457950516873,
+      "learning_rate": 1.876397139855047e-05,
+      "loss": 0.0615,
+      "step": 56
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.02097714519013948,
+      "learning_rate": 1.7829919182222752e-05,
+      "loss": 0.0813,
+      "step": 57
+    },
+    {
+      "epoch": 1.2888888888888888,
+      "grad_norm": 0.013954092588025305,
+      "learning_rate": 1.690665144854198e-05,
+      "loss": 0.0655,
+      "step": 58
+    },
+    {
+      "epoch": 1.3111111111111111,
+      "grad_norm": 0.012050592569823772,
+      "learning_rate": 1.5995556879882246e-05,
+      "loss": 0.0687,
+      "step": 59
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.012934161041963854,
+      "learning_rate": 1.509800584902108e-05,
+      "loss": 0.0604,
+      "step": 60
+    },
+    {
+      "epoch": 1.3555555555555556,
+      "grad_norm": 0.01335553684986876,
+      "learning_rate": 1.4215348357968669e-05,
+      "loss": 0.0697,
+      "step": 61
+    },
+    {
+      "epoch": 1.3777777777777778,
+      "grad_norm": 0.0128301661906966,
+      "learning_rate": 1.3348912007436537e-05,
+      "loss": 0.0661,
+      "step": 62
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.013663032199972917,
+      "learning_rate": 1.2500000000000006e-05,
+      "loss": 0.06,
+      "step": 63
+    },
+    {
+      "epoch": 1.4222222222222223,
+      "grad_norm": 0.012134554930746227,
+      "learning_rate": 1.1669889179957725e-05,
+      "loss": 0.0582,
+      "step": 64
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.014696339652119621,
+      "learning_rate": 1.085982811283654e-05,
+      "loss": 0.0536,
+      "step": 65
+    },
+    {
+      "epoch": 1.4666666666666668,
+      "grad_norm": 0.01208908491764342,
+      "learning_rate": 1.0071035207430352e-05,
+      "loss": 0.0622,
+      "step": 66
+    },
+    {
+      "epoch": 1.488888888888889,
+      "grad_norm": 0.013155964113476967,
+      "learning_rate": 9.304696883197542e-06,
+      "loss": 0.053,
+      "step": 67
+    },
+    {
+      "epoch": 1.511111111111111,
+      "grad_norm": 0.010941706376727206,
+      "learning_rate": 8.561965785773413e-06,
+      "loss": 0.0611,
+      "step": 68
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.011333878019293763,
+      "learning_rate": 7.843959053281663e-06,
+      "loss": 0.0708,
+      "step": 69
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 0.012008866964762043,
+      "learning_rate": 7.1517566360525284e-06,
+      "loss": 0.0623,
+      "step": 70
+    },
+    {
+      "epoch": 1.5777777777777777,
+      "grad_norm": 0.010450627576345212,
+      "learning_rate": 6.48639967227489e-06,
+      "loss": 0.0527,
+      "step": 71
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.01209040932177887,
+      "learning_rate": 5.848888922025553e-06,
+      "loss": 0.0731,
+      "step": 72
+    },
+    {
+      "epoch": 1.6222222222222222,
+      "grad_norm": 0.016761186864218524,
+      "learning_rate": 5.240183262031021e-06,
+      "loss": 0.061,
+      "step": 73
+    },
+    {
+      "epoch": 1.6444444444444444,
+      "grad_norm": 0.013223702181080193,
+      "learning_rate": 4.661198243425813e-06,
+      "loss": 0.0536,
+      "step": 74
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.013217534982198317,
+      "learning_rate": 4.112804714676594e-06,
+      "loss": 0.0804,
+      "step": 75
+    },
+    {
+      "epoch": 1.6888888888888889,
+      "grad_norm": 0.010298693910537537,
+      "learning_rate": 3.595827511743341e-06,
+      "loss": 0.0611,
+      "step": 76
+    },
+    {
+      "epoch": 1.7111111111111112,
+      "grad_norm": 0.012020933219454272,
+      "learning_rate": 3.111044217447731e-06,
+      "loss": 0.0655,
+      "step": 77
+    },
+    {
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.010506006734785647,
+      "learning_rate": 2.659183991914696e-06,
+      "loss": 0.0637,
+      "step": 78
+    },
+    {
+      "epoch": 1.7555555555555555,
+      "grad_norm": 0.013713942520363884,
+      "learning_rate": 2.2409264758463363e-06,
+      "loss": 0.0692,
+      "step": 79
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.010764341565662566,
+      "learning_rate": 1.8569007682777417e-06,
+      "loss": 0.0488,
+      "step": 80
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.01112279858087453,
+      "learning_rate": 1.5076844803522922e-06,
+      "loss": 0.053,
+      "step": 81
+    },
+    {
+      "epoch": 1.8222222222222222,
+      "grad_norm": 0.011362400077819504,
+      "learning_rate": 1.1938028665396173e-06,
+      "loss": 0.0598,
+      "step": 82
+    },
+    {
+      "epoch": 1.8444444444444446,
+      "grad_norm": 0.010591446954163518,
+      "learning_rate": 9.157280346029918e-07,
+      "loss": 0.0621,
+      "step": 83
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.011204877929597866,
+      "learning_rate": 6.738782355044049e-07,
+      "loss": 0.0584,
+      "step": 84
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.01257113405363327,
+      "learning_rate": 4.6861723431538276e-07,
+      "loss": 0.063,
+      "step": 85
+    },
+    {
+      "epoch": 1.911111111111111,
+      "grad_norm": 0.01179877033535891,
+      "learning_rate": 3.002537630797747e-07,
+      "loss": 0.0561,
+      "step": 86
+    },
+    {
+      "epoch": 1.9333333333333333,
+      "grad_norm": 0.011129626210418207,
+      "learning_rate": 1.6904105645142444e-07,
+      "loss": 0.0608,
+      "step": 87
+    },
+    {
+      "epoch": 1.9555555555555557,
+      "grad_norm": 0.017223256380832427,
+      "learning_rate": 7.51764708051994e-08,
+      "loss": 0.0628,
+      "step": 88
+    },
+    {
+      "epoch": 1.9777777777777779,
+      "grad_norm": 0.013357392038310047,
+      "learning_rate": 1.8801187394248965e-08,
+      "loss": 0.056,
+      "step": 89
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.014111952540712841,
+      "learning_rate": 0.0,
+      "loss": 0.0581,
+      "step": 90
+    },
+    {
+      "epoch": 2.0,
+      "step": 90,
+      "total_flos": 1704342039035904.0,
+      "train_loss": 0.08834147482282585,
+      "train_runtime": 2715.1649,
+      "train_samples_per_second": 0.527,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 90,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1704342039035904.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}