mlfoundations-dev
/

seed_math_multiple_samples_scale_up_scaredy_cat_test

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: seed_math_multiple_samples_scale_up_scaredy_cat_test
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # seed_math_multiple_samples_scale_up_scaredy_cat_test
-This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
 ## Model description

 base_model: Qwen/Qwen2.5-7B-Instruct
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: seed_math_multiple_samples_scale_up_scaredy_cat_test
 # seed_math_multiple_samples_scale_up_scaredy_cat_test
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/seed_math_multiple_samples_scale_up_scaredy_cat_test dataset.
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.986666666666667,
+    "total_flos": 139896582144000.0,
+    "train_loss": 0.7518468662386849,
+    "train_runtime": 7945.17,
+    "train_samples_per_second": 2.039,
+    "train_steps_per_second": 0.021
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.986666666666667,
+    "total_flos": 139896582144000.0,
+    "train_loss": 0.7518468662386849,
+    "train_runtime": 7945.17,
+    "train_samples_per_second": 2.039,
+    "train_steps_per_second": 0.021
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1218 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.986666666666667,
+  "eval_steps": 500,
+  "global_step": 168,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.017777777777777778,
+      "grad_norm": 6.802438735961914,
+      "learning_rate": 5.882352941176471e-07,
+      "loss": 1.1138,
+      "step": 1
+    },
+    {
+      "epoch": 0.035555555555555556,
+      "grad_norm": 6.787415981292725,
+      "learning_rate": 1.1764705882352942e-06,
+      "loss": 1.0806,
+      "step": 2
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 6.674739360809326,
+      "learning_rate": 1.7647058823529414e-06,
+      "loss": 1.0469,
+      "step": 3
+    },
+    {
+      "epoch": 0.07111111111111111,
+      "grad_norm": 6.719709873199463,
+      "learning_rate": 2.3529411764705885e-06,
+      "loss": 1.0525,
+      "step": 4
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 6.4609293937683105,
+      "learning_rate": 2.9411764705882355e-06,
+      "loss": 1.1458,
+      "step": 5
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 5.147739887237549,
+      "learning_rate": 3.529411764705883e-06,
+      "loss": 1.0836,
+      "step": 6
+    },
+    {
+      "epoch": 0.12444444444444444,
+      "grad_norm": 3.1252267360687256,
+      "learning_rate": 4.11764705882353e-06,
+      "loss": 0.9522,
+      "step": 7
+    },
+    {
+      "epoch": 0.14222222222222222,
+      "grad_norm": 2.836021661758423,
+      "learning_rate": 4.705882352941177e-06,
+      "loss": 0.9805,
+      "step": 8
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 4.235110759735107,
+      "learning_rate": 5.294117647058824e-06,
+      "loss": 0.9539,
+      "step": 9
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 4.741976261138916,
+      "learning_rate": 5.882352941176471e-06,
+      "loss": 0.9584,
+      "step": 10
+    },
+    {
+      "epoch": 0.19555555555555557,
+      "grad_norm": 4.572456359863281,
+      "learning_rate": 6.470588235294119e-06,
+      "loss": 0.9436,
+      "step": 11
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 3.5882203578948975,
+      "learning_rate": 7.058823529411766e-06,
+      "loss": 0.9193,
+      "step": 12
+    },
+    {
+      "epoch": 0.2311111111111111,
+      "grad_norm": 3.677826404571533,
+      "learning_rate": 7.647058823529411e-06,
+      "loss": 0.9732,
+      "step": 13
+    },
+    {
+      "epoch": 0.24888888888888888,
+      "grad_norm": 2.6755166053771973,
+      "learning_rate": 8.23529411764706e-06,
+      "loss": 0.8779,
+      "step": 14
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 2.1610584259033203,
+      "learning_rate": 8.823529411764707e-06,
+      "loss": 0.8581,
+      "step": 15
+    },
+    {
+      "epoch": 0.28444444444444444,
+      "grad_norm": 1.9563807249069214,
+      "learning_rate": 9.411764705882354e-06,
+      "loss": 0.8518,
+      "step": 16
+    },
+    {
+      "epoch": 0.3022222222222222,
+      "grad_norm": 2.030823230743408,
+      "learning_rate": 1e-05,
+      "loss": 0.7811,
+      "step": 17
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.7580914497375488,
+      "learning_rate": 9.998917893031615e-06,
+      "loss": 0.8713,
+      "step": 18
+    },
+    {
+      "epoch": 0.3377777777777778,
+      "grad_norm": 1.7509064674377441,
+      "learning_rate": 9.995672040508656e-06,
+      "loss": 0.85,
+      "step": 19
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 1.547536015510559,
+      "learning_rate": 9.990263847374976e-06,
+      "loss": 0.8085,
+      "step": 20
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 1.1073737144470215,
+      "learning_rate": 9.982695654527966e-06,
+      "loss": 0.7902,
+      "step": 21
+    },
+    {
+      "epoch": 0.39111111111111113,
+      "grad_norm": 0.9871188998222351,
+      "learning_rate": 9.972970737805312e-06,
+      "loss": 0.8036,
+      "step": 22
+    },
+    {
+      "epoch": 0.4088888888888889,
+      "grad_norm": 1.1322752237319946,
+      "learning_rate": 9.961093306567076e-06,
+      "loss": 0.8417,
+      "step": 23
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.8192468881607056,
+      "learning_rate": 9.947068501873702e-06,
+      "loss": 0.794,
+      "step": 24
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.8386516571044922,
+      "learning_rate": 9.930902394260746e-06,
+      "loss": 0.767,
+      "step": 25
+    },
+    {
+      "epoch": 0.4622222222222222,
+      "grad_norm": 0.959214448928833,
+      "learning_rate": 9.912601981111287e-06,
+      "loss": 0.8015,
+      "step": 26
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7772365808486938,
+      "learning_rate": 9.892175183627161e-06,
+      "loss": 0.764,
+      "step": 27
+    },
+    {
+      "epoch": 0.49777777777777776,
+      "grad_norm": 0.831992506980896,
+      "learning_rate": 9.869630843400331e-06,
+      "loss": 0.8278,
+      "step": 28
+    },
+    {
+      "epoch": 0.5155555555555555,
+      "grad_norm": 0.7600440979003906,
+      "learning_rate": 9.844978718585855e-06,
+      "loss": 0.7788,
+      "step": 29
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.7171022891998291,
+      "learning_rate": 9.81822947967816e-06,
+      "loss": 0.7713,
+      "step": 30
+    },
+    {
+      "epoch": 0.5511111111111111,
+      "grad_norm": 0.8401284217834473,
+      "learning_rate": 9.789394704892364e-06,
+      "loss": 0.8337,
+      "step": 31
+    },
+    {
+      "epoch": 0.5688888888888889,
+      "grad_norm": 0.6657397150993347,
+      "learning_rate": 9.758486875152766e-06,
+      "loss": 0.8159,
+      "step": 32
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.7162612676620483,
+      "learning_rate": 9.725519368690539e-06,
+      "loss": 0.8484,
+      "step": 33
+    },
+    {
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.6413375735282898,
+      "learning_rate": 9.690506455253073e-06,
+      "loss": 0.7798,
+      "step": 34
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.641582190990448,
+      "learning_rate": 9.65346328992741e-06,
+      "loss": 0.8282,
+      "step": 35
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6429569721221924,
+      "learning_rate": 9.614405906580486e-06,
+      "loss": 0.7737,
+      "step": 36
+    },
+    {
+      "epoch": 0.6577777777777778,
+      "grad_norm": 0.5829933285713196,
+      "learning_rate": 9.573351210918976e-06,
+      "loss": 0.8027,
+      "step": 37
+    },
+    {
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.5490368008613586,
+      "learning_rate": 9.53031697317178e-06,
+      "loss": 0.7706,
+      "step": 38
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.5687077045440674,
+      "learning_rate": 9.485321820398321e-06,
+      "loss": 0.8054,
+      "step": 39
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.5813000798225403,
+      "learning_rate": 9.43838522842594e-06,
+      "loss": 0.7565,
+      "step": 40
+    },
+    {
+      "epoch": 0.7288888888888889,
+      "grad_norm": 0.558438777923584,
+      "learning_rate": 9.389527513419935e-06,
+      "loss": 0.7622,
+      "step": 41
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.6196303367614746,
+      "learning_rate": 9.338769823089853e-06,
+      "loss": 0.7578,
+      "step": 42
+    },
+    {
+      "epoch": 0.7644444444444445,
+      "grad_norm": 0.658560037612915,
+      "learning_rate": 9.286134127535859e-06,
+      "loss": 0.7589,
+      "step": 43
+    },
+    {
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.6177524328231812,
+      "learning_rate": 9.231643209739128e-06,
+      "loss": 0.7243,
+      "step": 44
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5707345604896545,
+      "learning_rate": 9.175320655700407e-06,
+      "loss": 0.7538,
+      "step": 45
+    },
+    {
+      "epoch": 0.8177777777777778,
+      "grad_norm": 0.5330144762992859,
+      "learning_rate": 9.117190844230971e-06,
+      "loss": 0.7676,
+      "step": 46
+    },
+    {
+      "epoch": 0.8355555555555556,
+      "grad_norm": 0.6449193358421326,
+      "learning_rate": 9.057278936400453e-06,
+      "loss": 0.7789,
+      "step": 47
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.5758365988731384,
+      "learning_rate": 8.99561086464603e-06,
+      "loss": 0.7404,
+      "step": 48
+    },
+    {
+      "epoch": 0.8711111111111111,
+      "grad_norm": 0.5622390508651733,
+      "learning_rate": 8.932213321547769e-06,
+      "loss": 0.7493,
+      "step": 49
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.6213387846946716,
+      "learning_rate": 8.86711374827494e-06,
+      "loss": 0.7431,
+      "step": 50
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.5265310406684875,
+      "learning_rate": 8.800340322708291e-06,
+      "loss": 0.7748,
+      "step": 51
+    },
+    {
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.6248610019683838,
+      "learning_rate": 8.73192194724347e-06,
+      "loss": 0.756,
+      "step": 52
+    },
+    {
+      "epoch": 0.9422222222222222,
+      "grad_norm": 0.5746708512306213,
+      "learning_rate": 8.661888236280813e-06,
+      "loss": 0.7826,
+      "step": 53
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6057078242301941,
+      "learning_rate": 8.590269503406986e-06,
+      "loss": 0.7477,
+      "step": 54
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.6079862117767334,
+      "learning_rate": 8.517096748273951e-06,
+      "loss": 0.7411,
+      "step": 55
+    },
+    {
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.6125847697257996,
+      "learning_rate": 8.442401643181e-06,
+      "loss": 0.7707,
+      "step": 56
+    },
+    {
+      "epoch": 1.0133333333333334,
+      "grad_norm": 1.0918083190917969,
+      "learning_rate": 8.366216519365623e-06,
+      "loss": 1.2307,
+      "step": 57
+    },
+    {
+      "epoch": 1.031111111111111,
+      "grad_norm": 0.6814342141151428,
+      "learning_rate": 8.288574353009164e-06,
+      "loss": 0.6984,
+      "step": 58
+    },
+    {
+      "epoch": 1.048888888888889,
+      "grad_norm": 0.7980371713638306,
+      "learning_rate": 8.20950875096333e-06,
+      "loss": 0.8081,
+      "step": 59
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.7088991403579712,
+      "learning_rate": 8.129053936203688e-06,
+      "loss": 0.7388,
+      "step": 60
+    },
+    {
+      "epoch": 1.0844444444444445,
+      "grad_norm": 0.6209843754768372,
+      "learning_rate": 8.04724473301652e-06,
+      "loss": 0.8232,
+      "step": 61
+    },
+    {
+      "epoch": 1.1022222222222222,
+      "grad_norm": 0.5992409586906433,
+      "learning_rate": 7.964116551925365e-06,
+      "loss": 0.6424,
+      "step": 62
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.8191760778427124,
+      "learning_rate": 7.879705374363831e-06,
+      "loss": 0.743,
+      "step": 63
+    },
+    {
+      "epoch": 1.1377777777777778,
+      "grad_norm": 0.5577734112739563,
+      "learning_rate": 7.794047737101298e-06,
+      "loss": 0.6329,
+      "step": 64
+    },
+    {
+      "epoch": 1.1555555555555554,
+      "grad_norm": 0.590572714805603,
+      "learning_rate": 7.707180716428237e-06,
+      "loss": 0.7648,
+      "step": 65
+    },
+    {
+      "epoch": 1.1733333333333333,
+      "grad_norm": 0.7350046634674072,
+      "learning_rate": 7.619141912108008e-06,
+      "loss": 0.9037,
+      "step": 66
+    },
+    {
+      "epoch": 1.1911111111111112,
+      "grad_norm": 0.5404512286186218,
+      "learning_rate": 7.529969431102063e-06,
+      "loss": 0.7325,
+      "step": 67
+    },
+    {
+      "epoch": 1.208888888888889,
+      "grad_norm": 0.6300895810127258,
+      "learning_rate": 7.4397018710756415e-06,
+      "loss": 0.6517,
+      "step": 68
+    },
+    {
+      "epoch": 1.2266666666666666,
+      "grad_norm": 0.5601981282234192,
+      "learning_rate": 7.34837830369103e-06,
+      "loss": 0.6863,
+      "step": 69
+    },
+    {
+      "epoch": 1.2444444444444445,
+      "grad_norm": 0.6357449293136597,
+      "learning_rate": 7.2560382576956875e-06,
+      "loss": 0.7334,
+      "step": 70
+    },
+    {
+      "epoch": 1.2622222222222224,
+      "grad_norm": 0.5976100564002991,
+      "learning_rate": 7.162721701812506e-06,
+      "loss": 0.6825,
+      "step": 71
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.5376277565956116,
+      "learning_rate": 7.068469027439642e-06,
+      "loss": 0.7138,
+      "step": 72
+    },
+    {
+      "epoch": 1.2977777777777777,
+      "grad_norm": 0.5158504843711853,
+      "learning_rate": 6.9733210311673826e-06,
+      "loss": 0.7004,
+      "step": 73
+    },
+    {
+      "epoch": 1.3155555555555556,
+      "grad_norm": 0.6006451845169067,
+      "learning_rate": 6.8773188971196515e-06,
+      "loss": 0.6926,
+      "step": 74
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.5877394080162048,
+      "learning_rate": 6.780504179127735e-06,
+      "loss": 0.7251,
+      "step": 75
+    },
+    {
+      "epoch": 1.3511111111111112,
+      "grad_norm": 0.47379350662231445,
+      "learning_rate": 6.682918782744033e-06,
+      "loss": 0.6315,
+      "step": 76
+    },
+    {
+      "epoch": 1.3688888888888888,
+      "grad_norm": 0.6031640768051147,
+      "learning_rate": 6.584604947103515e-06,
+      "loss": 0.8246,
+      "step": 77
+    },
+    {
+      "epoch": 1.3866666666666667,
+      "grad_norm": 0.5239250659942627,
+      "learning_rate": 6.4856052266408375e-06,
+      "loss": 0.6918,
+      "step": 78
+    },
+    {
+      "epoch": 1.4044444444444444,
+      "grad_norm": 0.5310869216918945,
+      "learning_rate": 6.385962472670953e-06,
+      "loss": 0.7386,
+      "step": 79
+    },
+    {
+      "epoch": 1.4222222222222223,
+      "grad_norm": 0.49420294165611267,
+      "learning_rate": 6.28571981484123e-06,
+      "loss": 0.6715,
+      "step": 80
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.5260173082351685,
+      "learning_rate": 6.184920642463095e-06,
+      "loss": 0.662,
+      "step": 81
+    },
+    {
+      "epoch": 1.4577777777777778,
+      "grad_norm": 0.5475410223007202,
+      "learning_rate": 6.083608585731283e-06,
+      "loss": 0.7322,
+      "step": 82
+    },
+    {
+      "epoch": 1.4755555555555555,
+      "grad_norm": 0.5325695276260376,
+      "learning_rate": 5.9818274968388225e-06,
+      "loss": 0.6793,
+      "step": 83
+    },
+    {
+      "epoch": 1.4933333333333334,
+      "grad_norm": 0.49327635765075684,
+      "learning_rate": 5.879621430995927e-06,
+      "loss": 0.6983,
+      "step": 84
+    },
+    {
+      "epoch": 1.511111111111111,
+      "grad_norm": 0.5579730272293091,
+      "learning_rate": 5.777034627361025e-06,
+      "loss": 0.7993,
+      "step": 85
+    },
+    {
+      "epoch": 1.528888888888889,
+      "grad_norm": 0.5502652525901794,
+      "learning_rate": 5.674111489892144e-06,
+      "loss": 0.6955,
+      "step": 86
+    },
+    {
+      "epoch": 1.5466666666666666,
+      "grad_norm": 0.5440042018890381,
+      "learning_rate": 5.570896568126994e-06,
+      "loss": 0.694,
+      "step": 87
+    },
+    {
+      "epoch": 1.5644444444444443,
+      "grad_norm": 0.473174124956131,
+      "learning_rate": 5.4674345379e-06,
+      "loss": 0.6934,
+      "step": 88
+    },
+    {
+      "epoch": 1.5822222222222222,
+      "grad_norm": 0.492448627948761,
+      "learning_rate": 5.36377018200472e-06,
+      "loss": 0.7066,
+      "step": 89
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.5301864147186279,
+      "learning_rate": 5.259948370809902e-06,
+      "loss": 0.608,
+      "step": 90
+    },
+    {
+      "epoch": 1.6177777777777778,
+      "grad_norm": 0.5350763201713562,
+      "learning_rate": 5.156014042837696e-06,
+      "loss": 0.762,
+      "step": 91
+    },
+    {
+      "epoch": 1.6355555555555554,
+      "grad_norm": 0.43970951437950134,
+      "learning_rate": 5.052012185312322e-06,
+      "loss": 0.6945,
+      "step": 92
+    },
+    {
+      "epoch": 1.6533333333333333,
+      "grad_norm": 0.5044556856155396,
+      "learning_rate": 4.94798781468768e-06,
+      "loss": 0.6902,
+      "step": 93
+    },
+    {
+      "epoch": 1.6711111111111112,
+      "grad_norm": 0.5784631967544556,
+      "learning_rate": 4.843985957162304e-06,
+      "loss": 0.8185,
+      "step": 94
+    },
+    {
+      "epoch": 1.6888888888888889,
+      "grad_norm": 0.470632404088974,
+      "learning_rate": 4.740051629190099e-06,
+      "loss": 0.6987,
+      "step": 95
+    },
+    {
+      "epoch": 1.7066666666666666,
+      "grad_norm": 0.43977099657058716,
+      "learning_rate": 4.636229817995281e-06,
+      "loss": 0.759,
+      "step": 96
+    },
+    {
+      "epoch": 1.7244444444444444,
+      "grad_norm": 0.4898114502429962,
+      "learning_rate": 4.532565462099999e-06,
+      "loss": 0.7005,
+      "step": 97
+    },
+    {
+      "epoch": 1.7422222222222223,
+      "grad_norm": 0.45880624651908875,
+      "learning_rate": 4.429103431873009e-06,
+      "loss": 0.6753,
+      "step": 98
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.44095566868782043,
+      "learning_rate": 4.3258885101078565e-06,
+      "loss": 0.6962,
+      "step": 99
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.4869973063468933,
+      "learning_rate": 4.2229653726389765e-06,
+      "loss": 0.7709,
+      "step": 100
+    },
+    {
+      "epoch": 1.7955555555555556,
+      "grad_norm": 0.3903906047344208,
+      "learning_rate": 4.120378569004074e-06,
+      "loss": 0.6639,
+      "step": 101
+    },
+    {
+      "epoch": 1.8133333333333335,
+      "grad_norm": 0.42414912581443787,
+      "learning_rate": 4.018172503161179e-06,
+      "loss": 0.679,
+      "step": 102
+    },
+    {
+      "epoch": 1.8311111111111111,
+      "grad_norm": 0.5198123455047607,
+      "learning_rate": 3.9163914142687185e-06,
+      "loss": 0.7619,
+      "step": 103
+    },
+    {
+      "epoch": 1.8488888888888888,
+      "grad_norm": 0.4870392084121704,
+      "learning_rate": 3.815079357536907e-06,
+      "loss": 0.5876,
+      "step": 104
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.48693349957466125,
+      "learning_rate": 3.714280185158771e-06,
+      "loss": 0.7726,
+      "step": 105
+    },
+    {
+      "epoch": 1.8844444444444446,
+      "grad_norm": 0.4303606450557709,
+      "learning_rate": 3.614037527329048e-06,
+      "loss": 0.7155,
+      "step": 106
+    },
+    {
+      "epoch": 1.9022222222222223,
+      "grad_norm": 0.5268415808677673,
+      "learning_rate": 3.5143947733591633e-06,
+      "loss": 0.7175,
+      "step": 107
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.4765200614929199,
+      "learning_rate": 3.4153950528964867e-06,
+      "loss": 0.6943,
+      "step": 108
+    },
+    {
+      "epoch": 1.9377777777777778,
+      "grad_norm": 0.42151421308517456,
+      "learning_rate": 3.3170812172559695e-06,
+      "loss": 0.7555,
+      "step": 109
+    },
+    {
+      "epoch": 1.9555555555555557,
+      "grad_norm": 0.426076740026474,
+      "learning_rate": 3.2194958208722656e-06,
+      "loss": 0.6852,
+      "step": 110
+    },
+    {
+      "epoch": 1.9733333333333334,
+      "grad_norm": 0.5378457307815552,
+      "learning_rate": 3.1226811028803514e-06,
+      "loss": 0.7171,
+      "step": 111
+    },
+    {
+      "epoch": 1.991111111111111,
+      "grad_norm": 0.39721420407295227,
+      "learning_rate": 3.0266789688326187e-06,
+      "loss": 0.6122,
+      "step": 112
+    },
+    {
+      "epoch": 2.008888888888889,
+      "grad_norm": 0.9009656310081482,
+      "learning_rate": 2.9315309725603596e-06,
+      "loss": 1.3572,
+      "step": 113
+    },
+    {
+      "epoch": 2.026666666666667,
+      "grad_norm": 0.4720599353313446,
+      "learning_rate": 2.8372782981874964e-06,
+      "loss": 0.7745,
+      "step": 114
+    },
+    {
+      "epoch": 2.0444444444444443,
+      "grad_norm": 0.3901655972003937,
+      "learning_rate": 2.7439617423043146e-06,
+      "loss": 0.6196,
+      "step": 115
+    },
+    {
+      "epoch": 2.062222222222222,
+      "grad_norm": 0.4585684537887573,
+      "learning_rate": 2.6516216963089698e-06,
+      "loss": 0.5975,
+      "step": 116
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.44775474071502686,
+      "learning_rate": 2.560298128924358e-06,
+      "loss": 0.7238,
+      "step": 117
+    },
+    {
+      "epoch": 2.097777777777778,
+      "grad_norm": 0.45278117060661316,
+      "learning_rate": 2.470030568897938e-06,
+      "loss": 0.7086,
+      "step": 118
+    },
+    {
+      "epoch": 2.1155555555555554,
+      "grad_norm": 0.48289376497268677,
+      "learning_rate": 2.3808580878919948e-06,
+      "loss": 0.7369,
+      "step": 119
+    },
+    {
+      "epoch": 2.1333333333333333,
+      "grad_norm": 0.4182297885417938,
+      "learning_rate": 2.2928192835717642e-06,
+      "loss": 0.6709,
+      "step": 120
+    },
+    {
+      "epoch": 2.151111111111111,
+      "grad_norm": 0.41355010867118835,
+      "learning_rate": 2.205952262898704e-06,
+      "loss": 0.6351,
+      "step": 121
+    },
+    {
+      "epoch": 2.168888888888889,
+      "grad_norm": 0.38360127806663513,
+      "learning_rate": 2.120294625636171e-06,
+      "loss": 0.6603,
+      "step": 122
+    },
+    {
+      "epoch": 2.1866666666666665,
+      "grad_norm": 0.4717971980571747,
+      "learning_rate": 2.0358834480746363e-06,
+      "loss": 0.7047,
+      "step": 123
+    },
+    {
+      "epoch": 2.2044444444444444,
+      "grad_norm": 0.44043272733688354,
+      "learning_rate": 1.9527552669834797e-06,
+      "loss": 0.6574,
+      "step": 124
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 0.439424604177475,
+      "learning_rate": 1.8709460637963123e-06,
+      "loss": 0.7172,
+      "step": 125
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.4320985674858093,
+      "learning_rate": 1.7904912490366723e-06,
+      "loss": 0.6212,
+      "step": 126
+    },
+    {
+      "epoch": 2.2577777777777777,
+      "grad_norm": 0.4921020567417145,
+      "learning_rate": 1.711425646990838e-06,
+      "loss": 0.7009,
+      "step": 127
+    },
+    {
+      "epoch": 2.2755555555555556,
+      "grad_norm": 0.43402716517448425,
+      "learning_rate": 1.6337834806343783e-06,
+      "loss": 0.6447,
+      "step": 128
+    },
+    {
+      "epoch": 2.2933333333333334,
+      "grad_norm": 0.417825847864151,
+      "learning_rate": 1.557598356819e-06,
+      "loss": 0.6013,
+      "step": 129
+    },
+    {
+      "epoch": 2.311111111111111,
+      "grad_norm": 0.4234044849872589,
+      "learning_rate": 1.482903251726049e-06,
+      "loss": 0.6542,
+      "step": 130
+    },
+    {
+      "epoch": 2.328888888888889,
+      "grad_norm": 0.41442492604255676,
+      "learning_rate": 1.409730496593016e-06,
+      "loss": 0.6156,
+      "step": 131
+    },
+    {
+      "epoch": 2.3466666666666667,
+      "grad_norm": 0.44905272126197815,
+      "learning_rate": 1.3381117637191887e-06,
+      "loss": 0.8226,
+      "step": 132
+    },
+    {
+      "epoch": 2.3644444444444446,
+      "grad_norm": 0.4065934717655182,
+      "learning_rate": 1.2680780527565313e-06,
+      "loss": 0.6674,
+      "step": 133
+    },
+    {
+      "epoch": 2.3822222222222225,
+      "grad_norm": 0.44520464539527893,
+      "learning_rate": 1.1996596772917091e-06,
+      "loss": 0.7292,
+      "step": 134
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.3922524154186249,
+      "learning_rate": 1.132886251725061e-06,
+      "loss": 0.6922,
+      "step": 135
+    },
+    {
+      "epoch": 2.417777777777778,
+      "grad_norm": 0.40516623854637146,
+      "learning_rate": 1.0677866784522317e-06,
+      "loss": 0.7064,
+      "step": 136
+    },
+    {
+      "epoch": 2.4355555555555557,
+      "grad_norm": 0.3747960031032562,
+      "learning_rate": 1.004389135353972e-06,
+      "loss": 0.6197,
+      "step": 137
+    },
+    {
+      "epoch": 2.453333333333333,
+      "grad_norm": 0.4241096079349518,
+      "learning_rate": 9.427210635995482e-07,
+      "loss": 0.6535,
+      "step": 138
+    },
+    {
+      "epoch": 2.471111111111111,
+      "grad_norm": 0.38984397053718567,
+      "learning_rate": 8.828091557690288e-07,
+      "loss": 0.7117,
+      "step": 139
+    },
+    {
+      "epoch": 2.488888888888889,
+      "grad_norm": 0.3895849585533142,
+      "learning_rate": 8.246793442995954e-07,
+      "loss": 0.7144,
+      "step": 140
+    },
+    {
+      "epoch": 2.506666666666667,
+      "grad_norm": 0.3922812342643738,
+      "learning_rate": 7.68356790260873e-07,
+      "loss": 0.6957,
+      "step": 141
+    },
+    {
+      "epoch": 2.5244444444444447,
+      "grad_norm": 0.4137726128101349,
+      "learning_rate": 7.138658724641417e-07,
+      "loss": 0.7647,
+      "step": 142
+    },
+    {
+      "epoch": 2.542222222222222,
+      "grad_norm": 0.41041967272758484,
+      "learning_rate": 6.612301769101464e-07,
+      "loss": 0.7211,
+      "step": 143
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.3760773837566376,
+      "learning_rate": 6.104724865800665e-07,
+      "loss": 0.635,
+      "step": 144
+    },
+    {
+      "epoch": 2.5777777777777775,
+      "grad_norm": 0.38913705945014954,
+      "learning_rate": 5.616147715740611e-07,
+      "loss": 0.5809,
+      "step": 145
+    },
+    {
+      "epoch": 2.5955555555555554,
+      "grad_norm": 0.47968509793281555,
+      "learning_rate": 5.146781796016798e-07,
+      "loss": 0.8847,
+      "step": 146
+    },
+    {
+      "epoch": 2.6133333333333333,
+      "grad_norm": 0.36607035994529724,
+      "learning_rate": 4.696830268282204e-07,
+      "loss": 0.6341,
+      "step": 147
+    },
+    {
+      "epoch": 2.631111111111111,
+      "grad_norm": 0.37374281883239746,
+      "learning_rate": 4.2664878908102556e-07,
+      "loss": 0.7013,
+      "step": 148
+    },
+    {
+      "epoch": 2.648888888888889,
+      "grad_norm": 0.3905697464942932,
+      "learning_rate": 3.855940934195146e-07,
+      "loss": 0.7076,
+      "step": 149
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.38409143686294556,
+      "learning_rate": 3.4653671007259084e-07,
+      "loss": 0.7147,
+      "step": 150
+    },
+    {
+      "epoch": 2.6844444444444444,
+      "grad_norm": 0.36509791016578674,
+      "learning_rate": 3.0949354474692937e-07,
+      "loss": 0.6029,
+      "step": 151
+    },
+    {
+      "epoch": 2.7022222222222223,
+      "grad_norm": 0.4057907164096832,
+      "learning_rate": 2.7448063130946224e-07,
+      "loss": 0.7018,
+      "step": 152
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.4523860216140747,
+      "learning_rate": 2.4151312484723465e-07,
+      "loss": 0.7996,
+      "step": 153
+    },
+    {
+      "epoch": 2.7377777777777776,
+      "grad_norm": 0.37582913041114807,
+      "learning_rate": 2.106052951076365e-07,
+      "loss": 0.5584,
+      "step": 154
+    },
+    {
+      "epoch": 2.7555555555555555,
+      "grad_norm": 0.4033922255039215,
+      "learning_rate": 1.8177052032184285e-07,
+      "loss": 0.626,
+      "step": 155
+    },
+    {
+      "epoch": 2.7733333333333334,
+      "grad_norm": 0.39120516180992126,
+      "learning_rate": 1.5502128141414496e-07,
+      "loss": 0.6039,
+      "step": 156
+    },
+    {
+      "epoch": 2.7911111111111113,
+      "grad_norm": 0.3827860951423645,
+      "learning_rate": 1.303691565996712e-07,
+      "loss": 0.7568,
+      "step": 157
+    },
+    {
+      "epoch": 2.8088888888888888,
+      "grad_norm": 0.41667839884757996,
+      "learning_rate": 1.0782481637284014e-07,
+      "loss": 0.7259,
+      "step": 158
+    },
+    {
+      "epoch": 2.8266666666666667,
+      "grad_norm": 0.3329305052757263,
+      "learning_rate": 8.739801888871468e-08,
+      "loss": 0.5574,
+      "step": 159
+    },
+    {
+      "epoch": 2.8444444444444446,
+      "grad_norm": 0.39114266633987427,
+      "learning_rate": 6.909760573925561e-08,
+      "loss": 0.7079,
+      "step": 160
+    },
+    {
+      "epoch": 2.862222222222222,
+      "grad_norm": 0.39278408885002136,
+      "learning_rate": 5.2931498126298495e-08,
+      "loss": 0.6241,
+      "step": 161
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.3883131146430969,
+      "learning_rate": 3.890669343292464e-08,
+      "loss": 0.7255,
+      "step": 162
+    },
+    {
+      "epoch": 2.897777777777778,
+      "grad_norm": 0.3658883273601532,
+      "learning_rate": 2.702926219468882e-08,
+      "loss": 0.6244,
+      "step": 163
+    },
+    {
+      "epoch": 2.9155555555555557,
+      "grad_norm": 0.3795766532421112,
+      "learning_rate": 1.7304345472035634e-08,
+      "loss": 0.7233,
+      "step": 164
+    },
+    {
+      "epoch": 2.9333333333333336,
+      "grad_norm": 0.3794161081314087,
+      "learning_rate": 9.73615262502503e-09,
+      "loss": 0.5895,
+      "step": 165
+    },
+    {
+      "epoch": 2.951111111111111,
+      "grad_norm": 0.41713038086891174,
+      "learning_rate": 4.327959491344791e-09,
+      "loss": 0.7434,
+      "step": 166
+    },
+    {
+      "epoch": 2.968888888888889,
+      "grad_norm": 0.36857718229293823,
+      "learning_rate": 1.082106968385288e-09,
+      "loss": 0.6239,
+      "step": 167
+    },
+    {
+      "epoch": 2.986666666666667,
+      "grad_norm": 0.38884976506233215,
+      "learning_rate": 0.0,
+      "loss": 0.6308,
+      "step": 168
+    },
+    {
+      "epoch": 2.986666666666667,
+      "step": 168,
+      "total_flos": 139896582144000.0,
+      "train_loss": 0.7518468662386849,
+      "train_runtime": 7945.17,
+      "train_samples_per_second": 2.039,
+      "train_steps_per_second": 0.021
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 168,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 139896582144000.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_loss.png ADDED Viewed