Model save

Browse files

Files changed (7) hide show

README.md +65 -0
all_results.json +9 -0
generation_config.json +7 -0
model.safetensors +1 -1
runs/Sep05_22-49-26_nova.cs.ucla.edu/events.out.tfevents.1725601953.nova.cs.ucla.edu.1919554.0 +2 -2
train_results.json +9 -0
trainer_state.json +801 -0

README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+license: apache-2.0
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: tinyllama-sft-wizard-processed-indicator-0.6-full
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# tinyllama-sft-wizard-processed-indicator-0.6-full
+This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.7583
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.7509        | 1.0   | 543  | 0.7583          |
+### Framework versions
+- Transformers 4.40.1
+- Pytorch 2.4.0+cu121
+- Datasets 2.20.0
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 49305877217280.0,
+    "train_loss": 0.7919497733616697,
+    "train_runtime": 3542.7729,
+    "train_samples": 69499,
+    "train_samples_per_second": 19.617,
+    "train_steps_per_second": 0.153
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "max_length": 2048,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.1"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe1c5a2300d9fd815f25914a35e90bb99407d01cc6a625a2eda98a1b6480e9ee
 size 2200136248

 version https://git-lfs.github.com/spec/v1
+oid sha256:1bba5b08bec63a03fb2e79d01e1eb293bf7cb1950a7fcee2c9e0e4552d69d7f8
 size 2200136248

runs/Sep05_22-49-26_nova.cs.ucla.edu/events.out.tfevents.1725601953.nova.cs.ucla.edu.1919554.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c0960cd482dd640b2afe57518b8253dd142f38dd702410c99138c2252cdebbbf
-size 26189

 version https://git-lfs.github.com/spec/v1
+oid sha256:aed645010c3be4ab738468c3ae65e8ed00dd8bfa6f88c9ede09e332d0666a318
+size 28502

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 49305877217280.0,
+    "train_loss": 0.7919497733616697,
+    "train_runtime": 3542.7729,
+    "train_samples": 69499,
+    "train_samples_per_second": 19.617,
+    "train_steps_per_second": 0.153
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,801 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 543,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001841620626151013,
+      "grad_norm": 9.486898645787276,
+      "learning_rate": 3.6363636363636366e-07,
+      "loss": 1.3331,
+      "step": 1
+    },
+    {
+      "epoch": 0.009208103130755065,
+      "grad_norm": 9.648113107304198,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 1.3395,
+      "step": 5
+    },
+    {
+      "epoch": 0.01841620626151013,
+      "grad_norm": 6.990867421428758,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 1.2962,
+      "step": 10
+    },
+    {
+      "epoch": 0.027624309392265192,
+      "grad_norm": 3.458068927612039,
+      "learning_rate": 5.4545454545454545e-06,
+      "loss": 1.0739,
+      "step": 15
+    },
+    {
+      "epoch": 0.03683241252302026,
+      "grad_norm": 1.2260366851944846,
+      "learning_rate": 7.272727272727273e-06,
+      "loss": 0.9505,
+      "step": 20
+    },
+    {
+      "epoch": 0.04604051565377532,
+      "grad_norm": 1.0221449554865838,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 0.9144,
+      "step": 25
+    },
+    {
+      "epoch": 0.055248618784530384,
+      "grad_norm": 0.8620729339974224,
+      "learning_rate": 1.0909090909090909e-05,
+      "loss": 0.8733,
+      "step": 30
+    },
+    {
+      "epoch": 0.06445672191528545,
+      "grad_norm": 0.851167156872127,
+      "learning_rate": 1.2727272727272728e-05,
+      "loss": 0.8785,
+      "step": 35
+    },
+    {
+      "epoch": 0.07366482504604052,
+      "grad_norm": 0.9108061969642942,
+      "learning_rate": 1.4545454545454546e-05,
+      "loss": 0.8671,
+      "step": 40
+    },
+    {
+      "epoch": 0.08287292817679558,
+      "grad_norm": 0.9428707752739769,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 0.8432,
+      "step": 45
+    },
+    {
+      "epoch": 0.09208103130755065,
+      "grad_norm": 0.799992233407414,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 0.8391,
+      "step": 50
+    },
+    {
+      "epoch": 0.10128913443830571,
+      "grad_norm": 0.92988123755397,
+      "learning_rate": 2e-05,
+      "loss": 0.8069,
+      "step": 55
+    },
+    {
+      "epoch": 0.11049723756906077,
+      "grad_norm": 0.8806369480862966,
+      "learning_rate": 1.9994819965926346e-05,
+      "loss": 0.8461,
+      "step": 60
+    },
+    {
+      "epoch": 0.11970534069981584,
+      "grad_norm": 0.8429565875911434,
+      "learning_rate": 1.997928523025598e-05,
+      "loss": 0.8114,
+      "step": 65
+    },
+    {
+      "epoch": 0.1289134438305709,
+      "grad_norm": 2.6074141932570747,
+      "learning_rate": 1.9953411887080917e-05,
+      "loss": 0.8269,
+      "step": 70
+    },
+    {
+      "epoch": 0.13812154696132597,
+      "grad_norm": 0.81068282261157,
+      "learning_rate": 1.9917226741361014e-05,
+      "loss": 0.8313,
+      "step": 75
+    },
+    {
+      "epoch": 0.14732965009208104,
+      "grad_norm": 0.8146705010362959,
+      "learning_rate": 1.987076728115383e-05,
+      "loss": 0.8233,
+      "step": 80
+    },
+    {
+      "epoch": 0.15653775322283608,
+      "grad_norm": 0.8212502515723833,
+      "learning_rate": 1.9814081638776743e-05,
+      "loss": 0.8348,
+      "step": 85
+    },
+    {
+      "epoch": 0.16574585635359115,
+      "grad_norm": 0.9178279389667914,
+      "learning_rate": 1.9747228540941555e-05,
+      "loss": 0.8146,
+      "step": 90
+    },
+    {
+      "epoch": 0.17495395948434622,
+      "grad_norm": 0.7885672960810044,
+      "learning_rate": 1.9670277247913205e-05,
+      "loss": 0.8296,
+      "step": 95
+    },
+    {
+      "epoch": 0.1841620626151013,
+      "grad_norm": 0.848764605997187,
+      "learning_rate": 1.958330748175568e-05,
+      "loss": 0.7949,
+      "step": 100
+    },
+    {
+      "epoch": 0.19337016574585636,
+      "grad_norm": 0.7700219317674329,
+      "learning_rate": 1.948640934373939e-05,
+      "loss": 0.7866,
+      "step": 105
+    },
+    {
+      "epoch": 0.20257826887661143,
+      "grad_norm": 0.8315949857083919,
+      "learning_rate": 1.9379683220995657e-05,
+      "loss": 0.7962,
+      "step": 110
+    },
+    {
+      "epoch": 0.21178637200736647,
+      "grad_norm": 0.7752955445936617,
+      "learning_rate": 1.9263239682514953e-05,
+      "loss": 0.792,
+      "step": 115
+    },
+    {
+      "epoch": 0.22099447513812154,
+      "grad_norm": 0.8483319625591622,
+      "learning_rate": 1.9137199364596673e-05,
+      "loss": 0.7884,
+      "step": 120
+    },
+    {
+      "epoch": 0.2302025782688766,
+      "grad_norm": 0.8183776324168913,
+      "learning_rate": 1.9001692845869113e-05,
+      "loss": 0.801,
+      "step": 125
+    },
+    {
+      "epoch": 0.23941068139963168,
+      "grad_norm": 0.811447671431964,
+      "learning_rate": 1.8856860512009115e-05,
+      "loss": 0.7786,
+      "step": 130
+    },
+    {
+      "epoch": 0.24861878453038674,
+      "grad_norm": 0.8277281887592228,
+      "learning_rate": 1.8702852410301556e-05,
+      "loss": 0.8104,
+      "step": 135
+    },
+    {
+      "epoch": 0.2578268876611418,
+      "grad_norm": 0.8077180581855568,
+      "learning_rate": 1.853982809418932e-05,
+      "loss": 0.7828,
+      "step": 140
+    },
+    {
+      "epoch": 0.26703499079189685,
+      "grad_norm": 0.8329105731452281,
+      "learning_rate": 1.8367956457974872e-05,
+      "loss": 0.7894,
+      "step": 145
+    },
+    {
+      "epoch": 0.27624309392265195,
+      "grad_norm": 0.797486657559649,
+      "learning_rate": 1.8187415561844586e-05,
+      "loss": 0.7987,
+      "step": 150
+    },
+    {
+      "epoch": 0.285451197053407,
+      "grad_norm": 0.8537594990493205,
+      "learning_rate": 1.7998392447397197e-05,
+      "loss": 0.8036,
+      "step": 155
+    },
+    {
+      "epoch": 0.2946593001841621,
+      "grad_norm": 0.8180506794487967,
+      "learning_rate": 1.7801082943867406e-05,
+      "loss": 0.7932,
+      "step": 160
+    },
+    {
+      "epoch": 0.30386740331491713,
+      "grad_norm": 0.8084360565762896,
+      "learning_rate": 1.7595691465245484e-05,
+      "loss": 0.7835,
+      "step": 165
+    },
+    {
+      "epoch": 0.31307550644567217,
+      "grad_norm": 0.8626100032968109,
+      "learning_rate": 1.7382430798502977e-05,
+      "loss": 0.7946,
+      "step": 170
+    },
+    {
+      "epoch": 0.32228360957642727,
+      "grad_norm": 0.8294498359295781,
+      "learning_rate": 1.7161521883143936e-05,
+      "loss": 0.7854,
+      "step": 175
+    },
+    {
+      "epoch": 0.3314917127071823,
+      "grad_norm": 0.7709771325792745,
+      "learning_rate": 1.693319358231011e-05,
+      "loss": 0.8142,
+      "step": 180
+    },
+    {
+      "epoch": 0.3406998158379374,
+      "grad_norm": 0.7818326227818909,
+      "learning_rate": 1.6697682445677158e-05,
+      "loss": 0.7954,
+      "step": 185
+    },
+    {
+      "epoch": 0.34990791896869244,
+      "grad_norm": 0.8179375717027894,
+      "learning_rate": 1.6455232464387587e-05,
+      "loss": 0.7884,
+      "step": 190
+    },
+    {
+      "epoch": 0.35911602209944754,
+      "grad_norm": 0.7920896471000997,
+      "learning_rate": 1.6206094818274228e-05,
+      "loss": 0.7718,
+      "step": 195
+    },
+    {
+      "epoch": 0.3683241252302026,
+      "grad_norm": 0.7893170590987618,
+      "learning_rate": 1.595052761563627e-05,
+      "loss": 0.7689,
+      "step": 200
+    },
+    {
+      "epoch": 0.3775322283609576,
+      "grad_norm": 0.7657795923746319,
+      "learning_rate": 1.5688795625837274e-05,
+      "loss": 0.7881,
+      "step": 205
+    },
+    {
+      "epoch": 0.3867403314917127,
+      "grad_norm": 0.8393984134836868,
+      "learning_rate": 1.542117000500229e-05,
+      "loss": 0.7788,
+      "step": 210
+    },
+    {
+      "epoch": 0.39594843462246776,
+      "grad_norm": 0.8118764962105299,
+      "learning_rate": 1.5147928015098309e-05,
+      "loss": 0.7505,
+      "step": 215
+    },
+    {
+      "epoch": 0.40515653775322286,
+      "grad_norm": 0.7991535490667581,
+      "learning_rate": 1.4869352736688938e-05,
+      "loss": 0.7957,
+      "step": 220
+    },
+    {
+      "epoch": 0.4143646408839779,
+      "grad_norm": 0.8314136999499325,
+      "learning_rate": 1.458573277566103e-05,
+      "loss": 0.7912,
+      "step": 225
+    },
+    {
+      "epoch": 0.42357274401473294,
+      "grad_norm": 0.8256559921181463,
+      "learning_rate": 1.4297361964227004e-05,
+      "loss": 0.7764,
+      "step": 230
+    },
+    {
+      "epoch": 0.43278084714548803,
+      "grad_norm": 0.8766553756889243,
+      "learning_rate": 1.4004539056512667e-05,
+      "loss": 0.7755,
+      "step": 235
+    },
+    {
+      "epoch": 0.4419889502762431,
+      "grad_norm": 0.833181681270549,
+      "learning_rate": 1.3707567419045926e-05,
+      "loss": 0.7767,
+      "step": 240
+    },
+    {
+      "epoch": 0.45119705340699817,
+      "grad_norm": 0.7499469252603318,
+      "learning_rate": 1.3406754716466978e-05,
+      "loss": 0.7632,
+      "step": 245
+    },
+    {
+      "epoch": 0.4604051565377532,
+      "grad_norm": 0.79015324314155,
+      "learning_rate": 1.3102412592785654e-05,
+      "loss": 0.7621,
+      "step": 250
+    },
+    {
+      "epoch": 0.4696132596685083,
+      "grad_norm": 0.7575412875758781,
+      "learning_rate": 1.2794856348516095e-05,
+      "loss": 0.7623,
+      "step": 255
+    },
+    {
+      "epoch": 0.47882136279926335,
+      "grad_norm": 0.8681572551776058,
+      "learning_rate": 1.248440461402328e-05,
+      "loss": 0.782,
+      "step": 260
+    },
+    {
+      "epoch": 0.4880294659300184,
+      "grad_norm": 0.8073679206498725,
+      "learning_rate": 1.2171379019419786e-05,
+      "loss": 0.7705,
+      "step": 265
+    },
+    {
+      "epoch": 0.4972375690607735,
+      "grad_norm": 0.7841235830752601,
+      "learning_rate": 1.1856103861354809e-05,
+      "loss": 0.7583,
+      "step": 270
+    },
+    {
+      "epoch": 0.5064456721915286,
+      "grad_norm": 0.8108587176276811,
+      "learning_rate": 1.153890576704062e-05,
+      "loss": 0.765,
+      "step": 275
+    },
+    {
+      "epoch": 0.5156537753222836,
+      "grad_norm": 0.8126438543382127,
+      "learning_rate": 1.1220113355864549e-05,
+      "loss": 0.7525,
+      "step": 280
+    },
+    {
+      "epoch": 0.5248618784530387,
+      "grad_norm": 0.7777072836945255,
+      "learning_rate": 1.0900056898937055e-05,
+      "loss": 0.7544,
+      "step": 285
+    },
+    {
+      "epoch": 0.5340699815837937,
+      "grad_norm": 0.7965361718611623,
+      "learning_rate": 1.0579067976928614e-05,
+      "loss": 0.773,
+      "step": 290
+    },
+    {
+      "epoch": 0.5432780847145487,
+      "grad_norm": 0.8193189022177652,
+      "learning_rate": 1.0257479136549889e-05,
+      "loss": 0.7675,
+      "step": 295
+    },
+    {
+      "epoch": 0.5524861878453039,
+      "grad_norm": 0.8067397329296685,
+      "learning_rate": 9.935623546031043e-06,
+      "loss": 0.7762,
+      "step": 300
+    },
+    {
+      "epoch": 0.5616942909760589,
+      "grad_norm": 0.7907814478229841,
+      "learning_rate": 9.613834649957216e-06,
+      "loss": 0.7638,
+      "step": 305
+    },
+    {
+      "epoch": 0.570902394106814,
+      "grad_norm": 0.7491658704683205,
+      "learning_rate": 9.292445823817647e-06,
+      "loss": 0.7573,
+      "step": 310
+    },
+    {
+      "epoch": 0.580110497237569,
+      "grad_norm": 0.7616414768278772,
+      "learning_rate": 8.971790028626395e-06,
+      "loss": 0.7536,
+      "step": 315
+    },
+    {
+      "epoch": 0.5893186003683242,
+      "grad_norm": 0.8064255556517879,
+      "learning_rate": 8.652199465972462e-06,
+      "loss": 0.7772,
+      "step": 320
+    },
+    {
+      "epoch": 0.5985267034990792,
+      "grad_norm": 0.8002742479547474,
+      "learning_rate": 8.334005233856681e-06,
+      "loss": 0.797,
+      "step": 325
+    },
+    {
+      "epoch": 0.6077348066298343,
+      "grad_norm": 0.7711512481084595,
+      "learning_rate": 8.017536983671929e-06,
+      "loss": 0.7492,
+      "step": 330
+    },
+    {
+      "epoch": 0.6169429097605893,
+      "grad_norm": 0.877795536912771,
+      "learning_rate": 7.703122578682047e-06,
+      "loss": 0.7664,
+      "step": 335
+    },
+    {
+      "epoch": 0.6261510128913443,
+      "grad_norm": 0.7877424655087469,
+      "learning_rate": 7.391087754353252e-06,
+      "loss": 0.7521,
+      "step": 340
+    },
+    {
+      "epoch": 0.6353591160220995,
+      "grad_norm": 0.7447106165793183,
+      "learning_rate": 7.081755780889978e-06,
+      "loss": 0.7576,
+      "step": 345
+    },
+    {
+      "epoch": 0.6445672191528545,
+      "grad_norm": 0.755847533013407,
+      "learning_rate": 6.7754471283247594e-06,
+      "loss": 0.7575,
+      "step": 350
+    },
+    {
+      "epoch": 0.6537753222836096,
+      "grad_norm": 0.8049779605578884,
+      "learning_rate": 6.472479134509052e-06,
+      "loss": 0.7527,
+      "step": 355
+    },
+    {
+      "epoch": 0.6629834254143646,
+      "grad_norm": 0.7303729256503527,
+      "learning_rate": 6.173165676349103e-06,
+      "loss": 0.751,
+      "step": 360
+    },
+    {
+      "epoch": 0.6721915285451197,
+      "grad_norm": 0.7685648396444108,
+      "learning_rate": 5.8778168446273045e-06,
+      "loss": 0.7578,
+      "step": 365
+    },
+    {
+      "epoch": 0.6813996316758748,
+      "grad_norm": 0.8106456096464736,
+      "learning_rate": 5.586738622746042e-06,
+      "loss": 0.7552,
+      "step": 370
+    },
+    {
+      "epoch": 0.6906077348066298,
+      "grad_norm": 0.820301021151535,
+      "learning_rate": 5.300232569726805e-06,
+      "loss": 0.7532,
+      "step": 375
+    },
+    {
+      "epoch": 0.6998158379373849,
+      "grad_norm": 0.7482967036736521,
+      "learning_rate": 5.0185955077929774e-06,
+      "loss": 0.74,
+      "step": 380
+    },
+    {
+      "epoch": 0.7090239410681399,
+      "grad_norm": 0.7662545231902685,
+      "learning_rate": 4.742119214860009e-06,
+      "loss": 0.7506,
+      "step": 385
+    },
+    {
+      "epoch": 0.7182320441988951,
+      "grad_norm": 0.8228369078010023,
+      "learning_rate": 4.471090122251496e-06,
+      "loss": 0.7683,
+      "step": 390
+    },
+    {
+      "epoch": 0.7274401473296501,
+      "grad_norm": 0.815178887743864,
+      "learning_rate": 4.205789017954364e-06,
+      "loss": 0.7593,
+      "step": 395
+    },
+    {
+      "epoch": 0.7366482504604052,
+      "grad_norm": 0.784624202462136,
+      "learning_rate": 3.946490755720621e-06,
+      "loss": 0.737,
+      "step": 400
+    },
+    {
+      "epoch": 0.7458563535911602,
+      "grad_norm": 0.746907024064999,
+      "learning_rate": 3.6934639703169905e-06,
+      "loss": 0.7688,
+      "step": 405
+    },
+    {
+      "epoch": 0.7550644567219152,
+      "grad_norm": 0.7543025440325943,
+      "learning_rate": 3.4469707992174607e-06,
+      "loss": 0.7549,
+      "step": 410
+    },
+    {
+      "epoch": 0.7642725598526704,
+      "grad_norm": 0.7752230336855982,
+      "learning_rate": 3.207266611027069e-06,
+      "loss": 0.753,
+      "step": 415
+    },
+    {
+      "epoch": 0.7734806629834254,
+      "grad_norm": 0.7518906525746277,
+      "learning_rate": 2.97459974091831e-06,
+      "loss": 0.7282,
+      "step": 420
+    },
+    {
+      "epoch": 0.7826887661141805,
+      "grad_norm": 0.7680193007274547,
+      "learning_rate": 2.7492112333541744e-06,
+      "loss": 0.7627,
+      "step": 425
+    },
+    {
+      "epoch": 0.7918968692449355,
+      "grad_norm": 0.7566643318267062,
+      "learning_rate": 2.531334592364457e-06,
+      "loss": 0.751,
+      "step": 430
+    },
+    {
+      "epoch": 0.8011049723756906,
+      "grad_norm": 0.7844855003556812,
+      "learning_rate": 2.3211955396340003e-06,
+      "loss": 0.7418,
+      "step": 435
+    },
+    {
+      "epoch": 0.8103130755064457,
+      "grad_norm": 0.803025429826923,
+      "learning_rate": 2.1190117806534714e-06,
+      "loss": 0.7592,
+      "step": 440
+    },
+    {
+      "epoch": 0.8195211786372008,
+      "grad_norm": 0.7378091665334658,
+      "learning_rate": 1.924992779174999e-06,
+      "loss": 0.751,
+      "step": 445
+    },
+    {
+      "epoch": 0.8287292817679558,
+      "grad_norm": 0.7457306258807127,
+      "learning_rate": 1.7393395402063085e-06,
+      "loss": 0.7453,
+      "step": 450
+    },
+    {
+      "epoch": 0.8379373848987108,
+      "grad_norm": 0.8022714604664063,
+      "learning_rate": 1.5622444017681438e-06,
+      "loss": 0.7754,
+      "step": 455
+    },
+    {
+      "epoch": 0.8471454880294659,
+      "grad_norm": 0.776511111974054,
+      "learning_rate": 1.3938908356307846e-06,
+      "loss": 0.7521,
+      "step": 460
+    },
+    {
+      "epoch": 0.856353591160221,
+      "grad_norm": 0.813230099626258,
+      "learning_rate": 1.2344532572360325e-06,
+      "loss": 0.7613,
+      "step": 465
+    },
+    {
+      "epoch": 0.8655616942909761,
+      "grad_norm": 0.7447985437221388,
+      "learning_rate": 1.0840968450016276e-06,
+      "loss": 0.7536,
+      "step": 470
+    },
+    {
+      "epoch": 0.8747697974217311,
+      "grad_norm": 0.7650288847018589,
+      "learning_rate": 9.42977369195286e-07,
+      "loss": 0.7384,
+      "step": 475
+    },
+    {
+      "epoch": 0.8839779005524862,
+      "grad_norm": 0.7256984697699458,
+      "learning_rate": 8.112410305556307e-07,
+      "loss": 0.7667,
+      "step": 480
+    },
+    {
+      "epoch": 0.8931860036832413,
+      "grad_norm": 0.7357960253082361,
+      "learning_rate": 6.890243088272453e-07,
+      "loss": 0.7622,
+      "step": 485
+    },
+    {
+      "epoch": 0.9023941068139963,
+      "grad_norm": 0.7811263789047292,
+      "learning_rate": 5.764538213667103e-07,
+      "loss": 0.7452,
+      "step": 490
+    },
+    {
+      "epoch": 0.9116022099447514,
+      "grad_norm": 0.739936412399576,
+      "learning_rate": 4.73646191966175e-07,
+      "loss": 0.7562,
+      "step": 495
+    },
+    {
+      "epoch": 0.9208103130755064,
+      "grad_norm": 0.723301898239526,
+      "learning_rate": 3.8070793003030296e-07,
+      "loss": 0.7391,
+      "step": 500
+    },
+    {
+      "epoch": 0.9300184162062615,
+      "grad_norm": 0.7573677757358993,
+      "learning_rate": 2.9773532023180897e-07,
+      "loss": 0.7652,
+      "step": 505
+    },
+    {
+      "epoch": 0.9392265193370166,
+      "grad_norm": 0.8052929969157859,
+      "learning_rate": 2.248143227598809e-07,
+      "loss": 0.7637,
+      "step": 510
+    },
+    {
+      "epoch": 0.9484346224677717,
+      "grad_norm": 0.764474777249725,
+      "learning_rate": 1.6202048426483652e-07,
+      "loss": 0.7541,
+      "step": 515
+    },
+    {
+      "epoch": 0.9576427255985267,
+      "grad_norm": 0.7342491064890043,
+      "learning_rate": 1.094188595912804e-07,
+      "loss": 0.7498,
+      "step": 520
+    },
+    {
+      "epoch": 0.9668508287292817,
+      "grad_norm": 0.7649422354591825,
+      "learning_rate": 6.706394438083962e-08,
+      "loss": 0.7451,
+      "step": 525
+    },
+    {
+      "epoch": 0.9760589318600368,
+      "grad_norm": 0.7902374858951695,
+      "learning_rate": 3.4999618614309784e-08,
+      "loss": 0.7636,
+      "step": 530
+    },
+    {
+      "epoch": 0.9852670349907919,
+      "grad_norm": 0.790427562884146,
+      "learning_rate": 1.325910115169471e-08,
+      "loss": 0.7452,
+      "step": 535
+    },
+    {
+      "epoch": 0.994475138121547,
+      "grad_norm": 0.7278103543057389,
+      "learning_rate": 1.8649153172423106e-09,
+      "loss": 0.7509,
+      "step": 540
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.7582516074180603,
+      "eval_runtime": 7.3708,
+      "eval_samples_per_second": 67.835,
+      "eval_steps_per_second": 2.171,
+      "step": 543
+    },
+    {
+      "epoch": 1.0,
+      "step": 543,
+      "total_flos": 49305877217280.0,
+      "train_loss": 0.7919497733616697,
+      "train_runtime": 3542.7729,
+      "train_samples_per_second": 19.617,
+      "train_steps_per_second": 0.153
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 543,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 49305877217280.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}