Model save

Browse files

Files changed (6) hide show

README.md +67 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
runs/Jul30_14-25-22_illidan.cs.ucla.edu/events.out.tfevents.1722375545.illidan.cs.ucla.edu.4110553.0 +2 -2
train_results.json +9 -0
trainer_state.json +2089 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+license: llama2
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Llama-2-7b-hf
+model-index:
+- name: llama-2-qlora-vicuna-processed-indicator-0.6
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# llama-2-qlora-vicuna-processed-indicator-0.6
+This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.7582
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.7995        | 0.9998 | 1463 | 0.7582          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.40.1
+- Pytorch 2.3.0+cu121
+- Datasets 2.19.0
+- Tokenizers 0.19.1

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9c798a44ad3e4c38ff0bcead3843b1a8dac67fca3f66754b110454c1b34bb2a
 size 604336032

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ac24f0b335e497e29061b264a9f226e0b723ef174c9da5b53de6920808c6eca
 size 604336032

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9998291474457544,
+    "total_flos": 2.720070830863155e+16,
+    "train_loss": 0.8341796821873235,
+    "train_runtime": 76059.1967,
+    "train_samples": 93645,
+    "train_samples_per_second": 1.231,
+    "train_steps_per_second": 0.019
+}

runs/Jul30_14-25-22_illidan.cs.ucla.edu/events.out.tfevents.1722375545.illidan.cs.ucla.edu.4110553.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86f28091f0f5ac396ae52b91af18555371ada0304a5d624b88f8fdeda2678983
-size 64182

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f7861d6ed79ec1d462f274d70011b04a310d0bec8be149b723fd98274265be0
+size 67339

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9998291474457544,
+    "total_flos": 2.720070830863155e+16,
+    "train_loss": 0.8341796821873235,
+    "train_runtime": 76059.1967,
+    "train_samples": 93645,
+    "train_samples_per_second": 1.231,
+    "train_steps_per_second": 0.019
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2089 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9998291474457544,
+  "eval_steps": 500,
+  "global_step": 1463,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0006834102169827439,
+      "grad_norm": 1.2020306596069663,
+      "learning_rate": 1.360544217687075e-06,
+      "loss": 2.3228,
+      "step": 1
+    },
+    {
+      "epoch": 0.0034170510849137197,
+      "grad_norm": 1.3355474092979494,
+      "learning_rate": 6.802721088435375e-06,
+      "loss": 2.0724,
+      "step": 5
+    },
+    {
+      "epoch": 0.006834102169827439,
+      "grad_norm": 0.9273189715373761,
+      "learning_rate": 1.360544217687075e-05,
+      "loss": 1.9699,
+      "step": 10
+    },
+    {
+      "epoch": 0.010251153254741158,
+      "grad_norm": 1.3295103616049666,
+      "learning_rate": 2.0408163265306123e-05,
+      "loss": 1.8654,
+      "step": 15
+    },
+    {
+      "epoch": 0.013668204339654879,
+      "grad_norm": 1.2009437077024674,
+      "learning_rate": 2.72108843537415e-05,
+      "loss": 1.9025,
+      "step": 20
+    },
+    {
+      "epoch": 0.017085255424568596,
+      "grad_norm": 1.3726855590601976,
+      "learning_rate": 3.401360544217687e-05,
+      "loss": 1.7878,
+      "step": 25
+    },
+    {
+      "epoch": 0.020502306509482315,
+      "grad_norm": 2.657045849796256,
+      "learning_rate": 4.0816326530612245e-05,
+      "loss": 1.6113,
+      "step": 30
+    },
+    {
+      "epoch": 0.023919357594396035,
+      "grad_norm": 0.9478658789778001,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 1.4247,
+      "step": 35
+    },
+    {
+      "epoch": 0.027336408679309757,
+      "grad_norm": 0.6332212749744585,
+      "learning_rate": 5.4421768707483e-05,
+      "loss": 1.2716,
+      "step": 40
+    },
+    {
+      "epoch": 0.030753459764223477,
+      "grad_norm": 0.7179118855118389,
+      "learning_rate": 6.122448979591838e-05,
+      "loss": 1.1143,
+      "step": 45
+    },
+    {
+      "epoch": 0.03417051084913719,
+      "grad_norm": 0.2701049312098903,
+      "learning_rate": 6.802721088435374e-05,
+      "loss": 1.0379,
+      "step": 50
+    },
+    {
+      "epoch": 0.03758756193405091,
+      "grad_norm": 0.3332688283691693,
+      "learning_rate": 7.482993197278913e-05,
+      "loss": 0.9675,
+      "step": 55
+    },
+    {
+      "epoch": 0.04100461301896463,
+      "grad_norm": 0.18106807415672466,
+      "learning_rate": 8.163265306122449e-05,
+      "loss": 0.9228,
+      "step": 60
+    },
+    {
+      "epoch": 0.04442166410387835,
+      "grad_norm": 0.10994265965320522,
+      "learning_rate": 8.843537414965987e-05,
+      "loss": 0.9031,
+      "step": 65
+    },
+    {
+      "epoch": 0.04783871518879207,
+      "grad_norm": 0.091102677734405,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.8894,
+      "step": 70
+    },
+    {
+      "epoch": 0.051255766273705795,
+      "grad_norm": 0.08026180351235926,
+      "learning_rate": 0.00010204081632653062,
+      "loss": 0.9169,
+      "step": 75
+    },
+    {
+      "epoch": 0.054672817358619515,
+      "grad_norm": 0.11831157299648261,
+      "learning_rate": 0.000108843537414966,
+      "loss": 0.8814,
+      "step": 80
+    },
+    {
+      "epoch": 0.058089868443533234,
+      "grad_norm": 0.06914745377313133,
+      "learning_rate": 0.00011564625850340137,
+      "loss": 0.8715,
+      "step": 85
+    },
+    {
+      "epoch": 0.06150691952844695,
+      "grad_norm": 0.13920764271687064,
+      "learning_rate": 0.00012244897959183676,
+      "loss": 0.8216,
+      "step": 90
+    },
+    {
+      "epoch": 0.06492397061336067,
+      "grad_norm": 0.11535706946797097,
+      "learning_rate": 0.00012925170068027212,
+      "loss": 0.8672,
+      "step": 95
+    },
+    {
+      "epoch": 0.06834102169827438,
+      "grad_norm": 0.06263173224021892,
+      "learning_rate": 0.00013605442176870748,
+      "loss": 0.8932,
+      "step": 100
+    },
+    {
+      "epoch": 0.0717580727831881,
+      "grad_norm": 0.0635597195613304,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.8785,
+      "step": 105
+    },
+    {
+      "epoch": 0.07517512386810182,
+      "grad_norm": 0.0668356688357063,
+      "learning_rate": 0.00014965986394557826,
+      "loss": 0.8699,
+      "step": 110
+    },
+    {
+      "epoch": 0.07859217495301554,
+      "grad_norm": 0.21135761738053174,
+      "learning_rate": 0.00015646258503401362,
+      "loss": 0.854,
+      "step": 115
+    },
+    {
+      "epoch": 0.08200922603792926,
+      "grad_norm": 0.06434087208368008,
+      "learning_rate": 0.00016326530612244898,
+      "loss": 0.8176,
+      "step": 120
+    },
+    {
+      "epoch": 0.08542627712284298,
+      "grad_norm": 0.07781059944854175,
+      "learning_rate": 0.00017006802721088434,
+      "loss": 0.8484,
+      "step": 125
+    },
+    {
+      "epoch": 0.0888433282077567,
+      "grad_norm": 0.061240585932445024,
+      "learning_rate": 0.00017687074829931973,
+      "loss": 0.8268,
+      "step": 130
+    },
+    {
+      "epoch": 0.09226037929267042,
+      "grad_norm": 0.06103221480342776,
+      "learning_rate": 0.00018367346938775512,
+      "loss": 0.862,
+      "step": 135
+    },
+    {
+      "epoch": 0.09567743037758414,
+      "grad_norm": 0.06475817648590411,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 0.8348,
+      "step": 140
+    },
+    {
+      "epoch": 0.09909448146249786,
+      "grad_norm": 0.05121427902237011,
+      "learning_rate": 0.00019727891156462587,
+      "loss": 0.8489,
+      "step": 145
+    },
+    {
+      "epoch": 0.10251153254741159,
+      "grad_norm": 0.04622680230570971,
+      "learning_rate": 0.00019999743552408648,
+      "loss": 0.8033,
+      "step": 150
+    },
+    {
+      "epoch": 0.10592858363232531,
+      "grad_norm": 0.04756612785734163,
+      "learning_rate": 0.00019998176420316002,
+      "loss": 0.8156,
+      "step": 155
+    },
+    {
+      "epoch": 0.10934563471723903,
+      "grad_norm": 0.053273317057135,
+      "learning_rate": 0.00019995184850013148,
+      "loss": 0.8312,
+      "step": 160
+    },
+    {
+      "epoch": 0.11276268580215275,
+      "grad_norm": 0.05329549631085044,
+      "learning_rate": 0.00019990769267708516,
+      "loss": 0.8369,
+      "step": 165
+    },
+    {
+      "epoch": 0.11617973688706647,
+      "grad_norm": 0.04820617549260813,
+      "learning_rate": 0.0001998493030248924,
+      "loss": 0.7983,
+      "step": 170
+    },
+    {
+      "epoch": 0.11959678797198019,
+      "grad_norm": 0.057369974131407014,
+      "learning_rate": 0.00019977668786231534,
+      "loss": 0.8437,
+      "step": 175
+    },
+    {
+      "epoch": 0.1230138390568939,
+      "grad_norm": 0.04809057102621255,
+      "learning_rate": 0.0001996898575348218,
+      "loss": 0.7988,
+      "step": 180
+    },
+    {
+      "epoch": 0.1264308901418076,
+      "grad_norm": 0.05206514813436257,
+      "learning_rate": 0.00019958882441311126,
+      "loss": 0.8653,
+      "step": 185
+    },
+    {
+      "epoch": 0.12984794122672133,
+      "grad_norm": 0.05340020667776524,
+      "learning_rate": 0.0001994736028913526,
+      "loss": 0.8169,
+      "step": 190
+    },
+    {
+      "epoch": 0.13326499231163505,
+      "grad_norm": 0.055601924920779144,
+      "learning_rate": 0.00019934420938513313,
+      "loss": 0.8532,
+      "step": 195
+    },
+    {
+      "epoch": 0.13668204339654877,
+      "grad_norm": 0.049961377636725916,
+      "learning_rate": 0.00019920066232911992,
+      "loss": 0.8551,
+      "step": 200
+    },
+    {
+      "epoch": 0.1400990944814625,
+      "grad_norm": 0.06440486804586033,
+      "learning_rate": 0.00019904298217443366,
+      "loss": 0.7731,
+      "step": 205
+    },
+    {
+      "epoch": 0.1435161455663762,
+      "grad_norm": 0.04520767268171852,
+      "learning_rate": 0.0001988711913857346,
+      "loss": 0.8517,
+      "step": 210
+    },
+    {
+      "epoch": 0.14693319665128993,
+      "grad_norm": 0.0593580261290197,
+      "learning_rate": 0.0001986853144380224,
+      "loss": 0.8656,
+      "step": 215
+    },
+    {
+      "epoch": 0.15035024773620365,
+      "grad_norm": 0.042745224707827656,
+      "learning_rate": 0.00019848537781314883,
+      "loss": 0.8116,
+      "step": 220
+    },
+    {
+      "epoch": 0.15376729882111737,
+      "grad_norm": 0.0460299850700426,
+      "learning_rate": 0.0001982714099960452,
+      "loss": 0.853,
+      "step": 225
+    },
+    {
+      "epoch": 0.15718434990603108,
+      "grad_norm": 0.052905877151890414,
+      "learning_rate": 0.00019804344147066397,
+      "loss": 0.8467,
+      "step": 230
+    },
+    {
+      "epoch": 0.1606014009909448,
+      "grad_norm": 0.05229076173383141,
+      "learning_rate": 0.00019780150471563558,
+      "loss": 0.8689,
+      "step": 235
+    },
+    {
+      "epoch": 0.16401845207585852,
+      "grad_norm": 0.05176820659932698,
+      "learning_rate": 0.00019754563419964165,
+      "loss": 0.8173,
+      "step": 240
+    },
+    {
+      "epoch": 0.16743550316077224,
+      "grad_norm": 0.04367228208575328,
+      "learning_rate": 0.00019727586637650373,
+      "loss": 0.8102,
+      "step": 245
+    },
+    {
+      "epoch": 0.17085255424568596,
+      "grad_norm": 0.05321184874347303,
+      "learning_rate": 0.00019699223967999005,
+      "loss": 0.8349,
+      "step": 250
+    },
+    {
+      "epoch": 0.17426960533059968,
+      "grad_norm": 0.04834762129215642,
+      "learning_rate": 0.00019669479451833976,
+      "loss": 0.7996,
+      "step": 255
+    },
+    {
+      "epoch": 0.1776866564155134,
+      "grad_norm": 0.05613807628331564,
+      "learning_rate": 0.00019638357326850602,
+      "loss": 0.8214,
+      "step": 260
+    },
+    {
+      "epoch": 0.18110370750042712,
+      "grad_norm": 0.04883314227497837,
+      "learning_rate": 0.00019605862027011856,
+      "loss": 0.8212,
+      "step": 265
+    },
+    {
+      "epoch": 0.18452075858534084,
+      "grad_norm": 0.050484613376210824,
+      "learning_rate": 0.0001957199818191665,
+      "loss": 0.8183,
+      "step": 270
+    },
+    {
+      "epoch": 0.18793780967025456,
+      "grad_norm": 0.05357806995093009,
+      "learning_rate": 0.00019536770616140276,
+      "loss": 0.7989,
+      "step": 275
+    },
+    {
+      "epoch": 0.19135486075516828,
+      "grad_norm": 0.04129143018621023,
+      "learning_rate": 0.00019500184348547042,
+      "loss": 0.7925,
+      "step": 280
+    },
+    {
+      "epoch": 0.194771911840082,
+      "grad_norm": 0.04085892932038951,
+      "learning_rate": 0.00019462244591575222,
+      "loss": 0.8475,
+      "step": 285
+    },
+    {
+      "epoch": 0.19818896292499572,
+      "grad_norm": 0.06675683827394115,
+      "learning_rate": 0.00019422956750494473,
+      "loss": 0.7807,
+      "step": 290
+    },
+    {
+      "epoch": 0.20160601400990943,
+      "grad_norm": 0.05536342161943553,
+      "learning_rate": 0.00019382326422635705,
+      "loss": 0.8386,
+      "step": 295
+    },
+    {
+      "epoch": 0.20502306509482318,
+      "grad_norm": 0.04899666779274121,
+      "learning_rate": 0.00019340359396593672,
+      "loss": 0.804,
+      "step": 300
+    },
+    {
+      "epoch": 0.2084401161797369,
+      "grad_norm": 0.05443653155294473,
+      "learning_rate": 0.00019297061651402236,
+      "loss": 0.8228,
+      "step": 305
+    },
+    {
+      "epoch": 0.21185716726465062,
+      "grad_norm": 0.04672292134290588,
+      "learning_rate": 0.00019252439355682573,
+      "loss": 0.7986,
+      "step": 310
+    },
+    {
+      "epoch": 0.21527421834956434,
+      "grad_norm": 0.05171464516573939,
+      "learning_rate": 0.00019206498866764288,
+      "loss": 0.8158,
+      "step": 315
+    },
+    {
+      "epoch": 0.21869126943447806,
+      "grad_norm": 0.04153138083699566,
+      "learning_rate": 0.00019159246729779725,
+      "loss": 0.7744,
+      "step": 320
+    },
+    {
+      "epoch": 0.22210832051939178,
+      "grad_norm": 0.0555673656216969,
+      "learning_rate": 0.00019110689676731454,
+      "loss": 0.8087,
+      "step": 325
+    },
+    {
+      "epoch": 0.2255253716043055,
+      "grad_norm": 0.044931623696595056,
+      "learning_rate": 0.00019060834625533196,
+      "loss": 0.856,
+      "step": 330
+    },
+    {
+      "epoch": 0.22894242268921922,
+      "grad_norm": 0.049318573438951276,
+      "learning_rate": 0.0001900968867902419,
+      "loss": 0.8035,
+      "step": 335
+    },
+    {
+      "epoch": 0.23235947377413294,
+      "grad_norm": 0.061664060206440885,
+      "learning_rate": 0.00018957259123957295,
+      "loss": 0.8527,
+      "step": 340
+    },
+    {
+      "epoch": 0.23577652485904665,
+      "grad_norm": 0.05596265390941409,
+      "learning_rate": 0.00018903553429960802,
+      "loss": 0.8479,
+      "step": 345
+    },
+    {
+      "epoch": 0.23919357594396037,
+      "grad_norm": 0.04165765621122209,
+      "learning_rate": 0.00018848579248474288,
+      "loss": 0.8174,
+      "step": 350
+    },
+    {
+      "epoch": 0.2426106270288741,
+      "grad_norm": 0.044612085840488855,
+      "learning_rate": 0.00018792344411658468,
+      "loss": 0.7853,
+      "step": 355
+    },
+    {
+      "epoch": 0.2460276781137878,
+      "grad_norm": 0.05075354589883105,
+      "learning_rate": 0.0001873485693127939,
+      "loss": 0.8029,
+      "step": 360
+    },
+    {
+      "epoch": 0.24944472919870153,
+      "grad_norm": 0.04667928330485948,
+      "learning_rate": 0.0001867612499756697,
+      "loss": 0.8143,
+      "step": 365
+    },
+    {
+      "epoch": 0.2528617802836152,
+      "grad_norm": 0.047551720416092784,
+      "learning_rate": 0.00018616156978048146,
+      "loss": 0.8186,
+      "step": 370
+    },
+    {
+      "epoch": 0.25627883136852897,
+      "grad_norm": 0.05128991713009436,
+      "learning_rate": 0.0001855496141635476,
+      "loss": 0.8193,
+      "step": 375
+    },
+    {
+      "epoch": 0.25969588245344266,
+      "grad_norm": 0.05752820484101754,
+      "learning_rate": 0.00018492547031006336,
+      "loss": 0.863,
+      "step": 380
+    },
+    {
+      "epoch": 0.2631129335383564,
+      "grad_norm": 0.04668740520192892,
+      "learning_rate": 0.0001842892271416797,
+      "loss": 0.7979,
+      "step": 385
+    },
+    {
+      "epoch": 0.2665299846232701,
+      "grad_norm": 0.050727440707260214,
+      "learning_rate": 0.00018364097530383457,
+      "loss": 0.8447,
+      "step": 390
+    },
+    {
+      "epoch": 0.26994703570818385,
+      "grad_norm": 0.048252979993068,
+      "learning_rate": 0.00018298080715283858,
+      "loss": 0.7944,
+      "step": 395
+    },
+    {
+      "epoch": 0.27336408679309754,
+      "grad_norm": 0.0476226694515863,
+      "learning_rate": 0.00018230881674271722,
+      "loss": 0.7988,
+      "step": 400
+    },
+    {
+      "epoch": 0.2767811378780113,
+      "grad_norm": 0.04795689299493362,
+      "learning_rate": 0.00018162509981181084,
+      "loss": 0.8044,
+      "step": 405
+    },
+    {
+      "epoch": 0.280198188962925,
+      "grad_norm": 0.05352148295316883,
+      "learning_rate": 0.0001809297537691349,
+      "loss": 0.8272,
+      "step": 410
+    },
+    {
+      "epoch": 0.2836152400478387,
+      "grad_norm": 0.045370780069930124,
+      "learning_rate": 0.00018022287768050202,
+      "loss": 0.8111,
+      "step": 415
+    },
+    {
+      "epoch": 0.2870322911327524,
+      "grad_norm": 0.0444718008442191,
+      "learning_rate": 0.0001795045722544083,
+      "loss": 0.8097,
+      "step": 420
+    },
+    {
+      "epoch": 0.29044934221766616,
+      "grad_norm": 0.04429513957599892,
+      "learning_rate": 0.00017877493982768527,
+      "loss": 0.7924,
+      "step": 425
+    },
+    {
+      "epoch": 0.29386639330257985,
+      "grad_norm": 0.047432068029914336,
+      "learning_rate": 0.0001780340843509199,
+      "loss": 0.7866,
+      "step": 430
+    },
+    {
+      "epoch": 0.2972834443874936,
+      "grad_norm": 0.049740183816650754,
+      "learning_rate": 0.00017728211137364489,
+      "loss": 0.7852,
+      "step": 435
+    },
+    {
+      "epoch": 0.3007004954724073,
+      "grad_norm": 0.04924473396769746,
+      "learning_rate": 0.00017651912802930112,
+      "loss": 0.8293,
+      "step": 440
+    },
+    {
+      "epoch": 0.30411754655732104,
+      "grad_norm": 0.057228556129132115,
+      "learning_rate": 0.00017574524301997423,
+      "loss": 0.8132,
+      "step": 445
+    },
+    {
+      "epoch": 0.30753459764223473,
+      "grad_norm": 0.053526590918154204,
+      "learning_rate": 0.0001749605666009079,
+      "loss": 0.7762,
+      "step": 450
+    },
+    {
+      "epoch": 0.3109516487271485,
+      "grad_norm": 0.050268696392289904,
+      "learning_rate": 0.00017416521056479577,
+      "loss": 0.8249,
+      "step": 455
+    },
+    {
+      "epoch": 0.31436869981206217,
+      "grad_norm": 0.04766777047847375,
+      "learning_rate": 0.00017335928822585447,
+      "loss": 0.799,
+      "step": 460
+    },
+    {
+      "epoch": 0.3177857508969759,
+      "grad_norm": 0.044354607610623285,
+      "learning_rate": 0.00017254291440367968,
+      "loss": 0.838,
+      "step": 465
+    },
+    {
+      "epoch": 0.3212028019818896,
+      "grad_norm": 0.046906502149708616,
+      "learning_rate": 0.00017171620540688782,
+      "loss": 0.8276,
+      "step": 470
+    },
+    {
+      "epoch": 0.32461985306680335,
+      "grad_norm": 0.05146499212308855,
+      "learning_rate": 0.00017087927901654557,
+      "loss": 0.8168,
+      "step": 475
+    },
+    {
+      "epoch": 0.32803690415171705,
+      "grad_norm": 0.047105565138273846,
+      "learning_rate": 0.00017003225446938965,
+      "loss": 0.815,
+      "step": 480
+    },
+    {
+      "epoch": 0.3314539552366308,
+      "grad_norm": 0.05499606447911948,
+      "learning_rate": 0.00016917525244083918,
+      "loss": 0.79,
+      "step": 485
+    },
+    {
+      "epoch": 0.3348710063215445,
+      "grad_norm": 0.0419744220807428,
+      "learning_rate": 0.0001683083950278031,
+      "loss": 0.8005,
+      "step": 490
+    },
+    {
+      "epoch": 0.33828805740645823,
+      "grad_norm": 0.049938428892942886,
+      "learning_rate": 0.00016743180573128495,
+      "loss": 0.7954,
+      "step": 495
+    },
+    {
+      "epoch": 0.3417051084913719,
+      "grad_norm": 0.046951024104632456,
+      "learning_rate": 0.00016654560943878783,
+      "loss": 0.8447,
+      "step": 500
+    },
+    {
+      "epoch": 0.34512215957628567,
+      "grad_norm": 0.05271620074783484,
+      "learning_rate": 0.0001656499324065217,
+      "loss": 0.8019,
+      "step": 505
+    },
+    {
+      "epoch": 0.34853921066119936,
+      "grad_norm": 0.0471240782044328,
+      "learning_rate": 0.0001647449022414155,
+      "loss": 0.8244,
+      "step": 510
+    },
+    {
+      "epoch": 0.3519562617461131,
+      "grad_norm": 0.05807042014433393,
+      "learning_rate": 0.0001638306478829373,
+      "loss": 0.8177,
+      "step": 515
+    },
+    {
+      "epoch": 0.3553733128310268,
+      "grad_norm": 0.05008851116233917,
+      "learning_rate": 0.000162907299584724,
+      "loss": 0.8101,
+      "step": 520
+    },
+    {
+      "epoch": 0.35879036391594055,
+      "grad_norm": 0.05138344221543279,
+      "learning_rate": 0.00016197498889602448,
+      "loss": 0.7888,
+      "step": 525
+    },
+    {
+      "epoch": 0.36220741500085424,
+      "grad_norm": 0.06306896659470204,
+      "learning_rate": 0.0001610338486429575,
+      "loss": 0.8229,
+      "step": 530
+    },
+    {
+      "epoch": 0.365624466085768,
+      "grad_norm": 0.04978065788218456,
+      "learning_rate": 0.00016008401290958807,
+      "loss": 0.766,
+      "step": 535
+    },
+    {
+      "epoch": 0.3690415171706817,
+      "grad_norm": 0.053225124642096215,
+      "learning_rate": 0.00015912561701882463,
+      "loss": 0.8059,
+      "step": 540
+    },
+    {
+      "epoch": 0.3724585682555954,
+      "grad_norm": 0.06456086867109719,
+      "learning_rate": 0.00015815879751313955,
+      "loss": 0.7995,
+      "step": 545
+    },
+    {
+      "epoch": 0.3758756193405091,
+      "grad_norm": 0.05173010381007458,
+      "learning_rate": 0.00015718369213511585,
+      "loss": 0.8098,
+      "step": 550
+    },
+    {
+      "epoch": 0.37929267042542286,
+      "grad_norm": 0.04825429265824465,
+      "learning_rate": 0.00015620043980782327,
+      "loss": 0.8022,
+      "step": 555
+    },
+    {
+      "epoch": 0.38270972151033655,
+      "grad_norm": 0.04794004820515472,
+      "learning_rate": 0.00015520918061502569,
+      "loss": 0.8245,
+      "step": 560
+    },
+    {
+      "epoch": 0.3861267725952503,
+      "grad_norm": 0.045386997453962415,
+      "learning_rate": 0.00015421005578122356,
+      "loss": 0.7673,
+      "step": 565
+    },
+    {
+      "epoch": 0.389543823680164,
+      "grad_norm": 0.04809541737862576,
+      "learning_rate": 0.00015320320765153367,
+      "loss": 0.8501,
+      "step": 570
+    },
+    {
+      "epoch": 0.39296087476507774,
+      "grad_norm": 0.05241051596332237,
+      "learning_rate": 0.0001521887796714092,
+      "loss": 0.7942,
+      "step": 575
+    },
+    {
+      "epoch": 0.39637792584999143,
+      "grad_norm": 0.0435869638715258,
+      "learning_rate": 0.00015116691636620318,
+      "loss": 0.8019,
+      "step": 580
+    },
+    {
+      "epoch": 0.3997949769349052,
+      "grad_norm": 0.04978017776276555,
+      "learning_rate": 0.00015013776332057786,
+      "loss": 0.8177,
+      "step": 585
+    },
+    {
+      "epoch": 0.40321202801981887,
+      "grad_norm": 0.046469710298847465,
+      "learning_rate": 0.00014910146715776338,
+      "loss": 0.8312,
+      "step": 590
+    },
+    {
+      "epoch": 0.4066290791047326,
+      "grad_norm": 0.05396734895489411,
+      "learning_rate": 0.00014805817551866838,
+      "loss": 0.8062,
+      "step": 595
+    },
+    {
+      "epoch": 0.41004613018964636,
+      "grad_norm": 0.04459842219250921,
+      "learning_rate": 0.00014700803704084564,
+      "loss": 0.7739,
+      "step": 600
+    },
+    {
+      "epoch": 0.41346318127456005,
+      "grad_norm": 0.044911492362826454,
+      "learning_rate": 0.00014595120133731565,
+      "loss": 0.8243,
+      "step": 605
+    },
+    {
+      "epoch": 0.4168802323594738,
+      "grad_norm": 0.062259732471486105,
+      "learning_rate": 0.00014488781897525131,
+      "loss": 0.8035,
+      "step": 610
+    },
+    {
+      "epoch": 0.4202972834443875,
+      "grad_norm": 0.049059941472946,
+      "learning_rate": 0.0001438180414545267,
+      "loss": 0.7758,
+      "step": 615
+    },
+    {
+      "epoch": 0.42371433452930124,
+      "grad_norm": 0.05098968718552912,
+      "learning_rate": 0.00014274202118613294,
+      "loss": 0.8207,
+      "step": 620
+    },
+    {
+      "epoch": 0.42713138561421493,
+      "grad_norm": 0.0446401173743016,
+      "learning_rate": 0.00014165991147046403,
+      "loss": 0.7974,
+      "step": 625
+    },
+    {
+      "epoch": 0.4305484366991287,
+      "grad_norm": 0.05004490347786539,
+      "learning_rate": 0.0001405718664754764,
+      "loss": 0.7936,
+      "step": 630
+    },
+    {
+      "epoch": 0.43396548778404237,
+      "grad_norm": 0.04852978946452643,
+      "learning_rate": 0.0001394780412147245,
+      "loss": 0.801,
+      "step": 635
+    },
+    {
+      "epoch": 0.4373825388689561,
+      "grad_norm": 0.04806463749182831,
+      "learning_rate": 0.00013837859152527623,
+      "loss": 0.7788,
+      "step": 640
+    },
+    {
+      "epoch": 0.4407995899538698,
+      "grad_norm": 0.04862497331803918,
+      "learning_rate": 0.00013727367404551055,
+      "loss": 0.7825,
+      "step": 645
+    },
+    {
+      "epoch": 0.44421664103878356,
+      "grad_norm": 0.049862937300937236,
+      "learning_rate": 0.00013616344619280156,
+      "loss": 0.7849,
+      "step": 650
+    },
+    {
+      "epoch": 0.44763369212369725,
+      "grad_norm": 0.045510232942769194,
+      "learning_rate": 0.00013504806614109098,
+      "loss": 0.7961,
+      "step": 655
+    },
+    {
+      "epoch": 0.451050743208611,
+      "grad_norm": 0.05369028256273588,
+      "learning_rate": 0.00013392769279835354,
+      "loss": 0.7573,
+      "step": 660
+    },
+    {
+      "epoch": 0.4544677942935247,
+      "grad_norm": 0.05026656335694924,
+      "learning_rate": 0.0001328024857839569,
+      "loss": 0.7828,
+      "step": 665
+    },
+    {
+      "epoch": 0.45788484537843843,
+      "grad_norm": 0.04857535606631572,
+      "learning_rate": 0.00013167260540592114,
+      "loss": 0.7593,
+      "step": 670
+    },
+    {
+      "epoch": 0.4613018964633521,
+      "grad_norm": 0.048520728276942086,
+      "learning_rate": 0.00013053821263807946,
+      "loss": 0.7675,
+      "step": 675
+    },
+    {
+      "epoch": 0.46471894754826587,
+      "grad_norm": 0.04667501054011369,
+      "learning_rate": 0.00012939946909714433,
+      "loss": 0.8009,
+      "step": 680
+    },
+    {
+      "epoch": 0.46813599863317956,
+      "grad_norm": 0.057240522592767185,
+      "learning_rate": 0.000128256537019682,
+      "loss": 0.8031,
+      "step": 685
+    },
+    {
+      "epoch": 0.4715530497180933,
+      "grad_norm": 0.052631048423238694,
+      "learning_rate": 0.0001271095792389987,
+      "loss": 0.7996,
+      "step": 690
+    },
+    {
+      "epoch": 0.474970100803007,
+      "grad_norm": 0.04746921614132779,
+      "learning_rate": 0.00012595875916194188,
+      "loss": 0.7814,
+      "step": 695
+    },
+    {
+      "epoch": 0.47838715188792075,
+      "grad_norm": 0.05533339114072001,
+      "learning_rate": 0.00012480424074561933,
+      "loss": 0.7658,
+      "step": 700
+    },
+    {
+      "epoch": 0.48180420297283444,
+      "grad_norm": 0.04827072377712223,
+      "learning_rate": 0.0001236461884740409,
+      "loss": 0.7643,
+      "step": 705
+    },
+    {
+      "epoch": 0.4852212540577482,
+      "grad_norm": 0.04645041541707932,
+      "learning_rate": 0.00012248476733468368,
+      "loss": 0.8182,
+      "step": 710
+    },
+    {
+      "epoch": 0.4886383051426619,
+      "grad_norm": 0.05525909772402828,
+      "learning_rate": 0.00012132014279498703,
+      "loss": 0.8095,
+      "step": 715
+    },
+    {
+      "epoch": 0.4920553562275756,
+      "grad_norm": 0.055659491763221325,
+      "learning_rate": 0.0001201524807787779,
+      "loss": 0.786,
+      "step": 720
+    },
+    {
+      "epoch": 0.4954724073124893,
+      "grad_norm": 0.05673553372313532,
+      "learning_rate": 0.00011898194764263197,
+      "loss": 0.8332,
+      "step": 725
+    },
+    {
+      "epoch": 0.49888945839740306,
+      "grad_norm": 0.05515832759906232,
+      "learning_rate": 0.0001178087101521729,
+      "loss": 0.7668,
+      "step": 730
+    },
+    {
+      "epoch": 0.5023065094823168,
+      "grad_norm": 0.05901869759826189,
+      "learning_rate": 0.00011663293545831302,
+      "loss": 0.7922,
+      "step": 735
+    },
+    {
+      "epoch": 0.5057235605672304,
+      "grad_norm": 0.057677078146364885,
+      "learning_rate": 0.00011545479107343963,
+      "loss": 0.7849,
+      "step": 740
+    },
+    {
+      "epoch": 0.5091406116521442,
+      "grad_norm": 0.05124349837241442,
+      "learning_rate": 0.0001142744448475494,
+      "loss": 0.8001,
+      "step": 745
+    },
+    {
+      "epoch": 0.5125576627370579,
+      "grad_norm": 0.05373639047755876,
+      "learning_rate": 0.00011309206494433488,
+      "loss": 0.8011,
+      "step": 750
+    },
+    {
+      "epoch": 0.5159747138219717,
+      "grad_norm": 0.052106174258665414,
+      "learning_rate": 0.00011190781981722623,
+      "loss": 0.8019,
+      "step": 755
+    },
+    {
+      "epoch": 0.5193917649068853,
+      "grad_norm": 0.05720659004653996,
+      "learning_rate": 0.00011072187818539159,
+      "loss": 0.7834,
+      "step": 760
+    },
+    {
+      "epoch": 0.5228088159917991,
+      "grad_norm": 0.04968260034464468,
+      "learning_rate": 0.00010953440900969994,
+      "loss": 0.763,
+      "step": 765
+    },
+    {
+      "epoch": 0.5262258670767128,
+      "grad_norm": 0.051174734080506955,
+      "learning_rate": 0.000108345581468649,
+      "loss": 0.7542,
+      "step": 770
+    },
+    {
+      "epoch": 0.5296429181616266,
+      "grad_norm": 0.05202154641165268,
+      "learning_rate": 0.00010715556493426262,
+      "loss": 0.8402,
+      "step": 775
+    },
+    {
+      "epoch": 0.5330599692465402,
+      "grad_norm": 0.05276394171114056,
+      "learning_rate": 0.00010596452894796028,
+      "loss": 0.7978,
+      "step": 780
+    },
+    {
+      "epoch": 0.536477020331454,
+      "grad_norm": 0.05377226234936245,
+      "learning_rate": 0.00010477264319640252,
+      "loss": 0.7923,
+      "step": 785
+    },
+    {
+      "epoch": 0.5398940714163677,
+      "grad_norm": 0.05764668909046145,
+      "learning_rate": 0.00010358007748731582,
+      "loss": 0.7842,
+      "step": 790
+    },
+    {
+      "epoch": 0.5433111225012814,
+      "grad_norm": 0.05020673174055731,
+      "learning_rate": 0.00010238700172530009,
+      "loss": 0.8392,
+      "step": 795
+    },
+    {
+      "epoch": 0.5467281735861951,
+      "grad_norm": 0.062063862040811144,
+      "learning_rate": 0.00010119358588762232,
+      "loss": 0.8354,
+      "step": 800
+    },
+    {
+      "epoch": 0.5501452246711088,
+      "grad_norm": 0.052012383832107364,
+      "learning_rate": 0.0001,
+      "loss": 0.7635,
+      "step": 805
+    },
+    {
+      "epoch": 0.5535622757560226,
+      "grad_norm": 0.053730950289275915,
+      "learning_rate": 9.880641411237772e-05,
+      "loss": 0.7928,
+      "step": 810
+    },
+    {
+      "epoch": 0.5569793268409363,
+      "grad_norm": 0.052091960918167694,
+      "learning_rate": 9.761299827469992e-05,
+      "loss": 0.8637,
+      "step": 815
+    },
+    {
+      "epoch": 0.56039637792585,
+      "grad_norm": 0.06417204337115318,
+      "learning_rate": 9.641992251268419e-05,
+      "loss": 0.8059,
+      "step": 820
+    },
+    {
+      "epoch": 0.5638134290107637,
+      "grad_norm": 0.05147939412647928,
+      "learning_rate": 9.52273568035975e-05,
+      "loss": 0.7883,
+      "step": 825
+    },
+    {
+      "epoch": 0.5672304800956774,
+      "grad_norm": 0.0496321920196591,
+      "learning_rate": 9.403547105203974e-05,
+      "loss": 0.814,
+      "step": 830
+    },
+    {
+      "epoch": 0.5706475311805912,
+      "grad_norm": 0.05031872933033937,
+      "learning_rate": 9.28444350657374e-05,
+      "loss": 0.7816,
+      "step": 835
+    },
+    {
+      "epoch": 0.5740645822655048,
+      "grad_norm": 0.07016242369513874,
+      "learning_rate": 9.165441853135104e-05,
+      "loss": 0.8329,
+      "step": 840
+    },
+    {
+      "epoch": 0.5774816333504186,
+      "grad_norm": 0.04971424475300946,
+      "learning_rate": 9.046559099030012e-05,
+      "loss": 0.7799,
+      "step": 845
+    },
+    {
+      "epoch": 0.5808986844353323,
+      "grad_norm": 0.0522755462077452,
+      "learning_rate": 8.927812181460843e-05,
+      "loss": 0.7938,
+      "step": 850
+    },
+    {
+      "epoch": 0.5843157355202461,
+      "grad_norm": 0.0512047310154528,
+      "learning_rate": 8.809218018277378e-05,
+      "loss": 0.7986,
+      "step": 855
+    },
+    {
+      "epoch": 0.5877327866051597,
+      "grad_norm": 0.06380254072567644,
+      "learning_rate": 8.690793505566511e-05,
+      "loss": 0.7792,
+      "step": 860
+    },
+    {
+      "epoch": 0.5911498376900735,
+      "grad_norm": 0.051049981907943395,
+      "learning_rate": 8.57255551524506e-05,
+      "loss": 0.8093,
+      "step": 865
+    },
+    {
+      "epoch": 0.5945668887749872,
+      "grad_norm": 0.05340993003962982,
+      "learning_rate": 8.454520892656038e-05,
+      "loss": 0.7856,
+      "step": 870
+    },
+    {
+      "epoch": 0.597983939859901,
+      "grad_norm": 0.058549604896924366,
+      "learning_rate": 8.336706454168701e-05,
+      "loss": 0.7921,
+      "step": 875
+    },
+    {
+      "epoch": 0.6014009909448146,
+      "grad_norm": 0.05465665880133671,
+      "learning_rate": 8.219128984782712e-05,
+      "loss": 0.775,
+      "step": 880
+    },
+    {
+      "epoch": 0.6048180420297283,
+      "grad_norm": 0.055556269694792465,
+      "learning_rate": 8.101805235736804e-05,
+      "loss": 0.8248,
+      "step": 885
+    },
+    {
+      "epoch": 0.6082350931146421,
+      "grad_norm": 0.05456060311041398,
+      "learning_rate": 7.984751922122214e-05,
+      "loss": 0.7868,
+      "step": 890
+    },
+    {
+      "epoch": 0.6116521441995558,
+      "grad_norm": 0.045746939739603906,
+      "learning_rate": 7.867985720501301e-05,
+      "loss": 0.7543,
+      "step": 895
+    },
+    {
+      "epoch": 0.6150691952844695,
+      "grad_norm": 0.07588505104354382,
+      "learning_rate": 7.751523266531634e-05,
+      "loss": 0.8306,
+      "step": 900
+    },
+    {
+      "epoch": 0.6184862463693832,
+      "grad_norm": 0.07330950373801204,
+      "learning_rate": 7.635381152595915e-05,
+      "loss": 0.8365,
+      "step": 905
+    },
+    {
+      "epoch": 0.621903297454297,
+      "grad_norm": 0.050877734105124285,
+      "learning_rate": 7.519575925438067e-05,
+      "loss": 0.7798,
+      "step": 910
+    },
+    {
+      "epoch": 0.6253203485392107,
+      "grad_norm": 0.060652606963752305,
+      "learning_rate": 7.404124083805819e-05,
+      "loss": 0.7894,
+      "step": 915
+    },
+    {
+      "epoch": 0.6287373996241243,
+      "grad_norm": 0.051776470909645156,
+      "learning_rate": 7.289042076100132e-05,
+      "loss": 0.7573,
+      "step": 920
+    },
+    {
+      "epoch": 0.6321544507090381,
+      "grad_norm": 0.05310980558006848,
+      "learning_rate": 7.174346298031804e-05,
+      "loss": 0.7859,
+      "step": 925
+    },
+    {
+      "epoch": 0.6355715017939518,
+      "grad_norm": 0.05227069362568692,
+      "learning_rate": 7.060053090285572e-05,
+      "loss": 0.8129,
+      "step": 930
+    },
+    {
+      "epoch": 0.6389885528788656,
+      "grad_norm": 0.05951427084679483,
+      "learning_rate": 6.946178736192053e-05,
+      "loss": 0.7709,
+      "step": 935
+    },
+    {
+      "epoch": 0.6424056039637792,
+      "grad_norm": 0.051385803262068645,
+      "learning_rate": 6.832739459407885e-05,
+      "loss": 0.7958,
+      "step": 940
+    },
+    {
+      "epoch": 0.645822655048693,
+      "grad_norm": 0.0699197425530053,
+      "learning_rate": 6.719751421604309e-05,
+      "loss": 0.7725,
+      "step": 945
+    },
+    {
+      "epoch": 0.6492397061336067,
+      "grad_norm": 0.05971638114237985,
+      "learning_rate": 6.607230720164647e-05,
+      "loss": 0.8066,
+      "step": 950
+    },
+    {
+      "epoch": 0.6526567572185205,
+      "grad_norm": 0.061174773260503496,
+      "learning_rate": 6.495193385890901e-05,
+      "loss": 0.7676,
+      "step": 955
+    },
+    {
+      "epoch": 0.6560738083034341,
+      "grad_norm": 0.05624437053551885,
+      "learning_rate": 6.383655380719848e-05,
+      "loss": 0.7995,
+      "step": 960
+    },
+    {
+      "epoch": 0.6594908593883478,
+      "grad_norm": 0.0506865636845252,
+      "learning_rate": 6.272632595448947e-05,
+      "loss": 0.8109,
+      "step": 965
+    },
+    {
+      "epoch": 0.6629079104732616,
+      "grad_norm": 0.052397675907368615,
+      "learning_rate": 6.162140847472381e-05,
+      "loss": 0.805,
+      "step": 970
+    },
+    {
+      "epoch": 0.6663249615581753,
+      "grad_norm": 0.05888057649474354,
+      "learning_rate": 6.05219587852755e-05,
+      "loss": 0.7887,
+      "step": 975
+    },
+    {
+      "epoch": 0.669742012643089,
+      "grad_norm": 0.05311115757434341,
+      "learning_rate": 5.9428133524523646e-05,
+      "loss": 0.7787,
+      "step": 980
+    },
+    {
+      "epoch": 0.6731590637280027,
+      "grad_norm": 0.052096208789878605,
+      "learning_rate": 5.834008852953603e-05,
+      "loss": 0.794,
+      "step": 985
+    },
+    {
+      "epoch": 0.6765761148129165,
+      "grad_norm": 0.06038508504122752,
+      "learning_rate": 5.7257978813867094e-05,
+      "loss": 0.7965,
+      "step": 990
+    },
+    {
+      "epoch": 0.6799931658978302,
+      "grad_norm": 0.06296714469265219,
+      "learning_rate": 5.6181958545473325e-05,
+      "loss": 0.7931,
+      "step": 995
+    },
+    {
+      "epoch": 0.6834102169827438,
+      "grad_norm": 0.04915194607567942,
+      "learning_rate": 5.511218102474872e-05,
+      "loss": 0.8123,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6868272680676576,
+      "grad_norm": 0.06403272365242713,
+      "learning_rate": 5.4048798662684376e-05,
+      "loss": 0.7897,
+      "step": 1005
+    },
+    {
+      "epoch": 0.6902443191525713,
+      "grad_norm": 0.05295988577722599,
+      "learning_rate": 5.299196295915441e-05,
+      "loss": 0.8112,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6936613702374851,
+      "grad_norm": 0.052501801842771235,
+      "learning_rate": 5.1941824481331626e-05,
+      "loss": 0.7997,
+      "step": 1015
+    },
+    {
+      "epoch": 0.6970784213223987,
+      "grad_norm": 0.05826194574882407,
+      "learning_rate": 5.089853284223667e-05,
+      "loss": 0.8301,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7004954724073125,
+      "grad_norm": 0.06250337842792936,
+      "learning_rate": 4.986223667942214e-05,
+      "loss": 0.8135,
+      "step": 1025
+    },
+    {
+      "epoch": 0.7039125234922262,
+      "grad_norm": 0.05760873048674179,
+      "learning_rate": 4.8833083633796826e-05,
+      "loss": 0.8062,
+      "step": 1030
+    },
+    {
+      "epoch": 0.70732957457714,
+      "grad_norm": 0.05324285237345129,
+      "learning_rate": 4.781122032859079e-05,
+      "loss": 0.7931,
+      "step": 1035
+    },
+    {
+      "epoch": 0.7107466256620536,
+      "grad_norm": 0.0599011246936521,
+      "learning_rate": 4.6796792348466356e-05,
+      "loss": 0.78,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7141636767469673,
+      "grad_norm": 0.04893354818829853,
+      "learning_rate": 4.578994421877645e-05,
+      "loss": 0.8154,
+      "step": 1045
+    },
+    {
+      "epoch": 0.7175807278318811,
+      "grad_norm": 0.05182450287923114,
+      "learning_rate": 4.479081938497435e-05,
+      "loss": 0.7852,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7209977789167948,
+      "grad_norm": 0.05550201225603795,
+      "learning_rate": 4.379956019217675e-05,
+      "loss": 0.7835,
+      "step": 1055
+    },
+    {
+      "epoch": 0.7244148300017085,
+      "grad_norm": 0.05173446489569767,
+      "learning_rate": 4.281630786488418e-05,
+      "loss": 0.8116,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7278318810866222,
+      "grad_norm": 0.056948504215535725,
+      "learning_rate": 4.184120248686048e-05,
+      "loss": 0.8377,
+      "step": 1065
+    },
+    {
+      "epoch": 0.731248932171536,
+      "grad_norm": 0.061134339997307,
+      "learning_rate": 4.087438298117536e-05,
+      "loss": 0.8125,
+      "step": 1070
+    },
+    {
+      "epoch": 0.7346659832564497,
+      "grad_norm": 0.04956030468932823,
+      "learning_rate": 3.991598709041196e-05,
+      "loss": 0.7923,
+      "step": 1075
+    },
+    {
+      "epoch": 0.7380830343413634,
+      "grad_norm": 0.04927896659942923,
+      "learning_rate": 3.896615135704251e-05,
+      "loss": 0.8063,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7415000854262771,
+      "grad_norm": 0.049440200527473416,
+      "learning_rate": 3.802501110397553e-05,
+      "loss": 0.8469,
+      "step": 1085
+    },
+    {
+      "epoch": 0.7449171365111908,
+      "grad_norm": 0.04986683012842273,
+      "learning_rate": 3.709270041527599e-05,
+      "loss": 0.7906,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7483341875961046,
+      "grad_norm": 0.05602482087134134,
+      "learning_rate": 3.616935211706275e-05,
+      "loss": 0.7721,
+      "step": 1095
+    },
+    {
+      "epoch": 0.7517512386810182,
+      "grad_norm": 0.04687315536980261,
+      "learning_rate": 3.525509775858451e-05,
+      "loss": 0.7597,
+      "step": 1100
+    },
+    {
+      "epoch": 0.755168289765932,
+      "grad_norm": 0.056244634522858154,
+      "learning_rate": 3.4350067593478356e-05,
+      "loss": 0.8166,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7585853408508457,
+      "grad_norm": 0.06127414968737818,
+      "learning_rate": 3.345439056121216e-05,
+      "loss": 0.7837,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7620023919357595,
+      "grad_norm": 0.05399366743123272,
+      "learning_rate": 3.256819426871507e-05,
+      "loss": 0.778,
+      "step": 1115
+    },
+    {
+      "epoch": 0.7654194430206731,
+      "grad_norm": 0.06458330411214924,
+      "learning_rate": 3.169160497219692e-05,
+      "loss": 0.8348,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7688364941055869,
+      "grad_norm": 0.05215467522818499,
+      "learning_rate": 3.0824747559160836e-05,
+      "loss": 0.7972,
+      "step": 1125
+    },
+    {
+      "epoch": 0.7722535451905006,
+      "grad_norm": 0.05781545861178535,
+      "learning_rate": 2.9967745530610357e-05,
+      "loss": 0.8017,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7756705962754143,
+      "grad_norm": 0.05543397909633525,
+      "learning_rate": 2.9120720983454463e-05,
+      "loss": 0.803,
+      "step": 1135
+    },
+    {
+      "epoch": 0.779087647360328,
+      "grad_norm": 0.04870099285910927,
+      "learning_rate": 2.828379459311219e-05,
+      "loss": 0.7606,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7825046984452417,
+      "grad_norm": 0.0667187917753979,
+      "learning_rate": 2.745708559632032e-05,
+      "loss": 0.8097,
+      "step": 1145
+    },
+    {
+      "epoch": 0.7859217495301555,
+      "grad_norm": 0.05222086951191216,
+      "learning_rate": 2.6640711774145543e-05,
+      "loss": 0.7795,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7893388006150692,
+      "grad_norm": 0.04711247830692807,
+      "learning_rate": 2.5834789435204243e-05,
+      "loss": 0.7978,
+      "step": 1155
+    },
+    {
+      "epoch": 0.7927558516999829,
+      "grad_norm": 0.04903142437310661,
+      "learning_rate": 2.503943339909214e-05,
+      "loss": 0.8113,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7961729027848966,
+      "grad_norm": 0.05187782322058754,
+      "learning_rate": 2.4254756980025773e-05,
+      "loss": 0.8056,
+      "step": 1165
+    },
+    {
+      "epoch": 0.7995899538698104,
+      "grad_norm": 0.07430744833255934,
+      "learning_rate": 2.348087197069889e-05,
+      "loss": 0.7875,
+      "step": 1170
+    },
+    {
+      "epoch": 0.8030070049547241,
+      "grad_norm": 0.05224096237020666,
+      "learning_rate": 2.2717888626355134e-05,
+      "loss": 0.7539,
+      "step": 1175
+    },
+    {
+      "epoch": 0.8064240560396377,
+      "grad_norm": 0.06939065704978066,
+      "learning_rate": 2.196591564908016e-05,
+      "loss": 0.805,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8098411071245515,
+      "grad_norm": 0.05628523497923318,
+      "learning_rate": 2.122506017231477e-05,
+      "loss": 0.7774,
+      "step": 1185
+    },
+    {
+      "epoch": 0.8132581582094652,
+      "grad_norm": 0.05874291639598103,
+      "learning_rate": 2.04954277455917e-05,
+      "loss": 0.8078,
+      "step": 1190
+    },
+    {
+      "epoch": 0.816675209294379,
+      "grad_norm": 0.060226053962080625,
+      "learning_rate": 1.9777122319497986e-05,
+      "loss": 0.7899,
+      "step": 1195
+    },
+    {
+      "epoch": 0.8200922603792927,
+      "grad_norm": 0.05099402535993439,
+      "learning_rate": 1.907024623086515e-05,
+      "loss": 0.7816,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8235093114642064,
+      "grad_norm": 0.057975740535861416,
+      "learning_rate": 1.837490018818917e-05,
+      "loss": 0.7726,
+      "step": 1205
+    },
+    {
+      "epoch": 0.8269263625491201,
+      "grad_norm": 0.059484775752965784,
+      "learning_rate": 1.7691183257282772e-05,
+      "loss": 0.7563,
+      "step": 1210
+    },
+    {
+      "epoch": 0.8303434136340339,
+      "grad_norm": 0.05370824040165683,
+      "learning_rate": 1.7019192847161425e-05,
+      "loss": 0.7812,
+      "step": 1215
+    },
+    {
+      "epoch": 0.8337604647189476,
+      "grad_norm": 0.06107368550618316,
+      "learning_rate": 1.635902469616544e-05,
+      "loss": 0.8212,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8371775158038612,
+      "grad_norm": 0.06800386851846588,
+      "learning_rate": 1.57107728583203e-05,
+      "loss": 0.8013,
+      "step": 1225
+    },
+    {
+      "epoch": 0.840594566888775,
+      "grad_norm": 0.07161023273562961,
+      "learning_rate": 1.5074529689936645e-05,
+      "loss": 0.7903,
+      "step": 1230
+    },
+    {
+      "epoch": 0.8440116179736887,
+      "grad_norm": 0.06460741141183791,
+      "learning_rate": 1.4450385836452429e-05,
+      "loss": 0.8216,
+      "step": 1235
+    },
+    {
+      "epoch": 0.8474286690586025,
+      "grad_norm": 0.0610751932799301,
+      "learning_rate": 1.383843021951855e-05,
+      "loss": 0.814,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8508457201435161,
+      "grad_norm": 0.05080799197596389,
+      "learning_rate": 1.3238750024330338e-05,
+      "loss": 0.808,
+      "step": 1245
+    },
+    {
+      "epoch": 0.8542627712284299,
+      "grad_norm": 0.051114360900508575,
+      "learning_rate": 1.2651430687206112e-05,
+      "loss": 0.7891,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8576798223133436,
+      "grad_norm": 0.06232632387186641,
+      "learning_rate": 1.207655588341534e-05,
+      "loss": 0.8154,
+      "step": 1255
+    },
+    {
+      "epoch": 0.8610968733982574,
+      "grad_norm": 0.050991722404101165,
+      "learning_rate": 1.1514207515257147e-05,
+      "loss": 0.8033,
+      "step": 1260
+    },
+    {
+      "epoch": 0.864513924483171,
+      "grad_norm": 0.05489592548763728,
+      "learning_rate": 1.096446570039198e-05,
+      "loss": 0.7717,
+      "step": 1265
+    },
+    {
+      "epoch": 0.8679309755680847,
+      "grad_norm": 0.055894258580256755,
+      "learning_rate": 1.0427408760427093e-05,
+      "loss": 0.7856,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8713480266529985,
+      "grad_norm": 0.05522200907166403,
+      "learning_rate": 9.903113209758096e-06,
+      "loss": 0.7621,
+      "step": 1275
+    },
+    {
+      "epoch": 0.8747650777379122,
+      "grad_norm": 0.05047477471682088,
+      "learning_rate": 9.391653744668072e-06,
+      "loss": 0.786,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8781821288228259,
+      "grad_norm": 0.05005408324574757,
+      "learning_rate": 8.89310323268544e-06,
+      "loss": 0.7687,
+      "step": 1285
+    },
+    {
+      "epoch": 0.8815991799077396,
+      "grad_norm": 0.061644507314616744,
+      "learning_rate": 8.40753270220277e-06,
+      "loss": 0.8259,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8850162309926534,
+      "grad_norm": 0.05765541923821856,
+      "learning_rate": 7.935011332357112e-06,
+      "loss": 0.7769,
+      "step": 1295
+    },
+    {
+      "epoch": 0.8884332820775671,
+      "grad_norm": 0.05364219973356691,
+      "learning_rate": 7.475606443174288e-06,
+      "loss": 0.7701,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8918503331624807,
+      "grad_norm": 0.07263092996256254,
+      "learning_rate": 7.029383485977625e-06,
+      "loss": 0.7792,
+      "step": 1305
+    },
+    {
+      "epoch": 0.8952673842473945,
+      "grad_norm": 0.060017401840350565,
+      "learning_rate": 6.596406034063318e-06,
+      "loss": 0.7845,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8986844353323082,
+      "grad_norm": 0.05492038881398355,
+      "learning_rate": 6.176735773642961e-06,
+      "loss": 0.7764,
+      "step": 1315
+    },
+    {
+      "epoch": 0.902101486417222,
+      "grad_norm": 0.06526851328526875,
+      "learning_rate": 5.770432495055311e-06,
+      "loss": 0.8039,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9055185375021356,
+      "grad_norm": 0.050050687943016064,
+      "learning_rate": 5.377554084247771e-06,
+      "loss": 0.7889,
+      "step": 1325
+    },
+    {
+      "epoch": 0.9089355885870494,
+      "grad_norm": 0.05623811926805879,
+      "learning_rate": 4.998156514529595e-06,
+      "loss": 0.7867,
+      "step": 1330
+    },
+    {
+      "epoch": 0.9123526396719631,
+      "grad_norm": 0.051017981131755645,
+      "learning_rate": 4.632293838597246e-06,
+      "loss": 0.8321,
+      "step": 1335
+    },
+    {
+      "epoch": 0.9157696907568769,
+      "grad_norm": 0.05412601343623542,
+      "learning_rate": 4.280018180833501e-06,
+      "loss": 0.804,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9191867418417905,
+      "grad_norm": 0.0595456214327722,
+      "learning_rate": 3.941379729881456e-06,
+      "loss": 0.7845,
+      "step": 1345
+    },
+    {
+      "epoch": 0.9226037929267042,
+      "grad_norm": 0.05195812070434534,
+      "learning_rate": 3.6164267314939713e-06,
+      "loss": 0.8123,
+      "step": 1350
+    },
+    {
+      "epoch": 0.926020844011618,
+      "grad_norm": 0.07421106031821466,
+      "learning_rate": 3.3052054816602452e-06,
+      "loss": 0.7904,
+      "step": 1355
+    },
+    {
+      "epoch": 0.9294378950965317,
+      "grad_norm": 0.05963573412011266,
+      "learning_rate": 3.007760320009967e-06,
+      "loss": 0.7794,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9328549461814454,
+      "grad_norm": 0.059269509717240765,
+      "learning_rate": 2.7241336234962944e-06,
+      "loss": 0.8089,
+      "step": 1365
+    },
+    {
+      "epoch": 0.9362719972663591,
+      "grad_norm": 0.059259649761895836,
+      "learning_rate": 2.4543658003583604e-06,
+      "loss": 0.7626,
+      "step": 1370
+    },
+    {
+      "epoch": 0.9396890483512729,
+      "grad_norm": 0.06762123294868294,
+      "learning_rate": 2.19849528436441e-06,
+      "loss": 0.802,
+      "step": 1375
+    },
+    {
+      "epoch": 0.9431060994361866,
+      "grad_norm": 0.060831926872792076,
+      "learning_rate": 1.956558529336061e-06,
+      "loss": 0.7518,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9465231505211003,
+      "grad_norm": 0.051796739233864277,
+      "learning_rate": 1.7285900039547998e-06,
+      "loss": 0.7966,
+      "step": 1385
+    },
+    {
+      "epoch": 0.949940201606014,
+      "grad_norm": 0.055065582393637,
+      "learning_rate": 1.5146221868511668e-06,
+      "loss": 0.799,
+      "step": 1390
+    },
+    {
+      "epoch": 0.9533572526909277,
+      "grad_norm": 0.05414380180341436,
+      "learning_rate": 1.3146855619776134e-06,
+      "loss": 0.8069,
+      "step": 1395
+    },
+    {
+      "epoch": 0.9567743037758415,
+      "grad_norm": 0.07040840761575913,
+      "learning_rate": 1.1288086142653864e-06,
+      "loss": 0.8167,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9601913548607551,
+      "grad_norm": 0.05744269684792452,
+      "learning_rate": 9.570178255663532e-07,
+      "loss": 0.7983,
+      "step": 1405
+    },
+    {
+      "epoch": 0.9636084059456689,
+      "grad_norm": 0.06440333782704491,
+      "learning_rate": 7.993376708800848e-07,
+      "loss": 0.7664,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9670254570305826,
+      "grad_norm": 0.061502951799294485,
+      "learning_rate": 6.557906148669024e-07,
+      "loss": 0.8054,
+      "step": 1415
+    },
+    {
+      "epoch": 0.9704425081154964,
+      "grad_norm": 0.05555024265947659,
+      "learning_rate": 5.26397108647414e-07,
+      "loss": 0.8238,
+      "step": 1420
+    },
+    {
+      "epoch": 0.97385955920041,
+      "grad_norm": 0.053575566054838014,
+      "learning_rate": 4.111755868887346e-07,
+      "loss": 0.7603,
+      "step": 1425
+    },
+    {
+      "epoch": 0.9772766102853238,
+      "grad_norm": 0.05884974269525592,
+      "learning_rate": 3.1014246517823145e-07,
+      "loss": 0.7584,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9806936613702375,
+      "grad_norm": 0.069718532394202,
+      "learning_rate": 2.2331213768468363e-07,
+      "loss": 0.7936,
+      "step": 1435
+    },
+    {
+      "epoch": 0.9841107124551512,
+      "grad_norm": 0.05791806090918031,
+      "learning_rate": 1.506969751076226e-07,
+      "loss": 0.8523,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9875277635400649,
+      "grad_norm": 0.05509884785667895,
+      "learning_rate": 9.230732291485301e-08,
+      "loss": 0.8025,
+      "step": 1445
+    },
+    {
+      "epoch": 0.9909448146249786,
+      "grad_norm": 0.05696235653534006,
+      "learning_rate": 4.8151499868520634e-08,
+      "loss": 0.7785,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9943618657098924,
+      "grad_norm": 0.04959938380200943,
+      "learning_rate": 1.8235796839982665e-08,
+      "loss": 0.8036,
+      "step": 1455
+    },
+    {
+      "epoch": 0.9977789167948061,
+      "grad_norm": 0.05072942561204063,
+      "learning_rate": 2.5644759135134976e-09,
+      "loss": 0.7995,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9998291474457544,
+      "eval_loss": 0.7582454681396484,
+      "eval_runtime": 90.7189,
+      "eval_samples_per_second": 5.512,
+      "eval_steps_per_second": 0.353,
+      "step": 1463
+    },
+    {
+      "epoch": 0.9998291474457544,
+      "step": 1463,
+      "total_flos": 2.720070830863155e+16,
+      "train_loss": 0.8341796821873235,
+      "train_runtime": 76059.1967,
+      "train_samples_per_second": 1.231,
+      "train_steps_per_second": 0.019
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1463,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "total_flos": 2.720070830863155e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}