diff --git a/README.md b/README.md
index ab1026189d4daa3c13229a3ffce8d7b35a755eb3..213dc0957d1ce56857a23da29983402f3e5ee424 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,143 @@
----
-license: unknown
----
+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model-index:
+- name: outputs/qlora-out
+ results: []
+---
+
+
+
+[
](https://github.com/OpenAccess-AI-Collective/axolotl)
+See axolotl config
+
+axolotl version: `0.4.1`
+```yaml
+adapter: qlora
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+bf16: auto
+dataset_prepared_path: null
+datasets:
+- path: Taiel26/plm_2500_uniref
+ type: alpaca
+debug: null
+deepspeed: null
+early_stopping_patience: null
+eval_sample_packing: false
+evals_per_epoch: 4
+flash_attention: true
+fp16: null
+fsdp: null
+fsdp_config: null
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+group_by_length: false
+learning_rate: 0.0002
+load_in_4bit: true
+load_in_8bit: false
+local_rank: null
+logging_steps: 1
+lora_alpha: 16
+lora_dropout: 0.05
+lora_fan_in_fan_out: null
+lora_model_dir: null
+lora_r: 32
+lora_target_linear: true
+lora_target_modules: null
+lr_scheduler: cosine
+micro_batch_size: 2
+model_type: LlamaForCausalLM
+num_epochs: 4
+optimizer: paged_adamw_32bit
+output_dir: ./outputs/qlora-out
+pad_to_sequence_len: true
+resume_from_checkpoint: null
+sample_packing: true
+saves_per_epoch: 1
+sequence_len: 4096
+special_tokens: null
+strict: false
+tf32: false
+tokenizer_type: LlamaTokenizer
+train_on_inputs: false
+val_set_size: 0.05
+wandb_entity: null
+wandb_log_model: null
+wandb_name: null
+wandb_project: null
+wandb_watch: null
+warmup_steps: 10
+weight_decay: 0.0
+xformers_attention: null
+
+```
+
+
+
+# outputs/qlora-out
+
+This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.8586
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 4
+
+### Training results
+
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.0919 | 0.0198 | 1 | 2.0800 |
+| 1.5479 | 0.2574 | 13 | 1.5341 |
+| 1.2083 | 0.5149 | 26 | 1.2245 |
+| 1.0851 | 0.7723 | 39 | 1.0607 |
+| 0.9432 | 1.0297 | 52 | 0.9755 |
+| 0.9007 | 1.2178 | 65 | 0.9334 |
+| 0.8765 | 1.4752 | 78 | 0.9084 |
+| 0.8789 | 1.7327 | 91 | 0.8891 |
+| 0.8304 | 1.9901 | 104 | 0.8779 |
+| 0.8194 | 2.1782 | 117 | 0.8714 |
+| 0.848 | 2.4356 | 130 | 0.8665 |
+| 0.8354 | 2.6931 | 143 | 0.8627 |
+| 0.8476 | 2.9505 | 156 | 0.8605 |
+| 0.811 | 3.1386 | 169 | 0.8590 |
+| 0.8178 | 3.3960 | 182 | 0.8588 |
+| 0.8073 | 3.6535 | 195 | 0.8586 |
+
+
+### Framework versions
+
+- PEFT 0.11.1
+- Transformers 4.41.1
+- Pytorch 2.1.2+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "q_proj",
+ "o_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.bin b/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c380520592770a23b238e94c8e56b4b79c0847c0
--- /dev/null
+++ b/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffc2779c22b7eae997dc6203abde8f60a5e25d728ff0372f233c318ba1fdff97
+size 50573978
diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978
--- /dev/null
+++ b/checkpoint-100/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891
--- /dev/null
+++ b/checkpoint-100/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "q_proj",
+ "o_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2bc8c349472a911b43a61191dca50545e47c7731
--- /dev/null
+++ b/checkpoint-100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b85a4fdc0abdb0ae863b99d8dbbc0f4de78e0d9fbd7bcb1ddcd7575e55dd73e
+size 50503848
diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdd1fa491d76cadd05c4b29aa7b82b6375fb6268
--- /dev/null
+++ b/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b8f5c81e295185d82b95402d9e8aa5ba7f3db7c0d3626b29a8ce3a7f38899ae
+size 202035450
diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..22b8bd3417b5dcc9c846deab82f71389c7adcb09
--- /dev/null
+++ b/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b71df2f60f93f95a69126d2a7bc1e1cccfa69f1b8fa8d99a58b0ccfa00747f6f
+size 14244
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd2ee4e3177198ef9bb677ca214baa3bb506f0b2
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc7800513a1b4dd006c457152c700dd768bb49ee4ed8e4d9665a4e42095b054
+size 1064
diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-100/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-100/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a34ffc16754db68bc7066ce5eaa863821b0391b4
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,797 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.9108910891089108,
+ "eval_steps": 13,
+ "global_step": 100,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.019801980198019802,
+ "grad_norm": 1.15625,
+ "learning_rate": 2e-05,
+ "loss": 2.0919,
+ "step": 1
+ },
+ {
+ "epoch": 0.019801980198019802,
+ "eval_loss": 2.079954147338867,
+ "eval_runtime": 13.8908,
+ "eval_samples_per_second": 8.999,
+ "eval_steps_per_second": 4.535,
+ "step": 1
+ },
+ {
+ "epoch": 0.039603960396039604,
+ "grad_norm": 1.203125,
+ "learning_rate": 4e-05,
+ "loss": 2.0814,
+ "step": 2
+ },
+ {
+ "epoch": 0.0594059405940594,
+ "grad_norm": 1.1953125,
+ "learning_rate": 6e-05,
+ "loss": 2.0499,
+ "step": 3
+ },
+ {
+ "epoch": 0.07920792079207921,
+ "grad_norm": 1.0859375,
+ "learning_rate": 8e-05,
+ "loss": 2.0153,
+ "step": 4
+ },
+ {
+ "epoch": 0.09900990099009901,
+ "grad_norm": 1.0390625,
+ "learning_rate": 0.0001,
+ "loss": 1.9548,
+ "step": 5
+ },
+ {
+ "epoch": 0.1188118811881188,
+ "grad_norm": 0.89453125,
+ "learning_rate": 0.00012,
+ "loss": 1.8982,
+ "step": 6
+ },
+ {
+ "epoch": 0.13861386138613863,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00014,
+ "loss": 1.8226,
+ "step": 7
+ },
+ {
+ "epoch": 0.15841584158415842,
+ "grad_norm": 0.66796875,
+ "learning_rate": 0.00016,
+ "loss": 1.7572,
+ "step": 8
+ },
+ {
+ "epoch": 0.1782178217821782,
+ "grad_norm": 0.78515625,
+ "learning_rate": 0.00018,
+ "loss": 1.7074,
+ "step": 9
+ },
+ {
+ "epoch": 0.19801980198019803,
+ "grad_norm": 0.73828125,
+ "learning_rate": 0.0002,
+ "loss": 1.6317,
+ "step": 10
+ },
+ {
+ "epoch": 0.21782178217821782,
+ "grad_norm": 0.484375,
+ "learning_rate": 0.0001999863304992469,
+ "loss": 1.5801,
+ "step": 11
+ },
+ {
+ "epoch": 0.2376237623762376,
+ "grad_norm": 0.53125,
+ "learning_rate": 0.00019994532573409262,
+ "loss": 1.5721,
+ "step": 12
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "grad_norm": 0.6953125,
+ "learning_rate": 0.00019987699691483048,
+ "loss": 1.5479,
+ "step": 13
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "eval_loss": 1.5341482162475586,
+ "eval_runtime": 13.8795,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 13
+ },
+ {
+ "epoch": 0.27722772277227725,
+ "grad_norm": 0.65234375,
+ "learning_rate": 0.00019978136272187747,
+ "loss": 1.534,
+ "step": 14
+ },
+ {
+ "epoch": 0.297029702970297,
+ "grad_norm": 0.515625,
+ "learning_rate": 0.000199658449300667,
+ "loss": 1.4804,
+ "step": 15
+ },
+ {
+ "epoch": 0.31683168316831684,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00019950829025450114,
+ "loss": 1.4805,
+ "step": 16
+ },
+ {
+ "epoch": 0.33663366336633666,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.00019933092663536382,
+ "loss": 1.3809,
+ "step": 17
+ },
+ {
+ "epoch": 0.3564356435643564,
+ "grad_norm": 0.3125,
+ "learning_rate": 0.00019912640693269752,
+ "loss": 1.3837,
+ "step": 18
+ },
+ {
+ "epoch": 0.37623762376237624,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00019889478706014687,
+ "loss": 1.3673,
+ "step": 19
+ },
+ {
+ "epoch": 0.39603960396039606,
+ "grad_norm": 0.298828125,
+ "learning_rate": 0.00019863613034027224,
+ "loss": 1.366,
+ "step": 20
+ },
+ {
+ "epoch": 0.4158415841584158,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.00019835050748723824,
+ "loss": 1.3318,
+ "step": 21
+ },
+ {
+ "epoch": 0.43564356435643564,
+ "grad_norm": 0.341796875,
+ "learning_rate": 0.00019803799658748094,
+ "loss": 1.2741,
+ "step": 22
+ },
+ {
+ "epoch": 0.45544554455445546,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019769868307835994,
+ "loss": 1.2978,
+ "step": 23
+ },
+ {
+ "epoch": 0.4752475247524752,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.0001973326597248006,
+ "loss": 1.2733,
+ "step": 24
+ },
+ {
+ "epoch": 0.49504950495049505,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00019694002659393305,
+ "loss": 1.2302,
+ "step": 25
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.00019652089102773488,
+ "loss": 1.2083,
+ "step": 26
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "eval_loss": 1.224540114402771,
+ "eval_runtime": 13.8695,
+ "eval_samples_per_second": 9.013,
+ "eval_steps_per_second": 4.542,
+ "step": 26
+ },
+ {
+ "epoch": 0.5346534653465347,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.00019607536761368484,
+ "loss": 1.1761,
+ "step": 27
+ },
+ {
+ "epoch": 0.5544554455445545,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.00019560357815343577,
+ "loss": 1.1751,
+ "step": 28
+ },
+ {
+ "epoch": 0.5742574257425742,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 1.2002,
+ "step": 29
+ },
+ {
+ "epoch": 0.594059405940594,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.00019458172417006347,
+ "loss": 1.1544,
+ "step": 30
+ },
+ {
+ "epoch": 0.6138613861386139,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00019403193901161613,
+ "loss": 1.1384,
+ "step": 31
+ },
+ {
+ "epoch": 0.6336633663366337,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0001934564464599461,
+ "loss": 1.0999,
+ "step": 32
+ },
+ {
+ "epoch": 0.6534653465346535,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019285540384897073,
+ "loss": 1.1576,
+ "step": 33
+ },
+ {
+ "epoch": 0.6732673267326733,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019222897549773848,
+ "loss": 1.091,
+ "step": 34
+ },
+ {
+ "epoch": 0.693069306930693,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00019157733266550575,
+ "loss": 1.056,
+ "step": 35
+ },
+ {
+ "epoch": 0.7128712871287128,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00019090065350491626,
+ "loss": 1.1068,
+ "step": 36
+ },
+ {
+ "epoch": 0.7326732673267327,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00019019912301329592,
+ "loss": 1.0583,
+ "step": 37
+ },
+ {
+ "epoch": 0.7524752475247525,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.00018947293298207635,
+ "loss": 1.0671,
+ "step": 38
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001887222819443612,
+ "loss": 1.0851,
+ "step": 39
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "eval_loss": 1.060703158378601,
+ "eval_runtime": 13.878,
+ "eval_samples_per_second": 9.007,
+ "eval_steps_per_second": 4.54,
+ "step": 39
+ },
+ {
+ "epoch": 0.7920792079207921,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001879473751206489,
+ "loss": 1.0343,
+ "step": 40
+ },
+ {
+ "epoch": 0.8118811881188119,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.00018714842436272773,
+ "loss": 0.9789,
+ "step": 41
+ },
+ {
+ "epoch": 0.8316831683168316,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00018632564809575742,
+ "loss": 1.0174,
+ "step": 42
+ },
+ {
+ "epoch": 0.8514851485148515,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0001854792712585539,
+ "loss": 1.0004,
+ "step": 43
+ },
+ {
+ "epoch": 0.8712871287128713,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00018460952524209355,
+ "loss": 1.0281,
+ "step": 44
+ },
+ {
+ "epoch": 0.8910891089108911,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00018371664782625287,
+ "loss": 0.9992,
+ "step": 45
+ },
+ {
+ "epoch": 0.9108910891089109,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00018280088311480201,
+ "loss": 0.9635,
+ "step": 46
+ },
+ {
+ "epoch": 0.9306930693069307,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00018186248146866927,
+ "loss": 1.006,
+ "step": 47
+ },
+ {
+ "epoch": 0.9504950495049505,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.9891,
+ "step": 48
+ },
+ {
+ "epoch": 0.9702970297029703,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.0001799187996894925,
+ "loss": 0.9809,
+ "step": 49
+ },
+ {
+ "epoch": 0.9900990099009901,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017891405093963938,
+ "loss": 0.9646,
+ "step": 50
+ },
+ {
+ "epoch": 1.00990099009901,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00017788772787621126,
+ "loss": 0.9553,
+ "step": 51
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00017684011108568592,
+ "loss": 0.9432,
+ "step": 52
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "eval_loss": 0.9755253195762634,
+ "eval_runtime": 13.879,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 52
+ },
+ {
+ "epoch": 1.0495049504950495,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001757714869760335,
+ "loss": 0.9631,
+ "step": 53
+ },
+ {
+ "epoch": 1.0693069306930694,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.0001746821476984154,
+ "loss": 0.9539,
+ "step": 54
+ },
+ {
+ "epoch": 1.0198019801980198,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00017357239106731317,
+ "loss": 0.9559,
+ "step": 55
+ },
+ {
+ "epoch": 1.0396039603960396,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00017244252047910892,
+ "loss": 0.9111,
+ "step": 56
+ },
+ {
+ "epoch": 1.0594059405940595,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.00017129284482913972,
+ "loss": 0.9503,
+ "step": 57
+ },
+ {
+ "epoch": 1.0792079207920793,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.00017012367842724887,
+ "loss": 0.911,
+ "step": 58
+ },
+ {
+ "epoch": 1.099009900990099,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.0001689353409118566,
+ "loss": 0.9041,
+ "step": 59
+ },
+ {
+ "epoch": 1.118811881188119,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.00016772815716257412,
+ "loss": 0.9117,
+ "step": 60
+ },
+ {
+ "epoch": 1.1386138613861387,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0001665024572113848,
+ "loss": 0.9351,
+ "step": 61
+ },
+ {
+ "epoch": 1.1584158415841583,
+ "grad_norm": 0.251953125,
+ "learning_rate": 0.00016525857615241687,
+ "loss": 0.9438,
+ "step": 62
+ },
+ {
+ "epoch": 1.1782178217821782,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00016399685405033167,
+ "loss": 0.9075,
+ "step": 63
+ },
+ {
+ "epoch": 1.198019801980198,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001627176358473537,
+ "loss": 0.8983,
+ "step": 64
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001614212712689668,
+ "loss": 0.9007,
+ "step": 65
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "eval_loss": 0.9333999156951904,
+ "eval_runtime": 13.8668,
+ "eval_samples_per_second": 9.014,
+ "eval_steps_per_second": 4.543,
+ "step": 65
+ },
+ {
+ "epoch": 1.2376237623762376,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00016010811472830252,
+ "loss": 0.9108,
+ "step": 66
+ },
+ {
+ "epoch": 1.2574257425742574,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.9177,
+ "step": 67
+ },
+ {
+ "epoch": 1.2772277227722773,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.00015743286626829437,
+ "loss": 0.9,
+ "step": 68
+ },
+ {
+ "epoch": 1.297029702970297,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.0001560715057351673,
+ "loss": 0.9096,
+ "step": 69
+ },
+ {
+ "epoch": 1.316831683168317,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.00015469481581224272,
+ "loss": 0.8946,
+ "step": 70
+ },
+ {
+ "epoch": 1.3366336633663367,
+ "grad_norm": 0.31640625,
+ "learning_rate": 0.0001533031728727994,
+ "loss": 0.8995,
+ "step": 71
+ },
+ {
+ "epoch": 1.3564356435643563,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00015189695737812152,
+ "loss": 0.922,
+ "step": 72
+ },
+ {
+ "epoch": 1.3762376237623761,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001504765537734844,
+ "loss": 0.885,
+ "step": 73
+ },
+ {
+ "epoch": 1.396039603960396,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00014904235038305083,
+ "loss": 0.895,
+ "step": 74
+ },
+ {
+ "epoch": 1.4158415841584158,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00014759473930370736,
+ "loss": 0.892,
+ "step": 75
+ },
+ {
+ "epoch": 1.4356435643564356,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0001461341162978688,
+ "loss": 0.8277,
+ "step": 76
+ },
+ {
+ "epoch": 1.4554455445544554,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.00014466088068528068,
+ "loss": 0.8687,
+ "step": 77
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014317543523384928,
+ "loss": 0.8765,
+ "step": 78
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "eval_loss": 0.9083698391914368,
+ "eval_runtime": 13.8834,
+ "eval_samples_per_second": 9.004,
+ "eval_steps_per_second": 4.538,
+ "step": 78
+ },
+ {
+ "epoch": 1.495049504950495,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014167818604952906,
+ "loss": 0.8797,
+ "step": 79
+ },
+ {
+ "epoch": 1.5148514851485149,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 0.00014016954246529696,
+ "loss": 0.905,
+ "step": 80
+ },
+ {
+ "epoch": 1.5346534653465347,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.00013864991692924523,
+ "loss": 0.8575,
+ "step": 81
+ },
+ {
+ "epoch": 1.5544554455445545,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00013711972489182208,
+ "loss": 0.8957,
+ "step": 82
+ },
+ {
+ "epoch": 1.5742574257425743,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.00013557938469225167,
+ "loss": 0.8792,
+ "step": 83
+ },
+ {
+ "epoch": 1.5940594059405941,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.00013402931744416433,
+ "loss": 0.889,
+ "step": 84
+ },
+ {
+ "epoch": 1.613861386138614,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00013246994692046836,
+ "loss": 0.8657,
+ "step": 85
+ },
+ {
+ "epoch": 1.6336633663366338,
+ "grad_norm": 0.20703125,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.8784,
+ "step": 86
+ },
+ {
+ "epoch": 1.6534653465346536,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001293250037384465,
+ "loss": 0.8822,
+ "step": 87
+ },
+ {
+ "epoch": 1.6732673267326734,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00012774029087618446,
+ "loss": 0.9092,
+ "step": 88
+ },
+ {
+ "epoch": 1.693069306930693,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.00012614799409538198,
+ "loss": 0.8813,
+ "step": 89
+ },
+ {
+ "epoch": 1.7128712871287128,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00012454854871407994,
+ "loss": 0.8975,
+ "step": 90
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00012294239200467516,
+ "loss": 0.8789,
+ "step": 91
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "eval_loss": 0.8891416788101196,
+ "eval_runtime": 13.872,
+ "eval_samples_per_second": 9.011,
+ "eval_steps_per_second": 4.542,
+ "step": 91
+ },
+ {
+ "epoch": 1.7524752475247525,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.0001213299630743747,
+ "loss": 0.9184,
+ "step": 92
+ },
+ {
+ "epoch": 1.7722772277227723,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00011971170274514802,
+ "loss": 0.8854,
+ "step": 93
+ },
+ {
+ "epoch": 1.7920792079207921,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000118088053433211,
+ "loss": 0.8688,
+ "step": 94
+ },
+ {
+ "epoch": 1.811881188118812,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.00011645945902807341,
+ "loss": 0.8281,
+ "step": 95
+ },
+ {
+ "epoch": 1.8316831683168315,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0001148263647711842,
+ "loss": 0.8488,
+ "step": 96
+ },
+ {
+ "epoch": 1.8514851485148514,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00011318921713420691,
+ "loss": 0.8742,
+ "step": 97
+ },
+ {
+ "epoch": 1.8712871287128712,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00011154846369695863,
+ "loss": 0.8586,
+ "step": 98
+ },
+ {
+ "epoch": 1.891089108910891,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001099045530250463,
+ "loss": 0.8776,
+ "step": 99
+ },
+ {
+ "epoch": 1.9108910891089108,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00010825793454723325,
+ "loss": 0.8563,
+ "step": 100
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 200,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 50,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.08354098020352e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944
diff --git a/checkpoint-150/README.md b/checkpoint-150/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978
--- /dev/null
+++ b/checkpoint-150/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/checkpoint-150/adapter_config.json b/checkpoint-150/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891
--- /dev/null
+++ b/checkpoint-150/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "q_proj",
+ "o_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-150/adapter_model.safetensors b/checkpoint-150/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..799a842d6fee603753511555ac2bc5993ecebb3b
--- /dev/null
+++ b/checkpoint-150/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97264f01540b1ad5acd25f27b627a7352dbda77c960c2b3c7b157d05035d6ac6
+size 50503848
diff --git a/checkpoint-150/optimizer.pt b/checkpoint-150/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ffc9c31e4e29a21f02c95e78b174839a460cb94
--- /dev/null
+++ b/checkpoint-150/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24e5de270e966edc3891231b22ee3b34b5d5573183750ce1a8ecca10a2b62423
+size 202035450
diff --git a/checkpoint-150/rng_state.pth b/checkpoint-150/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8d31ebd7d51189a81569a9786ce90149798f188f
--- /dev/null
+++ b/checkpoint-150/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3129c63169712c776c1e0e28d8711e276143acd2c2f061fb6eb052c04856ba72
+size 14244
diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d540db88f12afedb3a1b7ff4c08ac14c3431f65
--- /dev/null
+++ b/checkpoint-150/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd5d42bb0afda20ec4c83d38c6af1131541c335ecab229c74e7f418894f3c13b
+size 1064
diff --git a/checkpoint-150/special_tokens_map.json b/checkpoint-150/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-150/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-150/tokenizer.model b/checkpoint-150/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-150/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-150/tokenizer_config.json b/checkpoint-150/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86
--- /dev/null
+++ b/checkpoint-150/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b6a9be3ad59ba2c1d0d8712719a791499ffb1ec
--- /dev/null
+++ b/checkpoint-150/trainer_state.json
@@ -0,0 +1,1179 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.8316831683168315,
+ "eval_steps": 13,
+ "global_step": 150,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.019801980198019802,
+ "grad_norm": 1.15625,
+ "learning_rate": 2e-05,
+ "loss": 2.0919,
+ "step": 1
+ },
+ {
+ "epoch": 0.019801980198019802,
+ "eval_loss": 2.079954147338867,
+ "eval_runtime": 13.8908,
+ "eval_samples_per_second": 8.999,
+ "eval_steps_per_second": 4.535,
+ "step": 1
+ },
+ {
+ "epoch": 0.039603960396039604,
+ "grad_norm": 1.203125,
+ "learning_rate": 4e-05,
+ "loss": 2.0814,
+ "step": 2
+ },
+ {
+ "epoch": 0.0594059405940594,
+ "grad_norm": 1.1953125,
+ "learning_rate": 6e-05,
+ "loss": 2.0499,
+ "step": 3
+ },
+ {
+ "epoch": 0.07920792079207921,
+ "grad_norm": 1.0859375,
+ "learning_rate": 8e-05,
+ "loss": 2.0153,
+ "step": 4
+ },
+ {
+ "epoch": 0.09900990099009901,
+ "grad_norm": 1.0390625,
+ "learning_rate": 0.0001,
+ "loss": 1.9548,
+ "step": 5
+ },
+ {
+ "epoch": 0.1188118811881188,
+ "grad_norm": 0.89453125,
+ "learning_rate": 0.00012,
+ "loss": 1.8982,
+ "step": 6
+ },
+ {
+ "epoch": 0.13861386138613863,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00014,
+ "loss": 1.8226,
+ "step": 7
+ },
+ {
+ "epoch": 0.15841584158415842,
+ "grad_norm": 0.66796875,
+ "learning_rate": 0.00016,
+ "loss": 1.7572,
+ "step": 8
+ },
+ {
+ "epoch": 0.1782178217821782,
+ "grad_norm": 0.78515625,
+ "learning_rate": 0.00018,
+ "loss": 1.7074,
+ "step": 9
+ },
+ {
+ "epoch": 0.19801980198019803,
+ "grad_norm": 0.73828125,
+ "learning_rate": 0.0002,
+ "loss": 1.6317,
+ "step": 10
+ },
+ {
+ "epoch": 0.21782178217821782,
+ "grad_norm": 0.484375,
+ "learning_rate": 0.0001999863304992469,
+ "loss": 1.5801,
+ "step": 11
+ },
+ {
+ "epoch": 0.2376237623762376,
+ "grad_norm": 0.53125,
+ "learning_rate": 0.00019994532573409262,
+ "loss": 1.5721,
+ "step": 12
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "grad_norm": 0.6953125,
+ "learning_rate": 0.00019987699691483048,
+ "loss": 1.5479,
+ "step": 13
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "eval_loss": 1.5341482162475586,
+ "eval_runtime": 13.8795,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 13
+ },
+ {
+ "epoch": 0.27722772277227725,
+ "grad_norm": 0.65234375,
+ "learning_rate": 0.00019978136272187747,
+ "loss": 1.534,
+ "step": 14
+ },
+ {
+ "epoch": 0.297029702970297,
+ "grad_norm": 0.515625,
+ "learning_rate": 0.000199658449300667,
+ "loss": 1.4804,
+ "step": 15
+ },
+ {
+ "epoch": 0.31683168316831684,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00019950829025450114,
+ "loss": 1.4805,
+ "step": 16
+ },
+ {
+ "epoch": 0.33663366336633666,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.00019933092663536382,
+ "loss": 1.3809,
+ "step": 17
+ },
+ {
+ "epoch": 0.3564356435643564,
+ "grad_norm": 0.3125,
+ "learning_rate": 0.00019912640693269752,
+ "loss": 1.3837,
+ "step": 18
+ },
+ {
+ "epoch": 0.37623762376237624,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00019889478706014687,
+ "loss": 1.3673,
+ "step": 19
+ },
+ {
+ "epoch": 0.39603960396039606,
+ "grad_norm": 0.298828125,
+ "learning_rate": 0.00019863613034027224,
+ "loss": 1.366,
+ "step": 20
+ },
+ {
+ "epoch": 0.4158415841584158,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.00019835050748723824,
+ "loss": 1.3318,
+ "step": 21
+ },
+ {
+ "epoch": 0.43564356435643564,
+ "grad_norm": 0.341796875,
+ "learning_rate": 0.00019803799658748094,
+ "loss": 1.2741,
+ "step": 22
+ },
+ {
+ "epoch": 0.45544554455445546,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019769868307835994,
+ "loss": 1.2978,
+ "step": 23
+ },
+ {
+ "epoch": 0.4752475247524752,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.0001973326597248006,
+ "loss": 1.2733,
+ "step": 24
+ },
+ {
+ "epoch": 0.49504950495049505,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00019694002659393305,
+ "loss": 1.2302,
+ "step": 25
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.00019652089102773488,
+ "loss": 1.2083,
+ "step": 26
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "eval_loss": 1.224540114402771,
+ "eval_runtime": 13.8695,
+ "eval_samples_per_second": 9.013,
+ "eval_steps_per_second": 4.542,
+ "step": 26
+ },
+ {
+ "epoch": 0.5346534653465347,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.00019607536761368484,
+ "loss": 1.1761,
+ "step": 27
+ },
+ {
+ "epoch": 0.5544554455445545,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.00019560357815343577,
+ "loss": 1.1751,
+ "step": 28
+ },
+ {
+ "epoch": 0.5742574257425742,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 1.2002,
+ "step": 29
+ },
+ {
+ "epoch": 0.594059405940594,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.00019458172417006347,
+ "loss": 1.1544,
+ "step": 30
+ },
+ {
+ "epoch": 0.6138613861386139,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00019403193901161613,
+ "loss": 1.1384,
+ "step": 31
+ },
+ {
+ "epoch": 0.6336633663366337,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0001934564464599461,
+ "loss": 1.0999,
+ "step": 32
+ },
+ {
+ "epoch": 0.6534653465346535,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019285540384897073,
+ "loss": 1.1576,
+ "step": 33
+ },
+ {
+ "epoch": 0.6732673267326733,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019222897549773848,
+ "loss": 1.091,
+ "step": 34
+ },
+ {
+ "epoch": 0.693069306930693,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00019157733266550575,
+ "loss": 1.056,
+ "step": 35
+ },
+ {
+ "epoch": 0.7128712871287128,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00019090065350491626,
+ "loss": 1.1068,
+ "step": 36
+ },
+ {
+ "epoch": 0.7326732673267327,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00019019912301329592,
+ "loss": 1.0583,
+ "step": 37
+ },
+ {
+ "epoch": 0.7524752475247525,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.00018947293298207635,
+ "loss": 1.0671,
+ "step": 38
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001887222819443612,
+ "loss": 1.0851,
+ "step": 39
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "eval_loss": 1.060703158378601,
+ "eval_runtime": 13.878,
+ "eval_samples_per_second": 9.007,
+ "eval_steps_per_second": 4.54,
+ "step": 39
+ },
+ {
+ "epoch": 0.7920792079207921,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001879473751206489,
+ "loss": 1.0343,
+ "step": 40
+ },
+ {
+ "epoch": 0.8118811881188119,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.00018714842436272773,
+ "loss": 0.9789,
+ "step": 41
+ },
+ {
+ "epoch": 0.8316831683168316,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00018632564809575742,
+ "loss": 1.0174,
+ "step": 42
+ },
+ {
+ "epoch": 0.8514851485148515,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0001854792712585539,
+ "loss": 1.0004,
+ "step": 43
+ },
+ {
+ "epoch": 0.8712871287128713,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00018460952524209355,
+ "loss": 1.0281,
+ "step": 44
+ },
+ {
+ "epoch": 0.8910891089108911,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00018371664782625287,
+ "loss": 0.9992,
+ "step": 45
+ },
+ {
+ "epoch": 0.9108910891089109,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00018280088311480201,
+ "loss": 0.9635,
+ "step": 46
+ },
+ {
+ "epoch": 0.9306930693069307,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00018186248146866927,
+ "loss": 1.006,
+ "step": 47
+ },
+ {
+ "epoch": 0.9504950495049505,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.9891,
+ "step": 48
+ },
+ {
+ "epoch": 0.9702970297029703,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.0001799187996894925,
+ "loss": 0.9809,
+ "step": 49
+ },
+ {
+ "epoch": 0.9900990099009901,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017891405093963938,
+ "loss": 0.9646,
+ "step": 50
+ },
+ {
+ "epoch": 1.00990099009901,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00017788772787621126,
+ "loss": 0.9553,
+ "step": 51
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00017684011108568592,
+ "loss": 0.9432,
+ "step": 52
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "eval_loss": 0.9755253195762634,
+ "eval_runtime": 13.879,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 52
+ },
+ {
+ "epoch": 1.0495049504950495,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001757714869760335,
+ "loss": 0.9631,
+ "step": 53
+ },
+ {
+ "epoch": 1.0693069306930694,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.0001746821476984154,
+ "loss": 0.9539,
+ "step": 54
+ },
+ {
+ "epoch": 1.0198019801980198,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00017357239106731317,
+ "loss": 0.9559,
+ "step": 55
+ },
+ {
+ "epoch": 1.0396039603960396,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00017244252047910892,
+ "loss": 0.9111,
+ "step": 56
+ },
+ {
+ "epoch": 1.0594059405940595,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.00017129284482913972,
+ "loss": 0.9503,
+ "step": 57
+ },
+ {
+ "epoch": 1.0792079207920793,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.00017012367842724887,
+ "loss": 0.911,
+ "step": 58
+ },
+ {
+ "epoch": 1.099009900990099,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.0001689353409118566,
+ "loss": 0.9041,
+ "step": 59
+ },
+ {
+ "epoch": 1.118811881188119,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.00016772815716257412,
+ "loss": 0.9117,
+ "step": 60
+ },
+ {
+ "epoch": 1.1386138613861387,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0001665024572113848,
+ "loss": 0.9351,
+ "step": 61
+ },
+ {
+ "epoch": 1.1584158415841583,
+ "grad_norm": 0.251953125,
+ "learning_rate": 0.00016525857615241687,
+ "loss": 0.9438,
+ "step": 62
+ },
+ {
+ "epoch": 1.1782178217821782,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00016399685405033167,
+ "loss": 0.9075,
+ "step": 63
+ },
+ {
+ "epoch": 1.198019801980198,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001627176358473537,
+ "loss": 0.8983,
+ "step": 64
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001614212712689668,
+ "loss": 0.9007,
+ "step": 65
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "eval_loss": 0.9333999156951904,
+ "eval_runtime": 13.8668,
+ "eval_samples_per_second": 9.014,
+ "eval_steps_per_second": 4.543,
+ "step": 65
+ },
+ {
+ "epoch": 1.2376237623762376,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00016010811472830252,
+ "loss": 0.9108,
+ "step": 66
+ },
+ {
+ "epoch": 1.2574257425742574,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.9177,
+ "step": 67
+ },
+ {
+ "epoch": 1.2772277227722773,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.00015743286626829437,
+ "loss": 0.9,
+ "step": 68
+ },
+ {
+ "epoch": 1.297029702970297,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.0001560715057351673,
+ "loss": 0.9096,
+ "step": 69
+ },
+ {
+ "epoch": 1.316831683168317,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.00015469481581224272,
+ "loss": 0.8946,
+ "step": 70
+ },
+ {
+ "epoch": 1.3366336633663367,
+ "grad_norm": 0.31640625,
+ "learning_rate": 0.0001533031728727994,
+ "loss": 0.8995,
+ "step": 71
+ },
+ {
+ "epoch": 1.3564356435643563,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00015189695737812152,
+ "loss": 0.922,
+ "step": 72
+ },
+ {
+ "epoch": 1.3762376237623761,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001504765537734844,
+ "loss": 0.885,
+ "step": 73
+ },
+ {
+ "epoch": 1.396039603960396,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00014904235038305083,
+ "loss": 0.895,
+ "step": 74
+ },
+ {
+ "epoch": 1.4158415841584158,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00014759473930370736,
+ "loss": 0.892,
+ "step": 75
+ },
+ {
+ "epoch": 1.4356435643564356,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0001461341162978688,
+ "loss": 0.8277,
+ "step": 76
+ },
+ {
+ "epoch": 1.4554455445544554,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.00014466088068528068,
+ "loss": 0.8687,
+ "step": 77
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014317543523384928,
+ "loss": 0.8765,
+ "step": 78
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "eval_loss": 0.9083698391914368,
+ "eval_runtime": 13.8834,
+ "eval_samples_per_second": 9.004,
+ "eval_steps_per_second": 4.538,
+ "step": 78
+ },
+ {
+ "epoch": 1.495049504950495,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014167818604952906,
+ "loss": 0.8797,
+ "step": 79
+ },
+ {
+ "epoch": 1.5148514851485149,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 0.00014016954246529696,
+ "loss": 0.905,
+ "step": 80
+ },
+ {
+ "epoch": 1.5346534653465347,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.00013864991692924523,
+ "loss": 0.8575,
+ "step": 81
+ },
+ {
+ "epoch": 1.5544554455445545,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00013711972489182208,
+ "loss": 0.8957,
+ "step": 82
+ },
+ {
+ "epoch": 1.5742574257425743,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.00013557938469225167,
+ "loss": 0.8792,
+ "step": 83
+ },
+ {
+ "epoch": 1.5940594059405941,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.00013402931744416433,
+ "loss": 0.889,
+ "step": 84
+ },
+ {
+ "epoch": 1.613861386138614,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00013246994692046836,
+ "loss": 0.8657,
+ "step": 85
+ },
+ {
+ "epoch": 1.6336633663366338,
+ "grad_norm": 0.20703125,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.8784,
+ "step": 86
+ },
+ {
+ "epoch": 1.6534653465346536,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001293250037384465,
+ "loss": 0.8822,
+ "step": 87
+ },
+ {
+ "epoch": 1.6732673267326734,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00012774029087618446,
+ "loss": 0.9092,
+ "step": 88
+ },
+ {
+ "epoch": 1.693069306930693,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.00012614799409538198,
+ "loss": 0.8813,
+ "step": 89
+ },
+ {
+ "epoch": 1.7128712871287128,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00012454854871407994,
+ "loss": 0.8975,
+ "step": 90
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00012294239200467516,
+ "loss": 0.8789,
+ "step": 91
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "eval_loss": 0.8891416788101196,
+ "eval_runtime": 13.872,
+ "eval_samples_per_second": 9.011,
+ "eval_steps_per_second": 4.542,
+ "step": 91
+ },
+ {
+ "epoch": 1.7524752475247525,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.0001213299630743747,
+ "loss": 0.9184,
+ "step": 92
+ },
+ {
+ "epoch": 1.7722772277227723,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00011971170274514802,
+ "loss": 0.8854,
+ "step": 93
+ },
+ {
+ "epoch": 1.7920792079207921,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000118088053433211,
+ "loss": 0.8688,
+ "step": 94
+ },
+ {
+ "epoch": 1.811881188118812,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.00011645945902807341,
+ "loss": 0.8281,
+ "step": 95
+ },
+ {
+ "epoch": 1.8316831683168315,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0001148263647711842,
+ "loss": 0.8488,
+ "step": 96
+ },
+ {
+ "epoch": 1.8514851485148514,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00011318921713420691,
+ "loss": 0.8742,
+ "step": 97
+ },
+ {
+ "epoch": 1.8712871287128712,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00011154846369695863,
+ "loss": 0.8586,
+ "step": 98
+ },
+ {
+ "epoch": 1.891089108910891,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001099045530250463,
+ "loss": 0.8776,
+ "step": 99
+ },
+ {
+ "epoch": 1.9108910891089108,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00010825793454723325,
+ "loss": 0.8563,
+ "step": 100
+ },
+ {
+ "epoch": 1.9306930693069306,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00010660905843256994,
+ "loss": 0.8381,
+ "step": 101
+ },
+ {
+ "epoch": 1.9504950495049505,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.00010495837546732224,
+ "loss": 0.847,
+ "step": 102
+ },
+ {
+ "epoch": 1.9702970297029703,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.00010330633693173082,
+ "loss": 0.8512,
+ "step": 103
+ },
+ {
+ "epoch": 1.99009900990099,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00010165339447663587,
+ "loss": 0.8304,
+ "step": 104
+ },
+ {
+ "epoch": 1.99009900990099,
+ "eval_loss": 0.8779018521308899,
+ "eval_runtime": 13.8827,
+ "eval_samples_per_second": 9.004,
+ "eval_steps_per_second": 4.538,
+ "step": 104
+ },
+ {
+ "epoch": 2.00990099009901,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0001,
+ "loss": 0.8523,
+ "step": 105
+ },
+ {
+ "epoch": 2.0297029702970297,
+ "grad_norm": 0.2392578125,
+ "learning_rate": 9.834660552336415e-05,
+ "loss": 0.8109,
+ "step": 106
+ },
+ {
+ "epoch": 2.0495049504950495,
+ "grad_norm": 0.224609375,
+ "learning_rate": 9.669366306826919e-05,
+ "loss": 0.8394,
+ "step": 107
+ },
+ {
+ "epoch": 2.0693069306930694,
+ "grad_norm": 0.283203125,
+ "learning_rate": 9.504162453267777e-05,
+ "loss": 0.8524,
+ "step": 108
+ },
+ {
+ "epoch": 2.01980198019802,
+ "grad_norm": 0.22265625,
+ "learning_rate": 9.339094156743007e-05,
+ "loss": 0.8391,
+ "step": 109
+ },
+ {
+ "epoch": 2.0396039603960396,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 9.174206545276677e-05,
+ "loss": 0.8317,
+ "step": 110
+ },
+ {
+ "epoch": 2.0594059405940595,
+ "grad_norm": 0.22265625,
+ "learning_rate": 9.009544697495374e-05,
+ "loss": 0.833,
+ "step": 111
+ },
+ {
+ "epoch": 2.0792079207920793,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 8.845153630304139e-05,
+ "loss": 0.8408,
+ "step": 112
+ },
+ {
+ "epoch": 2.099009900990099,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 8.681078286579311e-05,
+ "loss": 0.8459,
+ "step": 113
+ },
+ {
+ "epoch": 2.118811881188119,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 8.517363522881579e-05,
+ "loss": 0.8177,
+ "step": 114
+ },
+ {
+ "epoch": 2.1386138613861387,
+ "grad_norm": 0.2265625,
+ "learning_rate": 8.35405409719266e-05,
+ "loss": 0.8451,
+ "step": 115
+ },
+ {
+ "epoch": 2.1584158415841586,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 8.191194656678904e-05,
+ "loss": 0.8543,
+ "step": 116
+ },
+ {
+ "epoch": 2.1782178217821784,
+ "grad_norm": 0.22265625,
+ "learning_rate": 8.028829725485199e-05,
+ "loss": 0.8194,
+ "step": 117
+ },
+ {
+ "epoch": 2.1782178217821784,
+ "eval_loss": 0.8713971972465515,
+ "eval_runtime": 13.8976,
+ "eval_samples_per_second": 8.994,
+ "eval_steps_per_second": 4.533,
+ "step": 117
+ },
+ {
+ "epoch": 2.198019801980198,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 7.867003692562534e-05,
+ "loss": 0.808,
+ "step": 118
+ },
+ {
+ "epoch": 2.217821782178218,
+ "grad_norm": 0.2470703125,
+ "learning_rate": 7.705760799532485e-05,
+ "loss": 0.8073,
+ "step": 119
+ },
+ {
+ "epoch": 2.237623762376238,
+ "grad_norm": 0.201171875,
+ "learning_rate": 7.54514512859201e-05,
+ "loss": 0.8392,
+ "step": 120
+ },
+ {
+ "epoch": 2.2574257425742577,
+ "grad_norm": 0.25,
+ "learning_rate": 7.385200590461803e-05,
+ "loss": 0.8574,
+ "step": 121
+ },
+ {
+ "epoch": 2.2772277227722775,
+ "grad_norm": 0.271484375,
+ "learning_rate": 7.225970912381556e-05,
+ "loss": 0.8338,
+ "step": 122
+ },
+ {
+ "epoch": 2.297029702970297,
+ "grad_norm": 0.294921875,
+ "learning_rate": 7.067499626155354e-05,
+ "loss": 0.8788,
+ "step": 123
+ },
+ {
+ "epoch": 2.3168316831683167,
+ "grad_norm": 0.2265625,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.8297,
+ "step": 124
+ },
+ {
+ "epoch": 2.3366336633663365,
+ "grad_norm": 0.267578125,
+ "learning_rate": 6.753005307953167e-05,
+ "loss": 0.8125,
+ "step": 125
+ },
+ {
+ "epoch": 2.3564356435643563,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 6.59706825558357e-05,
+ "loss": 0.814,
+ "step": 126
+ },
+ {
+ "epoch": 2.376237623762376,
+ "grad_norm": 0.27734375,
+ "learning_rate": 6.442061530774834e-05,
+ "loss": 0.8335,
+ "step": 127
+ },
+ {
+ "epoch": 2.396039603960396,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 6.28802751081779e-05,
+ "loss": 0.8512,
+ "step": 128
+ },
+ {
+ "epoch": 2.4158415841584158,
+ "grad_norm": 0.224609375,
+ "learning_rate": 6.135008307075481e-05,
+ "loss": 0.8297,
+ "step": 129
+ },
+ {
+ "epoch": 2.4356435643564356,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 5.983045753470308e-05,
+ "loss": 0.848,
+ "step": 130
+ },
+ {
+ "epoch": 2.4356435643564356,
+ "eval_loss": 0.8665071129798889,
+ "eval_runtime": 13.8735,
+ "eval_samples_per_second": 9.01,
+ "eval_steps_per_second": 4.541,
+ "step": 130
+ },
+ {
+ "epoch": 2.4554455445544554,
+ "grad_norm": 0.2265625,
+ "learning_rate": 5.832181395047098e-05,
+ "loss": 0.8203,
+ "step": 131
+ },
+ {
+ "epoch": 2.4752475247524752,
+ "grad_norm": 0.287109375,
+ "learning_rate": 5.6824564766150726e-05,
+ "loss": 0.8519,
+ "step": 132
+ },
+ {
+ "epoch": 2.495049504950495,
+ "grad_norm": 0.21484375,
+ "learning_rate": 5.533911931471936e-05,
+ "loss": 0.83,
+ "step": 133
+ },
+ {
+ "epoch": 2.514851485148515,
+ "grad_norm": 0.2109375,
+ "learning_rate": 5.386588370213124e-05,
+ "loss": 0.842,
+ "step": 134
+ },
+ {
+ "epoch": 2.5346534653465347,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 5.240526069629265e-05,
+ "loss": 0.8419,
+ "step": 135
+ },
+ {
+ "epoch": 2.5544554455445545,
+ "grad_norm": 0.267578125,
+ "learning_rate": 5.095764961694922e-05,
+ "loss": 0.8458,
+ "step": 136
+ },
+ {
+ "epoch": 2.5742574257425743,
+ "grad_norm": 0.203125,
+ "learning_rate": 4.952344622651566e-05,
+ "loss": 0.8133,
+ "step": 137
+ },
+ {
+ "epoch": 2.594059405940594,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 4.810304262187852e-05,
+ "loss": 0.8103,
+ "step": 138
+ },
+ {
+ "epoch": 2.613861386138614,
+ "grad_norm": 0.20703125,
+ "learning_rate": 4.669682712720065e-05,
+ "loss": 0.8105,
+ "step": 139
+ },
+ {
+ "epoch": 2.633663366336634,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 4.530518418775733e-05,
+ "loss": 0.8305,
+ "step": 140
+ },
+ {
+ "epoch": 2.6534653465346536,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 4.392849426483274e-05,
+ "loss": 0.7881,
+ "step": 141
+ },
+ {
+ "epoch": 2.6732673267326734,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 4.256713373170564e-05,
+ "loss": 0.8204,
+ "step": 142
+ },
+ {
+ "epoch": 2.693069306930693,
+ "grad_norm": 0.263671875,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.8354,
+ "step": 143
+ },
+ {
+ "epoch": 2.693069306930693,
+ "eval_loss": 0.8626759648323059,
+ "eval_runtime": 13.8585,
+ "eval_samples_per_second": 9.02,
+ "eval_steps_per_second": 4.546,
+ "step": 143
+ },
+ {
+ "epoch": 2.7128712871287126,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 3.9891885271697496e-05,
+ "loss": 0.8441,
+ "step": 144
+ },
+ {
+ "epoch": 2.7326732673267324,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 3.857872873103322e-05,
+ "loss": 0.8084,
+ "step": 145
+ },
+ {
+ "epoch": 2.7524752475247523,
+ "grad_norm": 0.18359375,
+ "learning_rate": 3.7282364152646297e-05,
+ "loss": 0.8184,
+ "step": 146
+ },
+ {
+ "epoch": 2.772277227722772,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 3.600314594966834e-05,
+ "loss": 0.8302,
+ "step": 147
+ },
+ {
+ "epoch": 2.792079207920792,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 3.4741423847583134e-05,
+ "loss": 0.8503,
+ "step": 148
+ },
+ {
+ "epoch": 2.8118811881188117,
+ "grad_norm": 0.2265625,
+ "learning_rate": 3.349754278861517e-05,
+ "loss": 0.8273,
+ "step": 149
+ },
+ {
+ "epoch": 2.8316831683168315,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 3.227184283742591e-05,
+ "loss": 0.8332,
+ "step": 150
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 200,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 50,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.1227070440800256e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30
--- /dev/null
+++ b/checkpoint-150/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944
diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978
--- /dev/null
+++ b/checkpoint-200/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891
--- /dev/null
+++ b/checkpoint-200/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "q_proj",
+ "o_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4181a1133aef7fd8af624e49e7f253a4b6ae062e
--- /dev/null
+++ b/checkpoint-200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800103a9f27876d14f8e9f0fb64fb81af3a478d54bbaea5587ecbd0592ad4142
+size 50503848
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c554fca617de0749a94a47996d59055a173c0d58
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dafd7ff9d5c3c564b22c4a0593f1078a408837f37261ad73caf0c7e062c6a39
+size 202035450
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..436017979432f2d4cfade120bb74af668a9f1bfc
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ca197c3706eaaadf2931079a5ebf26b215b3f60f60a6755cc111301c7ac7f6
+size 14244
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..53f4138ea76495c65f762a13321851ab341abfff
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9a25c72339c898b564e0c464a3f6fc75bbeec408008928b7ed05533156b98c
+size 1064
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2731e3b879a62aa4da83438163cd54e6b6e7b12e
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,1561 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.7524752475247523,
+ "eval_steps": 13,
+ "global_step": 200,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.019801980198019802,
+ "grad_norm": 1.15625,
+ "learning_rate": 2e-05,
+ "loss": 2.0919,
+ "step": 1
+ },
+ {
+ "epoch": 0.019801980198019802,
+ "eval_loss": 2.079954147338867,
+ "eval_runtime": 13.8908,
+ "eval_samples_per_second": 8.999,
+ "eval_steps_per_second": 4.535,
+ "step": 1
+ },
+ {
+ "epoch": 0.039603960396039604,
+ "grad_norm": 1.203125,
+ "learning_rate": 4e-05,
+ "loss": 2.0814,
+ "step": 2
+ },
+ {
+ "epoch": 0.0594059405940594,
+ "grad_norm": 1.1953125,
+ "learning_rate": 6e-05,
+ "loss": 2.0499,
+ "step": 3
+ },
+ {
+ "epoch": 0.07920792079207921,
+ "grad_norm": 1.0859375,
+ "learning_rate": 8e-05,
+ "loss": 2.0153,
+ "step": 4
+ },
+ {
+ "epoch": 0.09900990099009901,
+ "grad_norm": 1.0390625,
+ "learning_rate": 0.0001,
+ "loss": 1.9548,
+ "step": 5
+ },
+ {
+ "epoch": 0.1188118811881188,
+ "grad_norm": 0.89453125,
+ "learning_rate": 0.00012,
+ "loss": 1.8982,
+ "step": 6
+ },
+ {
+ "epoch": 0.13861386138613863,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00014,
+ "loss": 1.8226,
+ "step": 7
+ },
+ {
+ "epoch": 0.15841584158415842,
+ "grad_norm": 0.66796875,
+ "learning_rate": 0.00016,
+ "loss": 1.7572,
+ "step": 8
+ },
+ {
+ "epoch": 0.1782178217821782,
+ "grad_norm": 0.78515625,
+ "learning_rate": 0.00018,
+ "loss": 1.7074,
+ "step": 9
+ },
+ {
+ "epoch": 0.19801980198019803,
+ "grad_norm": 0.73828125,
+ "learning_rate": 0.0002,
+ "loss": 1.6317,
+ "step": 10
+ },
+ {
+ "epoch": 0.21782178217821782,
+ "grad_norm": 0.484375,
+ "learning_rate": 0.0001999863304992469,
+ "loss": 1.5801,
+ "step": 11
+ },
+ {
+ "epoch": 0.2376237623762376,
+ "grad_norm": 0.53125,
+ "learning_rate": 0.00019994532573409262,
+ "loss": 1.5721,
+ "step": 12
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "grad_norm": 0.6953125,
+ "learning_rate": 0.00019987699691483048,
+ "loss": 1.5479,
+ "step": 13
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "eval_loss": 1.5341482162475586,
+ "eval_runtime": 13.8795,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 13
+ },
+ {
+ "epoch": 0.27722772277227725,
+ "grad_norm": 0.65234375,
+ "learning_rate": 0.00019978136272187747,
+ "loss": 1.534,
+ "step": 14
+ },
+ {
+ "epoch": 0.297029702970297,
+ "grad_norm": 0.515625,
+ "learning_rate": 0.000199658449300667,
+ "loss": 1.4804,
+ "step": 15
+ },
+ {
+ "epoch": 0.31683168316831684,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00019950829025450114,
+ "loss": 1.4805,
+ "step": 16
+ },
+ {
+ "epoch": 0.33663366336633666,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.00019933092663536382,
+ "loss": 1.3809,
+ "step": 17
+ },
+ {
+ "epoch": 0.3564356435643564,
+ "grad_norm": 0.3125,
+ "learning_rate": 0.00019912640693269752,
+ "loss": 1.3837,
+ "step": 18
+ },
+ {
+ "epoch": 0.37623762376237624,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00019889478706014687,
+ "loss": 1.3673,
+ "step": 19
+ },
+ {
+ "epoch": 0.39603960396039606,
+ "grad_norm": 0.298828125,
+ "learning_rate": 0.00019863613034027224,
+ "loss": 1.366,
+ "step": 20
+ },
+ {
+ "epoch": 0.4158415841584158,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.00019835050748723824,
+ "loss": 1.3318,
+ "step": 21
+ },
+ {
+ "epoch": 0.43564356435643564,
+ "grad_norm": 0.341796875,
+ "learning_rate": 0.00019803799658748094,
+ "loss": 1.2741,
+ "step": 22
+ },
+ {
+ "epoch": 0.45544554455445546,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019769868307835994,
+ "loss": 1.2978,
+ "step": 23
+ },
+ {
+ "epoch": 0.4752475247524752,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.0001973326597248006,
+ "loss": 1.2733,
+ "step": 24
+ },
+ {
+ "epoch": 0.49504950495049505,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00019694002659393305,
+ "loss": 1.2302,
+ "step": 25
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.00019652089102773488,
+ "loss": 1.2083,
+ "step": 26
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "eval_loss": 1.224540114402771,
+ "eval_runtime": 13.8695,
+ "eval_samples_per_second": 9.013,
+ "eval_steps_per_second": 4.542,
+ "step": 26
+ },
+ {
+ "epoch": 0.5346534653465347,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.00019607536761368484,
+ "loss": 1.1761,
+ "step": 27
+ },
+ {
+ "epoch": 0.5544554455445545,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.00019560357815343577,
+ "loss": 1.1751,
+ "step": 28
+ },
+ {
+ "epoch": 0.5742574257425742,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 1.2002,
+ "step": 29
+ },
+ {
+ "epoch": 0.594059405940594,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.00019458172417006347,
+ "loss": 1.1544,
+ "step": 30
+ },
+ {
+ "epoch": 0.6138613861386139,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00019403193901161613,
+ "loss": 1.1384,
+ "step": 31
+ },
+ {
+ "epoch": 0.6336633663366337,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0001934564464599461,
+ "loss": 1.0999,
+ "step": 32
+ },
+ {
+ "epoch": 0.6534653465346535,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019285540384897073,
+ "loss": 1.1576,
+ "step": 33
+ },
+ {
+ "epoch": 0.6732673267326733,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019222897549773848,
+ "loss": 1.091,
+ "step": 34
+ },
+ {
+ "epoch": 0.693069306930693,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00019157733266550575,
+ "loss": 1.056,
+ "step": 35
+ },
+ {
+ "epoch": 0.7128712871287128,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00019090065350491626,
+ "loss": 1.1068,
+ "step": 36
+ },
+ {
+ "epoch": 0.7326732673267327,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00019019912301329592,
+ "loss": 1.0583,
+ "step": 37
+ },
+ {
+ "epoch": 0.7524752475247525,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.00018947293298207635,
+ "loss": 1.0671,
+ "step": 38
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001887222819443612,
+ "loss": 1.0851,
+ "step": 39
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "eval_loss": 1.060703158378601,
+ "eval_runtime": 13.878,
+ "eval_samples_per_second": 9.007,
+ "eval_steps_per_second": 4.54,
+ "step": 39
+ },
+ {
+ "epoch": 0.7920792079207921,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001879473751206489,
+ "loss": 1.0343,
+ "step": 40
+ },
+ {
+ "epoch": 0.8118811881188119,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.00018714842436272773,
+ "loss": 0.9789,
+ "step": 41
+ },
+ {
+ "epoch": 0.8316831683168316,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00018632564809575742,
+ "loss": 1.0174,
+ "step": 42
+ },
+ {
+ "epoch": 0.8514851485148515,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0001854792712585539,
+ "loss": 1.0004,
+ "step": 43
+ },
+ {
+ "epoch": 0.8712871287128713,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00018460952524209355,
+ "loss": 1.0281,
+ "step": 44
+ },
+ {
+ "epoch": 0.8910891089108911,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00018371664782625287,
+ "loss": 0.9992,
+ "step": 45
+ },
+ {
+ "epoch": 0.9108910891089109,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00018280088311480201,
+ "loss": 0.9635,
+ "step": 46
+ },
+ {
+ "epoch": 0.9306930693069307,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00018186248146866927,
+ "loss": 1.006,
+ "step": 47
+ },
+ {
+ "epoch": 0.9504950495049505,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.9891,
+ "step": 48
+ },
+ {
+ "epoch": 0.9702970297029703,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.0001799187996894925,
+ "loss": 0.9809,
+ "step": 49
+ },
+ {
+ "epoch": 0.9900990099009901,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017891405093963938,
+ "loss": 0.9646,
+ "step": 50
+ },
+ {
+ "epoch": 1.00990099009901,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00017788772787621126,
+ "loss": 0.9553,
+ "step": 51
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00017684011108568592,
+ "loss": 0.9432,
+ "step": 52
+ },
+ {
+ "epoch": 1.0297029702970297,
+ "eval_loss": 0.9755253195762634,
+ "eval_runtime": 13.879,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 52
+ },
+ {
+ "epoch": 1.0495049504950495,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001757714869760335,
+ "loss": 0.9631,
+ "step": 53
+ },
+ {
+ "epoch": 1.0693069306930694,
+ "grad_norm": 0.3046875,
+ "learning_rate": 0.0001746821476984154,
+ "loss": 0.9539,
+ "step": 54
+ },
+ {
+ "epoch": 1.0198019801980198,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00017357239106731317,
+ "loss": 0.9559,
+ "step": 55
+ },
+ {
+ "epoch": 1.0396039603960396,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00017244252047910892,
+ "loss": 0.9111,
+ "step": 56
+ },
+ {
+ "epoch": 1.0594059405940595,
+ "grad_norm": 0.30859375,
+ "learning_rate": 0.00017129284482913972,
+ "loss": 0.9503,
+ "step": 57
+ },
+ {
+ "epoch": 1.0792079207920793,
+ "grad_norm": 0.2265625,
+ "learning_rate": 0.00017012367842724887,
+ "loss": 0.911,
+ "step": 58
+ },
+ {
+ "epoch": 1.099009900990099,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.0001689353409118566,
+ "loss": 0.9041,
+ "step": 59
+ },
+ {
+ "epoch": 1.118811881188119,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.00016772815716257412,
+ "loss": 0.9117,
+ "step": 60
+ },
+ {
+ "epoch": 1.1386138613861387,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.0001665024572113848,
+ "loss": 0.9351,
+ "step": 61
+ },
+ {
+ "epoch": 1.1584158415841583,
+ "grad_norm": 0.251953125,
+ "learning_rate": 0.00016525857615241687,
+ "loss": 0.9438,
+ "step": 62
+ },
+ {
+ "epoch": 1.1782178217821782,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00016399685405033167,
+ "loss": 0.9075,
+ "step": 63
+ },
+ {
+ "epoch": 1.198019801980198,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001627176358473537,
+ "loss": 0.8983,
+ "step": 64
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 0.0001614212712689668,
+ "loss": 0.9007,
+ "step": 65
+ },
+ {
+ "epoch": 1.2178217821782178,
+ "eval_loss": 0.9333999156951904,
+ "eval_runtime": 13.8668,
+ "eval_samples_per_second": 9.014,
+ "eval_steps_per_second": 4.543,
+ "step": 65
+ },
+ {
+ "epoch": 1.2376237623762376,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00016010811472830252,
+ "loss": 0.9108,
+ "step": 66
+ },
+ {
+ "epoch": 1.2574257425742574,
+ "grad_norm": 0.232421875,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.9177,
+ "step": 67
+ },
+ {
+ "epoch": 1.2772277227722773,
+ "grad_norm": 0.271484375,
+ "learning_rate": 0.00015743286626829437,
+ "loss": 0.9,
+ "step": 68
+ },
+ {
+ "epoch": 1.297029702970297,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.0001560715057351673,
+ "loss": 0.9096,
+ "step": 69
+ },
+ {
+ "epoch": 1.316831683168317,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.00015469481581224272,
+ "loss": 0.8946,
+ "step": 70
+ },
+ {
+ "epoch": 1.3366336633663367,
+ "grad_norm": 0.31640625,
+ "learning_rate": 0.0001533031728727994,
+ "loss": 0.8995,
+ "step": 71
+ },
+ {
+ "epoch": 1.3564356435643563,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00015189695737812152,
+ "loss": 0.922,
+ "step": 72
+ },
+ {
+ "epoch": 1.3762376237623761,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001504765537734844,
+ "loss": 0.885,
+ "step": 73
+ },
+ {
+ "epoch": 1.396039603960396,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00014904235038305083,
+ "loss": 0.895,
+ "step": 74
+ },
+ {
+ "epoch": 1.4158415841584158,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 0.00014759473930370736,
+ "loss": 0.892,
+ "step": 75
+ },
+ {
+ "epoch": 1.4356435643564356,
+ "grad_norm": 0.216796875,
+ "learning_rate": 0.0001461341162978688,
+ "loss": 0.8277,
+ "step": 76
+ },
+ {
+ "epoch": 1.4554455445544554,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.00014466088068528068,
+ "loss": 0.8687,
+ "step": 77
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014317543523384928,
+ "loss": 0.8765,
+ "step": 78
+ },
+ {
+ "epoch": 1.4752475247524752,
+ "eval_loss": 0.9083698391914368,
+ "eval_runtime": 13.8834,
+ "eval_samples_per_second": 9.004,
+ "eval_steps_per_second": 4.538,
+ "step": 78
+ },
+ {
+ "epoch": 1.495049504950495,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00014167818604952906,
+ "loss": 0.8797,
+ "step": 79
+ },
+ {
+ "epoch": 1.5148514851485149,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 0.00014016954246529696,
+ "loss": 0.905,
+ "step": 80
+ },
+ {
+ "epoch": 1.5346534653465347,
+ "grad_norm": 0.25390625,
+ "learning_rate": 0.00013864991692924523,
+ "loss": 0.8575,
+ "step": 81
+ },
+ {
+ "epoch": 1.5544554455445545,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00013711972489182208,
+ "loss": 0.8957,
+ "step": 82
+ },
+ {
+ "epoch": 1.5742574257425743,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 0.00013557938469225167,
+ "loss": 0.8792,
+ "step": 83
+ },
+ {
+ "epoch": 1.5940594059405941,
+ "grad_norm": 0.21484375,
+ "learning_rate": 0.00013402931744416433,
+ "loss": 0.889,
+ "step": 84
+ },
+ {
+ "epoch": 1.613861386138614,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00013246994692046836,
+ "loss": 0.8657,
+ "step": 85
+ },
+ {
+ "epoch": 1.6336633663366338,
+ "grad_norm": 0.20703125,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.8784,
+ "step": 86
+ },
+ {
+ "epoch": 1.6534653465346536,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001293250037384465,
+ "loss": 0.8822,
+ "step": 87
+ },
+ {
+ "epoch": 1.6732673267326734,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 0.00012774029087618446,
+ "loss": 0.9092,
+ "step": 88
+ },
+ {
+ "epoch": 1.693069306930693,
+ "grad_norm": 0.234375,
+ "learning_rate": 0.00012614799409538198,
+ "loss": 0.8813,
+ "step": 89
+ },
+ {
+ "epoch": 1.7128712871287128,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.00012454854871407994,
+ "loss": 0.8975,
+ "step": 90
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00012294239200467516,
+ "loss": 0.8789,
+ "step": 91
+ },
+ {
+ "epoch": 1.7326732673267327,
+ "eval_loss": 0.8891416788101196,
+ "eval_runtime": 13.872,
+ "eval_samples_per_second": 9.011,
+ "eval_steps_per_second": 4.542,
+ "step": 91
+ },
+ {
+ "epoch": 1.7524752475247525,
+ "grad_norm": 0.26171875,
+ "learning_rate": 0.0001213299630743747,
+ "loss": 0.9184,
+ "step": 92
+ },
+ {
+ "epoch": 1.7722772277227723,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00011971170274514802,
+ "loss": 0.8854,
+ "step": 93
+ },
+ {
+ "epoch": 1.7920792079207921,
+ "grad_norm": 0.2890625,
+ "learning_rate": 0.000118088053433211,
+ "loss": 0.8688,
+ "step": 94
+ },
+ {
+ "epoch": 1.811881188118812,
+ "grad_norm": 0.3515625,
+ "learning_rate": 0.00011645945902807341,
+ "loss": 0.8281,
+ "step": 95
+ },
+ {
+ "epoch": 1.8316831683168315,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.0001148263647711842,
+ "loss": 0.8488,
+ "step": 96
+ },
+ {
+ "epoch": 1.8514851485148514,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00011318921713420691,
+ "loss": 0.8742,
+ "step": 97
+ },
+ {
+ "epoch": 1.8712871287128712,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00011154846369695863,
+ "loss": 0.8586,
+ "step": 98
+ },
+ {
+ "epoch": 1.891089108910891,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.0001099045530250463,
+ "loss": 0.8776,
+ "step": 99
+ },
+ {
+ "epoch": 1.9108910891089108,
+ "grad_norm": 0.259765625,
+ "learning_rate": 0.00010825793454723325,
+ "loss": 0.8563,
+ "step": 100
+ },
+ {
+ "epoch": 1.9306930693069306,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00010660905843256994,
+ "loss": 0.8381,
+ "step": 101
+ },
+ {
+ "epoch": 1.9504950495049505,
+ "grad_norm": 0.201171875,
+ "learning_rate": 0.00010495837546732224,
+ "loss": 0.847,
+ "step": 102
+ },
+ {
+ "epoch": 1.9702970297029703,
+ "grad_norm": 0.23828125,
+ "learning_rate": 0.00010330633693173082,
+ "loss": 0.8512,
+ "step": 103
+ },
+ {
+ "epoch": 1.99009900990099,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.00010165339447663587,
+ "loss": 0.8304,
+ "step": 104
+ },
+ {
+ "epoch": 1.99009900990099,
+ "eval_loss": 0.8779018521308899,
+ "eval_runtime": 13.8827,
+ "eval_samples_per_second": 9.004,
+ "eval_steps_per_second": 4.538,
+ "step": 104
+ },
+ {
+ "epoch": 2.00990099009901,
+ "grad_norm": 0.283203125,
+ "learning_rate": 0.0001,
+ "loss": 0.8523,
+ "step": 105
+ },
+ {
+ "epoch": 2.0297029702970297,
+ "grad_norm": 0.2392578125,
+ "learning_rate": 9.834660552336415e-05,
+ "loss": 0.8109,
+ "step": 106
+ },
+ {
+ "epoch": 2.0495049504950495,
+ "grad_norm": 0.224609375,
+ "learning_rate": 9.669366306826919e-05,
+ "loss": 0.8394,
+ "step": 107
+ },
+ {
+ "epoch": 2.0693069306930694,
+ "grad_norm": 0.283203125,
+ "learning_rate": 9.504162453267777e-05,
+ "loss": 0.8524,
+ "step": 108
+ },
+ {
+ "epoch": 2.01980198019802,
+ "grad_norm": 0.22265625,
+ "learning_rate": 9.339094156743007e-05,
+ "loss": 0.8391,
+ "step": 109
+ },
+ {
+ "epoch": 2.0396039603960396,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 9.174206545276677e-05,
+ "loss": 0.8317,
+ "step": 110
+ },
+ {
+ "epoch": 2.0594059405940595,
+ "grad_norm": 0.22265625,
+ "learning_rate": 9.009544697495374e-05,
+ "loss": 0.833,
+ "step": 111
+ },
+ {
+ "epoch": 2.0792079207920793,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 8.845153630304139e-05,
+ "loss": 0.8408,
+ "step": 112
+ },
+ {
+ "epoch": 2.099009900990099,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 8.681078286579311e-05,
+ "loss": 0.8459,
+ "step": 113
+ },
+ {
+ "epoch": 2.118811881188119,
+ "grad_norm": 0.2021484375,
+ "learning_rate": 8.517363522881579e-05,
+ "loss": 0.8177,
+ "step": 114
+ },
+ {
+ "epoch": 2.1386138613861387,
+ "grad_norm": 0.2265625,
+ "learning_rate": 8.35405409719266e-05,
+ "loss": 0.8451,
+ "step": 115
+ },
+ {
+ "epoch": 2.1584158415841586,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 8.191194656678904e-05,
+ "loss": 0.8543,
+ "step": 116
+ },
+ {
+ "epoch": 2.1782178217821784,
+ "grad_norm": 0.22265625,
+ "learning_rate": 8.028829725485199e-05,
+ "loss": 0.8194,
+ "step": 117
+ },
+ {
+ "epoch": 2.1782178217821784,
+ "eval_loss": 0.8713971972465515,
+ "eval_runtime": 13.8976,
+ "eval_samples_per_second": 8.994,
+ "eval_steps_per_second": 4.533,
+ "step": 117
+ },
+ {
+ "epoch": 2.198019801980198,
+ "grad_norm": 0.2333984375,
+ "learning_rate": 7.867003692562534e-05,
+ "loss": 0.808,
+ "step": 118
+ },
+ {
+ "epoch": 2.217821782178218,
+ "grad_norm": 0.2470703125,
+ "learning_rate": 7.705760799532485e-05,
+ "loss": 0.8073,
+ "step": 119
+ },
+ {
+ "epoch": 2.237623762376238,
+ "grad_norm": 0.201171875,
+ "learning_rate": 7.54514512859201e-05,
+ "loss": 0.8392,
+ "step": 120
+ },
+ {
+ "epoch": 2.2574257425742577,
+ "grad_norm": 0.25,
+ "learning_rate": 7.385200590461803e-05,
+ "loss": 0.8574,
+ "step": 121
+ },
+ {
+ "epoch": 2.2772277227722775,
+ "grad_norm": 0.271484375,
+ "learning_rate": 7.225970912381556e-05,
+ "loss": 0.8338,
+ "step": 122
+ },
+ {
+ "epoch": 2.297029702970297,
+ "grad_norm": 0.294921875,
+ "learning_rate": 7.067499626155354e-05,
+ "loss": 0.8788,
+ "step": 123
+ },
+ {
+ "epoch": 2.3168316831683167,
+ "grad_norm": 0.2265625,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.8297,
+ "step": 124
+ },
+ {
+ "epoch": 2.3366336633663365,
+ "grad_norm": 0.267578125,
+ "learning_rate": 6.753005307953167e-05,
+ "loss": 0.8125,
+ "step": 125
+ },
+ {
+ "epoch": 2.3564356435643563,
+ "grad_norm": 0.2431640625,
+ "learning_rate": 6.59706825558357e-05,
+ "loss": 0.814,
+ "step": 126
+ },
+ {
+ "epoch": 2.376237623762376,
+ "grad_norm": 0.27734375,
+ "learning_rate": 6.442061530774834e-05,
+ "loss": 0.8335,
+ "step": 127
+ },
+ {
+ "epoch": 2.396039603960396,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 6.28802751081779e-05,
+ "loss": 0.8512,
+ "step": 128
+ },
+ {
+ "epoch": 2.4158415841584158,
+ "grad_norm": 0.224609375,
+ "learning_rate": 6.135008307075481e-05,
+ "loss": 0.8297,
+ "step": 129
+ },
+ {
+ "epoch": 2.4356435643564356,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 5.983045753470308e-05,
+ "loss": 0.848,
+ "step": 130
+ },
+ {
+ "epoch": 2.4356435643564356,
+ "eval_loss": 0.8665071129798889,
+ "eval_runtime": 13.8735,
+ "eval_samples_per_second": 9.01,
+ "eval_steps_per_second": 4.541,
+ "step": 130
+ },
+ {
+ "epoch": 2.4554455445544554,
+ "grad_norm": 0.2265625,
+ "learning_rate": 5.832181395047098e-05,
+ "loss": 0.8203,
+ "step": 131
+ },
+ {
+ "epoch": 2.4752475247524752,
+ "grad_norm": 0.287109375,
+ "learning_rate": 5.6824564766150726e-05,
+ "loss": 0.8519,
+ "step": 132
+ },
+ {
+ "epoch": 2.495049504950495,
+ "grad_norm": 0.21484375,
+ "learning_rate": 5.533911931471936e-05,
+ "loss": 0.83,
+ "step": 133
+ },
+ {
+ "epoch": 2.514851485148515,
+ "grad_norm": 0.2109375,
+ "learning_rate": 5.386588370213124e-05,
+ "loss": 0.842,
+ "step": 134
+ },
+ {
+ "epoch": 2.5346534653465347,
+ "grad_norm": 0.2412109375,
+ "learning_rate": 5.240526069629265e-05,
+ "loss": 0.8419,
+ "step": 135
+ },
+ {
+ "epoch": 2.5544554455445545,
+ "grad_norm": 0.267578125,
+ "learning_rate": 5.095764961694922e-05,
+ "loss": 0.8458,
+ "step": 136
+ },
+ {
+ "epoch": 2.5742574257425743,
+ "grad_norm": 0.203125,
+ "learning_rate": 4.952344622651566e-05,
+ "loss": 0.8133,
+ "step": 137
+ },
+ {
+ "epoch": 2.594059405940594,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 4.810304262187852e-05,
+ "loss": 0.8103,
+ "step": 138
+ },
+ {
+ "epoch": 2.613861386138614,
+ "grad_norm": 0.20703125,
+ "learning_rate": 4.669682712720065e-05,
+ "loss": 0.8105,
+ "step": 139
+ },
+ {
+ "epoch": 2.633663366336634,
+ "grad_norm": 0.2060546875,
+ "learning_rate": 4.530518418775733e-05,
+ "loss": 0.8305,
+ "step": 140
+ },
+ {
+ "epoch": 2.6534653465346536,
+ "grad_norm": 0.2080078125,
+ "learning_rate": 4.392849426483274e-05,
+ "loss": 0.7881,
+ "step": 141
+ },
+ {
+ "epoch": 2.6732673267326734,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 4.256713373170564e-05,
+ "loss": 0.8204,
+ "step": 142
+ },
+ {
+ "epoch": 2.693069306930693,
+ "grad_norm": 0.263671875,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.8354,
+ "step": 143
+ },
+ {
+ "epoch": 2.693069306930693,
+ "eval_loss": 0.8626759648323059,
+ "eval_runtime": 13.8585,
+ "eval_samples_per_second": 9.02,
+ "eval_steps_per_second": 4.546,
+ "step": 143
+ },
+ {
+ "epoch": 2.7128712871287126,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 3.9891885271697496e-05,
+ "loss": 0.8441,
+ "step": 144
+ },
+ {
+ "epoch": 2.7326732673267324,
+ "grad_norm": 0.2197265625,
+ "learning_rate": 3.857872873103322e-05,
+ "loss": 0.8084,
+ "step": 145
+ },
+ {
+ "epoch": 2.7524752475247523,
+ "grad_norm": 0.18359375,
+ "learning_rate": 3.7282364152646297e-05,
+ "loss": 0.8184,
+ "step": 146
+ },
+ {
+ "epoch": 2.772277227722772,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 3.600314594966834e-05,
+ "loss": 0.8302,
+ "step": 147
+ },
+ {
+ "epoch": 2.792079207920792,
+ "grad_norm": 0.2041015625,
+ "learning_rate": 3.4741423847583134e-05,
+ "loss": 0.8503,
+ "step": 148
+ },
+ {
+ "epoch": 2.8118811881188117,
+ "grad_norm": 0.2265625,
+ "learning_rate": 3.349754278861517e-05,
+ "loss": 0.8273,
+ "step": 149
+ },
+ {
+ "epoch": 2.8316831683168315,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 3.227184283742591e-05,
+ "loss": 0.8332,
+ "step": 150
+ },
+ {
+ "epoch": 2.8514851485148514,
+ "grad_norm": 0.185546875,
+ "learning_rate": 3.106465908814342e-05,
+ "loss": 0.8391,
+ "step": 151
+ },
+ {
+ "epoch": 2.871287128712871,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 2.9876321572751144e-05,
+ "loss": 0.8029,
+ "step": 152
+ },
+ {
+ "epoch": 2.891089108910891,
+ "grad_norm": 0.224609375,
+ "learning_rate": 2.87071551708603e-05,
+ "loss": 0.8561,
+ "step": 153
+ },
+ {
+ "epoch": 2.910891089108911,
+ "grad_norm": 0.2275390625,
+ "learning_rate": 2.7557479520891104e-05,
+ "loss": 0.8055,
+ "step": 154
+ },
+ {
+ "epoch": 2.9306930693069306,
+ "grad_norm": 0.16796875,
+ "learning_rate": 2.6427608932686843e-05,
+ "loss": 0.8301,
+ "step": 155
+ },
+ {
+ "epoch": 2.9504950495049505,
+ "grad_norm": 0.1943359375,
+ "learning_rate": 2.5317852301584643e-05,
+ "loss": 0.8476,
+ "step": 156
+ },
+ {
+ "epoch": 2.9504950495049505,
+ "eval_loss": 0.8605256080627441,
+ "eval_runtime": 13.8794,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 156
+ },
+ {
+ "epoch": 2.9702970297029703,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 2.422851302396655e-05,
+ "loss": 0.8483,
+ "step": 157
+ },
+ {
+ "epoch": 2.99009900990099,
+ "grad_norm": 0.2216796875,
+ "learning_rate": 2.315988891431412e-05,
+ "loss": 0.8379,
+ "step": 158
+ },
+ {
+ "epoch": 3.00990099009901,
+ "grad_norm": 0.2373046875,
+ "learning_rate": 2.2112272123788768e-05,
+ "loss": 0.8042,
+ "step": 159
+ },
+ {
+ "epoch": 3.0297029702970297,
+ "grad_norm": 0.416015625,
+ "learning_rate": 2.1085949060360654e-05,
+ "loss": 0.8597,
+ "step": 160
+ },
+ {
+ "epoch": 3.0495049504950495,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 2.008120031050753e-05,
+ "loss": 0.8327,
+ "step": 161
+ },
+ {
+ "epoch": 3.0693069306930694,
+ "grad_norm": 0.212890625,
+ "learning_rate": 1.9098300562505266e-05,
+ "loss": 0.7991,
+ "step": 162
+ },
+ {
+ "epoch": 3.01980198019802,
+ "grad_norm": 0.1875,
+ "learning_rate": 1.8137518531330767e-05,
+ "loss": 0.8083,
+ "step": 163
+ },
+ {
+ "epoch": 3.0396039603960396,
+ "grad_norm": 0.1982421875,
+ "learning_rate": 1.7199116885197995e-05,
+ "loss": 0.8321,
+ "step": 164
+ },
+ {
+ "epoch": 3.0594059405940595,
+ "grad_norm": 0.193359375,
+ "learning_rate": 1.6283352173747145e-05,
+ "loss": 0.8596,
+ "step": 165
+ },
+ {
+ "epoch": 3.0792079207920793,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 1.5390474757906446e-05,
+ "loss": 0.82,
+ "step": 166
+ },
+ {
+ "epoch": 3.099009900990099,
+ "grad_norm": 0.1806640625,
+ "learning_rate": 1.4520728741446089e-05,
+ "loss": 0.8245,
+ "step": 167
+ },
+ {
+ "epoch": 3.118811881188119,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 1.3674351904242611e-05,
+ "loss": 0.8174,
+ "step": 168
+ },
+ {
+ "epoch": 3.1386138613861387,
+ "grad_norm": 0.1787109375,
+ "learning_rate": 1.2851575637272262e-05,
+ "loss": 0.811,
+ "step": 169
+ },
+ {
+ "epoch": 3.1386138613861387,
+ "eval_loss": 0.8589804768562317,
+ "eval_runtime": 13.8605,
+ "eval_samples_per_second": 9.018,
+ "eval_steps_per_second": 4.545,
+ "step": 169
+ },
+ {
+ "epoch": 3.1584158415841586,
+ "grad_norm": 0.1865234375,
+ "learning_rate": 1.2052624879351104e-05,
+ "loss": 0.8043,
+ "step": 170
+ },
+ {
+ "epoch": 3.1782178217821784,
+ "grad_norm": 0.181640625,
+ "learning_rate": 1.1277718055638819e-05,
+ "loss": 0.8117,
+ "step": 171
+ },
+ {
+ "epoch": 3.198019801980198,
+ "grad_norm": 0.205078125,
+ "learning_rate": 1.0527067017923654e-05,
+ "loss": 0.8176,
+ "step": 172
+ },
+ {
+ "epoch": 3.217821782178218,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 9.80087698670411e-06,
+ "loss": 0.7919,
+ "step": 173
+ },
+ {
+ "epoch": 3.237623762376238,
+ "grad_norm": 0.177734375,
+ "learning_rate": 9.09934649508375e-06,
+ "loss": 0.8099,
+ "step": 174
+ },
+ {
+ "epoch": 3.2574257425742577,
+ "grad_norm": 0.203125,
+ "learning_rate": 8.422667334494249e-06,
+ "loss": 0.8161,
+ "step": 175
+ },
+ {
+ "epoch": 3.2772277227722775,
+ "grad_norm": 0.208984375,
+ "learning_rate": 7.771024502261526e-06,
+ "loss": 0.8199,
+ "step": 176
+ },
+ {
+ "epoch": 3.297029702970297,
+ "grad_norm": 0.1923828125,
+ "learning_rate": 7.144596151029303e-06,
+ "loss": 0.8077,
+ "step": 177
+ },
+ {
+ "epoch": 3.3168316831683167,
+ "grad_norm": 0.298828125,
+ "learning_rate": 6.543553540053926e-06,
+ "loss": 0.8532,
+ "step": 178
+ },
+ {
+ "epoch": 3.3366336633663365,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 5.968060988383883e-06,
+ "loss": 0.8062,
+ "step": 179
+ },
+ {
+ "epoch": 3.3564356435643563,
+ "grad_norm": 0.1689453125,
+ "learning_rate": 5.418275829936537e-06,
+ "loss": 0.802,
+ "step": 180
+ },
+ {
+ "epoch": 3.376237623762376,
+ "grad_norm": 0.2001953125,
+ "learning_rate": 4.8943483704846475e-06,
+ "loss": 0.8189,
+ "step": 181
+ },
+ {
+ "epoch": 3.396039603960396,
+ "grad_norm": 0.169921875,
+ "learning_rate": 4.3964218465642355e-06,
+ "loss": 0.8178,
+ "step": 182
+ },
+ {
+ "epoch": 3.396039603960396,
+ "eval_loss": 0.858788788318634,
+ "eval_runtime": 13.8817,
+ "eval_samples_per_second": 9.005,
+ "eval_steps_per_second": 4.538,
+ "step": 182
+ },
+ {
+ "epoch": 3.4158415841584158,
+ "grad_norm": 0.16796875,
+ "learning_rate": 3.924632386315186e-06,
+ "loss": 0.8307,
+ "step": 183
+ },
+ {
+ "epoch": 3.4356435643564356,
+ "grad_norm": 0.181640625,
+ "learning_rate": 3.4791089722651436e-06,
+ "loss": 0.8255,
+ "step": 184
+ },
+ {
+ "epoch": 3.4554455445544554,
+ "grad_norm": 0.185546875,
+ "learning_rate": 3.059973406066963e-06,
+ "loss": 0.8222,
+ "step": 185
+ },
+ {
+ "epoch": 3.4752475247524752,
+ "grad_norm": 0.19140625,
+ "learning_rate": 2.667340275199426e-06,
+ "loss": 0.8054,
+ "step": 186
+ },
+ {
+ "epoch": 3.495049504950495,
+ "grad_norm": 0.1826171875,
+ "learning_rate": 2.3013169216400733e-06,
+ "loss": 0.8628,
+ "step": 187
+ },
+ {
+ "epoch": 3.514851485148515,
+ "grad_norm": 0.1796875,
+ "learning_rate": 1.9620034125190644e-06,
+ "loss": 0.8338,
+ "step": 188
+ },
+ {
+ "epoch": 3.5346534653465347,
+ "grad_norm": 0.1728515625,
+ "learning_rate": 1.6494925127617634e-06,
+ "loss": 0.809,
+ "step": 189
+ },
+ {
+ "epoch": 3.5544554455445545,
+ "grad_norm": 0.1904296875,
+ "learning_rate": 1.3638696597277679e-06,
+ "loss": 0.8328,
+ "step": 190
+ },
+ {
+ "epoch": 3.5742574257425743,
+ "grad_norm": 0.17578125,
+ "learning_rate": 1.1052129398531507e-06,
+ "loss": 0.8062,
+ "step": 191
+ },
+ {
+ "epoch": 3.594059405940594,
+ "grad_norm": 0.1884765625,
+ "learning_rate": 8.735930673024806e-07,
+ "loss": 0.832,
+ "step": 192
+ },
+ {
+ "epoch": 3.613861386138614,
+ "grad_norm": 0.17578125,
+ "learning_rate": 6.690733646361857e-07,
+ "loss": 0.8107,
+ "step": 193
+ },
+ {
+ "epoch": 3.633663366336634,
+ "grad_norm": 0.1875,
+ "learning_rate": 4.917097454988584e-07,
+ "loss": 0.8315,
+ "step": 194
+ },
+ {
+ "epoch": 3.6534653465346536,
+ "grad_norm": 0.1845703125,
+ "learning_rate": 3.415506993330153e-07,
+ "loss": 0.8073,
+ "step": 195
+ },
+ {
+ "epoch": 3.6534653465346536,
+ "eval_loss": 0.858626127243042,
+ "eval_runtime": 13.8621,
+ "eval_samples_per_second": 9.017,
+ "eval_steps_per_second": 4.545,
+ "step": 195
+ },
+ {
+ "epoch": 3.6732673267326734,
+ "grad_norm": 0.197265625,
+ "learning_rate": 2.1863727812254653e-07,
+ "loss": 0.8403,
+ "step": 196
+ },
+ {
+ "epoch": 3.693069306930693,
+ "grad_norm": 0.189453125,
+ "learning_rate": 1.230030851695263e-07,
+ "loss": 0.8116,
+ "step": 197
+ },
+ {
+ "epoch": 3.7128712871287126,
+ "grad_norm": 0.173828125,
+ "learning_rate": 5.467426590739511e-08,
+ "loss": 0.8115,
+ "step": 198
+ },
+ {
+ "epoch": 3.7326732673267324,
+ "grad_norm": 0.177734375,
+ "learning_rate": 1.3669500753099585e-08,
+ "loss": 0.7962,
+ "step": 199
+ },
+ {
+ "epoch": 3.7524752475247523,
+ "grad_norm": 0.2099609375,
+ "learning_rate": 0.0,
+ "loss": 0.8031,
+ "step": 200
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 200,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 50,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.164477534181786e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944
diff --git a/checkpoint-50/README.md b/checkpoint-50/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978
--- /dev/null
+++ b/checkpoint-50/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.11.1
\ No newline at end of file
diff --git a/checkpoint-50/adapter_config.json b/checkpoint-50/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891
--- /dev/null
+++ b/checkpoint-50/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "k_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "q_proj",
+ "o_proj",
+ "up_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-50/adapter_model.safetensors b/checkpoint-50/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e224cb43e2b5c839e08901e2bd1920ba49998d55
--- /dev/null
+++ b/checkpoint-50/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a02032e4ced1f76caa201d55031ab5925f6d0fb66b5d8f3092b8c5d785219b37
+size 50503848
diff --git a/checkpoint-50/optimizer.pt b/checkpoint-50/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bf0bc8aae14cf3dc15a5895ddfefef58623a2ecf
--- /dev/null
+++ b/checkpoint-50/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ff58348f44e2bde44ab7f9193c61e20dd0f8d95e056c7a292421ffd95a8c7d3
+size 202035450
diff --git a/checkpoint-50/rng_state.pth b/checkpoint-50/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..246cffea93db339a81ba0ae32aaa33c8c0ad92df
--- /dev/null
+++ b/checkpoint-50/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b35231a2c551e6ed40111614cd789a64fe47b38c49d5b21bea0aa24df8b78d2
+size 14244
diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..30029e06200a4e0722b03b67c9de799aa70b54ed
--- /dev/null
+++ b/checkpoint-50/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9e02dc10b7239989ab9b4418ee704e53fad611ad6b77ad633028bb8eb5238dd
+size 1064
diff --git a/checkpoint-50/special_tokens_map.json b/checkpoint-50/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-50/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-50/tokenizer.model b/checkpoint-50/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-50/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-50/tokenizer_config.json b/checkpoint-50/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86
--- /dev/null
+++ b/checkpoint-50/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..739306d50fc951c681d527a2d355ce130e33bb45
--- /dev/null
+++ b/checkpoint-50/trainer_state.json
@@ -0,0 +1,415 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.9900990099009901,
+ "eval_steps": 13,
+ "global_step": 50,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.019801980198019802,
+ "grad_norm": 1.15625,
+ "learning_rate": 2e-05,
+ "loss": 2.0919,
+ "step": 1
+ },
+ {
+ "epoch": 0.019801980198019802,
+ "eval_loss": 2.079954147338867,
+ "eval_runtime": 13.8908,
+ "eval_samples_per_second": 8.999,
+ "eval_steps_per_second": 4.535,
+ "step": 1
+ },
+ {
+ "epoch": 0.039603960396039604,
+ "grad_norm": 1.203125,
+ "learning_rate": 4e-05,
+ "loss": 2.0814,
+ "step": 2
+ },
+ {
+ "epoch": 0.0594059405940594,
+ "grad_norm": 1.1953125,
+ "learning_rate": 6e-05,
+ "loss": 2.0499,
+ "step": 3
+ },
+ {
+ "epoch": 0.07920792079207921,
+ "grad_norm": 1.0859375,
+ "learning_rate": 8e-05,
+ "loss": 2.0153,
+ "step": 4
+ },
+ {
+ "epoch": 0.09900990099009901,
+ "grad_norm": 1.0390625,
+ "learning_rate": 0.0001,
+ "loss": 1.9548,
+ "step": 5
+ },
+ {
+ "epoch": 0.1188118811881188,
+ "grad_norm": 0.89453125,
+ "learning_rate": 0.00012,
+ "loss": 1.8982,
+ "step": 6
+ },
+ {
+ "epoch": 0.13861386138613863,
+ "grad_norm": 0.67578125,
+ "learning_rate": 0.00014,
+ "loss": 1.8226,
+ "step": 7
+ },
+ {
+ "epoch": 0.15841584158415842,
+ "grad_norm": 0.66796875,
+ "learning_rate": 0.00016,
+ "loss": 1.7572,
+ "step": 8
+ },
+ {
+ "epoch": 0.1782178217821782,
+ "grad_norm": 0.78515625,
+ "learning_rate": 0.00018,
+ "loss": 1.7074,
+ "step": 9
+ },
+ {
+ "epoch": 0.19801980198019803,
+ "grad_norm": 0.73828125,
+ "learning_rate": 0.0002,
+ "loss": 1.6317,
+ "step": 10
+ },
+ {
+ "epoch": 0.21782178217821782,
+ "grad_norm": 0.484375,
+ "learning_rate": 0.0001999863304992469,
+ "loss": 1.5801,
+ "step": 11
+ },
+ {
+ "epoch": 0.2376237623762376,
+ "grad_norm": 0.53125,
+ "learning_rate": 0.00019994532573409262,
+ "loss": 1.5721,
+ "step": 12
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "grad_norm": 0.6953125,
+ "learning_rate": 0.00019987699691483048,
+ "loss": 1.5479,
+ "step": 13
+ },
+ {
+ "epoch": 0.25742574257425743,
+ "eval_loss": 1.5341482162475586,
+ "eval_runtime": 13.8795,
+ "eval_samples_per_second": 9.006,
+ "eval_steps_per_second": 4.539,
+ "step": 13
+ },
+ {
+ "epoch": 0.27722772277227725,
+ "grad_norm": 0.65234375,
+ "learning_rate": 0.00019978136272187747,
+ "loss": 1.534,
+ "step": 14
+ },
+ {
+ "epoch": 0.297029702970297,
+ "grad_norm": 0.515625,
+ "learning_rate": 0.000199658449300667,
+ "loss": 1.4804,
+ "step": 15
+ },
+ {
+ "epoch": 0.31683168316831684,
+ "grad_norm": 0.439453125,
+ "learning_rate": 0.00019950829025450114,
+ "loss": 1.4805,
+ "step": 16
+ },
+ {
+ "epoch": 0.33663366336633666,
+ "grad_norm": 0.361328125,
+ "learning_rate": 0.00019933092663536382,
+ "loss": 1.3809,
+ "step": 17
+ },
+ {
+ "epoch": 0.3564356435643564,
+ "grad_norm": 0.3125,
+ "learning_rate": 0.00019912640693269752,
+ "loss": 1.3837,
+ "step": 18
+ },
+ {
+ "epoch": 0.37623762376237624,
+ "grad_norm": 0.337890625,
+ "learning_rate": 0.00019889478706014687,
+ "loss": 1.3673,
+ "step": 19
+ },
+ {
+ "epoch": 0.39603960396039606,
+ "grad_norm": 0.298828125,
+ "learning_rate": 0.00019863613034027224,
+ "loss": 1.366,
+ "step": 20
+ },
+ {
+ "epoch": 0.4158415841584158,
+ "grad_norm": 0.34375,
+ "learning_rate": 0.00019835050748723824,
+ "loss": 1.3318,
+ "step": 21
+ },
+ {
+ "epoch": 0.43564356435643564,
+ "grad_norm": 0.341796875,
+ "learning_rate": 0.00019803799658748094,
+ "loss": 1.2741,
+ "step": 22
+ },
+ {
+ "epoch": 0.45544554455445546,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019769868307835994,
+ "loss": 1.2978,
+ "step": 23
+ },
+ {
+ "epoch": 0.4752475247524752,
+ "grad_norm": 0.291015625,
+ "learning_rate": 0.0001973326597248006,
+ "loss": 1.2733,
+ "step": 24
+ },
+ {
+ "epoch": 0.49504950495049505,
+ "grad_norm": 0.306640625,
+ "learning_rate": 0.00019694002659393305,
+ "loss": 1.2302,
+ "step": 25
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "grad_norm": 0.318359375,
+ "learning_rate": 0.00019652089102773488,
+ "loss": 1.2083,
+ "step": 26
+ },
+ {
+ "epoch": 0.5148514851485149,
+ "eval_loss": 1.224540114402771,
+ "eval_runtime": 13.8695,
+ "eval_samples_per_second": 9.013,
+ "eval_steps_per_second": 4.542,
+ "step": 26
+ },
+ {
+ "epoch": 0.5346534653465347,
+ "grad_norm": 0.26953125,
+ "learning_rate": 0.00019607536761368484,
+ "loss": 1.1761,
+ "step": 27
+ },
+ {
+ "epoch": 0.5544554455445545,
+ "grad_norm": 0.296875,
+ "learning_rate": 0.00019560357815343577,
+ "loss": 1.1751,
+ "step": 28
+ },
+ {
+ "epoch": 0.5742574257425742,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 1.2002,
+ "step": 29
+ },
+ {
+ "epoch": 0.594059405940594,
+ "grad_norm": 0.287109375,
+ "learning_rate": 0.00019458172417006347,
+ "loss": 1.1544,
+ "step": 30
+ },
+ {
+ "epoch": 0.6138613861386139,
+ "grad_norm": 0.365234375,
+ "learning_rate": 0.00019403193901161613,
+ "loss": 1.1384,
+ "step": 31
+ },
+ {
+ "epoch": 0.6336633663366337,
+ "grad_norm": 0.236328125,
+ "learning_rate": 0.0001934564464599461,
+ "loss": 1.0999,
+ "step": 32
+ },
+ {
+ "epoch": 0.6534653465346535,
+ "grad_norm": 0.326171875,
+ "learning_rate": 0.00019285540384897073,
+ "loss": 1.1576,
+ "step": 33
+ },
+ {
+ "epoch": 0.6732673267326733,
+ "grad_norm": 0.310546875,
+ "learning_rate": 0.00019222897549773848,
+ "loss": 1.091,
+ "step": 34
+ },
+ {
+ "epoch": 0.693069306930693,
+ "grad_norm": 0.2578125,
+ "learning_rate": 0.00019157733266550575,
+ "loss": 1.056,
+ "step": 35
+ },
+ {
+ "epoch": 0.7128712871287128,
+ "grad_norm": 0.267578125,
+ "learning_rate": 0.00019090065350491626,
+ "loss": 1.1068,
+ "step": 36
+ },
+ {
+ "epoch": 0.7326732673267327,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.00019019912301329592,
+ "loss": 1.0583,
+ "step": 37
+ },
+ {
+ "epoch": 0.7524752475247525,
+ "grad_norm": 0.2734375,
+ "learning_rate": 0.00018947293298207635,
+ "loss": 1.0671,
+ "step": 38
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "grad_norm": 0.2490234375,
+ "learning_rate": 0.0001887222819443612,
+ "loss": 1.0851,
+ "step": 39
+ },
+ {
+ "epoch": 0.7722772277227723,
+ "eval_loss": 1.060703158378601,
+ "eval_runtime": 13.878,
+ "eval_samples_per_second": 9.007,
+ "eval_steps_per_second": 4.54,
+ "step": 39
+ },
+ {
+ "epoch": 0.7920792079207921,
+ "grad_norm": 0.22265625,
+ "learning_rate": 0.0001879473751206489,
+ "loss": 1.0343,
+ "step": 40
+ },
+ {
+ "epoch": 0.8118811881188119,
+ "grad_norm": 0.1796875,
+ "learning_rate": 0.00018714842436272773,
+ "loss": 0.9789,
+ "step": 41
+ },
+ {
+ "epoch": 0.8316831683168316,
+ "grad_norm": 0.248046875,
+ "learning_rate": 0.00018632564809575742,
+ "loss": 1.0174,
+ "step": 42
+ },
+ {
+ "epoch": 0.8514851485148515,
+ "grad_norm": 0.2294921875,
+ "learning_rate": 0.0001854792712585539,
+ "loss": 1.0004,
+ "step": 43
+ },
+ {
+ "epoch": 0.8712871287128713,
+ "grad_norm": 0.228515625,
+ "learning_rate": 0.00018460952524209355,
+ "loss": 1.0281,
+ "step": 44
+ },
+ {
+ "epoch": 0.8910891089108911,
+ "grad_norm": 0.220703125,
+ "learning_rate": 0.00018371664782625287,
+ "loss": 0.9992,
+ "step": 45
+ },
+ {
+ "epoch": 0.9108910891089109,
+ "grad_norm": 0.2138671875,
+ "learning_rate": 0.00018280088311480201,
+ "loss": 0.9635,
+ "step": 46
+ },
+ {
+ "epoch": 0.9306930693069307,
+ "grad_norm": 0.265625,
+ "learning_rate": 0.00018186248146866927,
+ "loss": 1.006,
+ "step": 47
+ },
+ {
+ "epoch": 0.9504950495049505,
+ "grad_norm": 0.2451171875,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.9891,
+ "step": 48
+ },
+ {
+ "epoch": 0.9702970297029703,
+ "grad_norm": 0.28515625,
+ "learning_rate": 0.0001799187996894925,
+ "loss": 0.9809,
+ "step": 49
+ },
+ {
+ "epoch": 0.9900990099009901,
+ "grad_norm": 0.212890625,
+ "learning_rate": 0.00017891405093963938,
+ "loss": 0.9646,
+ "step": 50
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 200,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 50,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.04177049010176e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30
--- /dev/null
+++ b/checkpoint-50/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..573775b64764c233a2364b2f362979bdf95694c2
--- /dev/null
+++ b/config.json
@@ -0,0 +1,44 @@
+{
+ "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "hidden_act": "silu",
+ "hidden_size": 2048,
+ "initializer_range": 0.02,
+ "intermediate_size": 5632,
+ "max_position_embeddings": 4096,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 22,
+ "num_key_value_heads": 4,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": true,
+ "_load_in_8bit": false,
+ "bnb_4bit_compute_dtype": "bfloat16",
+ "bnb_4bit_quant_storage": "bfloat16",
+ "bnb_4bit_quant_type": "nf4",
+ "bnb_4bit_use_double_quant": true,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": true,
+ "load_in_8bit": false,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": null,
+ "rope_theta": 10000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float32",
+ "transformers_version": "4.41.1",
+ "use_cache": false,
+ "vocab_size": 32000
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,44 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}