Taiel26 commited on Jun 1, 2024

Commit

6a46f8c

verified ·

1 Parent(s): 92cd1d8

Upload 51 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +143 -3
adapter_config.json +34 -0
adapter_model.bin +3 -0
checkpoint-100/README.md +202 -0
checkpoint-100/adapter_config.json +34 -0
checkpoint-100/adapter_model.safetensors +3 -0
checkpoint-100/optimizer.pt +3 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/special_tokens_map.json +24 -0
checkpoint-100/tokenizer.model +3 -0
checkpoint-100/tokenizer_config.json +44 -0
checkpoint-100/trainer_state.json +797 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-150/README.md +202 -0
checkpoint-150/adapter_config.json +34 -0
checkpoint-150/adapter_model.safetensors +3 -0
checkpoint-150/optimizer.pt +3 -0
checkpoint-150/rng_state.pth +3 -0
checkpoint-150/scheduler.pt +3 -0
checkpoint-150/special_tokens_map.json +24 -0
checkpoint-150/tokenizer.model +3 -0
checkpoint-150/tokenizer_config.json +44 -0
checkpoint-150/trainer_state.json +1179 -0
checkpoint-150/training_args.bin +3 -0
checkpoint-200/README.md +202 -0
checkpoint-200/adapter_config.json +34 -0
checkpoint-200/adapter_model.safetensors +3 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +24 -0
checkpoint-200/tokenizer.model +3 -0
checkpoint-200/tokenizer_config.json +44 -0
checkpoint-200/trainer_state.json +1561 -0
checkpoint-200/training_args.bin +3 -0
checkpoint-50/README.md +202 -0
checkpoint-50/adapter_config.json +34 -0
checkpoint-50/adapter_model.safetensors +3 -0
checkpoint-50/optimizer.pt +3 -0
checkpoint-50/rng_state.pth +3 -0
checkpoint-50/scheduler.pt +3 -0
checkpoint-50/special_tokens_map.json +24 -0
checkpoint-50/tokenizer.model +3 -0
checkpoint-50/tokenizer_config.json +44 -0
checkpoint-50/trainer_state.json +415 -0
checkpoint-50/training_args.bin +3 -0
config.json +44 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,143 @@
----
-license: unknown
----

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model-index:
+- name: outputs/qlora-out
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.1`
+```yaml
+adapter: qlora
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+bf16: auto
+dataset_prepared_path: null
+datasets:
+- path: Taiel26/plm_2500_uniref
+  type: alpaca
+debug: null
+deepspeed: null
+early_stopping_patience: null
+eval_sample_packing: false
+evals_per_epoch: 4
+flash_attention: true
+fp16: null
+fsdp: null
+fsdp_config: null
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+group_by_length: false
+learning_rate: 0.0002
+load_in_4bit: true
+load_in_8bit: false
+local_rank: null
+logging_steps: 1
+lora_alpha: 16
+lora_dropout: 0.05
+lora_fan_in_fan_out: null
+lora_model_dir: null
+lora_r: 32
+lora_target_linear: true
+lora_target_modules: null
+lr_scheduler: cosine
+micro_batch_size: 2
+model_type: LlamaForCausalLM
+num_epochs: 4
+optimizer: paged_adamw_32bit
+output_dir: ./outputs/qlora-out
+pad_to_sequence_len: true
+resume_from_checkpoint: null
+sample_packing: true
+saves_per_epoch: 1
+sequence_len: 4096
+special_tokens: null
+strict: false
+tf32: false
+tokenizer_type: LlamaTokenizer
+train_on_inputs: false
+val_set_size: 0.05
+wandb_entity: null
+wandb_log_model: null
+wandb_name: null
+wandb_project: null
+wandb_watch: null
+warmup_steps: 10
+weight_decay: 0.0
+xformers_attention: null
+```
+</details><br>
+# outputs/qlora-out
+This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.8586
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 4
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.0919        | 0.0198 | 1    | 2.0800          |
+| 1.5479        | 0.2574 | 13   | 1.5341          |
+| 1.2083        | 0.5149 | 26   | 1.2245          |
+| 1.0851        | 0.7723 | 39   | 1.0607          |
+| 0.9432        | 1.0297 | 52   | 0.9755          |
+| 0.9007        | 1.2178 | 65   | 0.9334          |
+| 0.8765        | 1.4752 | 78   | 0.9084          |
+| 0.8789        | 1.7327 | 91   | 0.8891          |
+| 0.8304        | 1.9901 | 104  | 0.8779          |
+| 0.8194        | 2.1782 | 117  | 0.8714          |
+| 0.848         | 2.4356 | 130  | 0.8665          |
+| 0.8354        | 2.6931 | 143  | 0.8627          |
+| 0.8476        | 2.9505 | 156  | 0.8605          |
+| 0.811         | 3.1386 | 169  | 0.8590          |
+| 0.8178        | 3.3960 | 182  | 0.8588          |
+| 0.8073        | 3.6535 | 195  | 0.8586          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.41.1
+- Pytorch 2.1.2+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffc2779c22b7eae997dc6203abde8f60a5e25d728ff0372f233c318ba1fdff97
+size 50573978

checkpoint-100/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b85a4fdc0abdb0ae863b99d8dbbc0f4de78e0d9fbd7bcb1ddcd7575e55dd73e
+size 50503848

checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b8f5c81e295185d82b95402d9e8aa5ba7f3db7c0d3626b29a8ce3a7f38899ae
+size 202035450

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b71df2f60f93f95a69126d2a7bc1e1cccfa69f1b8fa8d99a58b0ccfa00747f6f
+size 14244

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc7800513a1b4dd006c457152c700dd768bb49ee4ed8e4d9665a4e42095b054
+size 1064

checkpoint-100/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-100/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,797 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9108910891089108,
+  "eval_steps": 13,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019801980198019802,
+      "grad_norm": 1.15625,
+      "learning_rate": 2e-05,
+      "loss": 2.0919,
+      "step": 1
+    },
+    {
+      "epoch": 0.019801980198019802,
+      "eval_loss": 2.079954147338867,
+      "eval_runtime": 13.8908,
+      "eval_samples_per_second": 8.999,
+      "eval_steps_per_second": 4.535,
+      "step": 1
+    },
+    {
+      "epoch": 0.039603960396039604,
+      "grad_norm": 1.203125,
+      "learning_rate": 4e-05,
+      "loss": 2.0814,
+      "step": 2
+    },
+    {
+      "epoch": 0.0594059405940594,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6e-05,
+      "loss": 2.0499,
+      "step": 3
+    },
+    {
+      "epoch": 0.07920792079207921,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8e-05,
+      "loss": 2.0153,
+      "step": 4
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0001,
+      "loss": 1.9548,
+      "step": 5
+    },
+    {
+      "epoch": 0.1188118811881188,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00012,
+      "loss": 1.8982,
+      "step": 6
+    },
+    {
+      "epoch": 0.13861386138613863,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00014,
+      "loss": 1.8226,
+      "step": 7
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00016,
+      "loss": 1.7572,
+      "step": 8
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00018,
+      "loss": 1.7074,
+      "step": 9
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0002,
+      "loss": 1.6317,
+      "step": 10
+    },
+    {
+      "epoch": 0.21782178217821782,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001999863304992469,
+      "loss": 1.5801,
+      "step": 11
+    },
+    {
+      "epoch": 0.2376237623762376,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 1.5721,
+      "step": 12
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00019987699691483048,
+      "loss": 1.5479,
+      "step": 13
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "eval_loss": 1.5341482162475586,
+      "eval_runtime": 13.8795,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 13
+    },
+    {
+      "epoch": 0.27722772277227725,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00019978136272187747,
+      "loss": 1.534,
+      "step": 14
+    },
+    {
+      "epoch": 0.297029702970297,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.4804,
+      "step": 15
+    },
+    {
+      "epoch": 0.31683168316831684,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00019950829025450114,
+      "loss": 1.4805,
+      "step": 16
+    },
+    {
+      "epoch": 0.33663366336633666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00019933092663536382,
+      "loss": 1.3809,
+      "step": 17
+    },
+    {
+      "epoch": 0.3564356435643564,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00019912640693269752,
+      "loss": 1.3837,
+      "step": 18
+    },
+    {
+      "epoch": 0.37623762376237624,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00019889478706014687,
+      "loss": 1.3673,
+      "step": 19
+    },
+    {
+      "epoch": 0.39603960396039606,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.366,
+      "step": 20
+    },
+    {
+      "epoch": 0.4158415841584158,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00019835050748723824,
+      "loss": 1.3318,
+      "step": 21
+    },
+    {
+      "epoch": 0.43564356435643564,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00019803799658748094,
+      "loss": 1.2741,
+      "step": 22
+    },
+    {
+      "epoch": 0.45544554455445546,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019769868307835994,
+      "loss": 1.2978,
+      "step": 23
+    },
+    {
+      "epoch": 0.4752475247524752,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001973326597248006,
+      "loss": 1.2733,
+      "step": 24
+    },
+    {
+      "epoch": 0.49504950495049505,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.2302,
+      "step": 25
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00019652089102773488,
+      "loss": 1.2083,
+      "step": 26
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "eval_loss": 1.224540114402771,
+      "eval_runtime": 13.8695,
+      "eval_samples_per_second": 9.013,
+      "eval_steps_per_second": 4.542,
+      "step": 26
+    },
+    {
+      "epoch": 0.5346534653465347,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00019607536761368484,
+      "loss": 1.1761,
+      "step": 27
+    },
+    {
+      "epoch": 0.5544554455445545,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00019560357815343577,
+      "loss": 1.1751,
+      "step": 28
+    },
+    {
+      "epoch": 0.5742574257425742,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 1.2002,
+      "step": 29
+    },
+    {
+      "epoch": 0.594059405940594,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1544,
+      "step": 30
+    },
+    {
+      "epoch": 0.6138613861386139,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00019403193901161613,
+      "loss": 1.1384,
+      "step": 31
+    },
+    {
+      "epoch": 0.6336633663366337,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001934564464599461,
+      "loss": 1.0999,
+      "step": 32
+    },
+    {
+      "epoch": 0.6534653465346535,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019285540384897073,
+      "loss": 1.1576,
+      "step": 33
+    },
+    {
+      "epoch": 0.6732673267326733,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019222897549773848,
+      "loss": 1.091,
+      "step": 34
+    },
+    {
+      "epoch": 0.693069306930693,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.056,
+      "step": 35
+    },
+    {
+      "epoch": 0.7128712871287128,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019090065350491626,
+      "loss": 1.1068,
+      "step": 36
+    },
+    {
+      "epoch": 0.7326732673267327,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00019019912301329592,
+      "loss": 1.0583,
+      "step": 37
+    },
+    {
+      "epoch": 0.7524752475247525,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00018947293298207635,
+      "loss": 1.0671,
+      "step": 38
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001887222819443612,
+      "loss": 1.0851,
+      "step": 39
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "eval_loss": 1.060703158378601,
+      "eval_runtime": 13.878,
+      "eval_samples_per_second": 9.007,
+      "eval_steps_per_second": 4.54,
+      "step": 39
+    },
+    {
+      "epoch": 0.7920792079207921,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.0343,
+      "step": 40
+    },
+    {
+      "epoch": 0.8118811881188119,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00018714842436272773,
+      "loss": 0.9789,
+      "step": 41
+    },
+    {
+      "epoch": 0.8316831683168316,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00018632564809575742,
+      "loss": 1.0174,
+      "step": 42
+    },
+    {
+      "epoch": 0.8514851485148515,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001854792712585539,
+      "loss": 1.0004,
+      "step": 43
+    },
+    {
+      "epoch": 0.8712871287128713,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00018460952524209355,
+      "loss": 1.0281,
+      "step": 44
+    },
+    {
+      "epoch": 0.8910891089108911,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 0.9992,
+      "step": 45
+    },
+    {
+      "epoch": 0.9108910891089109,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00018280088311480201,
+      "loss": 0.9635,
+      "step": 46
+    },
+    {
+      "epoch": 0.9306930693069307,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00018186248146866927,
+      "loss": 1.006,
+      "step": 47
+    },
+    {
+      "epoch": 0.9504950495049505,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9891,
+      "step": 48
+    },
+    {
+      "epoch": 0.9702970297029703,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001799187996894925,
+      "loss": 0.9809,
+      "step": 49
+    },
+    {
+      "epoch": 0.9900990099009901,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.9646,
+      "step": 50
+    },
+    {
+      "epoch": 1.00990099009901,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00017788772787621126,
+      "loss": 0.9553,
+      "step": 51
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00017684011108568592,
+      "loss": 0.9432,
+      "step": 52
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "eval_loss": 0.9755253195762634,
+      "eval_runtime": 13.879,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 52
+    },
+    {
+      "epoch": 1.0495049504950495,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001757714869760335,
+      "loss": 0.9631,
+      "step": 53
+    },
+    {
+      "epoch": 1.0693069306930694,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001746821476984154,
+      "loss": 0.9539,
+      "step": 54
+    },
+    {
+      "epoch": 1.0198019801980198,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00017357239106731317,
+      "loss": 0.9559,
+      "step": 55
+    },
+    {
+      "epoch": 1.0396039603960396,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00017244252047910892,
+      "loss": 0.9111,
+      "step": 56
+    },
+    {
+      "epoch": 1.0594059405940595,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00017129284482913972,
+      "loss": 0.9503,
+      "step": 57
+    },
+    {
+      "epoch": 1.0792079207920793,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00017012367842724887,
+      "loss": 0.911,
+      "step": 58
+    },
+    {
+      "epoch": 1.099009900990099,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0001689353409118566,
+      "loss": 0.9041,
+      "step": 59
+    },
+    {
+      "epoch": 1.118811881188119,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 0.9117,
+      "step": 60
+    },
+    {
+      "epoch": 1.1386138613861387,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001665024572113848,
+      "loss": 0.9351,
+      "step": 61
+    },
+    {
+      "epoch": 1.1584158415841583,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00016525857615241687,
+      "loss": 0.9438,
+      "step": 62
+    },
+    {
+      "epoch": 1.1782178217821782,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00016399685405033167,
+      "loss": 0.9075,
+      "step": 63
+    },
+    {
+      "epoch": 1.198019801980198,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001627176358473537,
+      "loss": 0.8983,
+      "step": 64
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001614212712689668,
+      "loss": 0.9007,
+      "step": 65
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "eval_loss": 0.9333999156951904,
+      "eval_runtime": 13.8668,
+      "eval_samples_per_second": 9.014,
+      "eval_steps_per_second": 4.543,
+      "step": 65
+    },
+    {
+      "epoch": 1.2376237623762376,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00016010811472830252,
+      "loss": 0.9108,
+      "step": 66
+    },
+    {
+      "epoch": 1.2574257425742574,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.9177,
+      "step": 67
+    },
+    {
+      "epoch": 1.2772277227722773,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00015743286626829437,
+      "loss": 0.9,
+      "step": 68
+    },
+    {
+      "epoch": 1.297029702970297,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001560715057351673,
+      "loss": 0.9096,
+      "step": 69
+    },
+    {
+      "epoch": 1.316831683168317,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 0.8946,
+      "step": 70
+    },
+    {
+      "epoch": 1.3366336633663367,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001533031728727994,
+      "loss": 0.8995,
+      "step": 71
+    },
+    {
+      "epoch": 1.3564356435643563,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00015189695737812152,
+      "loss": 0.922,
+      "step": 72
+    },
+    {
+      "epoch": 1.3762376237623761,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001504765537734844,
+      "loss": 0.885,
+      "step": 73
+    },
+    {
+      "epoch": 1.396039603960396,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00014904235038305083,
+      "loss": 0.895,
+      "step": 74
+    },
+    {
+      "epoch": 1.4158415841584158,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 0.892,
+      "step": 75
+    },
+    {
+      "epoch": 1.4356435643564356,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001461341162978688,
+      "loss": 0.8277,
+      "step": 76
+    },
+    {
+      "epoch": 1.4554455445544554,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00014466088068528068,
+      "loss": 0.8687,
+      "step": 77
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014317543523384928,
+      "loss": 0.8765,
+      "step": 78
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "eval_loss": 0.9083698391914368,
+      "eval_runtime": 13.8834,
+      "eval_samples_per_second": 9.004,
+      "eval_steps_per_second": 4.538,
+      "step": 78
+    },
+    {
+      "epoch": 1.495049504950495,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014167818604952906,
+      "loss": 0.8797,
+      "step": 79
+    },
+    {
+      "epoch": 1.5148514851485149,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 0.905,
+      "step": 80
+    },
+    {
+      "epoch": 1.5346534653465347,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00013864991692924523,
+      "loss": 0.8575,
+      "step": 81
+    },
+    {
+      "epoch": 1.5544554455445545,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00013711972489182208,
+      "loss": 0.8957,
+      "step": 82
+    },
+    {
+      "epoch": 1.5742574257425743,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00013557938469225167,
+      "loss": 0.8792,
+      "step": 83
+    },
+    {
+      "epoch": 1.5940594059405941,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00013402931744416433,
+      "loss": 0.889,
+      "step": 84
+    },
+    {
+      "epoch": 1.613861386138614,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00013246994692046836,
+      "loss": 0.8657,
+      "step": 85
+    },
+    {
+      "epoch": 1.6336633663366338,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8784,
+      "step": 86
+    },
+    {
+      "epoch": 1.6534653465346536,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001293250037384465,
+      "loss": 0.8822,
+      "step": 87
+    },
+    {
+      "epoch": 1.6732673267326734,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00012774029087618446,
+      "loss": 0.9092,
+      "step": 88
+    },
+    {
+      "epoch": 1.693069306930693,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00012614799409538198,
+      "loss": 0.8813,
+      "step": 89
+    },
+    {
+      "epoch": 1.7128712871287128,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 0.8975,
+      "step": 90
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00012294239200467516,
+      "loss": 0.8789,
+      "step": 91
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "eval_loss": 0.8891416788101196,
+      "eval_runtime": 13.872,
+      "eval_samples_per_second": 9.011,
+      "eval_steps_per_second": 4.542,
+      "step": 91
+    },
+    {
+      "epoch": 1.7524752475247525,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001213299630743747,
+      "loss": 0.9184,
+      "step": 92
+    },
+    {
+      "epoch": 1.7722772277227723,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00011971170274514802,
+      "loss": 0.8854,
+      "step": 93
+    },
+    {
+      "epoch": 1.7920792079207921,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000118088053433211,
+      "loss": 0.8688,
+      "step": 94
+    },
+    {
+      "epoch": 1.811881188118812,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00011645945902807341,
+      "loss": 0.8281,
+      "step": 95
+    },
+    {
+      "epoch": 1.8316831683168315,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001148263647711842,
+      "loss": 0.8488,
+      "step": 96
+    },
+    {
+      "epoch": 1.8514851485148514,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00011318921713420691,
+      "loss": 0.8742,
+      "step": 97
+    },
+    {
+      "epoch": 1.8712871287128712,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011154846369695863,
+      "loss": 0.8586,
+      "step": 98
+    },
+    {
+      "epoch": 1.891089108910891,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001099045530250463,
+      "loss": 0.8776,
+      "step": 99
+    },
+    {
+      "epoch": 1.9108910891089108,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 0.8563,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.08354098020352e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944

checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97264f01540b1ad5acd25f27b627a7352dbda77c960c2b3c7b157d05035d6ac6
+size 50503848

checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24e5de270e966edc3891231b22ee3b34b5d5573183750ce1a8ecca10a2b62423
+size 202035450

checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3129c63169712c776c1e0e28d8711e276143acd2c2f061fb6eb052c04856ba72
+size 14244

checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd5d42bb0afda20ec4c83d38c6af1131541c335ecab229c74e7f418894f3c13b
+size 1064

checkpoint-150/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-150/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-150/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1179 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8316831683168315,
+  "eval_steps": 13,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019801980198019802,
+      "grad_norm": 1.15625,
+      "learning_rate": 2e-05,
+      "loss": 2.0919,
+      "step": 1
+    },
+    {
+      "epoch": 0.019801980198019802,
+      "eval_loss": 2.079954147338867,
+      "eval_runtime": 13.8908,
+      "eval_samples_per_second": 8.999,
+      "eval_steps_per_second": 4.535,
+      "step": 1
+    },
+    {
+      "epoch": 0.039603960396039604,
+      "grad_norm": 1.203125,
+      "learning_rate": 4e-05,
+      "loss": 2.0814,
+      "step": 2
+    },
+    {
+      "epoch": 0.0594059405940594,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6e-05,
+      "loss": 2.0499,
+      "step": 3
+    },
+    {
+      "epoch": 0.07920792079207921,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8e-05,
+      "loss": 2.0153,
+      "step": 4
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0001,
+      "loss": 1.9548,
+      "step": 5
+    },
+    {
+      "epoch": 0.1188118811881188,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00012,
+      "loss": 1.8982,
+      "step": 6
+    },
+    {
+      "epoch": 0.13861386138613863,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00014,
+      "loss": 1.8226,
+      "step": 7
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00016,
+      "loss": 1.7572,
+      "step": 8
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00018,
+      "loss": 1.7074,
+      "step": 9
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0002,
+      "loss": 1.6317,
+      "step": 10
+    },
+    {
+      "epoch": 0.21782178217821782,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001999863304992469,
+      "loss": 1.5801,
+      "step": 11
+    },
+    {
+      "epoch": 0.2376237623762376,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 1.5721,
+      "step": 12
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00019987699691483048,
+      "loss": 1.5479,
+      "step": 13
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "eval_loss": 1.5341482162475586,
+      "eval_runtime": 13.8795,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 13
+    },
+    {
+      "epoch": 0.27722772277227725,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00019978136272187747,
+      "loss": 1.534,
+      "step": 14
+    },
+    {
+      "epoch": 0.297029702970297,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.4804,
+      "step": 15
+    },
+    {
+      "epoch": 0.31683168316831684,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00019950829025450114,
+      "loss": 1.4805,
+      "step": 16
+    },
+    {
+      "epoch": 0.33663366336633666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00019933092663536382,
+      "loss": 1.3809,
+      "step": 17
+    },
+    {
+      "epoch": 0.3564356435643564,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00019912640693269752,
+      "loss": 1.3837,
+      "step": 18
+    },
+    {
+      "epoch": 0.37623762376237624,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00019889478706014687,
+      "loss": 1.3673,
+      "step": 19
+    },
+    {
+      "epoch": 0.39603960396039606,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.366,
+      "step": 20
+    },
+    {
+      "epoch": 0.4158415841584158,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00019835050748723824,
+      "loss": 1.3318,
+      "step": 21
+    },
+    {
+      "epoch": 0.43564356435643564,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00019803799658748094,
+      "loss": 1.2741,
+      "step": 22
+    },
+    {
+      "epoch": 0.45544554455445546,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019769868307835994,
+      "loss": 1.2978,
+      "step": 23
+    },
+    {
+      "epoch": 0.4752475247524752,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001973326597248006,
+      "loss": 1.2733,
+      "step": 24
+    },
+    {
+      "epoch": 0.49504950495049505,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.2302,
+      "step": 25
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00019652089102773488,
+      "loss": 1.2083,
+      "step": 26
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "eval_loss": 1.224540114402771,
+      "eval_runtime": 13.8695,
+      "eval_samples_per_second": 9.013,
+      "eval_steps_per_second": 4.542,
+      "step": 26
+    },
+    {
+      "epoch": 0.5346534653465347,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00019607536761368484,
+      "loss": 1.1761,
+      "step": 27
+    },
+    {
+      "epoch": 0.5544554455445545,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00019560357815343577,
+      "loss": 1.1751,
+      "step": 28
+    },
+    {
+      "epoch": 0.5742574257425742,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 1.2002,
+      "step": 29
+    },
+    {
+      "epoch": 0.594059405940594,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1544,
+      "step": 30
+    },
+    {
+      "epoch": 0.6138613861386139,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00019403193901161613,
+      "loss": 1.1384,
+      "step": 31
+    },
+    {
+      "epoch": 0.6336633663366337,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001934564464599461,
+      "loss": 1.0999,
+      "step": 32
+    },
+    {
+      "epoch": 0.6534653465346535,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019285540384897073,
+      "loss": 1.1576,
+      "step": 33
+    },
+    {
+      "epoch": 0.6732673267326733,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019222897549773848,
+      "loss": 1.091,
+      "step": 34
+    },
+    {
+      "epoch": 0.693069306930693,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.056,
+      "step": 35
+    },
+    {
+      "epoch": 0.7128712871287128,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019090065350491626,
+      "loss": 1.1068,
+      "step": 36
+    },
+    {
+      "epoch": 0.7326732673267327,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00019019912301329592,
+      "loss": 1.0583,
+      "step": 37
+    },
+    {
+      "epoch": 0.7524752475247525,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00018947293298207635,
+      "loss": 1.0671,
+      "step": 38
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001887222819443612,
+      "loss": 1.0851,
+      "step": 39
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "eval_loss": 1.060703158378601,
+      "eval_runtime": 13.878,
+      "eval_samples_per_second": 9.007,
+      "eval_steps_per_second": 4.54,
+      "step": 39
+    },
+    {
+      "epoch": 0.7920792079207921,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.0343,
+      "step": 40
+    },
+    {
+      "epoch": 0.8118811881188119,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00018714842436272773,
+      "loss": 0.9789,
+      "step": 41
+    },
+    {
+      "epoch": 0.8316831683168316,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00018632564809575742,
+      "loss": 1.0174,
+      "step": 42
+    },
+    {
+      "epoch": 0.8514851485148515,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001854792712585539,
+      "loss": 1.0004,
+      "step": 43
+    },
+    {
+      "epoch": 0.8712871287128713,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00018460952524209355,
+      "loss": 1.0281,
+      "step": 44
+    },
+    {
+      "epoch": 0.8910891089108911,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 0.9992,
+      "step": 45
+    },
+    {
+      "epoch": 0.9108910891089109,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00018280088311480201,
+      "loss": 0.9635,
+      "step": 46
+    },
+    {
+      "epoch": 0.9306930693069307,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00018186248146866927,
+      "loss": 1.006,
+      "step": 47
+    },
+    {
+      "epoch": 0.9504950495049505,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9891,
+      "step": 48
+    },
+    {
+      "epoch": 0.9702970297029703,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001799187996894925,
+      "loss": 0.9809,
+      "step": 49
+    },
+    {
+      "epoch": 0.9900990099009901,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.9646,
+      "step": 50
+    },
+    {
+      "epoch": 1.00990099009901,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00017788772787621126,
+      "loss": 0.9553,
+      "step": 51
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00017684011108568592,
+      "loss": 0.9432,
+      "step": 52
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "eval_loss": 0.9755253195762634,
+      "eval_runtime": 13.879,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 52
+    },
+    {
+      "epoch": 1.0495049504950495,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001757714869760335,
+      "loss": 0.9631,
+      "step": 53
+    },
+    {
+      "epoch": 1.0693069306930694,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001746821476984154,
+      "loss": 0.9539,
+      "step": 54
+    },
+    {
+      "epoch": 1.0198019801980198,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00017357239106731317,
+      "loss": 0.9559,
+      "step": 55
+    },
+    {
+      "epoch": 1.0396039603960396,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00017244252047910892,
+      "loss": 0.9111,
+      "step": 56
+    },
+    {
+      "epoch": 1.0594059405940595,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00017129284482913972,
+      "loss": 0.9503,
+      "step": 57
+    },
+    {
+      "epoch": 1.0792079207920793,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00017012367842724887,
+      "loss": 0.911,
+      "step": 58
+    },
+    {
+      "epoch": 1.099009900990099,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0001689353409118566,
+      "loss": 0.9041,
+      "step": 59
+    },
+    {
+      "epoch": 1.118811881188119,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 0.9117,
+      "step": 60
+    },
+    {
+      "epoch": 1.1386138613861387,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001665024572113848,
+      "loss": 0.9351,
+      "step": 61
+    },
+    {
+      "epoch": 1.1584158415841583,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00016525857615241687,
+      "loss": 0.9438,
+      "step": 62
+    },
+    {
+      "epoch": 1.1782178217821782,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00016399685405033167,
+      "loss": 0.9075,
+      "step": 63
+    },
+    {
+      "epoch": 1.198019801980198,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001627176358473537,
+      "loss": 0.8983,
+      "step": 64
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001614212712689668,
+      "loss": 0.9007,
+      "step": 65
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "eval_loss": 0.9333999156951904,
+      "eval_runtime": 13.8668,
+      "eval_samples_per_second": 9.014,
+      "eval_steps_per_second": 4.543,
+      "step": 65
+    },
+    {
+      "epoch": 1.2376237623762376,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00016010811472830252,
+      "loss": 0.9108,
+      "step": 66
+    },
+    {
+      "epoch": 1.2574257425742574,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.9177,
+      "step": 67
+    },
+    {
+      "epoch": 1.2772277227722773,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00015743286626829437,
+      "loss": 0.9,
+      "step": 68
+    },
+    {
+      "epoch": 1.297029702970297,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001560715057351673,
+      "loss": 0.9096,
+      "step": 69
+    },
+    {
+      "epoch": 1.316831683168317,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 0.8946,
+      "step": 70
+    },
+    {
+      "epoch": 1.3366336633663367,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001533031728727994,
+      "loss": 0.8995,
+      "step": 71
+    },
+    {
+      "epoch": 1.3564356435643563,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00015189695737812152,
+      "loss": 0.922,
+      "step": 72
+    },
+    {
+      "epoch": 1.3762376237623761,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001504765537734844,
+      "loss": 0.885,
+      "step": 73
+    },
+    {
+      "epoch": 1.396039603960396,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00014904235038305083,
+      "loss": 0.895,
+      "step": 74
+    },
+    {
+      "epoch": 1.4158415841584158,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 0.892,
+      "step": 75
+    },
+    {
+      "epoch": 1.4356435643564356,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001461341162978688,
+      "loss": 0.8277,
+      "step": 76
+    },
+    {
+      "epoch": 1.4554455445544554,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00014466088068528068,
+      "loss": 0.8687,
+      "step": 77
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014317543523384928,
+      "loss": 0.8765,
+      "step": 78
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "eval_loss": 0.9083698391914368,
+      "eval_runtime": 13.8834,
+      "eval_samples_per_second": 9.004,
+      "eval_steps_per_second": 4.538,
+      "step": 78
+    },
+    {
+      "epoch": 1.495049504950495,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014167818604952906,
+      "loss": 0.8797,
+      "step": 79
+    },
+    {
+      "epoch": 1.5148514851485149,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 0.905,
+      "step": 80
+    },
+    {
+      "epoch": 1.5346534653465347,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00013864991692924523,
+      "loss": 0.8575,
+      "step": 81
+    },
+    {
+      "epoch": 1.5544554455445545,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00013711972489182208,
+      "loss": 0.8957,
+      "step": 82
+    },
+    {
+      "epoch": 1.5742574257425743,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00013557938469225167,
+      "loss": 0.8792,
+      "step": 83
+    },
+    {
+      "epoch": 1.5940594059405941,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00013402931744416433,
+      "loss": 0.889,
+      "step": 84
+    },
+    {
+      "epoch": 1.613861386138614,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00013246994692046836,
+      "loss": 0.8657,
+      "step": 85
+    },
+    {
+      "epoch": 1.6336633663366338,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8784,
+      "step": 86
+    },
+    {
+      "epoch": 1.6534653465346536,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001293250037384465,
+      "loss": 0.8822,
+      "step": 87
+    },
+    {
+      "epoch": 1.6732673267326734,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00012774029087618446,
+      "loss": 0.9092,
+      "step": 88
+    },
+    {
+      "epoch": 1.693069306930693,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00012614799409538198,
+      "loss": 0.8813,
+      "step": 89
+    },
+    {
+      "epoch": 1.7128712871287128,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 0.8975,
+      "step": 90
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00012294239200467516,
+      "loss": 0.8789,
+      "step": 91
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "eval_loss": 0.8891416788101196,
+      "eval_runtime": 13.872,
+      "eval_samples_per_second": 9.011,
+      "eval_steps_per_second": 4.542,
+      "step": 91
+    },
+    {
+      "epoch": 1.7524752475247525,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001213299630743747,
+      "loss": 0.9184,
+      "step": 92
+    },
+    {
+      "epoch": 1.7722772277227723,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00011971170274514802,
+      "loss": 0.8854,
+      "step": 93
+    },
+    {
+      "epoch": 1.7920792079207921,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000118088053433211,
+      "loss": 0.8688,
+      "step": 94
+    },
+    {
+      "epoch": 1.811881188118812,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00011645945902807341,
+      "loss": 0.8281,
+      "step": 95
+    },
+    {
+      "epoch": 1.8316831683168315,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001148263647711842,
+      "loss": 0.8488,
+      "step": 96
+    },
+    {
+      "epoch": 1.8514851485148514,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00011318921713420691,
+      "loss": 0.8742,
+      "step": 97
+    },
+    {
+      "epoch": 1.8712871287128712,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011154846369695863,
+      "loss": 0.8586,
+      "step": 98
+    },
+    {
+      "epoch": 1.891089108910891,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001099045530250463,
+      "loss": 0.8776,
+      "step": 99
+    },
+    {
+      "epoch": 1.9108910891089108,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 0.8563,
+      "step": 100
+    },
+    {
+      "epoch": 1.9306930693069306,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00010660905843256994,
+      "loss": 0.8381,
+      "step": 101
+    },
+    {
+      "epoch": 1.9504950495049505,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00010495837546732224,
+      "loss": 0.847,
+      "step": 102
+    },
+    {
+      "epoch": 1.9702970297029703,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00010330633693173082,
+      "loss": 0.8512,
+      "step": 103
+    },
+    {
+      "epoch": 1.99009900990099,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00010165339447663587,
+      "loss": 0.8304,
+      "step": 104
+    },
+    {
+      "epoch": 1.99009900990099,
+      "eval_loss": 0.8779018521308899,
+      "eval_runtime": 13.8827,
+      "eval_samples_per_second": 9.004,
+      "eval_steps_per_second": 4.538,
+      "step": 104
+    },
+    {
+      "epoch": 2.00990099009901,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.8523,
+      "step": 105
+    },
+    {
+      "epoch": 2.0297029702970297,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 9.834660552336415e-05,
+      "loss": 0.8109,
+      "step": 106
+    },
+    {
+      "epoch": 2.0495049504950495,
+      "grad_norm": 0.224609375,
+      "learning_rate": 9.669366306826919e-05,
+      "loss": 0.8394,
+      "step": 107
+    },
+    {
+      "epoch": 2.0693069306930694,
+      "grad_norm": 0.283203125,
+      "learning_rate": 9.504162453267777e-05,
+      "loss": 0.8524,
+      "step": 108
+    },
+    {
+      "epoch": 2.01980198019802,
+      "grad_norm": 0.22265625,
+      "learning_rate": 9.339094156743007e-05,
+      "loss": 0.8391,
+      "step": 109
+    },
+    {
+      "epoch": 2.0396039603960396,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 0.8317,
+      "step": 110
+    },
+    {
+      "epoch": 2.0594059405940595,
+      "grad_norm": 0.22265625,
+      "learning_rate": 9.009544697495374e-05,
+      "loss": 0.833,
+      "step": 111
+    },
+    {
+      "epoch": 2.0792079207920793,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 8.845153630304139e-05,
+      "loss": 0.8408,
+      "step": 112
+    },
+    {
+      "epoch": 2.099009900990099,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 8.681078286579311e-05,
+      "loss": 0.8459,
+      "step": 113
+    },
+    {
+      "epoch": 2.118811881188119,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 8.517363522881579e-05,
+      "loss": 0.8177,
+      "step": 114
+    },
+    {
+      "epoch": 2.1386138613861387,
+      "grad_norm": 0.2265625,
+      "learning_rate": 8.35405409719266e-05,
+      "loss": 0.8451,
+      "step": 115
+    },
+    {
+      "epoch": 2.1584158415841586,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 8.191194656678904e-05,
+      "loss": 0.8543,
+      "step": 116
+    },
+    {
+      "epoch": 2.1782178217821784,
+      "grad_norm": 0.22265625,
+      "learning_rate": 8.028829725485199e-05,
+      "loss": 0.8194,
+      "step": 117
+    },
+    {
+      "epoch": 2.1782178217821784,
+      "eval_loss": 0.8713971972465515,
+      "eval_runtime": 13.8976,
+      "eval_samples_per_second": 8.994,
+      "eval_steps_per_second": 4.533,
+      "step": 117
+    },
+    {
+      "epoch": 2.198019801980198,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 7.867003692562534e-05,
+      "loss": 0.808,
+      "step": 118
+    },
+    {
+      "epoch": 2.217821782178218,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 7.705760799532485e-05,
+      "loss": 0.8073,
+      "step": 119
+    },
+    {
+      "epoch": 2.237623762376238,
+      "grad_norm": 0.201171875,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 0.8392,
+      "step": 120
+    },
+    {
+      "epoch": 2.2574257425742577,
+      "grad_norm": 0.25,
+      "learning_rate": 7.385200590461803e-05,
+      "loss": 0.8574,
+      "step": 121
+    },
+    {
+      "epoch": 2.2772277227722775,
+      "grad_norm": 0.271484375,
+      "learning_rate": 7.225970912381556e-05,
+      "loss": 0.8338,
+      "step": 122
+    },
+    {
+      "epoch": 2.297029702970297,
+      "grad_norm": 0.294921875,
+      "learning_rate": 7.067499626155354e-05,
+      "loss": 0.8788,
+      "step": 123
+    },
+    {
+      "epoch": 2.3168316831683167,
+      "grad_norm": 0.2265625,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8297,
+      "step": 124
+    },
+    {
+      "epoch": 2.3366336633663365,
+      "grad_norm": 0.267578125,
+      "learning_rate": 6.753005307953167e-05,
+      "loss": 0.8125,
+      "step": 125
+    },
+    {
+      "epoch": 2.3564356435643563,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 6.59706825558357e-05,
+      "loss": 0.814,
+      "step": 126
+    },
+    {
+      "epoch": 2.376237623762376,
+      "grad_norm": 0.27734375,
+      "learning_rate": 6.442061530774834e-05,
+      "loss": 0.8335,
+      "step": 127
+    },
+    {
+      "epoch": 2.396039603960396,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 6.28802751081779e-05,
+      "loss": 0.8512,
+      "step": 128
+    },
+    {
+      "epoch": 2.4158415841584158,
+      "grad_norm": 0.224609375,
+      "learning_rate": 6.135008307075481e-05,
+      "loss": 0.8297,
+      "step": 129
+    },
+    {
+      "epoch": 2.4356435643564356,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 0.848,
+      "step": 130
+    },
+    {
+      "epoch": 2.4356435643564356,
+      "eval_loss": 0.8665071129798889,
+      "eval_runtime": 13.8735,
+      "eval_samples_per_second": 9.01,
+      "eval_steps_per_second": 4.541,
+      "step": 130
+    },
+    {
+      "epoch": 2.4554455445544554,
+      "grad_norm": 0.2265625,
+      "learning_rate": 5.832181395047098e-05,
+      "loss": 0.8203,
+      "step": 131
+    },
+    {
+      "epoch": 2.4752475247524752,
+      "grad_norm": 0.287109375,
+      "learning_rate": 5.6824564766150726e-05,
+      "loss": 0.8519,
+      "step": 132
+    },
+    {
+      "epoch": 2.495049504950495,
+      "grad_norm": 0.21484375,
+      "learning_rate": 5.533911931471936e-05,
+      "loss": 0.83,
+      "step": 133
+    },
+    {
+      "epoch": 2.514851485148515,
+      "grad_norm": 0.2109375,
+      "learning_rate": 5.386588370213124e-05,
+      "loss": 0.842,
+      "step": 134
+    },
+    {
+      "epoch": 2.5346534653465347,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 5.240526069629265e-05,
+      "loss": 0.8419,
+      "step": 135
+    },
+    {
+      "epoch": 2.5544554455445545,
+      "grad_norm": 0.267578125,
+      "learning_rate": 5.095764961694922e-05,
+      "loss": 0.8458,
+      "step": 136
+    },
+    {
+      "epoch": 2.5742574257425743,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.952344622651566e-05,
+      "loss": 0.8133,
+      "step": 137
+    },
+    {
+      "epoch": 2.594059405940594,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 4.810304262187852e-05,
+      "loss": 0.8103,
+      "step": 138
+    },
+    {
+      "epoch": 2.613861386138614,
+      "grad_norm": 0.20703125,
+      "learning_rate": 4.669682712720065e-05,
+      "loss": 0.8105,
+      "step": 139
+    },
+    {
+      "epoch": 2.633663366336634,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 0.8305,
+      "step": 140
+    },
+    {
+      "epoch": 2.6534653465346536,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 4.392849426483274e-05,
+      "loss": 0.7881,
+      "step": 141
+    },
+    {
+      "epoch": 2.6732673267326734,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.256713373170564e-05,
+      "loss": 0.8204,
+      "step": 142
+    },
+    {
+      "epoch": 2.693069306930693,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8354,
+      "step": 143
+    },
+    {
+      "epoch": 2.693069306930693,
+      "eval_loss": 0.8626759648323059,
+      "eval_runtime": 13.8585,
+      "eval_samples_per_second": 9.02,
+      "eval_steps_per_second": 4.546,
+      "step": 143
+    },
+    {
+      "epoch": 2.7128712871287126,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 3.9891885271697496e-05,
+      "loss": 0.8441,
+      "step": 144
+    },
+    {
+      "epoch": 2.7326732673267324,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 3.857872873103322e-05,
+      "loss": 0.8084,
+      "step": 145
+    },
+    {
+      "epoch": 2.7524752475247523,
+      "grad_norm": 0.18359375,
+      "learning_rate": 3.7282364152646297e-05,
+      "loss": 0.8184,
+      "step": 146
+    },
+    {
+      "epoch": 2.772277227722772,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 3.600314594966834e-05,
+      "loss": 0.8302,
+      "step": 147
+    },
+    {
+      "epoch": 2.792079207920792,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 3.4741423847583134e-05,
+      "loss": 0.8503,
+      "step": 148
+    },
+    {
+      "epoch": 2.8118811881188117,
+      "grad_norm": 0.2265625,
+      "learning_rate": 3.349754278861517e-05,
+      "loss": 0.8273,
+      "step": 149
+    },
+    {
+      "epoch": 2.8316831683168315,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 0.8332,
+      "step": 150
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.1227070440800256e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944

checkpoint-200/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:800103a9f27876d14f8e9f0fb64fb81af3a478d54bbaea5587ecbd0592ad4142
+size 50503848

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dafd7ff9d5c3c564b22c4a0593f1078a408837f37261ad73caf0c7e062c6a39
+size 202035450

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45ca197c3706eaaadf2931079a5ebf26b215b3f60f60a6755cc111301c7ac7f6
+size 14244

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9a25c72339c898b564e0c464a3f6fc75bbeec408008928b7ed05533156b98c
+size 1064

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1561 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.7524752475247523,
+  "eval_steps": 13,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019801980198019802,
+      "grad_norm": 1.15625,
+      "learning_rate": 2e-05,
+      "loss": 2.0919,
+      "step": 1
+    },
+    {
+      "epoch": 0.019801980198019802,
+      "eval_loss": 2.079954147338867,
+      "eval_runtime": 13.8908,
+      "eval_samples_per_second": 8.999,
+      "eval_steps_per_second": 4.535,
+      "step": 1
+    },
+    {
+      "epoch": 0.039603960396039604,
+      "grad_norm": 1.203125,
+      "learning_rate": 4e-05,
+      "loss": 2.0814,
+      "step": 2
+    },
+    {
+      "epoch": 0.0594059405940594,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6e-05,
+      "loss": 2.0499,
+      "step": 3
+    },
+    {
+      "epoch": 0.07920792079207921,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8e-05,
+      "loss": 2.0153,
+      "step": 4
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0001,
+      "loss": 1.9548,
+      "step": 5
+    },
+    {
+      "epoch": 0.1188118811881188,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00012,
+      "loss": 1.8982,
+      "step": 6
+    },
+    {
+      "epoch": 0.13861386138613863,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00014,
+      "loss": 1.8226,
+      "step": 7
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00016,
+      "loss": 1.7572,
+      "step": 8
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00018,
+      "loss": 1.7074,
+      "step": 9
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0002,
+      "loss": 1.6317,
+      "step": 10
+    },
+    {
+      "epoch": 0.21782178217821782,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001999863304992469,
+      "loss": 1.5801,
+      "step": 11
+    },
+    {
+      "epoch": 0.2376237623762376,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 1.5721,
+      "step": 12
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00019987699691483048,
+      "loss": 1.5479,
+      "step": 13
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "eval_loss": 1.5341482162475586,
+      "eval_runtime": 13.8795,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 13
+    },
+    {
+      "epoch": 0.27722772277227725,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00019978136272187747,
+      "loss": 1.534,
+      "step": 14
+    },
+    {
+      "epoch": 0.297029702970297,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.4804,
+      "step": 15
+    },
+    {
+      "epoch": 0.31683168316831684,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00019950829025450114,
+      "loss": 1.4805,
+      "step": 16
+    },
+    {
+      "epoch": 0.33663366336633666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00019933092663536382,
+      "loss": 1.3809,
+      "step": 17
+    },
+    {
+      "epoch": 0.3564356435643564,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00019912640693269752,
+      "loss": 1.3837,
+      "step": 18
+    },
+    {
+      "epoch": 0.37623762376237624,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00019889478706014687,
+      "loss": 1.3673,
+      "step": 19
+    },
+    {
+      "epoch": 0.39603960396039606,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.366,
+      "step": 20
+    },
+    {
+      "epoch": 0.4158415841584158,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00019835050748723824,
+      "loss": 1.3318,
+      "step": 21
+    },
+    {
+      "epoch": 0.43564356435643564,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00019803799658748094,
+      "loss": 1.2741,
+      "step": 22
+    },
+    {
+      "epoch": 0.45544554455445546,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019769868307835994,
+      "loss": 1.2978,
+      "step": 23
+    },
+    {
+      "epoch": 0.4752475247524752,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001973326597248006,
+      "loss": 1.2733,
+      "step": 24
+    },
+    {
+      "epoch": 0.49504950495049505,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.2302,
+      "step": 25
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00019652089102773488,
+      "loss": 1.2083,
+      "step": 26
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "eval_loss": 1.224540114402771,
+      "eval_runtime": 13.8695,
+      "eval_samples_per_second": 9.013,
+      "eval_steps_per_second": 4.542,
+      "step": 26
+    },
+    {
+      "epoch": 0.5346534653465347,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00019607536761368484,
+      "loss": 1.1761,
+      "step": 27
+    },
+    {
+      "epoch": 0.5544554455445545,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00019560357815343577,
+      "loss": 1.1751,
+      "step": 28
+    },
+    {
+      "epoch": 0.5742574257425742,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 1.2002,
+      "step": 29
+    },
+    {
+      "epoch": 0.594059405940594,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1544,
+      "step": 30
+    },
+    {
+      "epoch": 0.6138613861386139,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00019403193901161613,
+      "loss": 1.1384,
+      "step": 31
+    },
+    {
+      "epoch": 0.6336633663366337,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001934564464599461,
+      "loss": 1.0999,
+      "step": 32
+    },
+    {
+      "epoch": 0.6534653465346535,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019285540384897073,
+      "loss": 1.1576,
+      "step": 33
+    },
+    {
+      "epoch": 0.6732673267326733,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019222897549773848,
+      "loss": 1.091,
+      "step": 34
+    },
+    {
+      "epoch": 0.693069306930693,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.056,
+      "step": 35
+    },
+    {
+      "epoch": 0.7128712871287128,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019090065350491626,
+      "loss": 1.1068,
+      "step": 36
+    },
+    {
+      "epoch": 0.7326732673267327,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00019019912301329592,
+      "loss": 1.0583,
+      "step": 37
+    },
+    {
+      "epoch": 0.7524752475247525,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00018947293298207635,
+      "loss": 1.0671,
+      "step": 38
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001887222819443612,
+      "loss": 1.0851,
+      "step": 39
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "eval_loss": 1.060703158378601,
+      "eval_runtime": 13.878,
+      "eval_samples_per_second": 9.007,
+      "eval_steps_per_second": 4.54,
+      "step": 39
+    },
+    {
+      "epoch": 0.7920792079207921,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.0343,
+      "step": 40
+    },
+    {
+      "epoch": 0.8118811881188119,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00018714842436272773,
+      "loss": 0.9789,
+      "step": 41
+    },
+    {
+      "epoch": 0.8316831683168316,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00018632564809575742,
+      "loss": 1.0174,
+      "step": 42
+    },
+    {
+      "epoch": 0.8514851485148515,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001854792712585539,
+      "loss": 1.0004,
+      "step": 43
+    },
+    {
+      "epoch": 0.8712871287128713,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00018460952524209355,
+      "loss": 1.0281,
+      "step": 44
+    },
+    {
+      "epoch": 0.8910891089108911,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 0.9992,
+      "step": 45
+    },
+    {
+      "epoch": 0.9108910891089109,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00018280088311480201,
+      "loss": 0.9635,
+      "step": 46
+    },
+    {
+      "epoch": 0.9306930693069307,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00018186248146866927,
+      "loss": 1.006,
+      "step": 47
+    },
+    {
+      "epoch": 0.9504950495049505,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9891,
+      "step": 48
+    },
+    {
+      "epoch": 0.9702970297029703,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001799187996894925,
+      "loss": 0.9809,
+      "step": 49
+    },
+    {
+      "epoch": 0.9900990099009901,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.9646,
+      "step": 50
+    },
+    {
+      "epoch": 1.00990099009901,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00017788772787621126,
+      "loss": 0.9553,
+      "step": 51
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00017684011108568592,
+      "loss": 0.9432,
+      "step": 52
+    },
+    {
+      "epoch": 1.0297029702970297,
+      "eval_loss": 0.9755253195762634,
+      "eval_runtime": 13.879,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 52
+    },
+    {
+      "epoch": 1.0495049504950495,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001757714869760335,
+      "loss": 0.9631,
+      "step": 53
+    },
+    {
+      "epoch": 1.0693069306930694,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001746821476984154,
+      "loss": 0.9539,
+      "step": 54
+    },
+    {
+      "epoch": 1.0198019801980198,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00017357239106731317,
+      "loss": 0.9559,
+      "step": 55
+    },
+    {
+      "epoch": 1.0396039603960396,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00017244252047910892,
+      "loss": 0.9111,
+      "step": 56
+    },
+    {
+      "epoch": 1.0594059405940595,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00017129284482913972,
+      "loss": 0.9503,
+      "step": 57
+    },
+    {
+      "epoch": 1.0792079207920793,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.00017012367842724887,
+      "loss": 0.911,
+      "step": 58
+    },
+    {
+      "epoch": 1.099009900990099,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0001689353409118566,
+      "loss": 0.9041,
+      "step": 59
+    },
+    {
+      "epoch": 1.118811881188119,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 0.9117,
+      "step": 60
+    },
+    {
+      "epoch": 1.1386138613861387,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001665024572113848,
+      "loss": 0.9351,
+      "step": 61
+    },
+    {
+      "epoch": 1.1584158415841583,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00016525857615241687,
+      "loss": 0.9438,
+      "step": 62
+    },
+    {
+      "epoch": 1.1782178217821782,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00016399685405033167,
+      "loss": 0.9075,
+      "step": 63
+    },
+    {
+      "epoch": 1.198019801980198,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001627176358473537,
+      "loss": 0.8983,
+      "step": 64
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001614212712689668,
+      "loss": 0.9007,
+      "step": 65
+    },
+    {
+      "epoch": 1.2178217821782178,
+      "eval_loss": 0.9333999156951904,
+      "eval_runtime": 13.8668,
+      "eval_samples_per_second": 9.014,
+      "eval_steps_per_second": 4.543,
+      "step": 65
+    },
+    {
+      "epoch": 1.2376237623762376,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00016010811472830252,
+      "loss": 0.9108,
+      "step": 66
+    },
+    {
+      "epoch": 1.2574257425742574,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.9177,
+      "step": 67
+    },
+    {
+      "epoch": 1.2772277227722773,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00015743286626829437,
+      "loss": 0.9,
+      "step": 68
+    },
+    {
+      "epoch": 1.297029702970297,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001560715057351673,
+      "loss": 0.9096,
+      "step": 69
+    },
+    {
+      "epoch": 1.316831683168317,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 0.8946,
+      "step": 70
+    },
+    {
+      "epoch": 1.3366336633663367,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001533031728727994,
+      "loss": 0.8995,
+      "step": 71
+    },
+    {
+      "epoch": 1.3564356435643563,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00015189695737812152,
+      "loss": 0.922,
+      "step": 72
+    },
+    {
+      "epoch": 1.3762376237623761,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001504765537734844,
+      "loss": 0.885,
+      "step": 73
+    },
+    {
+      "epoch": 1.396039603960396,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00014904235038305083,
+      "loss": 0.895,
+      "step": 74
+    },
+    {
+      "epoch": 1.4158415841584158,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 0.892,
+      "step": 75
+    },
+    {
+      "epoch": 1.4356435643564356,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001461341162978688,
+      "loss": 0.8277,
+      "step": 76
+    },
+    {
+      "epoch": 1.4554455445544554,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00014466088068528068,
+      "loss": 0.8687,
+      "step": 77
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014317543523384928,
+      "loss": 0.8765,
+      "step": 78
+    },
+    {
+      "epoch": 1.4752475247524752,
+      "eval_loss": 0.9083698391914368,
+      "eval_runtime": 13.8834,
+      "eval_samples_per_second": 9.004,
+      "eval_steps_per_second": 4.538,
+      "step": 78
+    },
+    {
+      "epoch": 1.495049504950495,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00014167818604952906,
+      "loss": 0.8797,
+      "step": 79
+    },
+    {
+      "epoch": 1.5148514851485149,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 0.905,
+      "step": 80
+    },
+    {
+      "epoch": 1.5346534653465347,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00013864991692924523,
+      "loss": 0.8575,
+      "step": 81
+    },
+    {
+      "epoch": 1.5544554455445545,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00013711972489182208,
+      "loss": 0.8957,
+      "step": 82
+    },
+    {
+      "epoch": 1.5742574257425743,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00013557938469225167,
+      "loss": 0.8792,
+      "step": 83
+    },
+    {
+      "epoch": 1.5940594059405941,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00013402931744416433,
+      "loss": 0.889,
+      "step": 84
+    },
+    {
+      "epoch": 1.613861386138614,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00013246994692046836,
+      "loss": 0.8657,
+      "step": 85
+    },
+    {
+      "epoch": 1.6336633663366338,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.8784,
+      "step": 86
+    },
+    {
+      "epoch": 1.6534653465346536,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001293250037384465,
+      "loss": 0.8822,
+      "step": 87
+    },
+    {
+      "epoch": 1.6732673267326734,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00012774029087618446,
+      "loss": 0.9092,
+      "step": 88
+    },
+    {
+      "epoch": 1.693069306930693,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00012614799409538198,
+      "loss": 0.8813,
+      "step": 89
+    },
+    {
+      "epoch": 1.7128712871287128,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 0.8975,
+      "step": 90
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00012294239200467516,
+      "loss": 0.8789,
+      "step": 91
+    },
+    {
+      "epoch": 1.7326732673267327,
+      "eval_loss": 0.8891416788101196,
+      "eval_runtime": 13.872,
+      "eval_samples_per_second": 9.011,
+      "eval_steps_per_second": 4.542,
+      "step": 91
+    },
+    {
+      "epoch": 1.7524752475247525,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001213299630743747,
+      "loss": 0.9184,
+      "step": 92
+    },
+    {
+      "epoch": 1.7722772277227723,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00011971170274514802,
+      "loss": 0.8854,
+      "step": 93
+    },
+    {
+      "epoch": 1.7920792079207921,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000118088053433211,
+      "loss": 0.8688,
+      "step": 94
+    },
+    {
+      "epoch": 1.811881188118812,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00011645945902807341,
+      "loss": 0.8281,
+      "step": 95
+    },
+    {
+      "epoch": 1.8316831683168315,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001148263647711842,
+      "loss": 0.8488,
+      "step": 96
+    },
+    {
+      "epoch": 1.8514851485148514,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00011318921713420691,
+      "loss": 0.8742,
+      "step": 97
+    },
+    {
+      "epoch": 1.8712871287128712,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011154846369695863,
+      "loss": 0.8586,
+      "step": 98
+    },
+    {
+      "epoch": 1.891089108910891,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001099045530250463,
+      "loss": 0.8776,
+      "step": 99
+    },
+    {
+      "epoch": 1.9108910891089108,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 0.8563,
+      "step": 100
+    },
+    {
+      "epoch": 1.9306930693069306,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00010660905843256994,
+      "loss": 0.8381,
+      "step": 101
+    },
+    {
+      "epoch": 1.9504950495049505,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00010495837546732224,
+      "loss": 0.847,
+      "step": 102
+    },
+    {
+      "epoch": 1.9702970297029703,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00010330633693173082,
+      "loss": 0.8512,
+      "step": 103
+    },
+    {
+      "epoch": 1.99009900990099,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00010165339447663587,
+      "loss": 0.8304,
+      "step": 104
+    },
+    {
+      "epoch": 1.99009900990099,
+      "eval_loss": 0.8779018521308899,
+      "eval_runtime": 13.8827,
+      "eval_samples_per_second": 9.004,
+      "eval_steps_per_second": 4.538,
+      "step": 104
+    },
+    {
+      "epoch": 2.00990099009901,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.8523,
+      "step": 105
+    },
+    {
+      "epoch": 2.0297029702970297,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 9.834660552336415e-05,
+      "loss": 0.8109,
+      "step": 106
+    },
+    {
+      "epoch": 2.0495049504950495,
+      "grad_norm": 0.224609375,
+      "learning_rate": 9.669366306826919e-05,
+      "loss": 0.8394,
+      "step": 107
+    },
+    {
+      "epoch": 2.0693069306930694,
+      "grad_norm": 0.283203125,
+      "learning_rate": 9.504162453267777e-05,
+      "loss": 0.8524,
+      "step": 108
+    },
+    {
+      "epoch": 2.01980198019802,
+      "grad_norm": 0.22265625,
+      "learning_rate": 9.339094156743007e-05,
+      "loss": 0.8391,
+      "step": 109
+    },
+    {
+      "epoch": 2.0396039603960396,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 0.8317,
+      "step": 110
+    },
+    {
+      "epoch": 2.0594059405940595,
+      "grad_norm": 0.22265625,
+      "learning_rate": 9.009544697495374e-05,
+      "loss": 0.833,
+      "step": 111
+    },
+    {
+      "epoch": 2.0792079207920793,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 8.845153630304139e-05,
+      "loss": 0.8408,
+      "step": 112
+    },
+    {
+      "epoch": 2.099009900990099,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 8.681078286579311e-05,
+      "loss": 0.8459,
+      "step": 113
+    },
+    {
+      "epoch": 2.118811881188119,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 8.517363522881579e-05,
+      "loss": 0.8177,
+      "step": 114
+    },
+    {
+      "epoch": 2.1386138613861387,
+      "grad_norm": 0.2265625,
+      "learning_rate": 8.35405409719266e-05,
+      "loss": 0.8451,
+      "step": 115
+    },
+    {
+      "epoch": 2.1584158415841586,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 8.191194656678904e-05,
+      "loss": 0.8543,
+      "step": 116
+    },
+    {
+      "epoch": 2.1782178217821784,
+      "grad_norm": 0.22265625,
+      "learning_rate": 8.028829725485199e-05,
+      "loss": 0.8194,
+      "step": 117
+    },
+    {
+      "epoch": 2.1782178217821784,
+      "eval_loss": 0.8713971972465515,
+      "eval_runtime": 13.8976,
+      "eval_samples_per_second": 8.994,
+      "eval_steps_per_second": 4.533,
+      "step": 117
+    },
+    {
+      "epoch": 2.198019801980198,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 7.867003692562534e-05,
+      "loss": 0.808,
+      "step": 118
+    },
+    {
+      "epoch": 2.217821782178218,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 7.705760799532485e-05,
+      "loss": 0.8073,
+      "step": 119
+    },
+    {
+      "epoch": 2.237623762376238,
+      "grad_norm": 0.201171875,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 0.8392,
+      "step": 120
+    },
+    {
+      "epoch": 2.2574257425742577,
+      "grad_norm": 0.25,
+      "learning_rate": 7.385200590461803e-05,
+      "loss": 0.8574,
+      "step": 121
+    },
+    {
+      "epoch": 2.2772277227722775,
+      "grad_norm": 0.271484375,
+      "learning_rate": 7.225970912381556e-05,
+      "loss": 0.8338,
+      "step": 122
+    },
+    {
+      "epoch": 2.297029702970297,
+      "grad_norm": 0.294921875,
+      "learning_rate": 7.067499626155354e-05,
+      "loss": 0.8788,
+      "step": 123
+    },
+    {
+      "epoch": 2.3168316831683167,
+      "grad_norm": 0.2265625,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.8297,
+      "step": 124
+    },
+    {
+      "epoch": 2.3366336633663365,
+      "grad_norm": 0.267578125,
+      "learning_rate": 6.753005307953167e-05,
+      "loss": 0.8125,
+      "step": 125
+    },
+    {
+      "epoch": 2.3564356435643563,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 6.59706825558357e-05,
+      "loss": 0.814,
+      "step": 126
+    },
+    {
+      "epoch": 2.376237623762376,
+      "grad_norm": 0.27734375,
+      "learning_rate": 6.442061530774834e-05,
+      "loss": 0.8335,
+      "step": 127
+    },
+    {
+      "epoch": 2.396039603960396,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 6.28802751081779e-05,
+      "loss": 0.8512,
+      "step": 128
+    },
+    {
+      "epoch": 2.4158415841584158,
+      "grad_norm": 0.224609375,
+      "learning_rate": 6.135008307075481e-05,
+      "loss": 0.8297,
+      "step": 129
+    },
+    {
+      "epoch": 2.4356435643564356,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 0.848,
+      "step": 130
+    },
+    {
+      "epoch": 2.4356435643564356,
+      "eval_loss": 0.8665071129798889,
+      "eval_runtime": 13.8735,
+      "eval_samples_per_second": 9.01,
+      "eval_steps_per_second": 4.541,
+      "step": 130
+    },
+    {
+      "epoch": 2.4554455445544554,
+      "grad_norm": 0.2265625,
+      "learning_rate": 5.832181395047098e-05,
+      "loss": 0.8203,
+      "step": 131
+    },
+    {
+      "epoch": 2.4752475247524752,
+      "grad_norm": 0.287109375,
+      "learning_rate": 5.6824564766150726e-05,
+      "loss": 0.8519,
+      "step": 132
+    },
+    {
+      "epoch": 2.495049504950495,
+      "grad_norm": 0.21484375,
+      "learning_rate": 5.533911931471936e-05,
+      "loss": 0.83,
+      "step": 133
+    },
+    {
+      "epoch": 2.514851485148515,
+      "grad_norm": 0.2109375,
+      "learning_rate": 5.386588370213124e-05,
+      "loss": 0.842,
+      "step": 134
+    },
+    {
+      "epoch": 2.5346534653465347,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 5.240526069629265e-05,
+      "loss": 0.8419,
+      "step": 135
+    },
+    {
+      "epoch": 2.5544554455445545,
+      "grad_norm": 0.267578125,
+      "learning_rate": 5.095764961694922e-05,
+      "loss": 0.8458,
+      "step": 136
+    },
+    {
+      "epoch": 2.5742574257425743,
+      "grad_norm": 0.203125,
+      "learning_rate": 4.952344622651566e-05,
+      "loss": 0.8133,
+      "step": 137
+    },
+    {
+      "epoch": 2.594059405940594,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 4.810304262187852e-05,
+      "loss": 0.8103,
+      "step": 138
+    },
+    {
+      "epoch": 2.613861386138614,
+      "grad_norm": 0.20703125,
+      "learning_rate": 4.669682712720065e-05,
+      "loss": 0.8105,
+      "step": 139
+    },
+    {
+      "epoch": 2.633663366336634,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 0.8305,
+      "step": 140
+    },
+    {
+      "epoch": 2.6534653465346536,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 4.392849426483274e-05,
+      "loss": 0.7881,
+      "step": 141
+    },
+    {
+      "epoch": 2.6732673267326734,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 4.256713373170564e-05,
+      "loss": 0.8204,
+      "step": 142
+    },
+    {
+      "epoch": 2.693069306930693,
+      "grad_norm": 0.263671875,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.8354,
+      "step": 143
+    },
+    {
+      "epoch": 2.693069306930693,
+      "eval_loss": 0.8626759648323059,
+      "eval_runtime": 13.8585,
+      "eval_samples_per_second": 9.02,
+      "eval_steps_per_second": 4.546,
+      "step": 143
+    },
+    {
+      "epoch": 2.7128712871287126,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 3.9891885271697496e-05,
+      "loss": 0.8441,
+      "step": 144
+    },
+    {
+      "epoch": 2.7326732673267324,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 3.857872873103322e-05,
+      "loss": 0.8084,
+      "step": 145
+    },
+    {
+      "epoch": 2.7524752475247523,
+      "grad_norm": 0.18359375,
+      "learning_rate": 3.7282364152646297e-05,
+      "loss": 0.8184,
+      "step": 146
+    },
+    {
+      "epoch": 2.772277227722772,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 3.600314594966834e-05,
+      "loss": 0.8302,
+      "step": 147
+    },
+    {
+      "epoch": 2.792079207920792,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 3.4741423847583134e-05,
+      "loss": 0.8503,
+      "step": 148
+    },
+    {
+      "epoch": 2.8118811881188117,
+      "grad_norm": 0.2265625,
+      "learning_rate": 3.349754278861517e-05,
+      "loss": 0.8273,
+      "step": 149
+    },
+    {
+      "epoch": 2.8316831683168315,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 0.8332,
+      "step": 150
+    },
+    {
+      "epoch": 2.8514851485148514,
+      "grad_norm": 0.185546875,
+      "learning_rate": 3.106465908814342e-05,
+      "loss": 0.8391,
+      "step": 151
+    },
+    {
+      "epoch": 2.871287128712871,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 2.9876321572751144e-05,
+      "loss": 0.8029,
+      "step": 152
+    },
+    {
+      "epoch": 2.891089108910891,
+      "grad_norm": 0.224609375,
+      "learning_rate": 2.87071551708603e-05,
+      "loss": 0.8561,
+      "step": 153
+    },
+    {
+      "epoch": 2.910891089108911,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 2.7557479520891104e-05,
+      "loss": 0.8055,
+      "step": 154
+    },
+    {
+      "epoch": 2.9306930693069306,
+      "grad_norm": 0.16796875,
+      "learning_rate": 2.6427608932686843e-05,
+      "loss": 0.8301,
+      "step": 155
+    },
+    {
+      "epoch": 2.9504950495049505,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 2.5317852301584643e-05,
+      "loss": 0.8476,
+      "step": 156
+    },
+    {
+      "epoch": 2.9504950495049505,
+      "eval_loss": 0.8605256080627441,
+      "eval_runtime": 13.8794,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 156
+    },
+    {
+      "epoch": 2.9702970297029703,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 2.422851302396655e-05,
+      "loss": 0.8483,
+      "step": 157
+    },
+    {
+      "epoch": 2.99009900990099,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 2.315988891431412e-05,
+      "loss": 0.8379,
+      "step": 158
+    },
+    {
+      "epoch": 3.00990099009901,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 2.2112272123788768e-05,
+      "loss": 0.8042,
+      "step": 159
+    },
+    {
+      "epoch": 3.0297029702970297,
+      "grad_norm": 0.416015625,
+      "learning_rate": 2.1085949060360654e-05,
+      "loss": 0.8597,
+      "step": 160
+    },
+    {
+      "epoch": 3.0495049504950495,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 2.008120031050753e-05,
+      "loss": 0.8327,
+      "step": 161
+    },
+    {
+      "epoch": 3.0693069306930694,
+      "grad_norm": 0.212890625,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.7991,
+      "step": 162
+    },
+    {
+      "epoch": 3.01980198019802,
+      "grad_norm": 0.1875,
+      "learning_rate": 1.8137518531330767e-05,
+      "loss": 0.8083,
+      "step": 163
+    },
+    {
+      "epoch": 3.0396039603960396,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 1.7199116885197995e-05,
+      "loss": 0.8321,
+      "step": 164
+    },
+    {
+      "epoch": 3.0594059405940595,
+      "grad_norm": 0.193359375,
+      "learning_rate": 1.6283352173747145e-05,
+      "loss": 0.8596,
+      "step": 165
+    },
+    {
+      "epoch": 3.0792079207920793,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 1.5390474757906446e-05,
+      "loss": 0.82,
+      "step": 166
+    },
+    {
+      "epoch": 3.099009900990099,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 1.4520728741446089e-05,
+      "loss": 0.8245,
+      "step": 167
+    },
+    {
+      "epoch": 3.118811881188119,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 1.3674351904242611e-05,
+      "loss": 0.8174,
+      "step": 168
+    },
+    {
+      "epoch": 3.1386138613861387,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 1.2851575637272262e-05,
+      "loss": 0.811,
+      "step": 169
+    },
+    {
+      "epoch": 3.1386138613861387,
+      "eval_loss": 0.8589804768562317,
+      "eval_runtime": 13.8605,
+      "eval_samples_per_second": 9.018,
+      "eval_steps_per_second": 4.545,
+      "step": 169
+    },
+    {
+      "epoch": 3.1584158415841586,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 1.2052624879351104e-05,
+      "loss": 0.8043,
+      "step": 170
+    },
+    {
+      "epoch": 3.1782178217821784,
+      "grad_norm": 0.181640625,
+      "learning_rate": 1.1277718055638819e-05,
+      "loss": 0.8117,
+      "step": 171
+    },
+    {
+      "epoch": 3.198019801980198,
+      "grad_norm": 0.205078125,
+      "learning_rate": 1.0527067017923654e-05,
+      "loss": 0.8176,
+      "step": 172
+    },
+    {
+      "epoch": 3.217821782178218,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 9.80087698670411e-06,
+      "loss": 0.7919,
+      "step": 173
+    },
+    {
+      "epoch": 3.237623762376238,
+      "grad_norm": 0.177734375,
+      "learning_rate": 9.09934649508375e-06,
+      "loss": 0.8099,
+      "step": 174
+    },
+    {
+      "epoch": 3.2574257425742577,
+      "grad_norm": 0.203125,
+      "learning_rate": 8.422667334494249e-06,
+      "loss": 0.8161,
+      "step": 175
+    },
+    {
+      "epoch": 3.2772277227722775,
+      "grad_norm": 0.208984375,
+      "learning_rate": 7.771024502261526e-06,
+      "loss": 0.8199,
+      "step": 176
+    },
+    {
+      "epoch": 3.297029702970297,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 7.144596151029303e-06,
+      "loss": 0.8077,
+      "step": 177
+    },
+    {
+      "epoch": 3.3168316831683167,
+      "grad_norm": 0.298828125,
+      "learning_rate": 6.543553540053926e-06,
+      "loss": 0.8532,
+      "step": 178
+    },
+    {
+      "epoch": 3.3366336633663365,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 5.968060988383883e-06,
+      "loss": 0.8062,
+      "step": 179
+    },
+    {
+      "epoch": 3.3564356435643563,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 0.802,
+      "step": 180
+    },
+    {
+      "epoch": 3.376237623762376,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.8189,
+      "step": 181
+    },
+    {
+      "epoch": 3.396039603960396,
+      "grad_norm": 0.169921875,
+      "learning_rate": 4.3964218465642355e-06,
+      "loss": 0.8178,
+      "step": 182
+    },
+    {
+      "epoch": 3.396039603960396,
+      "eval_loss": 0.858788788318634,
+      "eval_runtime": 13.8817,
+      "eval_samples_per_second": 9.005,
+      "eval_steps_per_second": 4.538,
+      "step": 182
+    },
+    {
+      "epoch": 3.4158415841584158,
+      "grad_norm": 0.16796875,
+      "learning_rate": 3.924632386315186e-06,
+      "loss": 0.8307,
+      "step": 183
+    },
+    {
+      "epoch": 3.4356435643564356,
+      "grad_norm": 0.181640625,
+      "learning_rate": 3.4791089722651436e-06,
+      "loss": 0.8255,
+      "step": 184
+    },
+    {
+      "epoch": 3.4554455445544554,
+      "grad_norm": 0.185546875,
+      "learning_rate": 3.059973406066963e-06,
+      "loss": 0.8222,
+      "step": 185
+    },
+    {
+      "epoch": 3.4752475247524752,
+      "grad_norm": 0.19140625,
+      "learning_rate": 2.667340275199426e-06,
+      "loss": 0.8054,
+      "step": 186
+    },
+    {
+      "epoch": 3.495049504950495,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 2.3013169216400733e-06,
+      "loss": 0.8628,
+      "step": 187
+    },
+    {
+      "epoch": 3.514851485148515,
+      "grad_norm": 0.1796875,
+      "learning_rate": 1.9620034125190644e-06,
+      "loss": 0.8338,
+      "step": 188
+    },
+    {
+      "epoch": 3.5346534653465347,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 1.6494925127617634e-06,
+      "loss": 0.809,
+      "step": 189
+    },
+    {
+      "epoch": 3.5544554455445545,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 1.3638696597277679e-06,
+      "loss": 0.8328,
+      "step": 190
+    },
+    {
+      "epoch": 3.5742574257425743,
+      "grad_norm": 0.17578125,
+      "learning_rate": 1.1052129398531507e-06,
+      "loss": 0.8062,
+      "step": 191
+    },
+    {
+      "epoch": 3.594059405940594,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 8.735930673024806e-07,
+      "loss": 0.832,
+      "step": 192
+    },
+    {
+      "epoch": 3.613861386138614,
+      "grad_norm": 0.17578125,
+      "learning_rate": 6.690733646361857e-07,
+      "loss": 0.8107,
+      "step": 193
+    },
+    {
+      "epoch": 3.633663366336634,
+      "grad_norm": 0.1875,
+      "learning_rate": 4.917097454988584e-07,
+      "loss": 0.8315,
+      "step": 194
+    },
+    {
+      "epoch": 3.6534653465346536,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 3.415506993330153e-07,
+      "loss": 0.8073,
+      "step": 195
+    },
+    {
+      "epoch": 3.6534653465346536,
+      "eval_loss": 0.858626127243042,
+      "eval_runtime": 13.8621,
+      "eval_samples_per_second": 9.017,
+      "eval_steps_per_second": 4.545,
+      "step": 195
+    },
+    {
+      "epoch": 3.6732673267326734,
+      "grad_norm": 0.197265625,
+      "learning_rate": 2.1863727812254653e-07,
+      "loss": 0.8403,
+      "step": 196
+    },
+    {
+      "epoch": 3.693069306930693,
+      "grad_norm": 0.189453125,
+      "learning_rate": 1.230030851695263e-07,
+      "loss": 0.8116,
+      "step": 197
+    },
+    {
+      "epoch": 3.7128712871287126,
+      "grad_norm": 0.173828125,
+      "learning_rate": 5.467426590739511e-08,
+      "loss": 0.8115,
+      "step": 198
+    },
+    {
+      "epoch": 3.7326732673267324,
+      "grad_norm": 0.177734375,
+      "learning_rate": 1.3669500753099585e-08,
+      "loss": 0.7962,
+      "step": 199
+    },
+    {
+      "epoch": 3.7524752475247523,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0,
+      "loss": 0.8031,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.164477534181786e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944

checkpoint-50/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-50/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-50/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a02032e4ced1f76caa201d55031ab5925f6d0fb66b5d8f3092b8c5d785219b37
+size 50503848

checkpoint-50/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ff58348f44e2bde44ab7f9193c61e20dd0f8d95e056c7a292421ffd95a8c7d3
+size 202035450

checkpoint-50/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b35231a2c551e6ed40111614cd789a64fe47b38c49d5b21bea0aa24df8b78d2
+size 14244

checkpoint-50/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9e02dc10b7239989ab9b4418ee704e53fad611ad6b77ad633028bb8eb5238dd
+size 1064

checkpoint-50/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-50/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-50/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-50/trainer_state.json ADDED Viewed

	@@ -0,0 +1,415 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9900990099009901,
+  "eval_steps": 13,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019801980198019802,
+      "grad_norm": 1.15625,
+      "learning_rate": 2e-05,
+      "loss": 2.0919,
+      "step": 1
+    },
+    {
+      "epoch": 0.019801980198019802,
+      "eval_loss": 2.079954147338867,
+      "eval_runtime": 13.8908,
+      "eval_samples_per_second": 8.999,
+      "eval_steps_per_second": 4.535,
+      "step": 1
+    },
+    {
+      "epoch": 0.039603960396039604,
+      "grad_norm": 1.203125,
+      "learning_rate": 4e-05,
+      "loss": 2.0814,
+      "step": 2
+    },
+    {
+      "epoch": 0.0594059405940594,
+      "grad_norm": 1.1953125,
+      "learning_rate": 6e-05,
+      "loss": 2.0499,
+      "step": 3
+    },
+    {
+      "epoch": 0.07920792079207921,
+      "grad_norm": 1.0859375,
+      "learning_rate": 8e-05,
+      "loss": 2.0153,
+      "step": 4
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0001,
+      "loss": 1.9548,
+      "step": 5
+    },
+    {
+      "epoch": 0.1188118811881188,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00012,
+      "loss": 1.8982,
+      "step": 6
+    },
+    {
+      "epoch": 0.13861386138613863,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00014,
+      "loss": 1.8226,
+      "step": 7
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00016,
+      "loss": 1.7572,
+      "step": 8
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00018,
+      "loss": 1.7074,
+      "step": 9
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0002,
+      "loss": 1.6317,
+      "step": 10
+    },
+    {
+      "epoch": 0.21782178217821782,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001999863304992469,
+      "loss": 1.5801,
+      "step": 11
+    },
+    {
+      "epoch": 0.2376237623762376,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00019994532573409262,
+      "loss": 1.5721,
+      "step": 12
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00019987699691483048,
+      "loss": 1.5479,
+      "step": 13
+    },
+    {
+      "epoch": 0.25742574257425743,
+      "eval_loss": 1.5341482162475586,
+      "eval_runtime": 13.8795,
+      "eval_samples_per_second": 9.006,
+      "eval_steps_per_second": 4.539,
+      "step": 13
+    },
+    {
+      "epoch": 0.27722772277227725,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00019978136272187747,
+      "loss": 1.534,
+      "step": 14
+    },
+    {
+      "epoch": 0.297029702970297,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.4804,
+      "step": 15
+    },
+    {
+      "epoch": 0.31683168316831684,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00019950829025450114,
+      "loss": 1.4805,
+      "step": 16
+    },
+    {
+      "epoch": 0.33663366336633666,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00019933092663536382,
+      "loss": 1.3809,
+      "step": 17
+    },
+    {
+      "epoch": 0.3564356435643564,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00019912640693269752,
+      "loss": 1.3837,
+      "step": 18
+    },
+    {
+      "epoch": 0.37623762376237624,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00019889478706014687,
+      "loss": 1.3673,
+      "step": 19
+    },
+    {
+      "epoch": 0.39603960396039606,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.366,
+      "step": 20
+    },
+    {
+      "epoch": 0.4158415841584158,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00019835050748723824,
+      "loss": 1.3318,
+      "step": 21
+    },
+    {
+      "epoch": 0.43564356435643564,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00019803799658748094,
+      "loss": 1.2741,
+      "step": 22
+    },
+    {
+      "epoch": 0.45544554455445546,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019769868307835994,
+      "loss": 1.2978,
+      "step": 23
+    },
+    {
+      "epoch": 0.4752475247524752,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001973326597248006,
+      "loss": 1.2733,
+      "step": 24
+    },
+    {
+      "epoch": 0.49504950495049505,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.2302,
+      "step": 25
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00019652089102773488,
+      "loss": 1.2083,
+      "step": 26
+    },
+    {
+      "epoch": 0.5148514851485149,
+      "eval_loss": 1.224540114402771,
+      "eval_runtime": 13.8695,
+      "eval_samples_per_second": 9.013,
+      "eval_steps_per_second": 4.542,
+      "step": 26
+    },
+    {
+      "epoch": 0.5346534653465347,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00019607536761368484,
+      "loss": 1.1761,
+      "step": 27
+    },
+    {
+      "epoch": 0.5544554455445545,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00019560357815343577,
+      "loss": 1.1751,
+      "step": 28
+    },
+    {
+      "epoch": 0.5742574257425742,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 1.2002,
+      "step": 29
+    },
+    {
+      "epoch": 0.594059405940594,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1544,
+      "step": 30
+    },
+    {
+      "epoch": 0.6138613861386139,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00019403193901161613,
+      "loss": 1.1384,
+      "step": 31
+    },
+    {
+      "epoch": 0.6336633663366337,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001934564464599461,
+      "loss": 1.0999,
+      "step": 32
+    },
+    {
+      "epoch": 0.6534653465346535,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00019285540384897073,
+      "loss": 1.1576,
+      "step": 33
+    },
+    {
+      "epoch": 0.6732673267326733,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00019222897549773848,
+      "loss": 1.091,
+      "step": 34
+    },
+    {
+      "epoch": 0.693069306930693,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.056,
+      "step": 35
+    },
+    {
+      "epoch": 0.7128712871287128,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019090065350491626,
+      "loss": 1.1068,
+      "step": 36
+    },
+    {
+      "epoch": 0.7326732673267327,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00019019912301329592,
+      "loss": 1.0583,
+      "step": 37
+    },
+    {
+      "epoch": 0.7524752475247525,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00018947293298207635,
+      "loss": 1.0671,
+      "step": 38
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001887222819443612,
+      "loss": 1.0851,
+      "step": 39
+    },
+    {
+      "epoch": 0.7722772277227723,
+      "eval_loss": 1.060703158378601,
+      "eval_runtime": 13.878,
+      "eval_samples_per_second": 9.007,
+      "eval_steps_per_second": 4.54,
+      "step": 39
+    },
+    {
+      "epoch": 0.7920792079207921,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.0343,
+      "step": 40
+    },
+    {
+      "epoch": 0.8118811881188119,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00018714842436272773,
+      "loss": 0.9789,
+      "step": 41
+    },
+    {
+      "epoch": 0.8316831683168316,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00018632564809575742,
+      "loss": 1.0174,
+      "step": 42
+    },
+    {
+      "epoch": 0.8514851485148515,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001854792712585539,
+      "loss": 1.0004,
+      "step": 43
+    },
+    {
+      "epoch": 0.8712871287128713,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00018460952524209355,
+      "loss": 1.0281,
+      "step": 44
+    },
+    {
+      "epoch": 0.8910891089108911,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 0.9992,
+      "step": 45
+    },
+    {
+      "epoch": 0.9108910891089109,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00018280088311480201,
+      "loss": 0.9635,
+      "step": 46
+    },
+    {
+      "epoch": 0.9306930693069307,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00018186248146866927,
+      "loss": 1.006,
+      "step": 47
+    },
+    {
+      "epoch": 0.9504950495049505,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.9891,
+      "step": 48
+    },
+    {
+      "epoch": 0.9702970297029703,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001799187996894925,
+      "loss": 0.9809,
+      "step": 49
+    },
+    {
+      "epoch": 0.9900990099009901,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.9646,
+      "step": 50
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.04177049010176e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-50/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f
+size 5944

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "bfloat16",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723