diff --git a/README.md b/README.md index ab1026189d4daa3c13229a3ffce8d7b35a755eb3..213dc0957d1ce56857a23da29983402f3e5ee424 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,143 @@ ---- -license: unknown ---- +--- +license: apache-2.0 +library_name: peft +tags: +- generated_from_trainer +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +model-index: +- name: outputs/qlora-out + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.1` +```yaml +adapter: qlora +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +bf16: auto +dataset_prepared_path: null +datasets: +- path: Taiel26/plm_2500_uniref + type: alpaca +debug: null +deepspeed: null +early_stopping_patience: null +eval_sample_packing: false +evals_per_epoch: 4 +flash_attention: true +fp16: null +fsdp: null +fsdp_config: null +gradient_accumulation_steps: 4 +gradient_checkpointing: true +group_by_length: false +learning_rate: 0.0002 +load_in_4bit: true +load_in_8bit: false +local_rank: null +logging_steps: 1 +lora_alpha: 16 +lora_dropout: 0.05 +lora_fan_in_fan_out: null +lora_model_dir: null +lora_r: 32 +lora_target_linear: true +lora_target_modules: null +lr_scheduler: cosine +micro_batch_size: 2 +model_type: LlamaForCausalLM +num_epochs: 4 +optimizer: paged_adamw_32bit +output_dir: ./outputs/qlora-out +pad_to_sequence_len: true +resume_from_checkpoint: null +sample_packing: true +saves_per_epoch: 1 +sequence_len: 4096 +special_tokens: null +strict: false +tf32: false +tokenizer_type: LlamaTokenizer +train_on_inputs: false +val_set_size: 0.05 +wandb_entity: null +wandb_log_model: null +wandb_name: null +wandb_project: null +wandb_watch: null +warmup_steps: 10 +weight_decay: 0.0 +xformers_attention: null + +``` + +

+ +# outputs/qlora-out + +This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 0.8586 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 4 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 2.0919 | 0.0198 | 1 | 2.0800 | +| 1.5479 | 0.2574 | 13 | 1.5341 | +| 1.2083 | 0.5149 | 26 | 1.2245 | +| 1.0851 | 0.7723 | 39 | 1.0607 | +| 0.9432 | 1.0297 | 52 | 0.9755 | +| 0.9007 | 1.2178 | 65 | 0.9334 | +| 0.8765 | 1.4752 | 78 | 0.9084 | +| 0.8789 | 1.7327 | 91 | 0.8891 | +| 0.8304 | 1.9901 | 104 | 0.8779 | +| 0.8194 | 2.1782 | 117 | 0.8714 | +| 0.848 | 2.4356 | 130 | 0.8665 | +| 0.8354 | 2.6931 | 143 | 0.8627 | +| 0.8476 | 2.9505 | 156 | 0.8605 | +| 0.811 | 3.1386 | 169 | 0.8590 | +| 0.8178 | 3.3960 | 182 | 0.8588 | +| 0.8073 | 3.6535 | 195 | 0.8586 | + + +### Framework versions + +- PEFT 0.11.1 +- Transformers 4.41.1 +- Pytorch 2.1.2+cu121 +- Datasets 2.19.1 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c380520592770a23b238e94c8e56b4b79c0847c0 --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffc2779c22b7eae997dc6203abde8f60a5e25d728ff0372f233c318ba1fdff97 +size 50573978 diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978 --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891 --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bc8c349472a911b43a61191dca50545e47c7731 --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b85a4fdc0abdb0ae863b99d8dbbc0f4de78e0d9fbd7bcb1ddcd7575e55dd73e +size 50503848 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdd1fa491d76cadd05c4b29aa7b82b6375fb6268 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8f5c81e295185d82b95402d9e8aa5ba7f3db7c0d3626b29a8ce3a7f38899ae +size 202035450 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..22b8bd3417b5dcc9c846deab82f71389c7adcb09 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b71df2f60f93f95a69126d2a7bc1e1cccfa69f1b8fa8d99a58b0ccfa00747f6f +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd2ee4e3177198ef9bb677ca214baa3bb506f0b2 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc7800513a1b4dd006c457152c700dd768bb49ee4ed8e4d9665a4e42095b054 +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a34ffc16754db68bc7066ce5eaa863821b0391b4 --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,797 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9108910891089108, + "eval_steps": 13, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.019801980198019802, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.0919, + "step": 1 + }, + { + "epoch": 0.019801980198019802, + "eval_loss": 2.079954147338867, + "eval_runtime": 13.8908, + "eval_samples_per_second": 8.999, + "eval_steps_per_second": 4.535, + "step": 1 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 1.203125, + "learning_rate": 4e-05, + "loss": 2.0814, + "step": 2 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 1.1953125, + "learning_rate": 6e-05, + "loss": 2.0499, + "step": 3 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 1.0859375, + "learning_rate": 8e-05, + "loss": 2.0153, + "step": 4 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 1.0390625, + "learning_rate": 0.0001, + "loss": 1.9548, + "step": 5 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.89453125, + "learning_rate": 0.00012, + "loss": 1.8982, + "step": 6 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.67578125, + "learning_rate": 0.00014, + "loss": 1.8226, + "step": 7 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.66796875, + "learning_rate": 0.00016, + "loss": 1.7572, + "step": 8 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.78515625, + "learning_rate": 0.00018, + "loss": 1.7074, + "step": 9 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.73828125, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 10 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.484375, + "learning_rate": 0.0001999863304992469, + "loss": 1.5801, + "step": 11 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.53125, + "learning_rate": 0.00019994532573409262, + "loss": 1.5721, + "step": 12 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.6953125, + "learning_rate": 0.00019987699691483048, + "loss": 1.5479, + "step": 13 + }, + { + "epoch": 0.25742574257425743, + "eval_loss": 1.5341482162475586, + "eval_runtime": 13.8795, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 13 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.65234375, + "learning_rate": 0.00019978136272187747, + "loss": 1.534, + "step": 14 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.515625, + "learning_rate": 0.000199658449300667, + "loss": 1.4804, + "step": 15 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.439453125, + "learning_rate": 0.00019950829025450114, + "loss": 1.4805, + "step": 16 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.361328125, + "learning_rate": 0.00019933092663536382, + "loss": 1.3809, + "step": 17 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.3125, + "learning_rate": 0.00019912640693269752, + "loss": 1.3837, + "step": 18 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.337890625, + "learning_rate": 0.00019889478706014687, + "loss": 1.3673, + "step": 19 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.298828125, + "learning_rate": 0.00019863613034027224, + "loss": 1.366, + "step": 20 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.34375, + "learning_rate": 0.00019835050748723824, + "loss": 1.3318, + "step": 21 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.341796875, + "learning_rate": 0.00019803799658748094, + "loss": 1.2741, + "step": 22 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.326171875, + "learning_rate": 0.00019769868307835994, + "loss": 1.2978, + "step": 23 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.291015625, + "learning_rate": 0.0001973326597248006, + "loss": 1.2733, + "step": 24 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.306640625, + "learning_rate": 0.00019694002659393305, + "loss": 1.2302, + "step": 25 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.318359375, + "learning_rate": 0.00019652089102773488, + "loss": 1.2083, + "step": 26 + }, + { + "epoch": 0.5148514851485149, + "eval_loss": 1.224540114402771, + "eval_runtime": 13.8695, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 4.542, + "step": 26 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.26953125, + "learning_rate": 0.00019607536761368484, + "loss": 1.1761, + "step": 27 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 0.296875, + "learning_rate": 0.00019560357815343577, + "loss": 1.1751, + "step": 28 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.310546875, + "learning_rate": 0.00019510565162951537, + "loss": 1.2002, + "step": 29 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.287109375, + "learning_rate": 0.00019458172417006347, + "loss": 1.1544, + "step": 30 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.365234375, + "learning_rate": 0.00019403193901161613, + "loss": 1.1384, + "step": 31 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.236328125, + "learning_rate": 0.0001934564464599461, + "loss": 1.0999, + "step": 32 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.326171875, + "learning_rate": 0.00019285540384897073, + "loss": 1.1576, + "step": 33 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.310546875, + "learning_rate": 0.00019222897549773848, + "loss": 1.091, + "step": 34 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.2578125, + "learning_rate": 0.00019157733266550575, + "loss": 1.056, + "step": 35 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.267578125, + "learning_rate": 0.00019090065350491626, + "loss": 1.1068, + "step": 36 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019019912301329592, + "loss": 1.0583, + "step": 37 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.2734375, + "learning_rate": 0.00018947293298207635, + "loss": 1.0671, + "step": 38 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001887222819443612, + "loss": 1.0851, + "step": 39 + }, + { + "epoch": 0.7722772277227723, + "eval_loss": 1.060703158378601, + "eval_runtime": 13.878, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 4.54, + "step": 39 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.22265625, + "learning_rate": 0.0001879473751206489, + "loss": 1.0343, + "step": 40 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.1796875, + "learning_rate": 0.00018714842436272773, + "loss": 0.9789, + "step": 41 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.248046875, + "learning_rate": 0.00018632564809575742, + "loss": 1.0174, + "step": 42 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001854792712585539, + "loss": 1.0004, + "step": 43 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.228515625, + "learning_rate": 0.00018460952524209355, + "loss": 1.0281, + "step": 44 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.220703125, + "learning_rate": 0.00018371664782625287, + "loss": 0.9992, + "step": 45 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018280088311480201, + "loss": 0.9635, + "step": 46 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.265625, + "learning_rate": 0.00018186248146866927, + "loss": 1.006, + "step": 47 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018090169943749476, + "loss": 0.9891, + "step": 48 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.28515625, + "learning_rate": 0.0001799187996894925, + "loss": 0.9809, + "step": 49 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.212890625, + "learning_rate": 0.00017891405093963938, + "loss": 0.9646, + "step": 50 + }, + { + "epoch": 1.00990099009901, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017788772787621126, + "loss": 0.9553, + "step": 51 + }, + { + "epoch": 1.0297029702970297, + "grad_norm": 0.2578125, + "learning_rate": 0.00017684011108568592, + "loss": 0.9432, + "step": 52 + }, + { + "epoch": 1.0297029702970297, + "eval_loss": 0.9755253195762634, + "eval_runtime": 13.879, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 52 + }, + { + "epoch": 1.0495049504950495, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001757714869760335, + "loss": 0.9631, + "step": 53 + }, + { + "epoch": 1.0693069306930694, + "grad_norm": 0.3046875, + "learning_rate": 0.0001746821476984154, + "loss": 0.9539, + "step": 54 + }, + { + "epoch": 1.0198019801980198, + "grad_norm": 0.232421875, + "learning_rate": 0.00017357239106731317, + "loss": 0.9559, + "step": 55 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.283203125, + "learning_rate": 0.00017244252047910892, + "loss": 0.9111, + "step": 56 + }, + { + "epoch": 1.0594059405940595, + "grad_norm": 0.30859375, + "learning_rate": 0.00017129284482913972, + "loss": 0.9503, + "step": 57 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.2265625, + "learning_rate": 0.00017012367842724887, + "loss": 0.911, + "step": 58 + }, + { + "epoch": 1.099009900990099, + "grad_norm": 0.3515625, + "learning_rate": 0.0001689353409118566, + "loss": 0.9041, + "step": 59 + }, + { + "epoch": 1.118811881188119, + "grad_norm": 0.26171875, + "learning_rate": 0.00016772815716257412, + "loss": 0.9117, + "step": 60 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.2890625, + "learning_rate": 0.0001665024572113848, + "loss": 0.9351, + "step": 61 + }, + { + "epoch": 1.1584158415841583, + "grad_norm": 0.251953125, + "learning_rate": 0.00016525857615241687, + "loss": 0.9438, + "step": 62 + }, + { + "epoch": 1.1782178217821782, + "grad_norm": 0.2138671875, + "learning_rate": 0.00016399685405033167, + "loss": 0.9075, + "step": 63 + }, + { + "epoch": 1.198019801980198, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001627176358473537, + "loss": 0.8983, + "step": 64 + }, + { + "epoch": 1.2178217821782178, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001614212712689668, + "loss": 0.9007, + "step": 65 + }, + { + "epoch": 1.2178217821782178, + "eval_loss": 0.9333999156951904, + "eval_runtime": 13.8668, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 4.543, + "step": 65 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016010811472830252, + "loss": 0.9108, + "step": 66 + }, + { + "epoch": 1.2574257425742574, + "grad_norm": 0.232421875, + "learning_rate": 0.00015877852522924732, + "loss": 0.9177, + "step": 67 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.271484375, + "learning_rate": 0.00015743286626829437, + "loss": 0.9, + "step": 68 + }, + { + "epoch": 1.297029702970297, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001560715057351673, + "loss": 0.9096, + "step": 69 + }, + { + "epoch": 1.316831683168317, + "grad_norm": 0.22265625, + "learning_rate": 0.00015469481581224272, + "loss": 0.8946, + "step": 70 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.31640625, + "learning_rate": 0.0001533031728727994, + "loss": 0.8995, + "step": 71 + }, + { + "epoch": 1.3564356435643563, + "grad_norm": 0.2197265625, + "learning_rate": 0.00015189695737812152, + "loss": 0.922, + "step": 72 + }, + { + "epoch": 1.3762376237623761, + "grad_norm": 0.22265625, + "learning_rate": 0.0001504765537734844, + "loss": 0.885, + "step": 73 + }, + { + "epoch": 1.396039603960396, + "grad_norm": 0.248046875, + "learning_rate": 0.00014904235038305083, + "loss": 0.895, + "step": 74 + }, + { + "epoch": 1.4158415841584158, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014759473930370736, + "loss": 0.892, + "step": 75 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.216796875, + "learning_rate": 0.0001461341162978688, + "loss": 0.8277, + "step": 76 + }, + { + "epoch": 1.4554455445544554, + "grad_norm": 0.23828125, + "learning_rate": 0.00014466088068528068, + "loss": 0.8687, + "step": 77 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 0.228515625, + "learning_rate": 0.00014317543523384928, + "loss": 0.8765, + "step": 78 + }, + { + "epoch": 1.4752475247524752, + "eval_loss": 0.9083698391914368, + "eval_runtime": 13.8834, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 4.538, + "step": 78 + }, + { + "epoch": 1.495049504950495, + "grad_norm": 0.228515625, + "learning_rate": 0.00014167818604952906, + "loss": 0.8797, + "step": 79 + }, + { + "epoch": 1.5148514851485149, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014016954246529696, + "loss": 0.905, + "step": 80 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.25390625, + "learning_rate": 0.00013864991692924523, + "loss": 0.8575, + "step": 81 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 0.2451171875, + "learning_rate": 0.00013711972489182208, + "loss": 0.8957, + "step": 82 + }, + { + "epoch": 1.5742574257425743, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013557938469225167, + "loss": 0.8792, + "step": 83 + }, + { + "epoch": 1.5940594059405941, + "grad_norm": 0.21484375, + "learning_rate": 0.00013402931744416433, + "loss": 0.889, + "step": 84 + }, + { + "epoch": 1.613861386138614, + "grad_norm": 0.228515625, + "learning_rate": 0.00013246994692046836, + "loss": 0.8657, + "step": 85 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 0.20703125, + "learning_rate": 0.00013090169943749476, + "loss": 0.8784, + "step": 86 + }, + { + "epoch": 1.6534653465346536, + "grad_norm": 0.265625, + "learning_rate": 0.0001293250037384465, + "loss": 0.8822, + "step": 87 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.2197265625, + "learning_rate": 0.00012774029087618446, + "loss": 0.9092, + "step": 88 + }, + { + "epoch": 1.693069306930693, + "grad_norm": 0.234375, + "learning_rate": 0.00012614799409538198, + "loss": 0.8813, + "step": 89 + }, + { + "epoch": 1.7128712871287128, + "grad_norm": 0.2294921875, + "learning_rate": 0.00012454854871407994, + "loss": 0.8975, + "step": 90 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 0.259765625, + "learning_rate": 0.00012294239200467516, + "loss": 0.8789, + "step": 91 + }, + { + "epoch": 1.7326732673267327, + "eval_loss": 0.8891416788101196, + "eval_runtime": 13.872, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 4.542, + "step": 91 + }, + { + "epoch": 1.7524752475247525, + "grad_norm": 0.26171875, + "learning_rate": 0.0001213299630743747, + "loss": 0.9184, + "step": 92 + }, + { + "epoch": 1.7722772277227723, + "grad_norm": 0.337890625, + "learning_rate": 0.00011971170274514802, + "loss": 0.8854, + "step": 93 + }, + { + "epoch": 1.7920792079207921, + "grad_norm": 0.2890625, + "learning_rate": 0.000118088053433211, + "loss": 0.8688, + "step": 94 + }, + { + "epoch": 1.811881188118812, + "grad_norm": 0.3515625, + "learning_rate": 0.00011645945902807341, + "loss": 0.8281, + "step": 95 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 0.26953125, + "learning_rate": 0.0001148263647711842, + "loss": 0.8488, + "step": 96 + }, + { + "epoch": 1.8514851485148514, + "grad_norm": 0.2490234375, + "learning_rate": 0.00011318921713420691, + "loss": 0.8742, + "step": 97 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.265625, + "learning_rate": 0.00011154846369695863, + "loss": 0.8586, + "step": 98 + }, + { + "epoch": 1.891089108910891, + "grad_norm": 0.265625, + "learning_rate": 0.0001099045530250463, + "loss": 0.8776, + "step": 99 + }, + { + "epoch": 1.9108910891089108, + "grad_norm": 0.259765625, + "learning_rate": 0.00010825793454723325, + "loss": 0.8563, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.08354098020352e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f +size 5944 diff --git a/checkpoint-150/README.md b/checkpoint-150/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978 --- /dev/null +++ b/checkpoint-150/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-150/adapter_config.json b/checkpoint-150/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891 --- /dev/null +++ b/checkpoint-150/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-150/adapter_model.safetensors b/checkpoint-150/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..799a842d6fee603753511555ac2bc5993ecebb3b --- /dev/null +++ b/checkpoint-150/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97264f01540b1ad5acd25f27b627a7352dbda77c960c2b3c7b157d05035d6ac6 +size 50503848 diff --git a/checkpoint-150/optimizer.pt b/checkpoint-150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ffc9c31e4e29a21f02c95e78b174839a460cb94 --- /dev/null +++ b/checkpoint-150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e5de270e966edc3891231b22ee3b34b5d5573183750ce1a8ecca10a2b62423 +size 202035450 diff --git a/checkpoint-150/rng_state.pth b/checkpoint-150/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d31ebd7d51189a81569a9786ce90149798f188f --- /dev/null +++ b/checkpoint-150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3129c63169712c776c1e0e28d8711e276143acd2c2f061fb6eb052c04856ba72 +size 14244 diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d540db88f12afedb3a1b7ff4c08ac14c3431f65 --- /dev/null +++ b/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5d42bb0afda20ec4c83d38c6af1131541c335ecab229c74e7f418894f3c13b +size 1064 diff --git a/checkpoint-150/special_tokens_map.json b/checkpoint-150/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-150/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-150/tokenizer.model b/checkpoint-150/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-150/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-150/tokenizer_config.json b/checkpoint-150/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-150/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1b6a9be3ad59ba2c1d0d8712719a791499ffb1ec --- /dev/null +++ b/checkpoint-150/trainer_state.json @@ -0,0 +1,1179 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8316831683168315, + "eval_steps": 13, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.019801980198019802, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.0919, + "step": 1 + }, + { + "epoch": 0.019801980198019802, + "eval_loss": 2.079954147338867, + "eval_runtime": 13.8908, + "eval_samples_per_second": 8.999, + "eval_steps_per_second": 4.535, + "step": 1 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 1.203125, + "learning_rate": 4e-05, + "loss": 2.0814, + "step": 2 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 1.1953125, + "learning_rate": 6e-05, + "loss": 2.0499, + "step": 3 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 1.0859375, + "learning_rate": 8e-05, + "loss": 2.0153, + "step": 4 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 1.0390625, + "learning_rate": 0.0001, + "loss": 1.9548, + "step": 5 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.89453125, + "learning_rate": 0.00012, + "loss": 1.8982, + "step": 6 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.67578125, + "learning_rate": 0.00014, + "loss": 1.8226, + "step": 7 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.66796875, + "learning_rate": 0.00016, + "loss": 1.7572, + "step": 8 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.78515625, + "learning_rate": 0.00018, + "loss": 1.7074, + "step": 9 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.73828125, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 10 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.484375, + "learning_rate": 0.0001999863304992469, + "loss": 1.5801, + "step": 11 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.53125, + "learning_rate": 0.00019994532573409262, + "loss": 1.5721, + "step": 12 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.6953125, + "learning_rate": 0.00019987699691483048, + "loss": 1.5479, + "step": 13 + }, + { + "epoch": 0.25742574257425743, + "eval_loss": 1.5341482162475586, + "eval_runtime": 13.8795, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 13 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.65234375, + "learning_rate": 0.00019978136272187747, + "loss": 1.534, + "step": 14 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.515625, + "learning_rate": 0.000199658449300667, + "loss": 1.4804, + "step": 15 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.439453125, + "learning_rate": 0.00019950829025450114, + "loss": 1.4805, + "step": 16 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.361328125, + "learning_rate": 0.00019933092663536382, + "loss": 1.3809, + "step": 17 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.3125, + "learning_rate": 0.00019912640693269752, + "loss": 1.3837, + "step": 18 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.337890625, + "learning_rate": 0.00019889478706014687, + "loss": 1.3673, + "step": 19 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.298828125, + "learning_rate": 0.00019863613034027224, + "loss": 1.366, + "step": 20 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.34375, + "learning_rate": 0.00019835050748723824, + "loss": 1.3318, + "step": 21 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.341796875, + "learning_rate": 0.00019803799658748094, + "loss": 1.2741, + "step": 22 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.326171875, + "learning_rate": 0.00019769868307835994, + "loss": 1.2978, + "step": 23 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.291015625, + "learning_rate": 0.0001973326597248006, + "loss": 1.2733, + "step": 24 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.306640625, + "learning_rate": 0.00019694002659393305, + "loss": 1.2302, + "step": 25 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.318359375, + "learning_rate": 0.00019652089102773488, + "loss": 1.2083, + "step": 26 + }, + { + "epoch": 0.5148514851485149, + "eval_loss": 1.224540114402771, + "eval_runtime": 13.8695, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 4.542, + "step": 26 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.26953125, + "learning_rate": 0.00019607536761368484, + "loss": 1.1761, + "step": 27 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 0.296875, + "learning_rate": 0.00019560357815343577, + "loss": 1.1751, + "step": 28 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.310546875, + "learning_rate": 0.00019510565162951537, + "loss": 1.2002, + "step": 29 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.287109375, + "learning_rate": 0.00019458172417006347, + "loss": 1.1544, + "step": 30 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.365234375, + "learning_rate": 0.00019403193901161613, + "loss": 1.1384, + "step": 31 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.236328125, + "learning_rate": 0.0001934564464599461, + "loss": 1.0999, + "step": 32 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.326171875, + "learning_rate": 0.00019285540384897073, + "loss": 1.1576, + "step": 33 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.310546875, + "learning_rate": 0.00019222897549773848, + "loss": 1.091, + "step": 34 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.2578125, + "learning_rate": 0.00019157733266550575, + "loss": 1.056, + "step": 35 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.267578125, + "learning_rate": 0.00019090065350491626, + "loss": 1.1068, + "step": 36 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019019912301329592, + "loss": 1.0583, + "step": 37 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.2734375, + "learning_rate": 0.00018947293298207635, + "loss": 1.0671, + "step": 38 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001887222819443612, + "loss": 1.0851, + "step": 39 + }, + { + "epoch": 0.7722772277227723, + "eval_loss": 1.060703158378601, + "eval_runtime": 13.878, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 4.54, + "step": 39 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.22265625, + "learning_rate": 0.0001879473751206489, + "loss": 1.0343, + "step": 40 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.1796875, + "learning_rate": 0.00018714842436272773, + "loss": 0.9789, + "step": 41 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.248046875, + "learning_rate": 0.00018632564809575742, + "loss": 1.0174, + "step": 42 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001854792712585539, + "loss": 1.0004, + "step": 43 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.228515625, + "learning_rate": 0.00018460952524209355, + "loss": 1.0281, + "step": 44 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.220703125, + "learning_rate": 0.00018371664782625287, + "loss": 0.9992, + "step": 45 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018280088311480201, + "loss": 0.9635, + "step": 46 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.265625, + "learning_rate": 0.00018186248146866927, + "loss": 1.006, + "step": 47 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018090169943749476, + "loss": 0.9891, + "step": 48 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.28515625, + "learning_rate": 0.0001799187996894925, + "loss": 0.9809, + "step": 49 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.212890625, + "learning_rate": 0.00017891405093963938, + "loss": 0.9646, + "step": 50 + }, + { + "epoch": 1.00990099009901, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017788772787621126, + "loss": 0.9553, + "step": 51 + }, + { + "epoch": 1.0297029702970297, + "grad_norm": 0.2578125, + "learning_rate": 0.00017684011108568592, + "loss": 0.9432, + "step": 52 + }, + { + "epoch": 1.0297029702970297, + "eval_loss": 0.9755253195762634, + "eval_runtime": 13.879, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 52 + }, + { + "epoch": 1.0495049504950495, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001757714869760335, + "loss": 0.9631, + "step": 53 + }, + { + "epoch": 1.0693069306930694, + "grad_norm": 0.3046875, + "learning_rate": 0.0001746821476984154, + "loss": 0.9539, + "step": 54 + }, + { + "epoch": 1.0198019801980198, + "grad_norm": 0.232421875, + "learning_rate": 0.00017357239106731317, + "loss": 0.9559, + "step": 55 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.283203125, + "learning_rate": 0.00017244252047910892, + "loss": 0.9111, + "step": 56 + }, + { + "epoch": 1.0594059405940595, + "grad_norm": 0.30859375, + "learning_rate": 0.00017129284482913972, + "loss": 0.9503, + "step": 57 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.2265625, + "learning_rate": 0.00017012367842724887, + "loss": 0.911, + "step": 58 + }, + { + "epoch": 1.099009900990099, + "grad_norm": 0.3515625, + "learning_rate": 0.0001689353409118566, + "loss": 0.9041, + "step": 59 + }, + { + "epoch": 1.118811881188119, + "grad_norm": 0.26171875, + "learning_rate": 0.00016772815716257412, + "loss": 0.9117, + "step": 60 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.2890625, + "learning_rate": 0.0001665024572113848, + "loss": 0.9351, + "step": 61 + }, + { + "epoch": 1.1584158415841583, + "grad_norm": 0.251953125, + "learning_rate": 0.00016525857615241687, + "loss": 0.9438, + "step": 62 + }, + { + "epoch": 1.1782178217821782, + "grad_norm": 0.2138671875, + "learning_rate": 0.00016399685405033167, + "loss": 0.9075, + "step": 63 + }, + { + "epoch": 1.198019801980198, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001627176358473537, + "loss": 0.8983, + "step": 64 + }, + { + "epoch": 1.2178217821782178, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001614212712689668, + "loss": 0.9007, + "step": 65 + }, + { + "epoch": 1.2178217821782178, + "eval_loss": 0.9333999156951904, + "eval_runtime": 13.8668, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 4.543, + "step": 65 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016010811472830252, + "loss": 0.9108, + "step": 66 + }, + { + "epoch": 1.2574257425742574, + "grad_norm": 0.232421875, + "learning_rate": 0.00015877852522924732, + "loss": 0.9177, + "step": 67 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.271484375, + "learning_rate": 0.00015743286626829437, + "loss": 0.9, + "step": 68 + }, + { + "epoch": 1.297029702970297, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001560715057351673, + "loss": 0.9096, + "step": 69 + }, + { + "epoch": 1.316831683168317, + "grad_norm": 0.22265625, + "learning_rate": 0.00015469481581224272, + "loss": 0.8946, + "step": 70 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.31640625, + "learning_rate": 0.0001533031728727994, + "loss": 0.8995, + "step": 71 + }, + { + "epoch": 1.3564356435643563, + "grad_norm": 0.2197265625, + "learning_rate": 0.00015189695737812152, + "loss": 0.922, + "step": 72 + }, + { + "epoch": 1.3762376237623761, + "grad_norm": 0.22265625, + "learning_rate": 0.0001504765537734844, + "loss": 0.885, + "step": 73 + }, + { + "epoch": 1.396039603960396, + "grad_norm": 0.248046875, + "learning_rate": 0.00014904235038305083, + "loss": 0.895, + "step": 74 + }, + { + "epoch": 1.4158415841584158, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014759473930370736, + "loss": 0.892, + "step": 75 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.216796875, + "learning_rate": 0.0001461341162978688, + "loss": 0.8277, + "step": 76 + }, + { + "epoch": 1.4554455445544554, + "grad_norm": 0.23828125, + "learning_rate": 0.00014466088068528068, + "loss": 0.8687, + "step": 77 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 0.228515625, + "learning_rate": 0.00014317543523384928, + "loss": 0.8765, + "step": 78 + }, + { + "epoch": 1.4752475247524752, + "eval_loss": 0.9083698391914368, + "eval_runtime": 13.8834, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 4.538, + "step": 78 + }, + { + "epoch": 1.495049504950495, + "grad_norm": 0.228515625, + "learning_rate": 0.00014167818604952906, + "loss": 0.8797, + "step": 79 + }, + { + "epoch": 1.5148514851485149, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014016954246529696, + "loss": 0.905, + "step": 80 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.25390625, + "learning_rate": 0.00013864991692924523, + "loss": 0.8575, + "step": 81 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 0.2451171875, + "learning_rate": 0.00013711972489182208, + "loss": 0.8957, + "step": 82 + }, + { + "epoch": 1.5742574257425743, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013557938469225167, + "loss": 0.8792, + "step": 83 + }, + { + "epoch": 1.5940594059405941, + "grad_norm": 0.21484375, + "learning_rate": 0.00013402931744416433, + "loss": 0.889, + "step": 84 + }, + { + "epoch": 1.613861386138614, + "grad_norm": 0.228515625, + "learning_rate": 0.00013246994692046836, + "loss": 0.8657, + "step": 85 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 0.20703125, + "learning_rate": 0.00013090169943749476, + "loss": 0.8784, + "step": 86 + }, + { + "epoch": 1.6534653465346536, + "grad_norm": 0.265625, + "learning_rate": 0.0001293250037384465, + "loss": 0.8822, + "step": 87 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.2197265625, + "learning_rate": 0.00012774029087618446, + "loss": 0.9092, + "step": 88 + }, + { + "epoch": 1.693069306930693, + "grad_norm": 0.234375, + "learning_rate": 0.00012614799409538198, + "loss": 0.8813, + "step": 89 + }, + { + "epoch": 1.7128712871287128, + "grad_norm": 0.2294921875, + "learning_rate": 0.00012454854871407994, + "loss": 0.8975, + "step": 90 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 0.259765625, + "learning_rate": 0.00012294239200467516, + "loss": 0.8789, + "step": 91 + }, + { + "epoch": 1.7326732673267327, + "eval_loss": 0.8891416788101196, + "eval_runtime": 13.872, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 4.542, + "step": 91 + }, + { + "epoch": 1.7524752475247525, + "grad_norm": 0.26171875, + "learning_rate": 0.0001213299630743747, + "loss": 0.9184, + "step": 92 + }, + { + "epoch": 1.7722772277227723, + "grad_norm": 0.337890625, + "learning_rate": 0.00011971170274514802, + "loss": 0.8854, + "step": 93 + }, + { + "epoch": 1.7920792079207921, + "grad_norm": 0.2890625, + "learning_rate": 0.000118088053433211, + "loss": 0.8688, + "step": 94 + }, + { + "epoch": 1.811881188118812, + "grad_norm": 0.3515625, + "learning_rate": 0.00011645945902807341, + "loss": 0.8281, + "step": 95 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 0.26953125, + "learning_rate": 0.0001148263647711842, + "loss": 0.8488, + "step": 96 + }, + { + "epoch": 1.8514851485148514, + "grad_norm": 0.2490234375, + "learning_rate": 0.00011318921713420691, + "loss": 0.8742, + "step": 97 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.265625, + "learning_rate": 0.00011154846369695863, + "loss": 0.8586, + "step": 98 + }, + { + "epoch": 1.891089108910891, + "grad_norm": 0.265625, + "learning_rate": 0.0001099045530250463, + "loss": 0.8776, + "step": 99 + }, + { + "epoch": 1.9108910891089108, + "grad_norm": 0.259765625, + "learning_rate": 0.00010825793454723325, + "loss": 0.8563, + "step": 100 + }, + { + "epoch": 1.9306930693069306, + "grad_norm": 0.283203125, + "learning_rate": 0.00010660905843256994, + "loss": 0.8381, + "step": 101 + }, + { + "epoch": 1.9504950495049505, + "grad_norm": 0.201171875, + "learning_rate": 0.00010495837546732224, + "loss": 0.847, + "step": 102 + }, + { + "epoch": 1.9702970297029703, + "grad_norm": 0.23828125, + "learning_rate": 0.00010330633693173082, + "loss": 0.8512, + "step": 103 + }, + { + "epoch": 1.99009900990099, + "grad_norm": 0.283203125, + "learning_rate": 0.00010165339447663587, + "loss": 0.8304, + "step": 104 + }, + { + "epoch": 1.99009900990099, + "eval_loss": 0.8779018521308899, + "eval_runtime": 13.8827, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 4.538, + "step": 104 + }, + { + "epoch": 2.00990099009901, + "grad_norm": 0.283203125, + "learning_rate": 0.0001, + "loss": 0.8523, + "step": 105 + }, + { + "epoch": 2.0297029702970297, + "grad_norm": 0.2392578125, + "learning_rate": 9.834660552336415e-05, + "loss": 0.8109, + "step": 106 + }, + { + "epoch": 2.0495049504950495, + "grad_norm": 0.224609375, + "learning_rate": 9.669366306826919e-05, + "loss": 0.8394, + "step": 107 + }, + { + "epoch": 2.0693069306930694, + "grad_norm": 0.283203125, + "learning_rate": 9.504162453267777e-05, + "loss": 0.8524, + "step": 108 + }, + { + "epoch": 2.01980198019802, + "grad_norm": 0.22265625, + "learning_rate": 9.339094156743007e-05, + "loss": 0.8391, + "step": 109 + }, + { + "epoch": 2.0396039603960396, + "grad_norm": 0.2001953125, + "learning_rate": 9.174206545276677e-05, + "loss": 0.8317, + "step": 110 + }, + { + "epoch": 2.0594059405940595, + "grad_norm": 0.22265625, + "learning_rate": 9.009544697495374e-05, + "loss": 0.833, + "step": 111 + }, + { + "epoch": 2.0792079207920793, + "grad_norm": 0.2041015625, + "learning_rate": 8.845153630304139e-05, + "loss": 0.8408, + "step": 112 + }, + { + "epoch": 2.099009900990099, + "grad_norm": 0.2080078125, + "learning_rate": 8.681078286579311e-05, + "loss": 0.8459, + "step": 113 + }, + { + "epoch": 2.118811881188119, + "grad_norm": 0.2021484375, + "learning_rate": 8.517363522881579e-05, + "loss": 0.8177, + "step": 114 + }, + { + "epoch": 2.1386138613861387, + "grad_norm": 0.2265625, + "learning_rate": 8.35405409719266e-05, + "loss": 0.8451, + "step": 115 + }, + { + "epoch": 2.1584158415841586, + "grad_norm": 0.2294921875, + "learning_rate": 8.191194656678904e-05, + "loss": 0.8543, + "step": 116 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.22265625, + "learning_rate": 8.028829725485199e-05, + "loss": 0.8194, + "step": 117 + }, + { + "epoch": 2.1782178217821784, + "eval_loss": 0.8713971972465515, + "eval_runtime": 13.8976, + "eval_samples_per_second": 8.994, + "eval_steps_per_second": 4.533, + "step": 117 + }, + { + "epoch": 2.198019801980198, + "grad_norm": 0.2333984375, + "learning_rate": 7.867003692562534e-05, + "loss": 0.808, + "step": 118 + }, + { + "epoch": 2.217821782178218, + "grad_norm": 0.2470703125, + "learning_rate": 7.705760799532485e-05, + "loss": 0.8073, + "step": 119 + }, + { + "epoch": 2.237623762376238, + "grad_norm": 0.201171875, + "learning_rate": 7.54514512859201e-05, + "loss": 0.8392, + "step": 120 + }, + { + "epoch": 2.2574257425742577, + "grad_norm": 0.25, + "learning_rate": 7.385200590461803e-05, + "loss": 0.8574, + "step": 121 + }, + { + "epoch": 2.2772277227722775, + "grad_norm": 0.271484375, + "learning_rate": 7.225970912381556e-05, + "loss": 0.8338, + "step": 122 + }, + { + "epoch": 2.297029702970297, + "grad_norm": 0.294921875, + "learning_rate": 7.067499626155354e-05, + "loss": 0.8788, + "step": 123 + }, + { + "epoch": 2.3168316831683167, + "grad_norm": 0.2265625, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8297, + "step": 124 + }, + { + "epoch": 2.3366336633663365, + "grad_norm": 0.267578125, + "learning_rate": 6.753005307953167e-05, + "loss": 0.8125, + "step": 125 + }, + { + "epoch": 2.3564356435643563, + "grad_norm": 0.2431640625, + "learning_rate": 6.59706825558357e-05, + "loss": 0.814, + "step": 126 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.27734375, + "learning_rate": 6.442061530774834e-05, + "loss": 0.8335, + "step": 127 + }, + { + "epoch": 2.396039603960396, + "grad_norm": 0.2216796875, + "learning_rate": 6.28802751081779e-05, + "loss": 0.8512, + "step": 128 + }, + { + "epoch": 2.4158415841584158, + "grad_norm": 0.224609375, + "learning_rate": 6.135008307075481e-05, + "loss": 0.8297, + "step": 129 + }, + { + "epoch": 2.4356435643564356, + "grad_norm": 0.2412109375, + "learning_rate": 5.983045753470308e-05, + "loss": 0.848, + "step": 130 + }, + { + "epoch": 2.4356435643564356, + "eval_loss": 0.8665071129798889, + "eval_runtime": 13.8735, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 4.541, + "step": 130 + }, + { + "epoch": 2.4554455445544554, + "grad_norm": 0.2265625, + "learning_rate": 5.832181395047098e-05, + "loss": 0.8203, + "step": 131 + }, + { + "epoch": 2.4752475247524752, + "grad_norm": 0.287109375, + "learning_rate": 5.6824564766150726e-05, + "loss": 0.8519, + "step": 132 + }, + { + "epoch": 2.495049504950495, + "grad_norm": 0.21484375, + "learning_rate": 5.533911931471936e-05, + "loss": 0.83, + "step": 133 + }, + { + "epoch": 2.514851485148515, + "grad_norm": 0.2109375, + "learning_rate": 5.386588370213124e-05, + "loss": 0.842, + "step": 134 + }, + { + "epoch": 2.5346534653465347, + "grad_norm": 0.2412109375, + "learning_rate": 5.240526069629265e-05, + "loss": 0.8419, + "step": 135 + }, + { + "epoch": 2.5544554455445545, + "grad_norm": 0.267578125, + "learning_rate": 5.095764961694922e-05, + "loss": 0.8458, + "step": 136 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.203125, + "learning_rate": 4.952344622651566e-05, + "loss": 0.8133, + "step": 137 + }, + { + "epoch": 2.594059405940594, + "grad_norm": 0.2060546875, + "learning_rate": 4.810304262187852e-05, + "loss": 0.8103, + "step": 138 + }, + { + "epoch": 2.613861386138614, + "grad_norm": 0.20703125, + "learning_rate": 4.669682712720065e-05, + "loss": 0.8105, + "step": 139 + }, + { + "epoch": 2.633663366336634, + "grad_norm": 0.2060546875, + "learning_rate": 4.530518418775733e-05, + "loss": 0.8305, + "step": 140 + }, + { + "epoch": 2.6534653465346536, + "grad_norm": 0.2080078125, + "learning_rate": 4.392849426483274e-05, + "loss": 0.7881, + "step": 141 + }, + { + "epoch": 2.6732673267326734, + "grad_norm": 0.2216796875, + "learning_rate": 4.256713373170564e-05, + "loss": 0.8204, + "step": 142 + }, + { + "epoch": 2.693069306930693, + "grad_norm": 0.263671875, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8354, + "step": 143 + }, + { + "epoch": 2.693069306930693, + "eval_loss": 0.8626759648323059, + "eval_runtime": 13.8585, + "eval_samples_per_second": 9.02, + "eval_steps_per_second": 4.546, + "step": 143 + }, + { + "epoch": 2.7128712871287126, + "grad_norm": 0.2138671875, + "learning_rate": 3.9891885271697496e-05, + "loss": 0.8441, + "step": 144 + }, + { + "epoch": 2.7326732673267324, + "grad_norm": 0.2197265625, + "learning_rate": 3.857872873103322e-05, + "loss": 0.8084, + "step": 145 + }, + { + "epoch": 2.7524752475247523, + "grad_norm": 0.18359375, + "learning_rate": 3.7282364152646297e-05, + "loss": 0.8184, + "step": 146 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.1904296875, + "learning_rate": 3.600314594966834e-05, + "loss": 0.8302, + "step": 147 + }, + { + "epoch": 2.792079207920792, + "grad_norm": 0.2041015625, + "learning_rate": 3.4741423847583134e-05, + "loss": 0.8503, + "step": 148 + }, + { + "epoch": 2.8118811881188117, + "grad_norm": 0.2265625, + "learning_rate": 3.349754278861517e-05, + "loss": 0.8273, + "step": 149 + }, + { + "epoch": 2.8316831683168315, + "grad_norm": 0.1943359375, + "learning_rate": 3.227184283742591e-05, + "loss": 0.8332, + "step": 150 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.1227070440800256e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30 --- /dev/null +++ b/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f +size 5944 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891 --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4181a1133aef7fd8af624e49e7f253a4b6ae062e --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800103a9f27876d14f8e9f0fb64fb81af3a478d54bbaea5587ecbd0592ad4142 +size 50503848 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c554fca617de0749a94a47996d59055a173c0d58 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dafd7ff9d5c3c564b22c4a0593f1078a408837f37261ad73caf0c7e062c6a39 +size 202035450 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..436017979432f2d4cfade120bb74af668a9f1bfc --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ca197c3706eaaadf2931079a5ebf26b215b3f60f60a6755cc111301c7ac7f6 +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..53f4138ea76495c65f762a13321851ab341abfff --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9a25c72339c898b564e0c464a3f6fc75bbeec408008928b7ed05533156b98c +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2731e3b879a62aa4da83438163cd54e6b6e7b12e --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,1561 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.7524752475247523, + "eval_steps": 13, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.019801980198019802, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.0919, + "step": 1 + }, + { + "epoch": 0.019801980198019802, + "eval_loss": 2.079954147338867, + "eval_runtime": 13.8908, + "eval_samples_per_second": 8.999, + "eval_steps_per_second": 4.535, + "step": 1 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 1.203125, + "learning_rate": 4e-05, + "loss": 2.0814, + "step": 2 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 1.1953125, + "learning_rate": 6e-05, + "loss": 2.0499, + "step": 3 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 1.0859375, + "learning_rate": 8e-05, + "loss": 2.0153, + "step": 4 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 1.0390625, + "learning_rate": 0.0001, + "loss": 1.9548, + "step": 5 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.89453125, + "learning_rate": 0.00012, + "loss": 1.8982, + "step": 6 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.67578125, + "learning_rate": 0.00014, + "loss": 1.8226, + "step": 7 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.66796875, + "learning_rate": 0.00016, + "loss": 1.7572, + "step": 8 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.78515625, + "learning_rate": 0.00018, + "loss": 1.7074, + "step": 9 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.73828125, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 10 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.484375, + "learning_rate": 0.0001999863304992469, + "loss": 1.5801, + "step": 11 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.53125, + "learning_rate": 0.00019994532573409262, + "loss": 1.5721, + "step": 12 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.6953125, + "learning_rate": 0.00019987699691483048, + "loss": 1.5479, + "step": 13 + }, + { + "epoch": 0.25742574257425743, + "eval_loss": 1.5341482162475586, + "eval_runtime": 13.8795, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 13 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.65234375, + "learning_rate": 0.00019978136272187747, + "loss": 1.534, + "step": 14 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.515625, + "learning_rate": 0.000199658449300667, + "loss": 1.4804, + "step": 15 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.439453125, + "learning_rate": 0.00019950829025450114, + "loss": 1.4805, + "step": 16 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.361328125, + "learning_rate": 0.00019933092663536382, + "loss": 1.3809, + "step": 17 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.3125, + "learning_rate": 0.00019912640693269752, + "loss": 1.3837, + "step": 18 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.337890625, + "learning_rate": 0.00019889478706014687, + "loss": 1.3673, + "step": 19 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.298828125, + "learning_rate": 0.00019863613034027224, + "loss": 1.366, + "step": 20 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.34375, + "learning_rate": 0.00019835050748723824, + "loss": 1.3318, + "step": 21 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.341796875, + "learning_rate": 0.00019803799658748094, + "loss": 1.2741, + "step": 22 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.326171875, + "learning_rate": 0.00019769868307835994, + "loss": 1.2978, + "step": 23 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.291015625, + "learning_rate": 0.0001973326597248006, + "loss": 1.2733, + "step": 24 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.306640625, + "learning_rate": 0.00019694002659393305, + "loss": 1.2302, + "step": 25 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.318359375, + "learning_rate": 0.00019652089102773488, + "loss": 1.2083, + "step": 26 + }, + { + "epoch": 0.5148514851485149, + "eval_loss": 1.224540114402771, + "eval_runtime": 13.8695, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 4.542, + "step": 26 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.26953125, + "learning_rate": 0.00019607536761368484, + "loss": 1.1761, + "step": 27 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 0.296875, + "learning_rate": 0.00019560357815343577, + "loss": 1.1751, + "step": 28 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.310546875, + "learning_rate": 0.00019510565162951537, + "loss": 1.2002, + "step": 29 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.287109375, + "learning_rate": 0.00019458172417006347, + "loss": 1.1544, + "step": 30 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.365234375, + "learning_rate": 0.00019403193901161613, + "loss": 1.1384, + "step": 31 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.236328125, + "learning_rate": 0.0001934564464599461, + "loss": 1.0999, + "step": 32 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.326171875, + "learning_rate": 0.00019285540384897073, + "loss": 1.1576, + "step": 33 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.310546875, + "learning_rate": 0.00019222897549773848, + "loss": 1.091, + "step": 34 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.2578125, + "learning_rate": 0.00019157733266550575, + "loss": 1.056, + "step": 35 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.267578125, + "learning_rate": 0.00019090065350491626, + "loss": 1.1068, + "step": 36 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019019912301329592, + "loss": 1.0583, + "step": 37 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.2734375, + "learning_rate": 0.00018947293298207635, + "loss": 1.0671, + "step": 38 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001887222819443612, + "loss": 1.0851, + "step": 39 + }, + { + "epoch": 0.7722772277227723, + "eval_loss": 1.060703158378601, + "eval_runtime": 13.878, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 4.54, + "step": 39 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.22265625, + "learning_rate": 0.0001879473751206489, + "loss": 1.0343, + "step": 40 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.1796875, + "learning_rate": 0.00018714842436272773, + "loss": 0.9789, + "step": 41 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.248046875, + "learning_rate": 0.00018632564809575742, + "loss": 1.0174, + "step": 42 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001854792712585539, + "loss": 1.0004, + "step": 43 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.228515625, + "learning_rate": 0.00018460952524209355, + "loss": 1.0281, + "step": 44 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.220703125, + "learning_rate": 0.00018371664782625287, + "loss": 0.9992, + "step": 45 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018280088311480201, + "loss": 0.9635, + "step": 46 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.265625, + "learning_rate": 0.00018186248146866927, + "loss": 1.006, + "step": 47 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018090169943749476, + "loss": 0.9891, + "step": 48 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.28515625, + "learning_rate": 0.0001799187996894925, + "loss": 0.9809, + "step": 49 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.212890625, + "learning_rate": 0.00017891405093963938, + "loss": 0.9646, + "step": 50 + }, + { + "epoch": 1.00990099009901, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017788772787621126, + "loss": 0.9553, + "step": 51 + }, + { + "epoch": 1.0297029702970297, + "grad_norm": 0.2578125, + "learning_rate": 0.00017684011108568592, + "loss": 0.9432, + "step": 52 + }, + { + "epoch": 1.0297029702970297, + "eval_loss": 0.9755253195762634, + "eval_runtime": 13.879, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 52 + }, + { + "epoch": 1.0495049504950495, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001757714869760335, + "loss": 0.9631, + "step": 53 + }, + { + "epoch": 1.0693069306930694, + "grad_norm": 0.3046875, + "learning_rate": 0.0001746821476984154, + "loss": 0.9539, + "step": 54 + }, + { + "epoch": 1.0198019801980198, + "grad_norm": 0.232421875, + "learning_rate": 0.00017357239106731317, + "loss": 0.9559, + "step": 55 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.283203125, + "learning_rate": 0.00017244252047910892, + "loss": 0.9111, + "step": 56 + }, + { + "epoch": 1.0594059405940595, + "grad_norm": 0.30859375, + "learning_rate": 0.00017129284482913972, + "loss": 0.9503, + "step": 57 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.2265625, + "learning_rate": 0.00017012367842724887, + "loss": 0.911, + "step": 58 + }, + { + "epoch": 1.099009900990099, + "grad_norm": 0.3515625, + "learning_rate": 0.0001689353409118566, + "loss": 0.9041, + "step": 59 + }, + { + "epoch": 1.118811881188119, + "grad_norm": 0.26171875, + "learning_rate": 0.00016772815716257412, + "loss": 0.9117, + "step": 60 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.2890625, + "learning_rate": 0.0001665024572113848, + "loss": 0.9351, + "step": 61 + }, + { + "epoch": 1.1584158415841583, + "grad_norm": 0.251953125, + "learning_rate": 0.00016525857615241687, + "loss": 0.9438, + "step": 62 + }, + { + "epoch": 1.1782178217821782, + "grad_norm": 0.2138671875, + "learning_rate": 0.00016399685405033167, + "loss": 0.9075, + "step": 63 + }, + { + "epoch": 1.198019801980198, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001627176358473537, + "loss": 0.8983, + "step": 64 + }, + { + "epoch": 1.2178217821782178, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001614212712689668, + "loss": 0.9007, + "step": 65 + }, + { + "epoch": 1.2178217821782178, + "eval_loss": 0.9333999156951904, + "eval_runtime": 13.8668, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 4.543, + "step": 65 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016010811472830252, + "loss": 0.9108, + "step": 66 + }, + { + "epoch": 1.2574257425742574, + "grad_norm": 0.232421875, + "learning_rate": 0.00015877852522924732, + "loss": 0.9177, + "step": 67 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.271484375, + "learning_rate": 0.00015743286626829437, + "loss": 0.9, + "step": 68 + }, + { + "epoch": 1.297029702970297, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001560715057351673, + "loss": 0.9096, + "step": 69 + }, + { + "epoch": 1.316831683168317, + "grad_norm": 0.22265625, + "learning_rate": 0.00015469481581224272, + "loss": 0.8946, + "step": 70 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.31640625, + "learning_rate": 0.0001533031728727994, + "loss": 0.8995, + "step": 71 + }, + { + "epoch": 1.3564356435643563, + "grad_norm": 0.2197265625, + "learning_rate": 0.00015189695737812152, + "loss": 0.922, + "step": 72 + }, + { + "epoch": 1.3762376237623761, + "grad_norm": 0.22265625, + "learning_rate": 0.0001504765537734844, + "loss": 0.885, + "step": 73 + }, + { + "epoch": 1.396039603960396, + "grad_norm": 0.248046875, + "learning_rate": 0.00014904235038305083, + "loss": 0.895, + "step": 74 + }, + { + "epoch": 1.4158415841584158, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014759473930370736, + "loss": 0.892, + "step": 75 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.216796875, + "learning_rate": 0.0001461341162978688, + "loss": 0.8277, + "step": 76 + }, + { + "epoch": 1.4554455445544554, + "grad_norm": 0.23828125, + "learning_rate": 0.00014466088068528068, + "loss": 0.8687, + "step": 77 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 0.228515625, + "learning_rate": 0.00014317543523384928, + "loss": 0.8765, + "step": 78 + }, + { + "epoch": 1.4752475247524752, + "eval_loss": 0.9083698391914368, + "eval_runtime": 13.8834, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 4.538, + "step": 78 + }, + { + "epoch": 1.495049504950495, + "grad_norm": 0.228515625, + "learning_rate": 0.00014167818604952906, + "loss": 0.8797, + "step": 79 + }, + { + "epoch": 1.5148514851485149, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014016954246529696, + "loss": 0.905, + "step": 80 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.25390625, + "learning_rate": 0.00013864991692924523, + "loss": 0.8575, + "step": 81 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 0.2451171875, + "learning_rate": 0.00013711972489182208, + "loss": 0.8957, + "step": 82 + }, + { + "epoch": 1.5742574257425743, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013557938469225167, + "loss": 0.8792, + "step": 83 + }, + { + "epoch": 1.5940594059405941, + "grad_norm": 0.21484375, + "learning_rate": 0.00013402931744416433, + "loss": 0.889, + "step": 84 + }, + { + "epoch": 1.613861386138614, + "grad_norm": 0.228515625, + "learning_rate": 0.00013246994692046836, + "loss": 0.8657, + "step": 85 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 0.20703125, + "learning_rate": 0.00013090169943749476, + "loss": 0.8784, + "step": 86 + }, + { + "epoch": 1.6534653465346536, + "grad_norm": 0.265625, + "learning_rate": 0.0001293250037384465, + "loss": 0.8822, + "step": 87 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.2197265625, + "learning_rate": 0.00012774029087618446, + "loss": 0.9092, + "step": 88 + }, + { + "epoch": 1.693069306930693, + "grad_norm": 0.234375, + "learning_rate": 0.00012614799409538198, + "loss": 0.8813, + "step": 89 + }, + { + "epoch": 1.7128712871287128, + "grad_norm": 0.2294921875, + "learning_rate": 0.00012454854871407994, + "loss": 0.8975, + "step": 90 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 0.259765625, + "learning_rate": 0.00012294239200467516, + "loss": 0.8789, + "step": 91 + }, + { + "epoch": 1.7326732673267327, + "eval_loss": 0.8891416788101196, + "eval_runtime": 13.872, + "eval_samples_per_second": 9.011, + "eval_steps_per_second": 4.542, + "step": 91 + }, + { + "epoch": 1.7524752475247525, + "grad_norm": 0.26171875, + "learning_rate": 0.0001213299630743747, + "loss": 0.9184, + "step": 92 + }, + { + "epoch": 1.7722772277227723, + "grad_norm": 0.337890625, + "learning_rate": 0.00011971170274514802, + "loss": 0.8854, + "step": 93 + }, + { + "epoch": 1.7920792079207921, + "grad_norm": 0.2890625, + "learning_rate": 0.000118088053433211, + "loss": 0.8688, + "step": 94 + }, + { + "epoch": 1.811881188118812, + "grad_norm": 0.3515625, + "learning_rate": 0.00011645945902807341, + "loss": 0.8281, + "step": 95 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 0.26953125, + "learning_rate": 0.0001148263647711842, + "loss": 0.8488, + "step": 96 + }, + { + "epoch": 1.8514851485148514, + "grad_norm": 0.2490234375, + "learning_rate": 0.00011318921713420691, + "loss": 0.8742, + "step": 97 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.265625, + "learning_rate": 0.00011154846369695863, + "loss": 0.8586, + "step": 98 + }, + { + "epoch": 1.891089108910891, + "grad_norm": 0.265625, + "learning_rate": 0.0001099045530250463, + "loss": 0.8776, + "step": 99 + }, + { + "epoch": 1.9108910891089108, + "grad_norm": 0.259765625, + "learning_rate": 0.00010825793454723325, + "loss": 0.8563, + "step": 100 + }, + { + "epoch": 1.9306930693069306, + "grad_norm": 0.283203125, + "learning_rate": 0.00010660905843256994, + "loss": 0.8381, + "step": 101 + }, + { + "epoch": 1.9504950495049505, + "grad_norm": 0.201171875, + "learning_rate": 0.00010495837546732224, + "loss": 0.847, + "step": 102 + }, + { + "epoch": 1.9702970297029703, + "grad_norm": 0.23828125, + "learning_rate": 0.00010330633693173082, + "loss": 0.8512, + "step": 103 + }, + { + "epoch": 1.99009900990099, + "grad_norm": 0.283203125, + "learning_rate": 0.00010165339447663587, + "loss": 0.8304, + "step": 104 + }, + { + "epoch": 1.99009900990099, + "eval_loss": 0.8779018521308899, + "eval_runtime": 13.8827, + "eval_samples_per_second": 9.004, + "eval_steps_per_second": 4.538, + "step": 104 + }, + { + "epoch": 2.00990099009901, + "grad_norm": 0.283203125, + "learning_rate": 0.0001, + "loss": 0.8523, + "step": 105 + }, + { + "epoch": 2.0297029702970297, + "grad_norm": 0.2392578125, + "learning_rate": 9.834660552336415e-05, + "loss": 0.8109, + "step": 106 + }, + { + "epoch": 2.0495049504950495, + "grad_norm": 0.224609375, + "learning_rate": 9.669366306826919e-05, + "loss": 0.8394, + "step": 107 + }, + { + "epoch": 2.0693069306930694, + "grad_norm": 0.283203125, + "learning_rate": 9.504162453267777e-05, + "loss": 0.8524, + "step": 108 + }, + { + "epoch": 2.01980198019802, + "grad_norm": 0.22265625, + "learning_rate": 9.339094156743007e-05, + "loss": 0.8391, + "step": 109 + }, + { + "epoch": 2.0396039603960396, + "grad_norm": 0.2001953125, + "learning_rate": 9.174206545276677e-05, + "loss": 0.8317, + "step": 110 + }, + { + "epoch": 2.0594059405940595, + "grad_norm": 0.22265625, + "learning_rate": 9.009544697495374e-05, + "loss": 0.833, + "step": 111 + }, + { + "epoch": 2.0792079207920793, + "grad_norm": 0.2041015625, + "learning_rate": 8.845153630304139e-05, + "loss": 0.8408, + "step": 112 + }, + { + "epoch": 2.099009900990099, + "grad_norm": 0.2080078125, + "learning_rate": 8.681078286579311e-05, + "loss": 0.8459, + "step": 113 + }, + { + "epoch": 2.118811881188119, + "grad_norm": 0.2021484375, + "learning_rate": 8.517363522881579e-05, + "loss": 0.8177, + "step": 114 + }, + { + "epoch": 2.1386138613861387, + "grad_norm": 0.2265625, + "learning_rate": 8.35405409719266e-05, + "loss": 0.8451, + "step": 115 + }, + { + "epoch": 2.1584158415841586, + "grad_norm": 0.2294921875, + "learning_rate": 8.191194656678904e-05, + "loss": 0.8543, + "step": 116 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.22265625, + "learning_rate": 8.028829725485199e-05, + "loss": 0.8194, + "step": 117 + }, + { + "epoch": 2.1782178217821784, + "eval_loss": 0.8713971972465515, + "eval_runtime": 13.8976, + "eval_samples_per_second": 8.994, + "eval_steps_per_second": 4.533, + "step": 117 + }, + { + "epoch": 2.198019801980198, + "grad_norm": 0.2333984375, + "learning_rate": 7.867003692562534e-05, + "loss": 0.808, + "step": 118 + }, + { + "epoch": 2.217821782178218, + "grad_norm": 0.2470703125, + "learning_rate": 7.705760799532485e-05, + "loss": 0.8073, + "step": 119 + }, + { + "epoch": 2.237623762376238, + "grad_norm": 0.201171875, + "learning_rate": 7.54514512859201e-05, + "loss": 0.8392, + "step": 120 + }, + { + "epoch": 2.2574257425742577, + "grad_norm": 0.25, + "learning_rate": 7.385200590461803e-05, + "loss": 0.8574, + "step": 121 + }, + { + "epoch": 2.2772277227722775, + "grad_norm": 0.271484375, + "learning_rate": 7.225970912381556e-05, + "loss": 0.8338, + "step": 122 + }, + { + "epoch": 2.297029702970297, + "grad_norm": 0.294921875, + "learning_rate": 7.067499626155354e-05, + "loss": 0.8788, + "step": 123 + }, + { + "epoch": 2.3168316831683167, + "grad_norm": 0.2265625, + "learning_rate": 6.909830056250527e-05, + "loss": 0.8297, + "step": 124 + }, + { + "epoch": 2.3366336633663365, + "grad_norm": 0.267578125, + "learning_rate": 6.753005307953167e-05, + "loss": 0.8125, + "step": 125 + }, + { + "epoch": 2.3564356435643563, + "grad_norm": 0.2431640625, + "learning_rate": 6.59706825558357e-05, + "loss": 0.814, + "step": 126 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.27734375, + "learning_rate": 6.442061530774834e-05, + "loss": 0.8335, + "step": 127 + }, + { + "epoch": 2.396039603960396, + "grad_norm": 0.2216796875, + "learning_rate": 6.28802751081779e-05, + "loss": 0.8512, + "step": 128 + }, + { + "epoch": 2.4158415841584158, + "grad_norm": 0.224609375, + "learning_rate": 6.135008307075481e-05, + "loss": 0.8297, + "step": 129 + }, + { + "epoch": 2.4356435643564356, + "grad_norm": 0.2412109375, + "learning_rate": 5.983045753470308e-05, + "loss": 0.848, + "step": 130 + }, + { + "epoch": 2.4356435643564356, + "eval_loss": 0.8665071129798889, + "eval_runtime": 13.8735, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 4.541, + "step": 130 + }, + { + "epoch": 2.4554455445544554, + "grad_norm": 0.2265625, + "learning_rate": 5.832181395047098e-05, + "loss": 0.8203, + "step": 131 + }, + { + "epoch": 2.4752475247524752, + "grad_norm": 0.287109375, + "learning_rate": 5.6824564766150726e-05, + "loss": 0.8519, + "step": 132 + }, + { + "epoch": 2.495049504950495, + "grad_norm": 0.21484375, + "learning_rate": 5.533911931471936e-05, + "loss": 0.83, + "step": 133 + }, + { + "epoch": 2.514851485148515, + "grad_norm": 0.2109375, + "learning_rate": 5.386588370213124e-05, + "loss": 0.842, + "step": 134 + }, + { + "epoch": 2.5346534653465347, + "grad_norm": 0.2412109375, + "learning_rate": 5.240526069629265e-05, + "loss": 0.8419, + "step": 135 + }, + { + "epoch": 2.5544554455445545, + "grad_norm": 0.267578125, + "learning_rate": 5.095764961694922e-05, + "loss": 0.8458, + "step": 136 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.203125, + "learning_rate": 4.952344622651566e-05, + "loss": 0.8133, + "step": 137 + }, + { + "epoch": 2.594059405940594, + "grad_norm": 0.2060546875, + "learning_rate": 4.810304262187852e-05, + "loss": 0.8103, + "step": 138 + }, + { + "epoch": 2.613861386138614, + "grad_norm": 0.20703125, + "learning_rate": 4.669682712720065e-05, + "loss": 0.8105, + "step": 139 + }, + { + "epoch": 2.633663366336634, + "grad_norm": 0.2060546875, + "learning_rate": 4.530518418775733e-05, + "loss": 0.8305, + "step": 140 + }, + { + "epoch": 2.6534653465346536, + "grad_norm": 0.2080078125, + "learning_rate": 4.392849426483274e-05, + "loss": 0.7881, + "step": 141 + }, + { + "epoch": 2.6732673267326734, + "grad_norm": 0.2216796875, + "learning_rate": 4.256713373170564e-05, + "loss": 0.8204, + "step": 142 + }, + { + "epoch": 2.693069306930693, + "grad_norm": 0.263671875, + "learning_rate": 4.12214747707527e-05, + "loss": 0.8354, + "step": 143 + }, + { + "epoch": 2.693069306930693, + "eval_loss": 0.8626759648323059, + "eval_runtime": 13.8585, + "eval_samples_per_second": 9.02, + "eval_steps_per_second": 4.546, + "step": 143 + }, + { + "epoch": 2.7128712871287126, + "grad_norm": 0.2138671875, + "learning_rate": 3.9891885271697496e-05, + "loss": 0.8441, + "step": 144 + }, + { + "epoch": 2.7326732673267324, + "grad_norm": 0.2197265625, + "learning_rate": 3.857872873103322e-05, + "loss": 0.8084, + "step": 145 + }, + { + "epoch": 2.7524752475247523, + "grad_norm": 0.18359375, + "learning_rate": 3.7282364152646297e-05, + "loss": 0.8184, + "step": 146 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.1904296875, + "learning_rate": 3.600314594966834e-05, + "loss": 0.8302, + "step": 147 + }, + { + "epoch": 2.792079207920792, + "grad_norm": 0.2041015625, + "learning_rate": 3.4741423847583134e-05, + "loss": 0.8503, + "step": 148 + }, + { + "epoch": 2.8118811881188117, + "grad_norm": 0.2265625, + "learning_rate": 3.349754278861517e-05, + "loss": 0.8273, + "step": 149 + }, + { + "epoch": 2.8316831683168315, + "grad_norm": 0.1943359375, + "learning_rate": 3.227184283742591e-05, + "loss": 0.8332, + "step": 150 + }, + { + "epoch": 2.8514851485148514, + "grad_norm": 0.185546875, + "learning_rate": 3.106465908814342e-05, + "loss": 0.8391, + "step": 151 + }, + { + "epoch": 2.871287128712871, + "grad_norm": 0.1982421875, + "learning_rate": 2.9876321572751144e-05, + "loss": 0.8029, + "step": 152 + }, + { + "epoch": 2.891089108910891, + "grad_norm": 0.224609375, + "learning_rate": 2.87071551708603e-05, + "loss": 0.8561, + "step": 153 + }, + { + "epoch": 2.910891089108911, + "grad_norm": 0.2275390625, + "learning_rate": 2.7557479520891104e-05, + "loss": 0.8055, + "step": 154 + }, + { + "epoch": 2.9306930693069306, + "grad_norm": 0.16796875, + "learning_rate": 2.6427608932686843e-05, + "loss": 0.8301, + "step": 155 + }, + { + "epoch": 2.9504950495049505, + "grad_norm": 0.1943359375, + "learning_rate": 2.5317852301584643e-05, + "loss": 0.8476, + "step": 156 + }, + { + "epoch": 2.9504950495049505, + "eval_loss": 0.8605256080627441, + "eval_runtime": 13.8794, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 156 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.2138671875, + "learning_rate": 2.422851302396655e-05, + "loss": 0.8483, + "step": 157 + }, + { + "epoch": 2.99009900990099, + "grad_norm": 0.2216796875, + "learning_rate": 2.315988891431412e-05, + "loss": 0.8379, + "step": 158 + }, + { + "epoch": 3.00990099009901, + "grad_norm": 0.2373046875, + "learning_rate": 2.2112272123788768e-05, + "loss": 0.8042, + "step": 159 + }, + { + "epoch": 3.0297029702970297, + "grad_norm": 0.416015625, + "learning_rate": 2.1085949060360654e-05, + "loss": 0.8597, + "step": 160 + }, + { + "epoch": 3.0495049504950495, + "grad_norm": 0.1806640625, + "learning_rate": 2.008120031050753e-05, + "loss": 0.8327, + "step": 161 + }, + { + "epoch": 3.0693069306930694, + "grad_norm": 0.212890625, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.7991, + "step": 162 + }, + { + "epoch": 3.01980198019802, + "grad_norm": 0.1875, + "learning_rate": 1.8137518531330767e-05, + "loss": 0.8083, + "step": 163 + }, + { + "epoch": 3.0396039603960396, + "grad_norm": 0.1982421875, + "learning_rate": 1.7199116885197995e-05, + "loss": 0.8321, + "step": 164 + }, + { + "epoch": 3.0594059405940595, + "grad_norm": 0.193359375, + "learning_rate": 1.6283352173747145e-05, + "loss": 0.8596, + "step": 165 + }, + { + "epoch": 3.0792079207920793, + "grad_norm": 0.1826171875, + "learning_rate": 1.5390474757906446e-05, + "loss": 0.82, + "step": 166 + }, + { + "epoch": 3.099009900990099, + "grad_norm": 0.1806640625, + "learning_rate": 1.4520728741446089e-05, + "loss": 0.8245, + "step": 167 + }, + { + "epoch": 3.118811881188119, + "grad_norm": 0.1923828125, + "learning_rate": 1.3674351904242611e-05, + "loss": 0.8174, + "step": 168 + }, + { + "epoch": 3.1386138613861387, + "grad_norm": 0.1787109375, + "learning_rate": 1.2851575637272262e-05, + "loss": 0.811, + "step": 169 + }, + { + "epoch": 3.1386138613861387, + "eval_loss": 0.8589804768562317, + "eval_runtime": 13.8605, + "eval_samples_per_second": 9.018, + "eval_steps_per_second": 4.545, + "step": 169 + }, + { + "epoch": 3.1584158415841586, + "grad_norm": 0.1865234375, + "learning_rate": 1.2052624879351104e-05, + "loss": 0.8043, + "step": 170 + }, + { + "epoch": 3.1782178217821784, + "grad_norm": 0.181640625, + "learning_rate": 1.1277718055638819e-05, + "loss": 0.8117, + "step": 171 + }, + { + "epoch": 3.198019801980198, + "grad_norm": 0.205078125, + "learning_rate": 1.0527067017923654e-05, + "loss": 0.8176, + "step": 172 + }, + { + "epoch": 3.217821782178218, + "grad_norm": 0.1904296875, + "learning_rate": 9.80087698670411e-06, + "loss": 0.7919, + "step": 173 + }, + { + "epoch": 3.237623762376238, + "grad_norm": 0.177734375, + "learning_rate": 9.09934649508375e-06, + "loss": 0.8099, + "step": 174 + }, + { + "epoch": 3.2574257425742577, + "grad_norm": 0.203125, + "learning_rate": 8.422667334494249e-06, + "loss": 0.8161, + "step": 175 + }, + { + "epoch": 3.2772277227722775, + "grad_norm": 0.208984375, + "learning_rate": 7.771024502261526e-06, + "loss": 0.8199, + "step": 176 + }, + { + "epoch": 3.297029702970297, + "grad_norm": 0.1923828125, + "learning_rate": 7.144596151029303e-06, + "loss": 0.8077, + "step": 177 + }, + { + "epoch": 3.3168316831683167, + "grad_norm": 0.298828125, + "learning_rate": 6.543553540053926e-06, + "loss": 0.8532, + "step": 178 + }, + { + "epoch": 3.3366336633663365, + "grad_norm": 0.1826171875, + "learning_rate": 5.968060988383883e-06, + "loss": 0.8062, + "step": 179 + }, + { + "epoch": 3.3564356435643563, + "grad_norm": 0.1689453125, + "learning_rate": 5.418275829936537e-06, + "loss": 0.802, + "step": 180 + }, + { + "epoch": 3.376237623762376, + "grad_norm": 0.2001953125, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.8189, + "step": 181 + }, + { + "epoch": 3.396039603960396, + "grad_norm": 0.169921875, + "learning_rate": 4.3964218465642355e-06, + "loss": 0.8178, + "step": 182 + }, + { + "epoch": 3.396039603960396, + "eval_loss": 0.858788788318634, + "eval_runtime": 13.8817, + "eval_samples_per_second": 9.005, + "eval_steps_per_second": 4.538, + "step": 182 + }, + { + "epoch": 3.4158415841584158, + "grad_norm": 0.16796875, + "learning_rate": 3.924632386315186e-06, + "loss": 0.8307, + "step": 183 + }, + { + "epoch": 3.4356435643564356, + "grad_norm": 0.181640625, + "learning_rate": 3.4791089722651436e-06, + "loss": 0.8255, + "step": 184 + }, + { + "epoch": 3.4554455445544554, + "grad_norm": 0.185546875, + "learning_rate": 3.059973406066963e-06, + "loss": 0.8222, + "step": 185 + }, + { + "epoch": 3.4752475247524752, + "grad_norm": 0.19140625, + "learning_rate": 2.667340275199426e-06, + "loss": 0.8054, + "step": 186 + }, + { + "epoch": 3.495049504950495, + "grad_norm": 0.1826171875, + "learning_rate": 2.3013169216400733e-06, + "loss": 0.8628, + "step": 187 + }, + { + "epoch": 3.514851485148515, + "grad_norm": 0.1796875, + "learning_rate": 1.9620034125190644e-06, + "loss": 0.8338, + "step": 188 + }, + { + "epoch": 3.5346534653465347, + "grad_norm": 0.1728515625, + "learning_rate": 1.6494925127617634e-06, + "loss": 0.809, + "step": 189 + }, + { + "epoch": 3.5544554455445545, + "grad_norm": 0.1904296875, + "learning_rate": 1.3638696597277679e-06, + "loss": 0.8328, + "step": 190 + }, + { + "epoch": 3.5742574257425743, + "grad_norm": 0.17578125, + "learning_rate": 1.1052129398531507e-06, + "loss": 0.8062, + "step": 191 + }, + { + "epoch": 3.594059405940594, + "grad_norm": 0.1884765625, + "learning_rate": 8.735930673024806e-07, + "loss": 0.832, + "step": 192 + }, + { + "epoch": 3.613861386138614, + "grad_norm": 0.17578125, + "learning_rate": 6.690733646361857e-07, + "loss": 0.8107, + "step": 193 + }, + { + "epoch": 3.633663366336634, + "grad_norm": 0.1875, + "learning_rate": 4.917097454988584e-07, + "loss": 0.8315, + "step": 194 + }, + { + "epoch": 3.6534653465346536, + "grad_norm": 0.1845703125, + "learning_rate": 3.415506993330153e-07, + "loss": 0.8073, + "step": 195 + }, + { + "epoch": 3.6534653465346536, + "eval_loss": 0.858626127243042, + "eval_runtime": 13.8621, + "eval_samples_per_second": 9.017, + "eval_steps_per_second": 4.545, + "step": 195 + }, + { + "epoch": 3.6732673267326734, + "grad_norm": 0.197265625, + "learning_rate": 2.1863727812254653e-07, + "loss": 0.8403, + "step": 196 + }, + { + "epoch": 3.693069306930693, + "grad_norm": 0.189453125, + "learning_rate": 1.230030851695263e-07, + "loss": 0.8116, + "step": 197 + }, + { + "epoch": 3.7128712871287126, + "grad_norm": 0.173828125, + "learning_rate": 5.467426590739511e-08, + "loss": 0.8115, + "step": 198 + }, + { + "epoch": 3.7326732673267324, + "grad_norm": 0.177734375, + "learning_rate": 1.3669500753099585e-08, + "loss": 0.7962, + "step": 199 + }, + { + "epoch": 3.7524752475247523, + "grad_norm": 0.2099609375, + "learning_rate": 0.0, + "loss": 0.8031, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.164477534181786e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f +size 5944 diff --git a/checkpoint-50/README.md b/checkpoint-50/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1ccd431539a8f1507d8755a9c3ba5e5b2897978 --- /dev/null +++ b/checkpoint-50/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-50/adapter_config.json b/checkpoint-50/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3b6760acbcf3eaee3a7347373ee7157ecbc99891 --- /dev/null +++ b/checkpoint-50/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-50/adapter_model.safetensors b/checkpoint-50/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e224cb43e2b5c839e08901e2bd1920ba49998d55 --- /dev/null +++ b/checkpoint-50/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a02032e4ced1f76caa201d55031ab5925f6d0fb66b5d8f3092b8c5d785219b37 +size 50503848 diff --git a/checkpoint-50/optimizer.pt b/checkpoint-50/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf0bc8aae14cf3dc15a5895ddfefef58623a2ecf --- /dev/null +++ b/checkpoint-50/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff58348f44e2bde44ab7f9193c61e20dd0f8d95e056c7a292421ffd95a8c7d3 +size 202035450 diff --git a/checkpoint-50/rng_state.pth b/checkpoint-50/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..246cffea93db339a81ba0ae32aaa33c8c0ad92df --- /dev/null +++ b/checkpoint-50/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b35231a2c551e6ed40111614cd789a64fe47b38c49d5b21bea0aa24df8b78d2 +size 14244 diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..30029e06200a4e0722b03b67c9de799aa70b54ed --- /dev/null +++ b/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e02dc10b7239989ab9b4418ee704e53fad611ad6b77ad633028bb8eb5238dd +size 1064 diff --git a/checkpoint-50/special_tokens_map.json b/checkpoint-50/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-50/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-50/tokenizer.model b/checkpoint-50/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-50/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-50/tokenizer_config.json b/checkpoint-50/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-50/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..739306d50fc951c681d527a2d355ce130e33bb45 --- /dev/null +++ b/checkpoint-50/trainer_state.json @@ -0,0 +1,415 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9900990099009901, + "eval_steps": 13, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.019801980198019802, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.0919, + "step": 1 + }, + { + "epoch": 0.019801980198019802, + "eval_loss": 2.079954147338867, + "eval_runtime": 13.8908, + "eval_samples_per_second": 8.999, + "eval_steps_per_second": 4.535, + "step": 1 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 1.203125, + "learning_rate": 4e-05, + "loss": 2.0814, + "step": 2 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 1.1953125, + "learning_rate": 6e-05, + "loss": 2.0499, + "step": 3 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 1.0859375, + "learning_rate": 8e-05, + "loss": 2.0153, + "step": 4 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 1.0390625, + "learning_rate": 0.0001, + "loss": 1.9548, + "step": 5 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.89453125, + "learning_rate": 0.00012, + "loss": 1.8982, + "step": 6 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.67578125, + "learning_rate": 0.00014, + "loss": 1.8226, + "step": 7 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.66796875, + "learning_rate": 0.00016, + "loss": 1.7572, + "step": 8 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.78515625, + "learning_rate": 0.00018, + "loss": 1.7074, + "step": 9 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.73828125, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 10 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.484375, + "learning_rate": 0.0001999863304992469, + "loss": 1.5801, + "step": 11 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.53125, + "learning_rate": 0.00019994532573409262, + "loss": 1.5721, + "step": 12 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.6953125, + "learning_rate": 0.00019987699691483048, + "loss": 1.5479, + "step": 13 + }, + { + "epoch": 0.25742574257425743, + "eval_loss": 1.5341482162475586, + "eval_runtime": 13.8795, + "eval_samples_per_second": 9.006, + "eval_steps_per_second": 4.539, + "step": 13 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.65234375, + "learning_rate": 0.00019978136272187747, + "loss": 1.534, + "step": 14 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.515625, + "learning_rate": 0.000199658449300667, + "loss": 1.4804, + "step": 15 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.439453125, + "learning_rate": 0.00019950829025450114, + "loss": 1.4805, + "step": 16 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.361328125, + "learning_rate": 0.00019933092663536382, + "loss": 1.3809, + "step": 17 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.3125, + "learning_rate": 0.00019912640693269752, + "loss": 1.3837, + "step": 18 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.337890625, + "learning_rate": 0.00019889478706014687, + "loss": 1.3673, + "step": 19 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.298828125, + "learning_rate": 0.00019863613034027224, + "loss": 1.366, + "step": 20 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.34375, + "learning_rate": 0.00019835050748723824, + "loss": 1.3318, + "step": 21 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.341796875, + "learning_rate": 0.00019803799658748094, + "loss": 1.2741, + "step": 22 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.326171875, + "learning_rate": 0.00019769868307835994, + "loss": 1.2978, + "step": 23 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.291015625, + "learning_rate": 0.0001973326597248006, + "loss": 1.2733, + "step": 24 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.306640625, + "learning_rate": 0.00019694002659393305, + "loss": 1.2302, + "step": 25 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.318359375, + "learning_rate": 0.00019652089102773488, + "loss": 1.2083, + "step": 26 + }, + { + "epoch": 0.5148514851485149, + "eval_loss": 1.224540114402771, + "eval_runtime": 13.8695, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 4.542, + "step": 26 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.26953125, + "learning_rate": 0.00019607536761368484, + "loss": 1.1761, + "step": 27 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 0.296875, + "learning_rate": 0.00019560357815343577, + "loss": 1.1751, + "step": 28 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.310546875, + "learning_rate": 0.00019510565162951537, + "loss": 1.2002, + "step": 29 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.287109375, + "learning_rate": 0.00019458172417006347, + "loss": 1.1544, + "step": 30 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.365234375, + "learning_rate": 0.00019403193901161613, + "loss": 1.1384, + "step": 31 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.236328125, + "learning_rate": 0.0001934564464599461, + "loss": 1.0999, + "step": 32 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.326171875, + "learning_rate": 0.00019285540384897073, + "loss": 1.1576, + "step": 33 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.310546875, + "learning_rate": 0.00019222897549773848, + "loss": 1.091, + "step": 34 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.2578125, + "learning_rate": 0.00019157733266550575, + "loss": 1.056, + "step": 35 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.267578125, + "learning_rate": 0.00019090065350491626, + "loss": 1.1068, + "step": 36 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019019912301329592, + "loss": 1.0583, + "step": 37 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.2734375, + "learning_rate": 0.00018947293298207635, + "loss": 1.0671, + "step": 38 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001887222819443612, + "loss": 1.0851, + "step": 39 + }, + { + "epoch": 0.7722772277227723, + "eval_loss": 1.060703158378601, + "eval_runtime": 13.878, + "eval_samples_per_second": 9.007, + "eval_steps_per_second": 4.54, + "step": 39 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.22265625, + "learning_rate": 0.0001879473751206489, + "loss": 1.0343, + "step": 40 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.1796875, + "learning_rate": 0.00018714842436272773, + "loss": 0.9789, + "step": 41 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.248046875, + "learning_rate": 0.00018632564809575742, + "loss": 1.0174, + "step": 42 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001854792712585539, + "loss": 1.0004, + "step": 43 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.228515625, + "learning_rate": 0.00018460952524209355, + "loss": 1.0281, + "step": 44 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.220703125, + "learning_rate": 0.00018371664782625287, + "loss": 0.9992, + "step": 45 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018280088311480201, + "loss": 0.9635, + "step": 46 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.265625, + "learning_rate": 0.00018186248146866927, + "loss": 1.006, + "step": 47 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018090169943749476, + "loss": 0.9891, + "step": 48 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.28515625, + "learning_rate": 0.0001799187996894925, + "loss": 0.9809, + "step": 49 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.212890625, + "learning_rate": 0.00017891405093963938, + "loss": 0.9646, + "step": 50 + } + ], + "logging_steps": 1, + "max_steps": 200, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.04177049010176e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..17c58e1f9571a1e651f5ca71c5238f9d8660fc30 --- /dev/null +++ b/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab891527a9343c5fed33fded5a4528864e72798598b8a74f11bf9b63e79e156f +size 5944 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..573775b64764c233a2364b2f362979bdf95694c2 --- /dev/null +++ b/config.json @@ -0,0 +1,44 @@ +{ + "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}