End of training
Browse files
    	
        README.md
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            library_name: transformers
         
     | 
| 3 | 
         
            -
            base_model: minpeter/ 
     | 
| 4 | 
         
             
            tags:
         
     | 
| 5 | 
         
             
            - axolotl
         
     | 
| 6 | 
         
             
            - generated_from_trainer
         
     | 
| 
         @@ -13,6 +13,8 @@ datasets: 
     | 
|
| 13 | 
         
             
            - FreedomIntelligence/sharegpt-korean
         
     | 
| 14 | 
         
             
            - coastral/korean-writing-style-instruct
         
     | 
| 15 | 
         
             
            - devngho/korean-instruction-mix
         
     | 
| 
         | 
|
| 
         | 
|
| 16 | 
         
             
            model-index:
         
     | 
| 17 | 
         
             
            - name: tiny-ko-sft
         
     | 
| 18 | 
         
             
              results: []
         
     | 
| 
         @@ -26,7 +28,7 @@ should probably proofread and complete it, then remove this comment. --> 
     | 
|
| 26 | 
         | 
| 27 | 
         
             
            axolotl version: `0.10.0.dev0`
         
     | 
| 28 | 
         
             
            ```yaml
         
     | 
| 29 | 
         
            -
            base_model: minpeter/ 
     | 
| 30 | 
         | 
| 31 | 
         
             
            hub_model_id: minpeter/tiny-ko-sft
         
     | 
| 32 | 
         
             
            output_dir: ./outputs/tiny-ko-sft
         
     | 
| 
         @@ -104,6 +106,22 @@ datasets: 
     | 
|
| 104 | 
         
             
                  role: from
         
     | 
| 105 | 
         
             
                  content: value
         
     | 
| 106 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 107 | 
         
             
            dataset_prepared_path: last_run_prepared
         
     | 
| 108 | 
         
             
            val_set_size: 0.05
         
     | 
| 109 | 
         | 
| 
         @@ -111,17 +129,17 @@ save_steps: 200 
     | 
|
| 111 | 
         
             
            warmup_steps: 20
         
     | 
| 112 | 
         
             
            eval_steps: 200
         
     | 
| 113 | 
         | 
| 114 | 
         
            -
            sequence_len:  
     | 
| 115 | 
         | 
| 116 | 
         
             
            # <<<< experimental settings <<<<
         
     | 
| 117 | 
         
            -
            sample_packing:  
     | 
| 118 | 
         
            -
            train_on_inputs:  
     | 
| 119 | 
         
             
            # >>>> experimental settings >>>
         
     | 
| 120 | 
         | 
| 121 | 
         
             
            pad_to_sequence_len: true
         
     | 
| 122 | 
         | 
| 123 | 
         
             
            gradient_accumulation_steps: 4
         
     | 
| 124 | 
         
            -
            micro_batch_size:  
     | 
| 125 | 
         | 
| 126 | 
         
             
            optimizer: paged_adamw_8bit
         
     | 
| 127 | 
         
             
            lr_scheduler: cosine
         
     | 
| 
         @@ -130,15 +148,6 @@ learning_rate: 1e-3 
     | 
|
| 130 | 
         
             
            bf16: auto
         
     | 
| 131 | 
         
             
            tf32: false
         
     | 
| 132 | 
         | 
| 133 | 
         
            -
            added_tokens_overrides:
         
     | 
| 134 | 
         
            -
              128001: "<|im_end|>"
         
     | 
| 135 | 
         
            -
              128002: "<|im_start|>"
         
     | 
| 136 | 
         
            -
             
     | 
| 137 | 
         
            -
            special_tokens:
         
     | 
| 138 | 
         
            -
              bos_token: <|begin_of_text|>
         
     | 
| 139 | 
         
            -
              eos_token: <|im_end|>
         
     | 
| 140 | 
         
            -
              pad_token: <|im_end|>
         
     | 
| 141 | 
         
            -
             
     | 
| 142 | 
         
             
            gradient_checkpointing: true
         
     | 
| 143 | 
         
             
            gradient_checkpointing_kwargs:
         
     | 
| 144 | 
         
             
              use_reentrant: false
         
     | 
| 
         @@ -146,7 +155,7 @@ resume_from_checkpoint: 
     | 
|
| 146 | 
         
             
            logging_steps: 1
         
     | 
| 147 | 
         
             
            flash_attention: true
         
     | 
| 148 | 
         | 
| 149 | 
         
            -
            num_epochs:  
     | 
| 150 | 
         
             
            weight_decay: 0.0
         
     | 
| 151 | 
         | 
| 152 | 
         
             
            ```
         
     | 
| 
         @@ -155,9 +164,9 @@ weight_decay: 0.0 
     | 
|
| 155 | 
         | 
| 156 | 
         
             
            # tiny-ko-sft
         
     | 
| 157 | 
         | 
| 158 | 
         
            -
            This model is a fine-tuned version of [minpeter/ 
     | 
| 159 | 
         
             
            It achieves the following results on the evaluation set:
         
     | 
| 160 | 
         
            -
            - Loss: 1. 
     | 
| 161 | 
         | 
| 162 | 
         
             
            ## Model description
         
     | 
| 163 | 
         | 
| 
         @@ -177,38 +186,28 @@ More information needed 
     | 
|
| 177 | 
         | 
| 178 | 
         
             
            The following hyperparameters were used during training:
         
     | 
| 179 | 
         
             
            - learning_rate: 0.001
         
     | 
| 180 | 
         
            -
            - train_batch_size:  
     | 
| 181 | 
         
            -
            - eval_batch_size:  
     | 
| 182 | 
         
             
            - seed: 42
         
     | 
| 183 | 
         
             
            - distributed_type: multi-GPU
         
     | 
| 184 | 
         
             
            - num_devices: 2
         
     | 
| 185 | 
         
             
            - gradient_accumulation_steps: 4
         
     | 
| 186 | 
         
            -
            - total_train_batch_size:  
     | 
| 187 | 
         
            -
            - total_eval_batch_size:  
     | 
| 188 | 
         
             
            - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
         
     | 
| 189 | 
         
             
            - lr_scheduler_type: cosine
         
     | 
| 190 | 
         
             
            - lr_scheduler_warmup_steps: 20
         
     | 
| 191 | 
         
            -
            - training_steps:  
     | 
| 192 | 
         | 
| 193 | 
         
             
            ### Training results
         
     | 
| 194 | 
         | 
| 195 | 
         
             
            | Training Loss | Epoch  | Step | Validation Loss |
         
     | 
| 196 | 
         
             
            |:-------------:|:------:|:----:|:---------------:|
         
     | 
| 197 | 
         
            -
            |  
     | 
| 198 | 
         
            -
            | 1. 
     | 
| 199 | 
         
            -
            | 1. 
     | 
| 200 | 
         
            -
            | 1. 
     | 
| 201 | 
         
            -
            | 1. 
     | 
| 202 | 
         
            -
            | 1.3066        | 1.0091 | 1000 | 1.5208          |
         
     | 
| 203 | 
         
            -
            | 1.395         | 1.2110 | 1200 | 1.5007          |
         
     | 
| 204 | 
         
            -
            | 1.3474        | 1.4128 | 1400 | 1.4699          |
         
     | 
| 205 | 
         
            -
            | 1.3025        | 1.6147 | 1600 | 1.4383          |
         
     | 
| 206 | 
         
            -
            | 1.2566        | 1.8166 | 1800 | 1.4117          |
         
     | 
| 207 | 
         
            -
            | 1.1672        | 2.0182 | 2000 | 1.4227          |
         
     | 
| 208 | 
         
            -
            | 1.1267        | 2.2200 | 2200 | 1.4141          |
         
     | 
| 209 | 
         
            -
            | 1.0195        | 2.4219 | 2400 | 1.4098          |
         
     | 
| 210 | 
         
            -
            | 1.084         | 2.6238 | 2600 | 1.4063          |
         
     | 
| 211 | 
         
            -
            | 1.1254        | 2.8256 | 2800 | 1.4059          |
         
     | 
| 212 | 
         | 
| 213 | 
         | 
| 214 | 
         
             
            ### Framework versions
         
     | 
| 
         | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            library_name: transformers
         
     | 
| 3 | 
         
            +
            base_model: minpeter/tiny-ko-base
         
     | 
| 4 | 
         
             
            tags:
         
     | 
| 5 | 
         
             
            - axolotl
         
     | 
| 6 | 
         
             
            - generated_from_trainer
         
     | 
| 
         | 
|
| 13 | 
         
             
            - FreedomIntelligence/sharegpt-korean
         
     | 
| 14 | 
         
             
            - coastral/korean-writing-style-instruct
         
     | 
| 15 | 
         
             
            - devngho/korean-instruction-mix
         
     | 
| 16 | 
         
            +
            - youjunhyeok/Magpie-Pro-300K-Filtered-ko
         
     | 
| 17 | 
         
            +
            - youjunhyeok/smoltalk-ko-translate
         
     | 
| 18 | 
         
             
            model-index:
         
     | 
| 19 | 
         
             
            - name: tiny-ko-sft
         
     | 
| 20 | 
         
             
              results: []
         
     | 
| 
         | 
|
| 28 | 
         | 
| 29 | 
         
             
            axolotl version: `0.10.0.dev0`
         
     | 
| 30 | 
         
             
            ```yaml
         
     | 
| 31 | 
         
            +
            base_model: minpeter/tiny-ko-base
         
     | 
| 32 | 
         | 
| 33 | 
         
             
            hub_model_id: minpeter/tiny-ko-sft
         
     | 
| 34 | 
         
             
            output_dir: ./outputs/tiny-ko-sft
         
     | 
| 
         | 
|
| 106 | 
         
             
                  role: from
         
     | 
| 107 | 
         
             
                  content: value
         
     | 
| 108 | 
         | 
| 109 | 
         
            +
              - path: youjunhyeok/Magpie-Pro-300K-Filtered-ko
         
     | 
| 110 | 
         
            +
                type: chat_template
         
     | 
| 111 | 
         
            +
                split: train[:10%]
         
     | 
| 112 | 
         
            +
                field_messages: conversations
         
     | 
| 113 | 
         
            +
                message_property_mappings:
         
     | 
| 114 | 
         
            +
                  role: from
         
     | 
| 115 | 
         
            +
                  content: value
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
              - path: youjunhyeok/smoltalk-ko-translate
         
     | 
| 118 | 
         
            +
                type: chat_template
         
     | 
| 119 | 
         
            +
                name: merge_filtered
         
     | 
| 120 | 
         
            +
                field_messages: conversations
         
     | 
| 121 | 
         
            +
                message_property_mappings:
         
     | 
| 122 | 
         
            +
                  role: role
         
     | 
| 123 | 
         
            +
                  content: content
         
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
             
            dataset_prepared_path: last_run_prepared
         
     | 
| 126 | 
         
             
            val_set_size: 0.05
         
     | 
| 127 | 
         | 
| 
         | 
|
| 129 | 
         
             
            warmup_steps: 20
         
     | 
| 130 | 
         
             
            eval_steps: 200
         
     | 
| 131 | 
         | 
| 132 | 
         
            +
            sequence_len: 4096
         
     | 
| 133 | 
         | 
| 134 | 
         
             
            # <<<< experimental settings <<<<
         
     | 
| 135 | 
         
            +
            sample_packing: true
         
     | 
| 136 | 
         
            +
            train_on_inputs: false
         
     | 
| 137 | 
         
             
            # >>>> experimental settings >>>
         
     | 
| 138 | 
         | 
| 139 | 
         
             
            pad_to_sequence_len: true
         
     | 
| 140 | 
         | 
| 141 | 
         
             
            gradient_accumulation_steps: 4
         
     | 
| 142 | 
         
            +
            micro_batch_size: 32
         
     | 
| 143 | 
         | 
| 144 | 
         
             
            optimizer: paged_adamw_8bit
         
     | 
| 145 | 
         
             
            lr_scheduler: cosine
         
     | 
| 
         | 
|
| 148 | 
         
             
            bf16: auto
         
     | 
| 149 | 
         
             
            tf32: false
         
     | 
| 150 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 151 | 
         
             
            gradient_checkpointing: true
         
     | 
| 152 | 
         
             
            gradient_checkpointing_kwargs:
         
     | 
| 153 | 
         
             
              use_reentrant: false
         
     | 
| 
         | 
|
| 155 | 
         
             
            logging_steps: 1
         
     | 
| 156 | 
         
             
            flash_attention: true
         
     | 
| 157 | 
         | 
| 158 | 
         
            +
            num_epochs: 1
         
     | 
| 159 | 
         
             
            weight_decay: 0.0
         
     | 
| 160 | 
         | 
| 161 | 
         
             
            ```
         
     | 
| 
         | 
|
| 164 | 
         | 
| 165 | 
         
             
            # tiny-ko-sft
         
     | 
| 166 | 
         | 
| 167 | 
         
            +
            This model is a fine-tuned version of [minpeter/tiny-ko-base](https://huggingface.co/minpeter/tiny-ko-base) on the lemon-mint/Korean-FineTome-100k, the lemon-mint/smol-koreantalk, the heegyu/open-korean-instructions-v20231020, the FreedomIntelligence/evol-instruct-korean, the FreedomIntelligence/alpaca-gpt4-korean, the FreedomIntelligence/sharegpt-korean, the coastral/korean-writing-style-instruct, the devngho/korean-instruction-mix, the youjunhyeok/Magpie-Pro-300K-Filtered-ko and the youjunhyeok/smoltalk-ko-translate datasets.
         
     | 
| 168 | 
         
             
            It achieves the following results on the evaluation set:
         
     | 
| 169 | 
         
            +
            - Loss: 1.5297
         
     | 
| 170 | 
         | 
| 171 | 
         
             
            ## Model description
         
     | 
| 172 | 
         | 
| 
         | 
|
| 186 | 
         | 
| 187 | 
         
             
            The following hyperparameters were used during training:
         
     | 
| 188 | 
         
             
            - learning_rate: 0.001
         
     | 
| 189 | 
         
            +
            - train_batch_size: 32
         
     | 
| 190 | 
         
            +
            - eval_batch_size: 32
         
     | 
| 191 | 
         
             
            - seed: 42
         
     | 
| 192 | 
         
             
            - distributed_type: multi-GPU
         
     | 
| 193 | 
         
             
            - num_devices: 2
         
     | 
| 194 | 
         
             
            - gradient_accumulation_steps: 4
         
     | 
| 195 | 
         
            +
            - total_train_batch_size: 256
         
     | 
| 196 | 
         
            +
            - total_eval_batch_size: 64
         
     | 
| 197 | 
         
             
            - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
         
     | 
| 198 | 
         
             
            - lr_scheduler_type: cosine
         
     | 
| 199 | 
         
             
            - lr_scheduler_warmup_steps: 20
         
     | 
| 200 | 
         
            +
            - training_steps: 817
         
     | 
| 201 | 
         | 
| 202 | 
         
             
            ### Training results
         
     | 
| 203 | 
         | 
| 204 | 
         
             
            | Training Loss | Epoch  | Step | Validation Loss |
         
     | 
| 205 | 
         
             
            |:-------------:|:------:|:----:|:---------------:|
         
     | 
| 206 | 
         
            +
            | 2.3518        | 0.0012 | 1    | 2.3640          |
         
     | 
| 207 | 
         
            +
            | 1.6322        | 0.2446 | 200  | 1.6913          |
         
     | 
| 208 | 
         
            +
            | 1.5903        | 0.4891 | 400  | 1.6003          |
         
     | 
| 209 | 
         
            +
            | 1.5146        | 0.7337 | 600  | 1.5392          |
         
     | 
| 210 | 
         
            +
            | 1.5277        | 0.9783 | 800  | 1.5297          |
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 211 | 
         | 
| 212 | 
         | 
| 213 | 
         
             
            ### Framework versions
         
     |