penfever commited on
Commit
4744c5d
·
verified ·
1 Parent(s): 6a654de

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 24,
20
+ "num_key_value_heads": 32,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": null,
25
+ "rope_theta": 130000,
26
+ "tie_word_embeddings": true,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers.js_config": {
29
+ "kv_cache_dtype": {
30
+ "fp16": "float16",
31
+ "q4f16": "float16"
32
+ }
33
+ },
34
+ "transformers_version": "4.45.2",
35
+ "use_cache": false,
36
+ "vocab_size": 49152
37
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.45.2"
7
+ }
logs/rank_0000.log ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-02-01 18:47:39,616][oumi][rank0][pid:11750][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
2
+ [2025-02-01 18:47:39,618][oumi][rank0][pid:11750][MainThread][INFO]][train.py:174] TrainingConfig:
3
+ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='text_sft_jsonl',
4
+ dataset_path='data/R1/math_10k_R1_outputs.jsonl',
5
+ subset=None,
6
+ split='train',
7
+ dataset_kwargs={},
8
+ sample_count=None,
9
+ mixture_proportion=None,
10
+ shuffle=False,
11
+ seed=None,
12
+ shuffle_buffer_size=1000,
13
+ trust_remote_code=False,
14
+ transform_num_workers=None)],
15
+ collator_name=None,
16
+ pack=False,
17
+ stream=False,
18
+ target_col=None,
19
+ mixture_strategy='first_exhausted',
20
+ seed=42,
21
+ use_async_dataset=False,
22
+ use_torchdata=None),
23
+ test=DatasetSplitParams(datasets=[],
24
+ collator_name=None,
25
+ pack=False,
26
+ stream=False,
27
+ target_col=None,
28
+ mixture_strategy='first_exhausted',
29
+ seed=None,
30
+ use_async_dataset=False,
31
+ use_torchdata=None),
32
+ validation=DatasetSplitParams(datasets=[],
33
+ collator_name=None,
34
+ pack=False,
35
+ stream=False,
36
+ target_col=None,
37
+ mixture_strategy='first_exhausted',
38
+ seed=None,
39
+ use_async_dataset=False,
40
+ use_torchdata=None)),
41
+ model=ModelParams(model_name='HuggingFaceTB/SmolLM2-1.7B-Instruct',
42
+ adapter_model=None,
43
+ tokenizer_name=None,
44
+ tokenizer_pad_token=None,
45
+ tokenizer_kwargs={},
46
+ model_max_length=None,
47
+ load_pretrained_weights=True,
48
+ trust_remote_code=True,
49
+ torch_dtype_str='bfloat16',
50
+ compile=False,
51
+ chat_template=None,
52
+ attn_implementation=None,
53
+ device_map='auto',
54
+ model_kwargs={},
55
+ enable_liger_kernel=False,
56
+ shard_for_eval=False,
57
+ freeze_layers=[]),
58
+ training=TrainingParams(use_peft=False,
59
+ trainer_type=<TrainerType.TRL_SFT: 'trl_sft'>,
60
+ enable_gradient_checkpointing=True,
61
+ gradient_checkpointing_kwargs={'use_reentrant': False},
62
+ output_dir='output/smollm2-17b-distill-r1-670b-math',
63
+ per_device_train_batch_size=2,
64
+ per_device_eval_batch_size=8,
65
+ gradient_accumulation_steps=2,
66
+ max_steps=-1,
67
+ num_train_epochs=1,
68
+ save_epoch=False,
69
+ save_steps=0,
70
+ save_final_model=True,
71
+ seed=42,
72
+ run_name='smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1',
73
+ metrics_function=None,
74
+ log_level='info',
75
+ dep_log_level='warning',
76
+ enable_wandb=True,
77
+ enable_tensorboard=True,
78
+ logging_strategy='steps',
79
+ logging_dir=None,
80
+ logging_steps=10,
81
+ logging_first_step=False,
82
+ eval_strategy='no',
83
+ eval_steps=500,
84
+ learning_rate=2e-05,
85
+ lr_scheduler_type='linear',
86
+ lr_scheduler_kwargs={},
87
+ warmup_ratio=0.1,
88
+ warmup_steps=None,
89
+ optimizer='adamw_torch_fused',
90
+ weight_decay=0.0,
91
+ adam_beta1=0.9,
92
+ adam_beta2=0.999,
93
+ adam_epsilon=1e-08,
94
+ sgd_momentum=0.0,
95
+ mixed_precision_dtype=<MixedPrecisionDtype.NONE: 'none'>,
96
+ compile=False,
97
+ include_performance_metrics=False,
98
+ include_alternative_mfu_metrics=False,
99
+ log_model_summary=False,
100
+ resume_from_checkpoint=None,
101
+ try_resume_from_last_checkpoint=False,
102
+ dataloader_num_workers=8,
103
+ dataloader_prefetch_factor=32,
104
+ dataloader_main_process_only=None,
105
+ ddp_find_unused_parameters=False,
106
+ max_grad_norm=10.0,
107
+ trainer_kwargs={},
108
+ profiler=ProfilerParams(save_dir=None,
109
+ enable_cpu_profiling=False,
110
+ enable_cuda_profiling=False,
111
+ record_shapes=False,
112
+ profile_memory=False,
113
+ with_stack=False,
114
+ with_flops=False,
115
+ with_modules=False,
116
+ row_limit=50,
117
+ schedule=ProfilerScheduleParams(enable_schedule=False,
118
+ wait=0,
119
+ warmup=1,
120
+ active=3,
121
+ repeat=1,
122
+ skip_first=1)),
123
+ telemetry=TelemetryParams(telemetry_dir='telemetry',
124
+ collect_telemetry_for_all_ranks=False,
125
+ track_gpu_temperature=False),
126
+ empty_device_cache_steps=1,
127
+ nccl_default_timeout_minutes=None),
128
+ peft=PeftParams(lora_r=8,
129
+ lora_alpha=8,
130
+ lora_dropout=0.0,
131
+ lora_target_modules=None,
132
+ lora_modules_to_save=None,
133
+ lora_bias='none',
134
+ lora_init_weights=<LoraWeightInitialization.DEFAULT: 'default'>,
135
+ lora_task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
136
+ q_lora=False,
137
+ q_lora_bits=4,
138
+ bnb_4bit_quant_type='fp4',
139
+ use_bnb_nested_quant=False,
140
+ bnb_4bit_quant_storage='uint8',
141
+ bnb_4bit_compute_dtype='float32',
142
+ peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
143
+ fsdp=FSDPParams(enable_fsdp=False,
144
+ sharding_strategy=<ShardingStrategy.FULL_SHARD: 'FULL_SHARD'>,
145
+ cpu_offload=False,
146
+ mixed_precision=None,
147
+ backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
148
+ forward_prefetch=False,
149
+ use_orig_params=None,
150
+ state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
151
+ auto_wrap_policy=<AutoWrapPolicy.NO_WRAP: 'NO_WRAP'>,
152
+ min_num_params=100000,
153
+ transformer_layer_cls=None,
154
+ sync_module_states=True))
155
+ [2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
156
+ [2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:185] Building model using device_map: cuda:0 (DeviceRankInfo(world_size=4, rank=0, local_world_size=4, local_rank=0))...
157
+ [2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
158
+ [2025-02-01 18:47:41,904][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
159
+ [2025-02-01 18:47:41,946][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
160
+ [2025-02-01 18:47:47,678][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1744.52 examples/sec. Examples: 10000. Duration: 5.7 sec. Transform workers: 1.
161
+ [2025-02-01 18:47:47,943][oumi][rank0][pid:11750][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
162
+ [2025-02-01 18:47:47,998][oumi][rank0][pid:11750][MainThread][INFO]][training.py:49] SFTConfig(output_dir='output/smollm2-17b-distill-r1-670b-math',
163
+ overwrite_output_dir=False,
164
+ do_train=False,
165
+ do_eval=False,
166
+ do_predict=False,
167
+ eval_strategy=<IntervalStrategy.NO: 'no'>,
168
+ prediction_loss_only=False,
169
+ per_device_train_batch_size=2,
170
+ per_device_eval_batch_size=8,
171
+ per_gpu_train_batch_size=None,
172
+ per_gpu_eval_batch_size=None,
173
+ gradient_accumulation_steps=2,
174
+ eval_accumulation_steps=None,
175
+ eval_delay=0,
176
+ torch_empty_cache_steps=1,
177
+ learning_rate=2e-05,
178
+ weight_decay=0.0,
179
+ adam_beta1=0.9,
180
+ adam_beta2=0.999,
181
+ adam_epsilon=1e-08,
182
+ max_grad_norm=10.0,
183
+ num_train_epochs=1,
184
+ max_steps=-1,
185
+ lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>,
186
+ lr_scheduler_kwargs={},
187
+ warmup_ratio=0.1,
188
+ warmup_steps=0,
189
+ log_level='warning',
190
+ log_level_replica='warning',
191
+ log_on_each_node=True,
192
+ logging_dir='output/smollm2-17b-distill-r1-670b-math/runs/Feb01_18-47-47_sky-d954-bf996-370b-head-3sxnf23v-compute',
193
+ logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
194
+ logging_first_step=False,
195
+ logging_steps=10,
196
+ logging_nan_inf_filter=True,
197
+ save_strategy=<IntervalStrategy.NO: 'no'>,
198
+ save_steps=0,
199
+ save_total_limit=None,
200
+ save_safetensors=True,
201
+ save_on_each_node=False,
202
+ save_only_model=False,
203
+ restore_callback_states_from_checkpoint=False,
204
+ no_cuda=False,
205
+ use_cpu=False,
206
+ use_mps_device=False,
207
+ seed=42,
208
+ data_seed=None,
209
+ jit_mode_eval=False,
210
+ use_ipex=False,
211
+ bf16=False,
212
+ fp16=False,
213
+ fp16_opt_level='O1',
214
+ half_precision_backend='auto',
215
+ bf16_full_eval=False,
216
+ fp16_full_eval=False,
217
+ tf32=None,
218
+ local_rank=0,
219
+ ddp_backend=None,
220
+ tpu_num_cores=None,
221
+ tpu_metrics_debug=False,
222
+ debug=[],
223
+ dataloader_drop_last=False,
224
+ eval_steps=500,
225
+ dataloader_num_workers=8,
226
+ dataloader_prefetch_factor=32,
227
+ past_index=-1,
228
+ run_name='smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1',
229
+ disable_tqdm=False,
230
+ remove_unused_columns=True,
231
+ label_names=None,
232
+ load_best_model_at_end=False,
233
+ metric_for_best_model=None,
234
+ greater_is_better=None,
235
+ ignore_data_skip=False,
236
+ fsdp=[],
237
+ fsdp_min_num_params=0,
238
+ fsdp_config={'min_num_params': 0,
239
+ 'xla': False,
240
+ 'xla_fsdp_grad_ckpt': False,
241
+ 'xla_fsdp_v2': False},
242
+ fsdp_transformer_layer_cls_to_wrap=None,
243
+ accelerator_config=AcceleratorConfig(split_batches=False,
244
+ dispatch_batches=None,
245
+ even_batches=True,
246
+ use_seedable_sampler=True,
247
+ non_blocking=False,
248
+ gradient_accumulation_kwargs=None,
249
+ use_configured_state=False),
250
+ deepspeed=None,
251
+ label_smoothing_factor=0.0,
252
+ optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>,
253
+ optim_args=None,
254
+ adafactor=False,
255
+ group_by_length=False,
256
+ length_column_name='length',
257
+ report_to=['wandb', 'tensorboard'],
258
+ ddp_find_unused_parameters=False,
259
+ ddp_bucket_cap_mb=None,
260
+ ddp_broadcast_buffers=None,
261
+ dataloader_pin_memory=True,
262
+ dataloader_persistent_workers=False,
263
+ skip_memory_metrics=True,
264
+ use_legacy_prediction_loop=False,
265
+ push_to_hub=False,
266
+ resume_from_checkpoint=None,
267
+ hub_model_id=None,
268
+ hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>,
269
+ hub_token=None,
270
+ hub_private_repo=False,
271
+ hub_always_push=False,
272
+ gradient_checkpointing=True,
273
+ gradient_checkpointing_kwargs={'use_reentrant': False},
274
+ include_inputs_for_metrics=False,
275
+ eval_do_concat_batches=True,
276
+ fp16_backend='auto',
277
+ evaluation_strategy=None,
278
+ push_to_hub_model_id=None,
279
+ push_to_hub_organization=None,
280
+ push_to_hub_token=None,
281
+ mp_parameters='',
282
+ auto_find_batch_size=False,
283
+ full_determinism=False,
284
+ torchdynamo=None,
285
+ ray_scope='last',
286
+ ddp_timeout=1800,
287
+ torch_compile=False,
288
+ torch_compile_backend=None,
289
+ torch_compile_mode=None,
290
+ dispatch_batches=None,
291
+ split_batches=None,
292
+ include_tokens_per_second=False,
293
+ include_num_input_tokens_seen=False,
294
+ neftune_noise_alpha=None,
295
+ optim_target_modules=None,
296
+ batch_eval_metrics=False,
297
+ eval_on_start=False,
298
+ use_liger_kernel=False,
299
+ eval_use_gather_object=False,
300
+ dataset_text_field=None,
301
+ packing=False,
302
+ max_seq_length=None,
303
+ dataset_num_proc=None,
304
+ dataset_batch_size=1000,
305
+ model_init_kwargs=None,
306
+ dataset_kwargs=None,
307
+ eval_packing=None,
308
+ num_of_sequences=1024,
309
+ chars_per_token=3.6,
310
+ use_liger=False)
311
+ [2025-02-01 18:47:48,072][oumi][rank0][pid:11750][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
312
+ [2025-02-01 18:47:48,078][oumi][rank0][pid:11750][MainThread][INFO]][train.py:312] Training init time: 10.796s
313
+ [2025-02-01 18:47:48,078][oumi][rank0][pid:11750][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
314
+ [2025-02-01 18:52:35,471][oumi][rank0][pid:11750][MainThread][INFO]][train.py:320] Training is Complete.
315
+ [2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
316
+ [2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.43 GB
317
+ [2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][train.py:327] Saving final state...
318
+ [2025-02-01 18:52:35,504][oumi][rank0][pid:11750][MainThread][INFO]][train.py:332] Saving final model...
319
+ [2025-02-01 18:52:43,074][oumi][rank0][pid:11750][MainThread][INFO]][hf_trainer.py:102] Model has been saved at output/smollm2-17b-distill-r1-670b-math
320
+ [2025-02-01 18:52:43,650][oumi][rank0][pid:11750][MainThread][INFO]][train.py:339]
321
+
322
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0001.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-02-01 18:47:39,024][oumi][rank1][pid:11751][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
2
+ [2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
3
+ [2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:185] Building model using device_map: cuda:1 (DeviceRankInfo(world_size=4, rank=1, local_world_size=4, local_rank=1))...
4
+ [2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
5
+ [2025-02-01 18:47:41,530][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
6
+ [2025-02-01 18:47:41,663][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
7
+ [2025-02-01 18:47:47,716][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1652.20 examples/sec. Examples: 10000. Duration: 6.1 sec. Transform workers: 1.
8
+ [2025-02-01 18:47:47,984][oumi][rank1][pid:11751][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
9
+ [2025-02-01 18:47:48,077][oumi][rank1][pid:11751][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
10
+ [2025-02-01 18:47:48,078][oumi][rank1][pid:11751][MainThread][INFO]][train.py:312] Training init time: 10.795s
11
+ [2025-02-01 18:47:48,078][oumi][rank1][pid:11751][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
12
+ [2025-02-01 18:52:35,469][oumi][rank1][pid:11751][MainThread][INFO]][train.py:320] Training is Complete.
13
+ [2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
14
+ [2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.24 GB
15
+ [2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][train.py:327] Saving final state...
16
+ [2025-02-01 18:52:35,504][oumi][rank1][pid:11751][MainThread][INFO]][train.py:332] Saving final model...
17
+ [2025-02-01 18:52:43,653][oumi][rank1][pid:11751][MainThread][INFO]][train.py:339]
18
+
19
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0002.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-02-01 18:47:39,024][oumi][rank2][pid:11752][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
2
+ [2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
3
+ [2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:185] Building model using device_map: cuda:2 (DeviceRankInfo(world_size=4, rank=2, local_world_size=4, local_rank=2))...
4
+ [2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
5
+ [2025-02-01 18:47:41,533][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
6
+ [2025-02-01 18:47:41,616][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
7
+ [2025-02-01 18:47:47,667][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1652.64 examples/sec. Examples: 10000. Duration: 6.1 sec. Transform workers: 1.
8
+ [2025-02-01 18:47:47,937][oumi][rank2][pid:11752][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
9
+ [2025-02-01 18:47:48,077][oumi][rank2][pid:11752][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
10
+ [2025-02-01 18:47:48,078][oumi][rank2][pid:11752][MainThread][INFO]][train.py:312] Training init time: 10.795s
11
+ [2025-02-01 18:47:48,078][oumi][rank2][pid:11752][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
12
+ [2025-02-01 18:52:35,469][oumi][rank2][pid:11752][MainThread][INFO]][train.py:320] Training is Complete.
13
+ [2025-02-01 18:52:35,496][oumi][rank2][pid:11752][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
14
+ [2025-02-01 18:52:35,497][oumi][rank2][pid:11752][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.46 GB
15
+ [2025-02-01 18:52:35,497][oumi][rank2][pid:11752][MainThread][INFO]][train.py:327] Saving final state...
16
+ [2025-02-01 18:52:35,504][oumi][rank2][pid:11752][MainThread][INFO]][train.py:332] Saving final model...
17
+ [2025-02-01 18:52:43,655][oumi][rank2][pid:11752][MainThread][INFO]][train.py:339]
18
+
19
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0003.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-02-01 18:47:39,024][oumi][rank3][pid:11753][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
2
+ [2025-02-01 18:47:39,326][oumi][rank3][pid:11753][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
3
+ [2025-02-01 18:47:39,326][oumi][rank3][pid:11753][MainThread][INFO]][models.py:185] Building model using device_map: cuda:3 (DeviceRankInfo(world_size=4, rank=3, local_world_size=4, local_rank=3))...
4
+ [2025-02-01 18:47:39,327][oumi][rank3][pid:11753][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
5
+ [2025-02-01 18:47:41,529][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
6
+ [2025-02-01 18:47:41,714][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
7
+ [2025-02-01 18:47:47,694][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1672.23 examples/sec. Examples: 10000. Duration: 6.0 sec. Transform workers: 1.
8
+ [2025-02-01 18:47:47,964][oumi][rank3][pid:11753][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
9
+ [2025-02-01 18:47:48,076][oumi][rank3][pid:11753][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
10
+ [2025-02-01 18:47:48,078][oumi][rank3][pid:11753][MainThread][INFO]][train.py:312] Training init time: 10.795s
11
+ [2025-02-01 18:47:48,078][oumi][rank3][pid:11753][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
12
+ [2025-02-01 18:52:35,469][oumi][rank3][pid:11753][MainThread][INFO]][train.py:320] Training is Complete.
13
+ [2025-02-01 18:52:35,496][oumi][rank3][pid:11753][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
14
+ [2025-02-01 18:52:35,497][oumi][rank3][pid:11753][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 16.56 GB
15
+ [2025-02-01 18:52:35,497][oumi][rank3][pid:11753][MainThread][INFO]][train.py:327] Saving final state...
16
+ [2025-02-01 18:52:35,504][oumi][rank3][pid:11753][MainThread][INFO]][train.py:332] Saving final model...
17
+ [2025-02-01 18:52:43,652][oumi][rank3][pid:11753][MainThread][INFO]][train.py:339]
18
+
19
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656b06593f9a7fd9830d1adb5883c539f3ccc697fa6f9b565921cb86836e32af
3
+ size 3422777952
runs/Feb01_18-47-47_sky-d954-bf996-370b-head-3sxnf23v-compute/events.out.tfevents.1738435669.sky-d954-bf996-370b-head-3sxnf23v-compute.11750.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31e126e7d84ce93b41ef1338794b75bc7718945a7b308c7028f605af6bf96c30
3
+ size 19018
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
telemetry/devices_info.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ CPU cores: 48 CUDA devices: 4
2
+ device(0)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 76.58GiB Allocated: 0.0GiB Cached: 0.0GiB]
3
+ device(1)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.26GiB Allocated: 0.0GiB Cached: 0.0GiB]
4
+ device(2)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.26GiB Allocated: 0.0GiB Cached: 0.0GiB]
5
+ device(3)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.12GiB Allocated: 0.0GiB Cached: 0.0GiB]
telemetry/training_config.yaml ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ train:
3
+ datasets:
4
+ - dataset_name: text_sft_jsonl
5
+ dataset_path: data/R1/math_10k_R1_outputs.jsonl
6
+ subset: null
7
+ split: train
8
+ dataset_kwargs: {}
9
+ sample_count: null
10
+ mixture_proportion: null
11
+ shuffle: false
12
+ seed: null
13
+ shuffle_buffer_size: 1000
14
+ trust_remote_code: false
15
+ transform_num_workers: null
16
+ collator_name: null
17
+ pack: false
18
+ stream: false
19
+ target_col: null
20
+ mixture_strategy: first_exhausted
21
+ seed: 42
22
+ use_async_dataset: false
23
+ use_torchdata: null
24
+ test:
25
+ datasets: []
26
+ collator_name: null
27
+ pack: false
28
+ stream: false
29
+ target_col: null
30
+ mixture_strategy: first_exhausted
31
+ seed: null
32
+ use_async_dataset: false
33
+ use_torchdata: null
34
+ validation:
35
+ datasets: []
36
+ collator_name: null
37
+ pack: false
38
+ stream: false
39
+ target_col: null
40
+ mixture_strategy: first_exhausted
41
+ seed: null
42
+ use_async_dataset: false
43
+ use_torchdata: null
44
+ model:
45
+ model_name: HuggingFaceTB/SmolLM2-1.7B-Instruct
46
+ adapter_model: null
47
+ tokenizer_name: null
48
+ tokenizer_pad_token: null
49
+ tokenizer_kwargs: {}
50
+ model_max_length: null
51
+ load_pretrained_weights: true
52
+ trust_remote_code: true
53
+ torch_dtype_str: bfloat16
54
+ compile: false
55
+ chat_template: null
56
+ attn_implementation: null
57
+ device_map: auto
58
+ model_kwargs: {}
59
+ enable_liger_kernel: false
60
+ shard_for_eval: false
61
+ freeze_layers: []
62
+ training:
63
+ use_peft: false
64
+ trainer_type: TRL_SFT
65
+ enable_gradient_checkpointing: true
66
+ gradient_checkpointing_kwargs:
67
+ use_reentrant: false
68
+ output_dir: output/smollm2-17b-distill-r1-670b-math
69
+ per_device_train_batch_size: 2
70
+ per_device_eval_batch_size: 8
71
+ gradient_accumulation_steps: 2
72
+ max_steps: -1
73
+ num_train_epochs: 1
74
+ save_epoch: false
75
+ save_steps: 0
76
+ save_final_model: true
77
+ seed: 42
78
+ run_name: smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1
79
+ metrics_function: null
80
+ log_level: info
81
+ dep_log_level: warning
82
+ enable_wandb: true
83
+ enable_tensorboard: true
84
+ logging_strategy: steps
85
+ logging_dir: null
86
+ logging_steps: 10
87
+ logging_first_step: false
88
+ eval_strategy: 'no'
89
+ eval_steps: 500
90
+ learning_rate: 2.0e-05
91
+ lr_scheduler_type: linear
92
+ lr_scheduler_kwargs: {}
93
+ warmup_ratio: 0.1
94
+ warmup_steps: null
95
+ optimizer: adamw_torch_fused
96
+ weight_decay: 0.0
97
+ adam_beta1: 0.9
98
+ adam_beta2: 0.999
99
+ adam_epsilon: 1.0e-08
100
+ sgd_momentum: 0.0
101
+ mixed_precision_dtype: NONE
102
+ compile: false
103
+ include_performance_metrics: false
104
+ include_alternative_mfu_metrics: false
105
+ log_model_summary: false
106
+ resume_from_checkpoint: null
107
+ try_resume_from_last_checkpoint: false
108
+ dataloader_num_workers: 8
109
+ dataloader_prefetch_factor: 32
110
+ dataloader_main_process_only: null
111
+ ddp_find_unused_parameters: false
112
+ max_grad_norm: 10.0
113
+ trainer_kwargs: {}
114
+ profiler:
115
+ save_dir: null
116
+ enable_cpu_profiling: false
117
+ enable_cuda_profiling: false
118
+ record_shapes: false
119
+ profile_memory: false
120
+ with_stack: false
121
+ with_flops: false
122
+ with_modules: false
123
+ row_limit: 50
124
+ schedule:
125
+ enable_schedule: false
126
+ wait: 0
127
+ warmup: 1
128
+ active: 3
129
+ repeat: 1
130
+ skip_first: 1
131
+ telemetry:
132
+ telemetry_dir: telemetry
133
+ collect_telemetry_for_all_ranks: false
134
+ track_gpu_temperature: false
135
+ empty_device_cache_steps: 1
136
+ nccl_default_timeout_minutes: null
137
+ peft:
138
+ lora_r: 8
139
+ lora_alpha: 8
140
+ lora_dropout: 0.0
141
+ lora_target_modules: null
142
+ lora_modules_to_save: null
143
+ lora_bias: none
144
+ lora_init_weights: DEFAULT
145
+ lora_task_type: CAUSAL_LM
146
+ q_lora: false
147
+ q_lora_bits: 4
148
+ bnb_4bit_quant_type: fp4
149
+ use_bnb_nested_quant: false
150
+ bnb_4bit_quant_storage: uint8
151
+ bnb_4bit_compute_dtype: float32
152
+ peft_save_mode: ADAPTER_ONLY
153
+ fsdp:
154
+ enable_fsdp: false
155
+ sharding_strategy: FULL_SHARD
156
+ cpu_offload: false
157
+ mixed_precision: null
158
+ backward_prefetch: BACKWARD_PRE
159
+ forward_prefetch: false
160
+ use_orig_params: null
161
+ state_dict_type: FULL_STATE_DICT
162
+ auto_wrap_policy: NO_WRAP
163
+ min_num_params: 100000
164
+ transformer_layer_cls: null
165
+ sync_module_states: true
telemetry/world_size.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "LOCAL_WORLD_SIZE": 4,
3
+ "WORLD_SIZE": 4
4
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|im_start|>",
143
+ "<|im_end|>"
144
+ ],
145
+ "bos_token": "<|im_start|>",
146
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
+ "clean_up_tokenization_spaces": false,
148
+ "eos_token": "<|im_end|>",
149
+ "model_max_length": 8192,
150
+ "pad_token": "<|im_end|>",
151
+ "tokenizer_class": "GPT2Tokenizer",
152
+ "unk_token": "<|endoftext|>",
153
+ "vocab_size": 49152
154
+ }
trainer_state.json ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 625,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016,
13
+ "grad_norm": 30.125,
14
+ "learning_rate": 3.1746031746031746e-06,
15
+ "loss": 2.5254,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.032,
20
+ "grad_norm": 20.25,
21
+ "learning_rate": 6.349206349206349e-06,
22
+ "loss": 2.5254,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.048,
27
+ "grad_norm": 14.8125,
28
+ "learning_rate": 9.523809523809525e-06,
29
+ "loss": 2.0496,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.064,
34
+ "grad_norm": 10.9375,
35
+ "learning_rate": 1.2698412698412699e-05,
36
+ "loss": 1.4853,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08,
41
+ "grad_norm": 5.625,
42
+ "learning_rate": 1.5873015873015872e-05,
43
+ "loss": 0.8731,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.096,
48
+ "grad_norm": 3.109375,
49
+ "learning_rate": 1.904761904761905e-05,
50
+ "loss": 0.6086,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.112,
55
+ "grad_norm": 3.015625,
56
+ "learning_rate": 1.9750889679715305e-05,
57
+ "loss": 0.5745,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.128,
62
+ "grad_norm": 2.375,
63
+ "learning_rate": 1.939501779359431e-05,
64
+ "loss": 0.5121,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.144,
69
+ "grad_norm": 3.28125,
70
+ "learning_rate": 1.903914590747331e-05,
71
+ "loss": 0.4875,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.16,
76
+ "grad_norm": 2.703125,
77
+ "learning_rate": 1.8683274021352315e-05,
78
+ "loss": 0.4997,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.176,
83
+ "grad_norm": 2.203125,
84
+ "learning_rate": 1.832740213523132e-05,
85
+ "loss": 0.5004,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.192,
90
+ "grad_norm": 2.109375,
91
+ "learning_rate": 1.7971530249110324e-05,
92
+ "loss": 0.5119,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.208,
97
+ "grad_norm": 2.453125,
98
+ "learning_rate": 1.7615658362989325e-05,
99
+ "loss": 0.5088,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.224,
104
+ "grad_norm": 2.171875,
105
+ "learning_rate": 1.725978647686833e-05,
106
+ "loss": 0.4842,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.24,
111
+ "grad_norm": 3.65625,
112
+ "learning_rate": 1.690391459074733e-05,
113
+ "loss": 0.5193,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.256,
118
+ "grad_norm": 2.609375,
119
+ "learning_rate": 1.6548042704626336e-05,
120
+ "loss": 0.4984,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.272,
125
+ "grad_norm": 2.28125,
126
+ "learning_rate": 1.619217081850534e-05,
127
+ "loss": 0.5011,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.288,
132
+ "grad_norm": 3.46875,
133
+ "learning_rate": 1.583629893238434e-05,
134
+ "loss": 0.5493,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.304,
139
+ "grad_norm": 2.46875,
140
+ "learning_rate": 1.5480427046263346e-05,
141
+ "loss": 0.4869,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.32,
146
+ "grad_norm": 2.734375,
147
+ "learning_rate": 1.5124555160142349e-05,
148
+ "loss": 0.4902,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.336,
153
+ "grad_norm": 1.921875,
154
+ "learning_rate": 1.4768683274021354e-05,
155
+ "loss": 0.5013,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.352,
160
+ "grad_norm": 2.125,
161
+ "learning_rate": 1.4412811387900356e-05,
162
+ "loss": 0.5061,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.368,
167
+ "grad_norm": 2.265625,
168
+ "learning_rate": 1.4056939501779361e-05,
169
+ "loss": 0.4956,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.384,
174
+ "grad_norm": 1.6015625,
175
+ "learning_rate": 1.3701067615658364e-05,
176
+ "loss": 0.4665,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.4,
181
+ "grad_norm": 2.234375,
182
+ "learning_rate": 1.3345195729537369e-05,
183
+ "loss": 0.4605,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.416,
188
+ "grad_norm": 2.1875,
189
+ "learning_rate": 1.298932384341637e-05,
190
+ "loss": 0.4728,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.432,
195
+ "grad_norm": 2.46875,
196
+ "learning_rate": 1.2633451957295374e-05,
197
+ "loss": 0.5115,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.448,
202
+ "grad_norm": 2.765625,
203
+ "learning_rate": 1.2277580071174377e-05,
204
+ "loss": 0.4873,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.464,
209
+ "grad_norm": 2.140625,
210
+ "learning_rate": 1.1921708185053382e-05,
211
+ "loss": 0.5266,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.48,
216
+ "grad_norm": 2.078125,
217
+ "learning_rate": 1.1565836298932385e-05,
218
+ "loss": 0.5175,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.496,
223
+ "grad_norm": 2.328125,
224
+ "learning_rate": 1.120996441281139e-05,
225
+ "loss": 0.4702,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.512,
230
+ "grad_norm": 2.734375,
231
+ "learning_rate": 1.0854092526690392e-05,
232
+ "loss": 0.5071,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.528,
237
+ "grad_norm": 2.03125,
238
+ "learning_rate": 1.0498220640569397e-05,
239
+ "loss": 0.5155,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.544,
244
+ "grad_norm": 2.53125,
245
+ "learning_rate": 1.01423487544484e-05,
246
+ "loss": 0.4964,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.56,
251
+ "grad_norm": 2.984375,
252
+ "learning_rate": 9.786476868327403e-06,
253
+ "loss": 0.4867,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.576,
258
+ "grad_norm": 2.828125,
259
+ "learning_rate": 9.430604982206405e-06,
260
+ "loss": 0.545,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.592,
265
+ "grad_norm": 2.546875,
266
+ "learning_rate": 9.07473309608541e-06,
267
+ "loss": 0.4832,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.608,
272
+ "grad_norm": 2.6875,
273
+ "learning_rate": 8.718861209964413e-06,
274
+ "loss": 0.4826,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.624,
279
+ "grad_norm": 1.765625,
280
+ "learning_rate": 8.362989323843418e-06,
281
+ "loss": 0.4652,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.64,
286
+ "grad_norm": 2.359375,
287
+ "learning_rate": 8.00711743772242e-06,
288
+ "loss": 0.5069,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.656,
293
+ "grad_norm": 2.453125,
294
+ "learning_rate": 7.651245551601423e-06,
295
+ "loss": 0.5131,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.672,
300
+ "grad_norm": 2.3125,
301
+ "learning_rate": 7.295373665480427e-06,
302
+ "loss": 0.4937,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.688,
307
+ "grad_norm": 2.109375,
308
+ "learning_rate": 6.939501779359431e-06,
309
+ "loss": 0.4878,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.704,
314
+ "grad_norm": 3.546875,
315
+ "learning_rate": 6.5836298932384346e-06,
316
+ "loss": 0.4821,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.72,
321
+ "grad_norm": 2.046875,
322
+ "learning_rate": 6.227758007117438e-06,
323
+ "loss": 0.4486,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.736,
328
+ "grad_norm": 2.71875,
329
+ "learning_rate": 5.871886120996442e-06,
330
+ "loss": 0.5022,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.752,
335
+ "grad_norm": 2.53125,
336
+ "learning_rate": 5.516014234875445e-06,
337
+ "loss": 0.4977,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.768,
342
+ "grad_norm": 2.6875,
343
+ "learning_rate": 5.160142348754449e-06,
344
+ "loss": 0.5076,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.784,
349
+ "grad_norm": 2.546875,
350
+ "learning_rate": 4.8042704626334524e-06,
351
+ "loss": 0.451,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.8,
356
+ "grad_norm": 1.734375,
357
+ "learning_rate": 4.448398576512456e-06,
358
+ "loss": 0.4576,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.816,
363
+ "grad_norm": 2.765625,
364
+ "learning_rate": 4.09252669039146e-06,
365
+ "loss": 0.4914,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.832,
370
+ "grad_norm": 2.59375,
371
+ "learning_rate": 3.7366548042704632e-06,
372
+ "loss": 0.4938,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.848,
377
+ "grad_norm": 2.625,
378
+ "learning_rate": 3.3807829181494666e-06,
379
+ "loss": 0.5218,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.864,
384
+ "grad_norm": 1.8515625,
385
+ "learning_rate": 3.0249110320284703e-06,
386
+ "loss": 0.4694,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.88,
391
+ "grad_norm": 2.375,
392
+ "learning_rate": 2.669039145907473e-06,
393
+ "loss": 0.5102,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.896,
398
+ "grad_norm": 2.375,
399
+ "learning_rate": 2.313167259786477e-06,
400
+ "loss": 0.506,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.912,
405
+ "grad_norm": 2.875,
406
+ "learning_rate": 1.9572953736654807e-06,
407
+ "loss": 0.4982,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.928,
412
+ "grad_norm": 1.9140625,
413
+ "learning_rate": 1.6014234875444842e-06,
414
+ "loss": 0.5107,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.944,
419
+ "grad_norm": 2.546875,
420
+ "learning_rate": 1.2455516014234877e-06,
421
+ "loss": 0.4857,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.96,
426
+ "grad_norm": 1.8359375,
427
+ "learning_rate": 8.896797153024913e-07,
428
+ "loss": 0.4821,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.976,
433
+ "grad_norm": 2.609375,
434
+ "learning_rate": 5.338078291814947e-07,
435
+ "loss": 0.5166,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.992,
440
+ "grad_norm": 2.578125,
441
+ "learning_rate": 1.7793594306049826e-07,
442
+ "loss": 0.5116,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 1.0,
447
+ "step": 625,
448
+ "total_flos": 9900164319805440.0,
449
+ "train_loss": 0.6107198246002197,
450
+ "train_runtime": 287.0781,
451
+ "train_samples_per_second": 34.834,
452
+ "train_steps_per_second": 2.177
453
+ }
454
+ ],
455
+ "logging_steps": 10,
456
+ "max_steps": 625,
457
+ "num_input_tokens_seen": 0,
458
+ "num_train_epochs": 1,
459
+ "save_steps": 0,
460
+ "stateful_callbacks": {
461
+ "TrainerControl": {
462
+ "args": {
463
+ "should_epoch_stop": false,
464
+ "should_evaluate": false,
465
+ "should_log": false,
466
+ "should_save": false,
467
+ "should_training_stop": false
468
+ },
469
+ "attributes": {}
470
+ }
471
+ },
472
+ "total_flos": 9900164319805440.0,
473
+ "train_batch_size": 2,
474
+ "trial_name": null,
475
+ "trial_params": null
476
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca5ba33e3d7a1ff4b6185b6649d17e444660dc9c669f108ba1cf9db15a4edb0b
3
+ size 5624
vocab.json ADDED
The diff for this file is too large to render. See raw diff