penfever commited on Feb 1

Commit

4744c5d

verified ·

1 Parent(s): 6a654de

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

config.json +37 -0
generation_config.json +7 -0
logs/rank_0000.log +322 -0
logs/rank_0001.log +19 -0
logs/rank_0002.log +19 -0
logs/rank_0003.log +19 -0
merges.txt +0 -0
model.safetensors +3 -0
runs/Feb01_18-47-47_sky-d954-bf996-370b-head-3sxnf23v-compute/events.out.tfevents.1738435669.sky-d954-bf996-370b-head-3sxnf23v-compute.11750.0 +3 -0
special_tokens_map.json +34 -0
telemetry/devices_info.txt +5 -0
telemetry/training_config.yaml +165 -0
telemetry/world_size.json +4 -0
tokenizer.json +0 -0
tokenizer_config.json +154 -0
trainer_state.json +476 -0
training_args.bin +3 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 32,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 130000,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers.js_config": {
+    "kv_cache_dtype": {
+      "fp16": "float16",
+      "q4f16": "float16"
+    }
+  },
+  "transformers_version": "4.45.2",
+  "use_cache": false,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.45.2"
+}

logs/rank_0000.log ADDED Viewed

	@@ -0,0 +1,322 @@

+[2025-02-01 18:47:39,616][oumi][rank0][pid:11750][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
+[2025-02-01 18:47:39,618][oumi][rank0][pid:11750][MainThread][INFO]][train.py:174] TrainingConfig:
+TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='text_sft_jsonl',
+                                                                                dataset_path='data/R1/math_10k_R1_outputs.jsonl',
+                                                                                subset=None,
+                                                                                split='train',
+                                                                                dataset_kwargs={},
+                                                                                sample_count=None,
+                                                                                mixture_proportion=None,
+                                                                                shuffle=False,
+                                                                                seed=None,
+                                                                                shuffle_buffer_size=1000,
+                                                                                trust_remote_code=False,
+                                                                                transform_num_workers=None)],
+                                                        collator_name=None,
+                                                        pack=False,
+                                                        stream=False,
+                                                        target_col=None,
+                                                        mixture_strategy='first_exhausted',
+                                                        seed=42,
+                                                        use_async_dataset=False,
+                                                        use_torchdata=None),
+                               test=DatasetSplitParams(datasets=[],
+                                                       collator_name=None,
+                                                       pack=False,
+                                                       stream=False,
+                                                       target_col=None,
+                                                       mixture_strategy='first_exhausted',
+                                                       seed=None,
+                                                       use_async_dataset=False,
+                                                       use_torchdata=None),
+                               validation=DatasetSplitParams(datasets=[],
+                                                             collator_name=None,
+                                                             pack=False,
+                                                             stream=False,
+                                                             target_col=None,
+                                                             mixture_strategy='first_exhausted',
+                                                             seed=None,
+                                                             use_async_dataset=False,
+                                                             use_torchdata=None)),
+               model=ModelParams(model_name='HuggingFaceTB/SmolLM2-1.7B-Instruct',
+                                 adapter_model=None,
+                                 tokenizer_name=None,
+                                 tokenizer_pad_token=None,
+                                 tokenizer_kwargs={},
+                                 model_max_length=None,
+                                 load_pretrained_weights=True,
+                                 trust_remote_code=True,
+                                 torch_dtype_str='bfloat16',
+                                 compile=False,
+                                 chat_template=None,
+                                 attn_implementation=None,
+                                 device_map='auto',
+                                 model_kwargs={},
+                                 enable_liger_kernel=False,
+                                 shard_for_eval=False,
+                                 freeze_layers=[]),
+               training=TrainingParams(use_peft=False,
+                                       trainer_type=<TrainerType.TRL_SFT: 'trl_sft'>,
+                                       enable_gradient_checkpointing=True,
+                                       gradient_checkpointing_kwargs={'use_reentrant': False},
+                                       output_dir='output/smollm2-17b-distill-r1-670b-math',
+                                       per_device_train_batch_size=2,
+                                       per_device_eval_batch_size=8,
+                                       gradient_accumulation_steps=2,
+                                       max_steps=-1,
+                                       num_train_epochs=1,
+                                       save_epoch=False,
+                                       save_steps=0,
+                                       save_final_model=True,
+                                       seed=42,
+                                       run_name='smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1',
+                                       metrics_function=None,
+                                       log_level='info',
+                                       dep_log_level='warning',
+                                       enable_wandb=True,
+                                       enable_tensorboard=True,
+                                       logging_strategy='steps',
+                                       logging_dir=None,
+                                       logging_steps=10,
+                                       logging_first_step=False,
+                                       eval_strategy='no',
+                                       eval_steps=500,
+                                       learning_rate=2e-05,
+                                       lr_scheduler_type='linear',
+                                       lr_scheduler_kwargs={},
+                                       warmup_ratio=0.1,
+                                       warmup_steps=None,
+                                       optimizer='adamw_torch_fused',
+                                       weight_decay=0.0,
+                                       adam_beta1=0.9,
+                                       adam_beta2=0.999,
+                                       adam_epsilon=1e-08,
+                                       sgd_momentum=0.0,
+                                       mixed_precision_dtype=<MixedPrecisionDtype.NONE: 'none'>,
+                                       compile=False,
+                                       include_performance_metrics=False,
+                                       include_alternative_mfu_metrics=False,
+                                       log_model_summary=False,
+                                       resume_from_checkpoint=None,
+                                       try_resume_from_last_checkpoint=False,
+                                       dataloader_num_workers=8,
+                                       dataloader_prefetch_factor=32,
+                                       dataloader_main_process_only=None,
+                                       ddp_find_unused_parameters=False,
+                                       max_grad_norm=10.0,
+                                       trainer_kwargs={},
+                                       profiler=ProfilerParams(save_dir=None,
+                                                               enable_cpu_profiling=False,
+                                                               enable_cuda_profiling=False,
+                                                               record_shapes=False,
+                                                               profile_memory=False,
+                                                               with_stack=False,
+                                                               with_flops=False,
+                                                               with_modules=False,
+                                                               row_limit=50,
+                                                               schedule=ProfilerScheduleParams(enable_schedule=False,
+                                                                                               wait=0,
+                                                                                               warmup=1,
+                                                                                               active=3,
+                                                                                               repeat=1,
+                                                                                               skip_first=1)),
+                                       telemetry=TelemetryParams(telemetry_dir='telemetry',
+                                                                 collect_telemetry_for_all_ranks=False,
+                                                                 track_gpu_temperature=False),
+                                       empty_device_cache_steps=1,
+                                       nccl_default_timeout_minutes=None),
+               peft=PeftParams(lora_r=8,
+                               lora_alpha=8,
+                               lora_dropout=0.0,
+                               lora_target_modules=None,
+                               lora_modules_to_save=None,
+                               lora_bias='none',
+                               lora_init_weights=<LoraWeightInitialization.DEFAULT: 'default'>,
+                               lora_task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
+                               q_lora=False,
+                               q_lora_bits=4,
+                               bnb_4bit_quant_type='fp4',
+                               use_bnb_nested_quant=False,
+                               bnb_4bit_quant_storage='uint8',
+                               bnb_4bit_compute_dtype='float32',
+                               peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
+               fsdp=FSDPParams(enable_fsdp=False,
+                               sharding_strategy=<ShardingStrategy.FULL_SHARD: 'FULL_SHARD'>,
+                               cpu_offload=False,
+                               mixed_precision=None,
+                               backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
+                               forward_prefetch=False,
+                               use_orig_params=None,
+                               state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
+                               auto_wrap_policy=<AutoWrapPolicy.NO_WRAP: 'NO_WRAP'>,
+                               min_num_params=100000,
+                               transformer_layer_cls=None,
+                               sync_module_states=True))
+[2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
+[2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:185] Building model using device_map: cuda:0 (DeviceRankInfo(world_size=4, rank=0, local_world_size=4, local_rank=0))...
+[2025-02-01 18:47:39,903][oumi][rank0][pid:11750][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
+[2025-02-01 18:47:41,904][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
+[2025-02-01 18:47:41,946][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
+[2025-02-01 18:47:47,678][oumi][rank0][pid:11750][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1744.52 examples/sec. Examples: 10000. Duration: 5.7 sec. Transform workers: 1.
+[2025-02-01 18:47:47,943][oumi][rank0][pid:11750][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
+[2025-02-01 18:47:47,998][oumi][rank0][pid:11750][MainThread][INFO]][training.py:49] SFTConfig(output_dir='output/smollm2-17b-distill-r1-670b-math',
+          overwrite_output_dir=False,
+          do_train=False,
+          do_eval=False,
+          do_predict=False,
+          eval_strategy=<IntervalStrategy.NO: 'no'>,
+          prediction_loss_only=False,
+          per_device_train_batch_size=2,
+          per_device_eval_batch_size=8,
+          per_gpu_train_batch_size=None,
+          per_gpu_eval_batch_size=None,
+          gradient_accumulation_steps=2,
+          eval_accumulation_steps=None,
+          eval_delay=0,
+          torch_empty_cache_steps=1,
+          learning_rate=2e-05,
+          weight_decay=0.0,
+          adam_beta1=0.9,
+          adam_beta2=0.999,
+          adam_epsilon=1e-08,
+          max_grad_norm=10.0,
+          num_train_epochs=1,
+          max_steps=-1,
+          lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>,
+          lr_scheduler_kwargs={},
+          warmup_ratio=0.1,
+          warmup_steps=0,
+          log_level='warning',
+          log_level_replica='warning',
+          log_on_each_node=True,
+          logging_dir='output/smollm2-17b-distill-r1-670b-math/runs/Feb01_18-47-47_sky-d954-bf996-370b-head-3sxnf23v-compute',
+          logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
+          logging_first_step=False,
+          logging_steps=10,
+          logging_nan_inf_filter=True,
+          save_strategy=<IntervalStrategy.NO: 'no'>,
+          save_steps=0,
+          save_total_limit=None,
+          save_safetensors=True,
+          save_on_each_node=False,
+          save_only_model=False,
+          restore_callback_states_from_checkpoint=False,
+          no_cuda=False,
+          use_cpu=False,
+          use_mps_device=False,
+          seed=42,
+          data_seed=None,
+          jit_mode_eval=False,
+          use_ipex=False,
+          bf16=False,
+          fp16=False,
+          fp16_opt_level='O1',
+          half_precision_backend='auto',
+          bf16_full_eval=False,
+          fp16_full_eval=False,
+          tf32=None,
+          local_rank=0,
+          ddp_backend=None,
+          tpu_num_cores=None,
+          tpu_metrics_debug=False,
+          debug=[],
+          dataloader_drop_last=False,
+          eval_steps=500,
+          dataloader_num_workers=8,
+          dataloader_prefetch_factor=32,
+          past_index=-1,
+          run_name='smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1',
+          disable_tqdm=False,
+          remove_unused_columns=True,
+          label_names=None,
+          load_best_model_at_end=False,
+          metric_for_best_model=None,
+          greater_is_better=None,
+          ignore_data_skip=False,
+          fsdp=[],
+          fsdp_min_num_params=0,
+          fsdp_config={'min_num_params': 0,
+                       'xla': False,
+                       'xla_fsdp_grad_ckpt': False,
+                       'xla_fsdp_v2': False},
+          fsdp_transformer_layer_cls_to_wrap=None,
+          accelerator_config=AcceleratorConfig(split_batches=False,
+                                               dispatch_batches=None,
+                                               even_batches=True,
+                                               use_seedable_sampler=True,
+                                               non_blocking=False,
+                                               gradient_accumulation_kwargs=None,
+                                               use_configured_state=False),
+          deepspeed=None,
+          label_smoothing_factor=0.0,
+          optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>,
+          optim_args=None,
+          adafactor=False,
+          group_by_length=False,
+          length_column_name='length',
+          report_to=['wandb', 'tensorboard'],
+          ddp_find_unused_parameters=False,
+          ddp_bucket_cap_mb=None,
+          ddp_broadcast_buffers=None,
+          dataloader_pin_memory=True,
+          dataloader_persistent_workers=False,
+          skip_memory_metrics=True,
+          use_legacy_prediction_loop=False,
+          push_to_hub=False,
+          resume_from_checkpoint=None,
+          hub_model_id=None,
+          hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>,
+          hub_token=None,
+          hub_private_repo=False,
+          hub_always_push=False,
+          gradient_checkpointing=True,
+          gradient_checkpointing_kwargs={'use_reentrant': False},
+          include_inputs_for_metrics=False,
+          eval_do_concat_batches=True,
+          fp16_backend='auto',
+          evaluation_strategy=None,
+          push_to_hub_model_id=None,
+          push_to_hub_organization=None,
+          push_to_hub_token=None,
+          mp_parameters='',
+          auto_find_batch_size=False,
+          full_determinism=False,
+          torchdynamo=None,
+          ray_scope='last',
+          ddp_timeout=1800,
+          torch_compile=False,
+          torch_compile_backend=None,
+          torch_compile_mode=None,
+          dispatch_batches=None,
+          split_batches=None,
+          include_tokens_per_second=False,
+          include_num_input_tokens_seen=False,
+          neftune_noise_alpha=None,
+          optim_target_modules=None,
+          batch_eval_metrics=False,
+          eval_on_start=False,
+          use_liger_kernel=False,
+          eval_use_gather_object=False,
+          dataset_text_field=None,
+          packing=False,
+          max_seq_length=None,
+          dataset_num_proc=None,
+          dataset_batch_size=1000,
+          model_init_kwargs=None,
+          dataset_kwargs=None,
+          eval_packing=None,
+          num_of_sequences=1024,
+          chars_per_token=3.6,
+          use_liger=False)
+[2025-02-01 18:47:48,072][oumi][rank0][pid:11750][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
+[2025-02-01 18:47:48,078][oumi][rank0][pid:11750][MainThread][INFO]][train.py:312] Training init time: 10.796s
+[2025-02-01 18:47:48,078][oumi][rank0][pid:11750][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
+[2025-02-01 18:52:35,471][oumi][rank0][pid:11750][MainThread][INFO]][train.py:320] Training is Complete.
+[2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.43 GB
+[2025-02-01 18:52:35,501][oumi][rank0][pid:11750][MainThread][INFO]][train.py:327] Saving final state...
+[2025-02-01 18:52:35,504][oumi][rank0][pid:11750][MainThread][INFO]][train.py:332] Saving final model...
+[2025-02-01 18:52:43,074][oumi][rank0][pid:11750][MainThread][INFO]][hf_trainer.py:102] Model has been saved at output/smollm2-17b-distill-r1-670b-math
+[2025-02-01 18:52:43,650][oumi][rank0][pid:11750][MainThread][INFO]][train.py:339]
+» We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0001.log ADDED Viewed

	@@ -0,0 +1,19 @@

+[2025-02-01 18:47:39,024][oumi][rank1][pid:11751][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
+[2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
+[2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:185] Building model using device_map: cuda:1 (DeviceRankInfo(world_size=4, rank=1, local_world_size=4, local_rank=1))...
+[2025-02-01 18:47:39,328][oumi][rank1][pid:11751][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
+[2025-02-01 18:47:41,530][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
+[2025-02-01 18:47:41,663][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
+[2025-02-01 18:47:47,716][oumi][rank1][pid:11751][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1652.20 examples/sec. Examples: 10000. Duration: 6.1 sec. Transform workers: 1.
+[2025-02-01 18:47:47,984][oumi][rank1][pid:11751][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
+[2025-02-01 18:47:48,077][oumi][rank1][pid:11751][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
+[2025-02-01 18:47:48,078][oumi][rank1][pid:11751][MainThread][INFO]][train.py:312] Training init time: 10.795s
+[2025-02-01 18:47:48,078][oumi][rank1][pid:11751][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
+[2025-02-01 18:52:35,469][oumi][rank1][pid:11751][MainThread][INFO]][train.py:320] Training is Complete.
+[2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.24 GB
+[2025-02-01 18:52:35,498][oumi][rank1][pid:11751][MainThread][INFO]][train.py:327] Saving final state...
+[2025-02-01 18:52:35,504][oumi][rank1][pid:11751][MainThread][INFO]][train.py:332] Saving final model...
+[2025-02-01 18:52:43,653][oumi][rank1][pid:11751][MainThread][INFO]][train.py:339]
+» We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0002.log ADDED Viewed

	@@ -0,0 +1,19 @@

+[2025-02-01 18:47:39,024][oumi][rank2][pid:11752][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
+[2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
+[2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:185] Building model using device_map: cuda:2 (DeviceRankInfo(world_size=4, rank=2, local_world_size=4, local_rank=2))...
+[2025-02-01 18:47:39,330][oumi][rank2][pid:11752][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
+[2025-02-01 18:47:41,533][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
+[2025-02-01 18:47:41,616][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
+[2025-02-01 18:47:47,667][oumi][rank2][pid:11752][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1652.64 examples/sec. Examples: 10000. Duration: 6.1 sec. Transform workers: 1.
+[2025-02-01 18:47:47,937][oumi][rank2][pid:11752][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
+[2025-02-01 18:47:48,077][oumi][rank2][pid:11752][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
+[2025-02-01 18:47:48,078][oumi][rank2][pid:11752][MainThread][INFO]][train.py:312] Training init time: 10.795s
+[2025-02-01 18:47:48,078][oumi][rank2][pid:11752][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
+[2025-02-01 18:52:35,469][oumi][rank2][pid:11752][MainThread][INFO]][train.py:320] Training is Complete.
+[2025-02-01 18:52:35,496][oumi][rank2][pid:11752][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-02-01 18:52:35,497][oumi][rank2][pid:11752][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 17.46 GB
+[2025-02-01 18:52:35,497][oumi][rank2][pid:11752][MainThread][INFO]][train.py:327] Saving final state...
+[2025-02-01 18:52:35,504][oumi][rank2][pid:11752][MainThread][INFO]][train.py:332] Saving final model...
+[2025-02-01 18:52:43,655][oumi][rank2][pid:11752][MainThread][INFO]][train.py:339]
+» We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0003.log ADDED Viewed

	@@ -0,0 +1,19 @@

+[2025-02-01 18:47:39,024][oumi][rank3][pid:11753][MainThread][INFO]][train.py:144] Resolved 'training.dataloader_num_workers=auto' to 'training.dataloader_num_workers=8'
+[2025-02-01 18:47:39,326][oumi][rank3][pid:11753][MainThread][INFO]][models.py:180] Building model for distributed training (world_size: 4)...
+[2025-02-01 18:47:39,326][oumi][rank3][pid:11753][MainThread][INFO]][models.py:185] Building model using device_map: cuda:3 (DeviceRankInfo(world_size=4, rank=3, local_world_size=4, local_rank=3))...
+[2025-02-01 18:47:39,327][oumi][rank3][pid:11753][MainThread][INFO]][models.py:255] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.
+[2025-02-01 18:47:41,529][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:68] Creating map dataset (type: TextSftJsonLinesDataset) dataset_name: 'text_sft_jsonl', dataset_path: 'None'...
+[2025-02-01 18:47:41,714][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:297] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])
+[2025-02-01 18:47:47,694][oumi][rank3][pid:11753][MainThread][INFO]][base_map_dataset.py:361] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1672.23 examples/sec. Examples: 10000. Duration: 6.0 sec. Transform workers: 1.
+[2025-02-01 18:47:47,964][oumi][rank3][pid:11753][MainThread][INFO]][torch_profiler_utils.py:150] PROF: Torch Profiler disabled!
+[2025-02-01 18:47:48,076][oumi][rank3][pid:11753][MainThread][INFO]][device_utils.py:283] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=7019.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=70.637, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
+[2025-02-01 18:47:48,078][oumi][rank3][pid:11753][MainThread][INFO]][train.py:312] Training init time: 10.795s
+[2025-02-01 18:47:48,078][oumi][rank3][pid:11753][MainThread][INFO]][train.py:313] Starting training... (TrainerType.TRL_SFT, transformers: 4.45.2)
+[2025-02-01 18:52:35,469][oumi][rank3][pid:11753][MainThread][INFO]][train.py:320] Training is Complete.
+[2025-02-01 18:52:35,496][oumi][rank3][pid:11753][MainThread][INFO]][device_utils.py:283] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=4, used_memory_mb=21283.0, temperature=43, fan_speed=None, fan_speeds=None, power_usage_watts=181.852, power_limit_watts=400.0, gpu_utilization=54, memory_utilization=14, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-02-01 18:52:35,497][oumi][rank3][pid:11753][MainThread][INFO]][torch_utils.py:117] Peak GPU memory usage: 16.56 GB
+[2025-02-01 18:52:35,497][oumi][rank3][pid:11753][MainThread][INFO]][train.py:327] Saving final state...
+[2025-02-01 18:52:35,504][oumi][rank3][pid:11753][MainThread][INFO]][train.py:332] Saving final model...
+[2025-02-01 18:52:43,652][oumi][rank3][pid:11753][MainThread][INFO]][train.py:339]
+» We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656b06593f9a7fd9830d1adb5883c539f3ccc697fa6f9b565921cb86836e32af
+size 3422777952

runs/Feb01_18-47-47_sky-d954-bf996-370b-head-3sxnf23v-compute/events.out.tfevents.1738435669.sky-d954-bf996-370b-head-3sxnf23v-compute.11750.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31e126e7d84ce93b41ef1338794b75bc7718945a7b308c7028f605af6bf96c30
+size 19018

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

telemetry/devices_info.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+CPU cores: 48 CUDA devices: 4
+device(0)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 76.58GiB Allocated: 0.0GiB Cached: 0.0GiB]
+device(1)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.26GiB Allocated: 0.0GiB Cached: 0.0GiB]
+device(2)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.26GiB Allocated: 0.0GiB Cached: 0.0GiB]
+device(3)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.15GiB Free: 77.12GiB Allocated: 0.0GiB Cached: 0.0GiB]

telemetry/training_config.yaml ADDED Viewed

	@@ -0,0 +1,165 @@

+data:
+  train:
+    datasets:
+    - dataset_name: text_sft_jsonl
+      dataset_path: data/R1/math_10k_R1_outputs.jsonl
+      subset: null
+      split: train
+      dataset_kwargs: {}
+      sample_count: null
+      mixture_proportion: null
+      shuffle: false
+      seed: null
+      shuffle_buffer_size: 1000
+      trust_remote_code: false
+      transform_num_workers: null
+    collator_name: null
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: 42
+    use_async_dataset: false
+    use_torchdata: null
+  test:
+    datasets: []
+    collator_name: null
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: null
+    use_async_dataset: false
+    use_torchdata: null
+  validation:
+    datasets: []
+    collator_name: null
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: null
+    use_async_dataset: false
+    use_torchdata: null
+model:
+  model_name: HuggingFaceTB/SmolLM2-1.7B-Instruct
+  adapter_model: null
+  tokenizer_name: null
+  tokenizer_pad_token: null
+  tokenizer_kwargs: {}
+  model_max_length: null
+  load_pretrained_weights: true
+  trust_remote_code: true
+  torch_dtype_str: bfloat16
+  compile: false
+  chat_template: null
+  attn_implementation: null
+  device_map: auto
+  model_kwargs: {}
+  enable_liger_kernel: false
+  shard_for_eval: false
+  freeze_layers: []
+training:
+  use_peft: false
+  trainer_type: TRL_SFT
+  enable_gradient_checkpointing: true
+  gradient_checkpointing_kwargs:
+    use_reentrant: false
+  output_dir: output/smollm2-17b-distill-r1-670b-math
+  per_device_train_batch_size: 2
+  per_device_eval_batch_size: 8
+  gradient_accumulation_steps: 2
+  max_steps: -1
+  num_train_epochs: 1
+  save_epoch: false
+  save_steps: 0
+  save_final_model: true
+  seed: 42
+  run_name: smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1
+  metrics_function: null
+  log_level: info
+  dep_log_level: warning
+  enable_wandb: true
+  enable_tensorboard: true
+  logging_strategy: steps
+  logging_dir: null
+  logging_steps: 10
+  logging_first_step: false
+  eval_strategy: 'no'
+  eval_steps: 500
+  learning_rate: 2.0e-05
+  lr_scheduler_type: linear
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.1
+  warmup_steps: null
+  optimizer: adamw_torch_fused
+  weight_decay: 0.0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  sgd_momentum: 0.0
+  mixed_precision_dtype: NONE
+  compile: false
+  include_performance_metrics: false
+  include_alternative_mfu_metrics: false
+  log_model_summary: false
+  resume_from_checkpoint: null
+  try_resume_from_last_checkpoint: false
+  dataloader_num_workers: 8
+  dataloader_prefetch_factor: 32
+  dataloader_main_process_only: null
+  ddp_find_unused_parameters: false
+  max_grad_norm: 10.0
+  trainer_kwargs: {}
+  profiler:
+    save_dir: null
+    enable_cpu_profiling: false
+    enable_cuda_profiling: false
+    record_shapes: false
+    profile_memory: false
+    with_stack: false
+    with_flops: false
+    with_modules: false
+    row_limit: 50
+    schedule:
+      enable_schedule: false
+      wait: 0
+      warmup: 1
+      active: 3
+      repeat: 1
+      skip_first: 1
+  telemetry:
+    telemetry_dir: telemetry
+    collect_telemetry_for_all_ranks: false
+    track_gpu_temperature: false
+  empty_device_cache_steps: 1
+  nccl_default_timeout_minutes: null
+peft:
+  lora_r: 8
+  lora_alpha: 8
+  lora_dropout: 0.0
+  lora_target_modules: null
+  lora_modules_to_save: null
+  lora_bias: none
+  lora_init_weights: DEFAULT
+  lora_task_type: CAUSAL_LM
+  q_lora: false
+  q_lora_bits: 4
+  bnb_4bit_quant_type: fp4
+  use_bnb_nested_quant: false
+  bnb_4bit_quant_storage: uint8
+  bnb_4bit_compute_dtype: float32
+  peft_save_mode: ADAPTER_ONLY
+fsdp:
+  enable_fsdp: false
+  sharding_strategy: FULL_SHARD
+  cpu_offload: false
+  mixed_precision: null
+  backward_prefetch: BACKWARD_PRE
+  forward_prefetch: false
+  use_orig_params: null
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: NO_WRAP
+  min_num_params: 100000
+  transformer_layer_cls: null
+  sync_module_states: true

telemetry/world_size.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "LOCAL_WORLD_SIZE": 4,
+  "WORLD_SIZE": 4
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,476 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 625,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 30.125,
+      "learning_rate": 3.1746031746031746e-06,
+      "loss": 2.5254,
+      "step": 10
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 20.25,
+      "learning_rate": 6.349206349206349e-06,
+      "loss": 2.5254,
+      "step": 20
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 14.8125,
+      "learning_rate": 9.523809523809525e-06,
+      "loss": 2.0496,
+      "step": 30
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 10.9375,
+      "learning_rate": 1.2698412698412699e-05,
+      "loss": 1.4853,
+      "step": 40
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 5.625,
+      "learning_rate": 1.5873015873015872e-05,
+      "loss": 0.8731,
+      "step": 50
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 3.109375,
+      "learning_rate": 1.904761904761905e-05,
+      "loss": 0.6086,
+      "step": 60
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 3.015625,
+      "learning_rate": 1.9750889679715305e-05,
+      "loss": 0.5745,
+      "step": 70
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 2.375,
+      "learning_rate": 1.939501779359431e-05,
+      "loss": 0.5121,
+      "step": 80
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 3.28125,
+      "learning_rate": 1.903914590747331e-05,
+      "loss": 0.4875,
+      "step": 90
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 2.703125,
+      "learning_rate": 1.8683274021352315e-05,
+      "loss": 0.4997,
+      "step": 100
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 2.203125,
+      "learning_rate": 1.832740213523132e-05,
+      "loss": 0.5004,
+      "step": 110
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 2.109375,
+      "learning_rate": 1.7971530249110324e-05,
+      "loss": 0.5119,
+      "step": 120
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 2.453125,
+      "learning_rate": 1.7615658362989325e-05,
+      "loss": 0.5088,
+      "step": 130
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 2.171875,
+      "learning_rate": 1.725978647686833e-05,
+      "loss": 0.4842,
+      "step": 140
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 3.65625,
+      "learning_rate": 1.690391459074733e-05,
+      "loss": 0.5193,
+      "step": 150
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 2.609375,
+      "learning_rate": 1.6548042704626336e-05,
+      "loss": 0.4984,
+      "step": 160
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 2.28125,
+      "learning_rate": 1.619217081850534e-05,
+      "loss": 0.5011,
+      "step": 170
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 3.46875,
+      "learning_rate": 1.583629893238434e-05,
+      "loss": 0.5493,
+      "step": 180
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 2.46875,
+      "learning_rate": 1.5480427046263346e-05,
+      "loss": 0.4869,
+      "step": 190
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.734375,
+      "learning_rate": 1.5124555160142349e-05,
+      "loss": 0.4902,
+      "step": 200
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 1.921875,
+      "learning_rate": 1.4768683274021354e-05,
+      "loss": 0.5013,
+      "step": 210
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 2.125,
+      "learning_rate": 1.4412811387900356e-05,
+      "loss": 0.5061,
+      "step": 220
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 2.265625,
+      "learning_rate": 1.4056939501779361e-05,
+      "loss": 0.4956,
+      "step": 230
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 1.6015625,
+      "learning_rate": 1.3701067615658364e-05,
+      "loss": 0.4665,
+      "step": 240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.234375,
+      "learning_rate": 1.3345195729537369e-05,
+      "loss": 0.4605,
+      "step": 250
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 2.1875,
+      "learning_rate": 1.298932384341637e-05,
+      "loss": 0.4728,
+      "step": 260
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 2.46875,
+      "learning_rate": 1.2633451957295374e-05,
+      "loss": 0.5115,
+      "step": 270
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 2.765625,
+      "learning_rate": 1.2277580071174377e-05,
+      "loss": 0.4873,
+      "step": 280
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 2.140625,
+      "learning_rate": 1.1921708185053382e-05,
+      "loss": 0.5266,
+      "step": 290
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.078125,
+      "learning_rate": 1.1565836298932385e-05,
+      "loss": 0.5175,
+      "step": 300
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 2.328125,
+      "learning_rate": 1.120996441281139e-05,
+      "loss": 0.4702,
+      "step": 310
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 2.734375,
+      "learning_rate": 1.0854092526690392e-05,
+      "loss": 0.5071,
+      "step": 320
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 2.03125,
+      "learning_rate": 1.0498220640569397e-05,
+      "loss": 0.5155,
+      "step": 330
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 2.53125,
+      "learning_rate": 1.01423487544484e-05,
+      "loss": 0.4964,
+      "step": 340
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.984375,
+      "learning_rate": 9.786476868327403e-06,
+      "loss": 0.4867,
+      "step": 350
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 2.828125,
+      "learning_rate": 9.430604982206405e-06,
+      "loss": 0.545,
+      "step": 360
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 2.546875,
+      "learning_rate": 9.07473309608541e-06,
+      "loss": 0.4832,
+      "step": 370
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 2.6875,
+      "learning_rate": 8.718861209964413e-06,
+      "loss": 0.4826,
+      "step": 380
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 1.765625,
+      "learning_rate": 8.362989323843418e-06,
+      "loss": 0.4652,
+      "step": 390
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.359375,
+      "learning_rate": 8.00711743772242e-06,
+      "loss": 0.5069,
+      "step": 400
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 2.453125,
+      "learning_rate": 7.651245551601423e-06,
+      "loss": 0.5131,
+      "step": 410
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 2.3125,
+      "learning_rate": 7.295373665480427e-06,
+      "loss": 0.4937,
+      "step": 420
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 2.109375,
+      "learning_rate": 6.939501779359431e-06,
+      "loss": 0.4878,
+      "step": 430
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 3.546875,
+      "learning_rate": 6.5836298932384346e-06,
+      "loss": 0.4821,
+      "step": 440
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 2.046875,
+      "learning_rate": 6.227758007117438e-06,
+      "loss": 0.4486,
+      "step": 450
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 2.71875,
+      "learning_rate": 5.871886120996442e-06,
+      "loss": 0.5022,
+      "step": 460
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 2.53125,
+      "learning_rate": 5.516014234875445e-06,
+      "loss": 0.4977,
+      "step": 470
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 2.6875,
+      "learning_rate": 5.160142348754449e-06,
+      "loss": 0.5076,
+      "step": 480
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 2.546875,
+      "learning_rate": 4.8042704626334524e-06,
+      "loss": 0.451,
+      "step": 490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.734375,
+      "learning_rate": 4.448398576512456e-06,
+      "loss": 0.4576,
+      "step": 500
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 2.765625,
+      "learning_rate": 4.09252669039146e-06,
+      "loss": 0.4914,
+      "step": 510
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 2.59375,
+      "learning_rate": 3.7366548042704632e-06,
+      "loss": 0.4938,
+      "step": 520
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 2.625,
+      "learning_rate": 3.3807829181494666e-06,
+      "loss": 0.5218,
+      "step": 530
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 1.8515625,
+      "learning_rate": 3.0249110320284703e-06,
+      "loss": 0.4694,
+      "step": 540
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 2.375,
+      "learning_rate": 2.669039145907473e-06,
+      "loss": 0.5102,
+      "step": 550
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 2.375,
+      "learning_rate": 2.313167259786477e-06,
+      "loss": 0.506,
+      "step": 560
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 2.875,
+      "learning_rate": 1.9572953736654807e-06,
+      "loss": 0.4982,
+      "step": 570
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 1.9140625,
+      "learning_rate": 1.6014234875444842e-06,
+      "loss": 0.5107,
+      "step": 580
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 2.546875,
+      "learning_rate": 1.2455516014234877e-06,
+      "loss": 0.4857,
+      "step": 590
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.8359375,
+      "learning_rate": 8.896797153024913e-07,
+      "loss": 0.4821,
+      "step": 600
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 2.609375,
+      "learning_rate": 5.338078291814947e-07,
+      "loss": 0.5166,
+      "step": 610
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 2.578125,
+      "learning_rate": 1.7793594306049826e-07,
+      "loss": 0.5116,
+      "step": 620
+    },
+    {
+      "epoch": 1.0,
+      "step": 625,
+      "total_flos": 9900164319805440.0,
+      "train_loss": 0.6107198246002197,
+      "train_runtime": 287.0781,
+      "train_samples_per_second": 34.834,
+      "train_steps_per_second": 2.177
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 625,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9900164319805440.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca5ba33e3d7a1ff4b6185b6649d17e444660dc9c669f108ba1cf9db15a4edb0b
+size 5624

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff