diff --git "a/train.log" "b/train.log" new file mode 100644--- /dev/null +++ "b/train.log" @@ -0,0 +1,12293 @@ +2023-08-06 00:54:15.581 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-54-15_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:54:15.582 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:54:15.582 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-54-15_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:54:15.582 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:54:15.584 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-54-15_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:54:15.585 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.740 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.741 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.741 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.742 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.743 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.743 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.749 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.750 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.750 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.751 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.752 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.753 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.754 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.756 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:55:05.755 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_00-55-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 00:55:05.757 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 00:57:00.361 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.362 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.364 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.364 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.367 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.367 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.367 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.368 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.371 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.371 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.374 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.374 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:00.398 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:00.398 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:01.497 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 00:57:01.497 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 00:57:37.486 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.487 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 00:57:37.790 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.205 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.329 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.345 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.479 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.481 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:38.603 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 00:57:39.108 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:15:27.970 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-27_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.970 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.971 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-26_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.971 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.976 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-27_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.977 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.977 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-26_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.978 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.978 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-26_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.979 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.979 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-27_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.980 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.981 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-26_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.982 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:27.982 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-15-26_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:15:27.983 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:15:45.685 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:45.685 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:45.688 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:45.688 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:45.692 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:45.693 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:45.976 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:45.977 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:45.982 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:45.983 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:46.021 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:46.021 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:46.027 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:46.028 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:15:48.519 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:15:48.519 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:16:04.920 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:04.920 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:04.921 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:05.123 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:05.173 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:05.173 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:05.310 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:05.329 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:05.391 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:05.446 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:05.674 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:05.875 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:05.898 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:06.322 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:16:07.557 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:16:07.997 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:21:44.392 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.392 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.392 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.393 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.393 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.393 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.393 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.394 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.394 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.395 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.399 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.400 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.399 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.400 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-21-43_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:21:44.400 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:21:44.401 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:22:01.817 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:01.818 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:01.962 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:01.962 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:02.008 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:02.008 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:02.017 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:02.018 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:02.280 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:02.280 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:02.285 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:02.285 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:02.301 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:02.302 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:04.771 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:22:04.772 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:22:21.199 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.350 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.350 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.389 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:21.515 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.527 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.528 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.583 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:21.877 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:21.880 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:21.933 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:21.955 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:22.043 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:22.155 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:22:23.963 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:22:24.433 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:23:53.580 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.581 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.583 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.583 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.584 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.585 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.588 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.588 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.588 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.589 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.589 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.589 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.589 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.590 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:23:53.589 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-23-52_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:23:53.590 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:24:10.722 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:10.722 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:10.751 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:10.752 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:10.760 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:10.760 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:11.032 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:11.032 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:11.034 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:11.034 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:11.045 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:11.045 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:11.125 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:11.126 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:13.675 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:24:13.675 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:24:29.725 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:29.725 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:29.856 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:30.098 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.174 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:30.174 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:30.248 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.385 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.392 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:30.392 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:30.401 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.620 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.827 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:30.934 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:24:32.966 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:24:33.212 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:33:48.908 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.908 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.908 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.909 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.910 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.910 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.910 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.911 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.911 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.912 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.917 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.918 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.918 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.919 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:33:48.919 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-33-48_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:33:48.920 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:34:06.657 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.658 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.670 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.670 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.672 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.673 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.673 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.674 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.696 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.696 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.704 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.705 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:06.739 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:06.739 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:09.632 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:34:09.632 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:34:25.804 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:25.804 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:25.805 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:25.902 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:25.986 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:26.055 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:26.080 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:26.155 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:26.216 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:26.270 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:26.333 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:26.486 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:26.491 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:27.167 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:34:29.260 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:34:29.690 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:49:30.181 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.182 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.186 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.187 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.191 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.191 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.191 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.192 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.192 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.193 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.195 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.196 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.195 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.196 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:30.197 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-49-29_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:49:30.198 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:49:47.255 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.255 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.324 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.325 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.335 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.335 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.353 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.354 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.424 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.425 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.446 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.446 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:47.447 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:47.448 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:49:49.944 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:49:49.944 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:50:06.581 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.581 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.612 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.636 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.785 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:06.814 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.814 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.858 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:06.867 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:06.904 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:06.952 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:07.138 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:07.392 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:07.488 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:50:08.880 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:50:08.986 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:57:39.229 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.230 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.230 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.231 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.231 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.231 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.232 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.232 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.232 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.233 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.233 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.234 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.234 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.235 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:39.243 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_01-57-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 01:57:39.245 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 01:57:56.636 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.637 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.729 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.729 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.810 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.811 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.819 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.820 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.926 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.927 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.939 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.940 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:56.953 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:56.953 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:57:59.560 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 01:57:59.560 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 01:58:15.874 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:15.931 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:16.253 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.253 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.257 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.324 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.324 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.360 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:16.749 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:16.774 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:16.781 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:16.819 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:16.942 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:17.148 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 01:58:18.826 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 01:58:19.358 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:00:35.484 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.485 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.485 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.486 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.486 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.486 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.487 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.487 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.491 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.491 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.492 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.492 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.491 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.492 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:35.493 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-00-34_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:00:35.494 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:00:52.707 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.707 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.711 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.712 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.714 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.715 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.886 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.887 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.915 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.915 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.928 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.928 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:52.994 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:52.995 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:00:55.198 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:00:55.199 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:01:11.999 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:11.999 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:11.999 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:11.999 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:11.999 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:12.296 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:12.380 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:12.392 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.396 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.428 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.433 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.521 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.860 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:12.904 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:01:14.498 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:01:14.819 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:05:56.176 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.176 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.177 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.177 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.177 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.178 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.177 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.178 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.179 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.179 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.180 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.180 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.181 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.182 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:05:56.182 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-05-55_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:05:56.183 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:06:13.379 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.380 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.410 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.411 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.495 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.496 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.754 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.755 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.761 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.762 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.786 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.786 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:13.820 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:13.821 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:15.990 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:06:15.991 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:06:32.755 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:32.871 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:32.871 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:32.871 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:32.871 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:32.985 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:33.124 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.173 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:33.238 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.305 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.320 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.455 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.467 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:33.864 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:06:35.296 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:06:35.560 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:08:05.519 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.519 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:05.521 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.522 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:05.523 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.525 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.524 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.526 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:05.527 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:36.392 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:36.391 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:36.392 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:05.535 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-08-04_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:08:36.394 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:36.395 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:36.395 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:08:53.657 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.657 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.705 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.705 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.777 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.778 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.847 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.848 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.935 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.936 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.955 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.956 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:53.973 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:53.973 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:08:56.490 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:08:56.491 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:09:13.013 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.013 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.013 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.014 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.071 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:13.158 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:13.174 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:13.232 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:13.319 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.367 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:13.368 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.688 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:13.968 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:14.381 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:09:15.535 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:09:15.920 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:29:40.020 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.021 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.023 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-39_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.024 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.024 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.025 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.027 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-39_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.028 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.027 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-39_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.028 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.032 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-39_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.033 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-38_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.034 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-29-39_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:29:40.034 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.034 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:40.035 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:29:57.188 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.188 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.248 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.248 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.260 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.261 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.302 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.303 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.312 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.313 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.343 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.344 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:57.356 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:57.357 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:29:59.927 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:29:59.928 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:30:16.490 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.562 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.562 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.562 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.562 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.572 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:16.629 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.826 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:16.881 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:16.978 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:16.989 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:17.024 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:17.114 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:17.419 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:30:18.947 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:30:19.306 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:08.944 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.945 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.945 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.946 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.947 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.948 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.948 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.949 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.949 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.949 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.949 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.950 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.953 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.954 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-42-07_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=300, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:42:08.954 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:08.955 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:42:26.798 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.798 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:26.829 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.829 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:26.891 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.892 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:26.895 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.896 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:26.906 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.906 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:26.918 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:26.919 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:27.054 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:27.054 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:29.820 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:42:29.820 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:42:46.017 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.018 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.060 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.101 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:46.112 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.264 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:46.269 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.269 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.306 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:46.459 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:46.529 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:46.615 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:46.892 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:47.282 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:42:48.921 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:42:49.345 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:22.134 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.135 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.136 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=3, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.136 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=1, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.137 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=2, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.137 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=6, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.137 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.137 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.138 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.138 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.137 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=5, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-20_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.138 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.138 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=4, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.139 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:22.141 | INFO | __main__:setup_everything:35 - train_args:TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=False, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=5, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=./train_args/ds_z3_config.json, +disable_tqdm=False, +do_eval=False, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=True, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=7, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=llama2-13b-moss-sft/runs/Aug06_02-49-21_715436, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=2, +logging_strategy=steps, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +no_cuda=False, +num_train_epochs=1, +optim=adamw_hf, +optim_args=None, +output_dir=llama2-13b-moss-sft, +overwrite_output_dir=False, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=1, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=False, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=llama2-13b-moss-sft, +save_on_each_node=False, +save_safetensors=False, +save_steps=1000, +save_strategy=steps, +save_total_limit=1, +seed=42, +sharded_ddp=[], +skip_memory_metrics=True, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.0, +warmup_steps=3000, +weight_decay=0, +xpu_backend=None, +) +2023-08-06 02:49:22.142 | INFO | __main__:init_components:45 - Initializing components... +2023-08-06 02:49:39.551 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.552 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.652 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.653 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.655 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.656 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.708 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.708 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.719 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.720 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.728 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.729 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:39.815 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:39.816 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:42.306 | INFO | __main__:init_components:77 - Total model params: 0.00M +2023-08-06 02:49:42.306 | INFO | component.dataset:__init__:14 - Loading data: ./data/moss-003-sft-data.jsonl +2023-08-06 02:49:58.644 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:58.698 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:58.819 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:58.821 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:59.006 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:59.092 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:59.163 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:59.169 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:59.189 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:59.213 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:59.265 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:49:59.737 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:59.801 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:49:59.954 | INFO | __main__:main:103 - *** starting training *** +2023-08-06 02:50:01.683 | INFO | component.dataset:__init__:17 - there are 1074551 data in dataset +2023-08-06 02:50:02.153 | INFO | __main__:main:103 - *** starting training ***