Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # Copyright 2021 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import os | |
| from ...utils import ( | |
| ComputeEnvironment, | |
| DistributedType, | |
| is_deepspeed_available, | |
| is_mlu_available, | |
| is_mps_available, | |
| is_npu_available, | |
| is_transformers_available, | |
| is_xpu_available, | |
| ) | |
| from ...utils.constants import ( | |
| DEEPSPEED_MULTINODE_LAUNCHERS, | |
| FSDP_AUTO_WRAP_POLICY, | |
| FSDP_BACKWARD_PREFETCH, | |
| FSDP_SHARDING_STRATEGY, | |
| FSDP_STATE_DICT_TYPE, | |
| TORCH_DYNAMO_MODES, | |
| ) | |
| from .config_args import ClusterConfig | |
| from .config_utils import ( | |
| DYNAMO_BACKENDS, | |
| _ask_field, | |
| _ask_options, | |
| _convert_distributed_mode, | |
| _convert_dynamo_backend, | |
| _convert_mixed_precision, | |
| _convert_yes_no_to_bool, | |
| ) | |
| def get_cluster_input(): | |
| distributed_type = _ask_options( | |
| "Which type of machine are you using?", | |
| ["No distributed training", "multi-CPU", "multi-XPU", "multi-GPU", "multi-NPU", "multi-MLU", "TPU"], | |
| _convert_distributed_mode, | |
| ) | |
| machine_rank = 0 | |
| num_machines = 1 | |
| num_processes = 1 | |
| gpu_ids = None | |
| main_process_ip = None | |
| main_process_port = None | |
| rdzv_backend = "static" | |
| same_network = True | |
| debug = False | |
| if distributed_type in [ | |
| DistributedType.MULTI_GPU, | |
| DistributedType.MULTI_MLU, | |
| DistributedType.MULTI_NPU, | |
| DistributedType.MULTI_XPU, | |
| DistributedType.MULTI_CPU, | |
| ]: | |
| num_machines = _ask_field( | |
| "How many different machines will you use (use more than 1 for multi-node training)? [1]: ", | |
| int, | |
| default=1, | |
| ) | |
| if num_machines > 1: | |
| machine_rank = _ask_options( | |
| "What is the rank of this machine?", | |
| list(range(num_machines)), | |
| int, | |
| ) | |
| main_process_ip = _ask_field( | |
| "What is the IP address of the machine that will host the main process? ", | |
| ) | |
| main_process_port = _ask_field( | |
| "What is the port you will use to communicate with the main process? ", | |
| int, | |
| ) | |
| same_network = _ask_field( | |
| "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if not same_network: | |
| rdzv_backend = _ask_field( | |
| "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static" | |
| ) | |
| debug = _ask_field( | |
| "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if distributed_type == DistributedType.NO: | |
| use_cpu = _ask_field( | |
| "Do you want to run your training on CPU only (even if a GPU / Apple Silicon / Ascend NPU device is available)? [yes/NO]:", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| elif distributed_type == DistributedType.MULTI_CPU: | |
| use_cpu = True | |
| else: | |
| use_cpu = False | |
| ipex_config = {} | |
| mpirun_config = {} | |
| if use_cpu: | |
| ipex_config["ipex"] = _ask_field( | |
| "Do you want to use Intel PyTorch Extension (IPEX) to speed up training on CPU? [yes/NO]:", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if distributed_type == DistributedType.MULTI_CPU: | |
| use_mpirun = _ask_field( | |
| "Do you want accelerate to launch mpirun? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_mpirun: | |
| mpirun_hostfile = _ask_field( | |
| "Please enter the path to the hostfile to use with mpirun [~/hostfile]: ", | |
| str, | |
| default="~/hostfile", | |
| ) | |
| mpirun_config["mpirun_hostfile"] = os.path.expanduser(mpirun_hostfile.strip()) | |
| mpirun_config["mpirun_ccl"] = _ask_field("Enter the number of oneCCL worker threads [1]: ", default=1) | |
| if ( | |
| not use_cpu | |
| and is_xpu_available() | |
| and distributed_type | |
| not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU, DistributedType.XLA] | |
| ): | |
| ipex_config["use_xpu"] = _ask_field( | |
| "Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| dynamo_config = {} | |
| use_dynamo = _ask_field( | |
| "Do you wish to optimize your script with torch dynamo?[yes/NO]:", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_dynamo: | |
| prefix = "dynamo_" | |
| dynamo_config[prefix + "backend"] = _ask_options( | |
| "Which dynamo backend would you like to use?", | |
| [x.lower() for x in DYNAMO_BACKENDS], | |
| _convert_dynamo_backend, | |
| default=2, | |
| ) | |
| use_custom_options = _ask_field( | |
| "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_custom_options: | |
| dynamo_config[prefix + "mode"] = _ask_options( | |
| "Which mode do you want to use?", | |
| TORCH_DYNAMO_MODES, | |
| lambda x: TORCH_DYNAMO_MODES[int(x)], | |
| default=0, | |
| ) | |
| dynamo_config[prefix + "use_fullgraph"] = _ask_field( | |
| "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| dynamo_config[prefix + "use_dynamic"] = _ask_field( | |
| "Do you want to enable dynamic shape tracing? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| use_mps = not use_cpu and is_mps_available() | |
| deepspeed_config = {} | |
| if ( | |
| distributed_type | |
| in [ | |
| DistributedType.MULTI_GPU, | |
| DistributedType.MULTI_XPU, | |
| DistributedType.MULTI_NPU, | |
| DistributedType.MULTI_MLU, | |
| DistributedType.NO, | |
| ] | |
| and not use_mps | |
| ): | |
| use_deepspeed = _ask_field( | |
| "Do you want to use DeepSpeed? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_deepspeed: | |
| distributed_type = DistributedType.DEEPSPEED | |
| assert ( | |
| is_deepspeed_available() | |
| ), "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source" | |
| if distributed_type == DistributedType.DEEPSPEED: | |
| use_deepspeed_config = _ask_field( | |
| "Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_deepspeed_config: | |
| deepspeed_config["deepspeed_config_file"] = _ask_field( | |
| "Please enter the path to the json DeepSpeed config file: ", | |
| str, | |
| default="none", | |
| ) | |
| else: | |
| deepspeed_config["zero_stage"] = _ask_options( | |
| "What should be your DeepSpeed's ZeRO optimization stage?", | |
| [0, 1, 2, 3], | |
| int, | |
| default=2, | |
| ) | |
| deepspeed_devices = ["none", "cpu", "nvme"] | |
| if deepspeed_config["zero_stage"] >= 2: | |
| deepspeed_config["offload_optimizer_device"] = _ask_options( | |
| "Where to offload optimizer states?", deepspeed_devices, lambda x: deepspeed_devices[int(x)] | |
| ) | |
| deepspeed_config["offload_param_device"] = _ask_options( | |
| "Where to offload parameters?", deepspeed_devices, lambda x: deepspeed_devices[int(x)] | |
| ) | |
| if deepspeed_config["offload_param_device"] == "nvme": | |
| deepspeed_config["offload_param_nvme_path"] = _ask_field( | |
| "Nvme Path to offload parameters?", | |
| str, | |
| default="/nvme", | |
| ) | |
| if deepspeed_config["offload_optimizer_device"] == "nvme": | |
| deepspeed_config["offload_optimizer_nvme_path"] = _ask_field( | |
| "Nvme Path to offload optimizer states?", | |
| str, | |
| default="/nvme", | |
| ) | |
| deepspeed_config["gradient_accumulation_steps"] = _ask_field( | |
| "How many gradient accumulation steps you're passing in your script? [1]: ", | |
| int, | |
| default=1, | |
| ) | |
| use_gradient_clipping = _ask_field( | |
| "Do you want to use gradient clipping? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_gradient_clipping: | |
| deepspeed_config["gradient_clipping"] = _ask_field( | |
| "What is the gradient clipping value? [1.0]: ", | |
| float, | |
| default=1.0, | |
| ) | |
| if deepspeed_config["zero_stage"] == 3: | |
| deepspeed_config["zero3_save_16bit_model"] = _ask_field( | |
| "Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| deepspeed_config["zero3_init_flag"] = _ask_field( | |
| "Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if deepspeed_config["zero3_init_flag"]: | |
| if not is_transformers_available(): | |
| raise Exception( | |
| "When `zero3_init_flag` is set, it requires Transformers to be installed. " | |
| "Please run `pip3 install transformers`." | |
| ) | |
| if num_machines > 1: | |
| launcher_query = "Which Type of launcher do you want to use?" | |
| deepspeed_config["deepspeed_multinode_launcher"] = _ask_options( | |
| launcher_query, | |
| DEEPSPEED_MULTINODE_LAUNCHERS, | |
| lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)], | |
| ) | |
| if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]: | |
| deepspeed_config["deepspeed_hostfile"] = _ask_field( | |
| "DeepSpeed configures multi-node compute resources with hostfile. " | |
| "Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; " | |
| "for more information please refer official [documentation]" | |
| "(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). " | |
| "Please specify the location of hostfile: ", | |
| str, | |
| ) | |
| is_exclusion_filter = _ask_field( | |
| "Do you want to specify exclusion filter string? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if is_exclusion_filter: | |
| deepspeed_config["deepspeed_exclusion_filter"] = _ask_field( | |
| "DeepSpeed exclusion filter string: ", | |
| str, | |
| ) | |
| is_inclusion_filter = _ask_field( | |
| "Do you want to specify inclusion filter string? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if is_inclusion_filter: | |
| deepspeed_config["deepspeed_inclusion_filter"] = _ask_field( | |
| "DeepSpeed inclusion filter string: ", | |
| str, | |
| ) | |
| fsdp_config = {} | |
| if distributed_type in [ | |
| DistributedType.MULTI_GPU, | |
| DistributedType.MULTI_NPU, | |
| DistributedType.MULTI_MLU, | |
| DistributedType.MULTI_XPU, | |
| ]: | |
| use_fsdp = _ask_field( | |
| "Do you want to use FullyShardedDataParallel? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_fsdp: | |
| distributed_type = DistributedType.FSDP | |
| if distributed_type == DistributedType.FSDP: | |
| sharding_strategy_query = "What should be your sharding strategy?" | |
| fsdp_config["fsdp_sharding_strategy"] = _ask_options( | |
| sharding_strategy_query, | |
| FSDP_SHARDING_STRATEGY, | |
| lambda x: FSDP_SHARDING_STRATEGY[int(x)], | |
| ) | |
| fsdp_config["fsdp_offload_params"] = _ask_field( | |
| "Do you want to offload parameters and gradients to CPU? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| fsdp_wrap_query = "What should be your auto wrap policy?" | |
| fsdp_config["fsdp_auto_wrap_policy"] = _ask_options( | |
| fsdp_wrap_query, | |
| FSDP_AUTO_WRAP_POLICY, | |
| lambda x: FSDP_AUTO_WRAP_POLICY[int(x)], | |
| ) | |
| if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]: | |
| use_no_split_modules = _ask_field( | |
| "Do you want to use the model's `_no_split_modules` to wrap. Only applicable for 🤗 Transformers [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if not use_no_split_modules: | |
| fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = _ask_field( | |
| "Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :" | |
| "`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? : ", | |
| str, | |
| ) | |
| elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]: | |
| fsdp_config["fsdp_min_num_params"] = _ask_field( | |
| "What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ", | |
| int, | |
| default=100000000, | |
| ) | |
| fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy?" | |
| fsdp_config["fsdp_backward_prefetch"] = _ask_options( | |
| fsdp_backward_prefetch_query, | |
| FSDP_BACKWARD_PREFETCH, | |
| lambda x: FSDP_BACKWARD_PREFETCH[int(x)], | |
| ) | |
| fsdp_state_dict_type_query = "What should be your FSDP's state dict type?" | |
| fsdp_config["fsdp_state_dict_type"] = _ask_options( | |
| fsdp_state_dict_type_query, | |
| FSDP_STATE_DICT_TYPE, | |
| lambda x: FSDP_STATE_DICT_TYPE[int(x)], | |
| default=2, | |
| ) | |
| fsdp_config["fsdp_forward_prefetch"] = _ask_field( | |
| "Do you want to enable FSDP's forward prefetch policy? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| fsdp_config["fsdp_use_orig_params"] = _ask_field( | |
| "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field( | |
| "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if fsdp_config["fsdp_cpu_ram_efficient_loading"]: | |
| fsdp_config["fsdp_sync_module_states"] = True | |
| else: | |
| fsdp_config["fsdp_sync_module_states"] = _ask_field( | |
| "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| megatron_lm_config = {} | |
| if distributed_type in [DistributedType.MULTI_GPU]: | |
| use_megatron_lm = _ask_field( | |
| "Do you want to use Megatron-LM ? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_megatron_lm: | |
| distributed_type = DistributedType.MEGATRON_LM | |
| if distributed_type == DistributedType.MEGATRON_LM: | |
| prefix = "megatron_lm_" | |
| megatron_lm_config[prefix + "tp_degree"] = _ask_field( | |
| "What is the Tensor Parallelism degree/size? [1]:", | |
| int, | |
| default=1, | |
| error_message="Please enter an integer.", | |
| ) | |
| if megatron_lm_config[prefix + "tp_degree"] > 1: | |
| megatron_lm_config[prefix + "sequence_parallelism"] = _ask_field( | |
| "Do you want to enable Sequence Parallelism? [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| megatron_lm_config[prefix + "pp_degree"] = _ask_field( | |
| "What is the Pipeline Parallelism degree/size? [1]:", | |
| int, | |
| default=1, | |
| error_message="Please enter an integer.", | |
| ) | |
| if megatron_lm_config[prefix + "pp_degree"] > 1: | |
| megatron_lm_config[prefix + "num_micro_batches"] = _ask_field( | |
| "What is the number of micro-batches? [1]:", | |
| int, | |
| default=1, | |
| error_message="Please enter an integer.", | |
| ) | |
| megatron_lm_config[prefix + "recompute_activations"] = _ask_field( | |
| "Do you want to enable selective activation recomputation? [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field( | |
| "Do you want to use distributed optimizer " | |
| "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ", | |
| _convert_yes_no_to_bool, | |
| default=True, | |
| error_message="Please enter yes or no.", | |
| ) | |
| megatron_lm_config[prefix + "gradient_clipping"] = _ask_field( | |
| "What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: ", | |
| float, | |
| default=1.0, | |
| ) | |
| # TPU specific defaults | |
| tpu_commands = None | |
| tpu_command_file = None | |
| tpu_downcast_bf16 = "no" | |
| tpu_env = [] | |
| tpu_name = None | |
| tpu_vm = None | |
| tpu_zone = None | |
| tpu_use_sudo = False | |
| tpu_use_cluster = False | |
| if distributed_type in [ | |
| DistributedType.MULTI_CPU, | |
| DistributedType.MULTI_XPU, | |
| DistributedType.MULTI_GPU, | |
| DistributedType.MULTI_MLU, | |
| DistributedType.MULTI_NPU, | |
| DistributedType.XLA, | |
| ]: | |
| machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") | |
| if machine_type == "TPU": | |
| machine_type += " cores" | |
| elif machine_type == "CPU": | |
| machine_type = "processes" | |
| else: | |
| machine_type += "(s)" | |
| num_processes = _ask_field( | |
| f"How many {machine_type} should be used for distributed training? [1]:", | |
| int, | |
| default=1, | |
| error_message="Please enter an integer.", | |
| ) | |
| elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]: | |
| num_processes = _ask_field( | |
| "How many GPU(s) should be used for distributed training? [1]:", | |
| int, | |
| default=1, | |
| error_message="Please enter an integer.", | |
| ) | |
| else: | |
| num_processes = 1 | |
| if (distributed_type == DistributedType.MULTI_GPU) and (num_machines == 1) and (num_processes == 1): | |
| raise ValueError( | |
| f"Specified distributed type {distributed_type} but only using 1 GPU on a single machine. Please select `No distributed training` for the type of machine you are using." | |
| ) | |
| if ( | |
| distributed_type | |
| in [ | |
| DistributedType.MULTI_GPU, | |
| DistributedType.MULTI_MLU, | |
| DistributedType.MULTI_NPU, | |
| DistributedType.MULTI_XPU, | |
| DistributedType.NO, | |
| ] | |
| and not use_cpu | |
| and not use_mps | |
| ): | |
| if is_npu_available(): | |
| machine_type = "NPU(s)" | |
| elif is_mlu_available(): | |
| machine_type = "MLU(s)" | |
| else: | |
| machine_type = "GPU(s)" | |
| gpu_ids = _ask_field( | |
| f"What {machine_type} (by id) should be used for training on this machine as a comma-seperated list? [all]:", | |
| default="all", | |
| ) | |
| # CPU affinity is only supported on NVIDIA hardware for now | |
| enable_cpu_affinity = False | |
| if distributed_type == (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps: | |
| enable_cpu_affinity = _ask_field( | |
| "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if distributed_type == DistributedType.XLA: | |
| mixed_precision = "no" | |
| main_training_function = _ask_field( | |
| "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ", | |
| default="main", | |
| ) | |
| tpu_use_cluster = _ask_field( | |
| "Are you using a TPU cluster? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if tpu_use_cluster: | |
| tpu_name = _ask_field( | |
| "What is the name of your TPU cluster? ", | |
| default=None, | |
| error_message="Please enter the name of your TPU cluster.", | |
| ) | |
| tpu_zone = _ask_field( | |
| "What is the zone of your TPU cluster? ", | |
| default=None, | |
| error_message="Please enter the zone of your TPU cluster.", | |
| ) | |
| tpu_use_sudo = _ask_field( | |
| "To run a python script in a TPU pod, should `sudo` be used? [yes/NO]: ", | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| run_commands = _ask_field( | |
| "Do you have code you wish to run on startup in each pod? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if run_commands: | |
| use_command_file = _ask_field( | |
| "Is this code located in a bash script? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| if use_command_file: | |
| tpu_command_file = _ask_field( | |
| "What is the path to your bash script? ", | |
| default=None, | |
| error_message="Please enter the path to your bash script.", | |
| ) | |
| tpu_command_file = os.path.abspath(tpu_command_file) | |
| else: | |
| print("Please enter each command seperately you wish to run on startup in each pod.") | |
| tpu_commands = [] | |
| another_command = True | |
| while another_command: | |
| tpu_commands.append( | |
| _ask_field( | |
| "Please enter a single command to be ran ", | |
| default=None, | |
| error_message="Please enter the commands you wish to run on startup in each pod as a single string.", | |
| ) | |
| ) | |
| another_command = _ask_field( | |
| "Do you wish to add another command? [yes/NO]: ", | |
| _convert_yes_no_to_bool, | |
| default=False, | |
| error_message="Please enter yes or no.", | |
| ) | |
| tpu_vm = _ask_field( | |
| "If not using an instance group, what are the names of the Compute VM instances to be used, seperated by a comma: ", | |
| default="", | |
| ).split(",") | |
| tpu_env = _ask_field( | |
| "What environment variables do you wish to set in each pod, seperated by a comma: ", | |
| default="", | |
| ).split(",") | |
| else: | |
| main_training_function = "main" | |
| if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config: | |
| mixed_precision = None | |
| else: | |
| mixed_precision = _ask_options( | |
| "Do you wish to use FP16 or BF16 (mixed precision)?", | |
| ["no", "fp16", "bf16", "fp8"], | |
| _convert_mixed_precision, | |
| ) | |
| if use_dynamo and mixed_precision == "no" and not use_cpu: | |
| print( | |
| "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts." | |
| ) | |
| if distributed_type == DistributedType.XLA and mixed_precision == "bf16": | |
| tpu_downcast_bf16 = _ask_field( | |
| "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no" | |
| ) | |
| return ClusterConfig( | |
| compute_environment=ComputeEnvironment.LOCAL_MACHINE, | |
| distributed_type=distributed_type, | |
| num_processes=num_processes, | |
| gpu_ids=gpu_ids, | |
| mixed_precision=mixed_precision, | |
| downcast_bf16=tpu_downcast_bf16, | |
| machine_rank=machine_rank, | |
| num_machines=num_machines, | |
| main_process_ip=main_process_ip, | |
| main_process_port=main_process_port, | |
| main_training_function=main_training_function, | |
| deepspeed_config=deepspeed_config, | |
| fsdp_config=fsdp_config, | |
| megatron_lm_config=megatron_lm_config, | |
| ipex_config=ipex_config, | |
| mpirun_config=mpirun_config, | |
| use_cpu=use_cpu, | |
| rdzv_backend=rdzv_backend, | |
| same_network=same_network, | |
| commands=tpu_commands, | |
| command_file=tpu_command_file, | |
| tpu_env=tpu_env, | |
| tpu_name=tpu_name, | |
| tpu_vm=tpu_vm, | |
| tpu_zone=tpu_zone, | |
| tpu_use_sudo=tpu_use_sudo, | |
| tpu_use_cluster=tpu_use_cluster, | |
| dynamo_config=dynamo_config, | |
| debug=debug, | |
| enable_cpu_affinity=enable_cpu_affinity, | |
| ) | |