Spaces:
Running
on
Zero
Running
on
Zero
# If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found | |
# Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 | |
export NCCL_IB_TIMEOUT=24 | |
export NCCL_NVLS_ENABLE=0 | |
NET_TYPE="high" | |
if [[ "${NET_TYPE}" = "low" ]]; then | |
export NCCL_SOCKET_IFNAME=eth1 | |
export NCCL_IB_GID_INDEX=3 | |
export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 | |
export NCCL_IB_SL=3 | |
export NCCL_CHECK_DISABLE=1 | |
export NCCL_P2P_DISABLE=0 | |
export NCCL_LL_THRESHOLD=16384 | |
export NCCL_IB_CUDA_SUPPORT=1 | |
else | |
export NCCL_IB_GID_INDEX=3 | |
export NCCL_IB_SL=3 | |
export NCCL_CHECK_DISABLE=1 | |
export NCCL_P2P_DISABLE=0 | |
export NCCL_IB_DISABLE=0 | |
export NCCL_LL_THRESHOLD=16384 | |
export NCCL_IB_CUDA_SUPPORT=1 | |
export NCCL_SOCKET_IFNAME=bond1 | |
export UCX_NET_DEVICES=bond1 | |
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 | |
export NCCL_COLLNET_ENABLE=0 | |
export SHARP_COLL_ENABLE_SAT=0 | |
export NCCL_NET_GDR_LEVEL=2 | |
export NCCL_IB_QPS_PER_CONNECTION=4 | |
export NCCL_IB_TC=160 | |
export NCCL_PXN_DISABLE=0 | |
fi | |
export NCCL_DEBUG=WARN | |
node_num=$1 | |
node_rank=$2 | |
master_ip=$3 | |
config=$4 | |
output_dir=$5 | |
# config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml' | |
# output_dir='output_folder/dit/overfitting_10' | |
echo node_num $node_num | |
echo node_rank $node_rank | |
echo master_ip $master_ip | |
echo config $config | |
echo output_dir $output_dir | |
if test -d "$output_dir"; then | |
cp $config $output_dir | |
else | |
mkdir -p "$output_dir" | |
cp $config $output_dir | |
fi | |
NODE_RANK=$node_rank \ | |
HF_HUB_OFFLINE=0 \ | |
MASTER_PORT=12348 \ | |
MASTER_ADDR=$master_ip \ | |
NCCL_SOCKET_IFNAME=bond1 \ | |
NCCL_IB_GID_INDEX=3 \ | |
NCCL_NVLS_ENABLE=0 \ | |
python3 main.py \ | |
--num_nodes $node_num \ | |
--num_gpus 8 \ | |
--config $config \ | |
--output_dir $output_dir \ | |
--deepspeed | |