Spaces:
Runtime error
Runtime error
| print_banner() { | |
| printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n" | |
| } | |
| print_banner "Distributed status: $1" | |
| echo $2 | |
| DATADIR=$2 | |
| if [ -n "$3" ] | |
| then | |
| USE_BASELINE="" | |
| else | |
| USE_BASELINE="--use_baseline" | |
| fi | |
| if [ "$1" == "single_gpu" ] | |
| then | |
| BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5" | |
| fi | |
| if [ "$1" == "distributed" ] | |
| then | |
| BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5" | |
| fi | |
| ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam" | |
| keep_batchnorms=( | |
| "" | |
| "--keep-batchnorm-fp32 True" | |
| "--keep-batchnorm-fp32 False" | |
| ) | |
| loss_scales=( | |
| "" | |
| "--loss-scale 1.0" | |
| "--loss-scale 128.0" | |
| "--loss-scale dynamic" | |
| ) | |
| opt_levels=( | |
| "O0" | |
| "O1" | |
| "O2" | |
| "O3" | |
| ) | |
| rm True* | |
| rm False* | |
| set -e | |
| print_banner "Installing Apex with --cuda_ext and --cpp_ext" | |
| pushd ../../.. | |
| pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . | |
| popd | |
| for opt_level in "${opt_levels[@]}" | |
| do | |
| for loss_scale in "${loss_scales[@]}" | |
| do | |
| for keep_batchnorm in "${keep_batchnorms[@]}" | |
| do | |
| if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ] | |
| then | |
| print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}" | |
| continue | |
| fi | |
| print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR" | |
| set -x | |
| ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR | |
| set +x | |
| done | |
| done | |
| done | |
| # Handle FusedAdam separately due to limited support. | |
| # FusedAdam will not be tested for bitwise accuracy against the Python implementation. | |
| # The L0 tests already do so. These tests are here to ensure that it actually runs, | |
| # and get an idea of performance. | |
| for loss_scale in "${loss_scales[@]}" | |
| do | |
| print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR" | |
| set -x | |
| ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR | |
| set +x | |
| done | |
| print_banner "Reinstalling apex without extensions" | |
| pushd ../../.. | |
| pip install -v --no-cache-dir . | |
| popd | |
| for opt_level in "${opt_levels[@]}" | |
| do | |
| for loss_scale in "${loss_scales[@]}" | |
| do | |
| for keep_batchnorm in "${keep_batchnorms[@]}" | |
| do | |
| if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ] | |
| then | |
| print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}" | |
| continue | |
| fi | |
| print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR" | |
| set -x | |
| ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR | |
| set +x | |
| done | |
| done | |
| done | |
| print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs" | |
| for opt_level in "${opt_levels[@]}" | |
| do | |
| for loss_scale in "${loss_scales[@]}" | |
| do | |
| for keep_batchnorm in "${keep_batchnorms[@]}" | |
| do | |
| echo "" | |
| if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ] | |
| then | |
| echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}" | |
| continue | |
| fi | |
| echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR" | |
| set -x | |
| python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline | |
| set +x | |
| done | |
| done | |
| done | |
| print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext" | |
| pushd ../../.. | |
| pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . | |
| popd | |