Open-Sora

Runtime error

App Files Files Community

Open-Sora / apex /tests /L1 /common /run_test.sh

kadirnar

Upload 494 files

8a42f8f verified over 1 year ago

raw

history blame

3.54 kB

	#!/bin/bash

	print_banner() {
	printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
	}

	print_banner "Distributed status: $1"

	echo $2
	DATADIR=$2

	if [ -n "$3" ]
	then
	USE_BASELINE=""
	else
	USE_BASELINE="--use_baseline"
	fi

	if [ "$1" == "single_gpu" ]
	then
	BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
	fi

	if [ "$1" == "distributed" ]
	then
	BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
	fi

	ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"

	keep_batchnorms=(
	""
	"--keep-batchnorm-fp32 True"
	"--keep-batchnorm-fp32 False"
	)

	loss_scales=(
	""
	"--loss-scale 1.0"
	"--loss-scale 128.0"
	"--loss-scale dynamic"
	)

	opt_levels=(
	"O0"
	"O1"
	"O2"
	"O3"
	)

	rm True*
	rm False*

	set -e

	print_banner "Installing Apex with --cuda_ext and --cpp_ext"

	pushd ../../..
	pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
	popd

	for opt_level in "${opt_levels[@]}"
	do
	for loss_scale in "${loss_scales[@]}"
	do
	for keep_batchnorm in "${keep_batchnorms[@]}"
	do
	if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
	then
	print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
	continue
	fi
	print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
	set -x
	${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
	set +x
	done
	done
	done

	# Handle FusedAdam separately due to limited support.
	# FusedAdam will not be tested for bitwise accuracy against the Python implementation.
	# The L0 tests already do so. These tests are here to ensure that it actually runs,
	# and get an idea of performance.
	for loss_scale in "${loss_scales[@]}"
	do
	print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
	set -x
	${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
	set +x
	done

	print_banner "Reinstalling apex without extensions"

	pushd ../../..
	pip install -v --no-cache-dir .
	popd

	for opt_level in "${opt_levels[@]}"
	do
	for loss_scale in "${loss_scales[@]}"
	do
	for keep_batchnorm in "${keep_batchnorms[@]}"
	do
	if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
	then
	print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
	continue
	fi
	print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
	set -x
	${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
	set +x
	done
	done
	done

	print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"

	for opt_level in "${opt_levels[@]}"
	do
	for loss_scale in "${loss_scales[@]}"
	do
	for keep_batchnorm in "${keep_batchnorms[@]}"
	do
	echo ""
	if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
	then
	echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
	continue
	fi
	echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
	set -x
	python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
	set +x
	done
	done
	done

	print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"

	pushd ../../..
	pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
	popd