Spaces:
Running
on
Zero
Running
on
Zero
# Run script | |
# Settings of training & test for different tasks. | |
method="$1" | |
task=$(python3 config.py --print_task) | |
case "${task}" in | |
'DIS5K') epochs=200 && val_last=50 && step=5 ;; | |
'COD') epochs=150 && val_last=50 && step=5 ;; | |
'HRSOD') epochs=150 && val_last=50 && step=5 ;; | |
'General') epochs=250 && val_last=50 && step=2 ;; | |
'General-2K') epochs=250 && val_last=30 && step=2 ;; | |
'Matting') epochs=100 && val_last=30 && step=2 ;; | |
esac | |
testsets=NO # Non-existing folder to skip. | |
# testsets=TE-COD10K # for COD | |
# Train | |
devices=$2 | |
nproc_per_node=$(echo ${devices%%,} | grep -o "," | wc -l) | |
to_be_distributed=`echo ${nproc_per_node} | awk '{if($e > 0) print "True"; else print "False";}'` | |
echo Training started at $(date) | |
if [ ${to_be_distributed} == "True" ] | |
then | |
# Adapt the nproc_per_node by the number of GPUs. Give 8989 as the default value of master_port. | |
echo "Multi-GPU mode received..." | |
CUDA_VISIBLE_DEVICES=${devices} \ | |
torchrun --standalone --nproc_per_node $((nproc_per_node+1)) \ | |
train.py --ckpt_dir ckpt/${method} --epochs ${epochs} \ | |
--testsets ${testsets} \ | |
--dist ${to_be_distributed} \ | |
--resume xx/xx-epoch_244.pth \ | |
# --use_accelerate | |
else | |
echo "Single-GPU mode received..." | |
CUDA_VISIBLE_DEVICES=${devices} \ | |
python train.py --ckpt_dir ckpt/${method} --epochs ${epochs} \ | |
--testsets ${testsets} \ | |
--dist ${to_be_distributed} \ | |
--resume xx/xx-epoch_244.pth | |
# --use_accelerate | |
fi | |
echo Training finished at $(date) | |