|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import yaml
|
|
|
|
FILE = Path(__file__).resolve()
|
|
ROOT = FILE.parents[2]
|
|
if str(ROOT) not in sys.path:
|
|
sys.path.append(str(ROOT))
|
|
|
|
port = 0
|
|
path = Path("").resolve()
|
|
for last in path.rglob("*/**/last.pt"):
|
|
ckpt = torch.load(last)
|
|
if ckpt["optimizer"] is None:
|
|
continue
|
|
|
|
|
|
with open(last.parent.parent / "opt.yaml", errors="ignore") as f:
|
|
opt = yaml.safe_load(f)
|
|
|
|
|
|
d = opt["device"].split(",")
|
|
nd = len(d)
|
|
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1)
|
|
|
|
if ddp:
|
|
port += 1
|
|
cmd = f"python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}"
|
|
else:
|
|
cmd = f"python train.py --resume {last}"
|
|
|
|
cmd += " > /dev/null 2>&1 &"
|
|
print(cmd)
|
|
os.system(cmd)
|
|
|