Spaces:
Running
Running
from __future__ import annotations | |
import os | |
import argparse | |
import subprocess | |
def print_and_write(outfile, line: str, flush: bool = False): | |
print(line, end="", flush=flush) | |
outfile.write(line) | |
if flush: | |
outfile.flush() | |
def main(args: argparse.Namespace) -> None: | |
assert len(args.gpu_ids) == 1 | |
hf_token = os.environ["HF_TOKEN"] | |
if args.model.startswith("models/"): | |
outdir = f"{args.result_root}/{args.model[len('models/'):]}" | |
else: | |
outdir = f"{args.result_root}/{args.model}" | |
os.makedirs(outdir, exist_ok=True) | |
outfile = open(f"{outdir}/gpus{''.join(args.gpu_ids)}.out.txt", "w") | |
print_and_write(outfile, f"Benchmarking {args.model}\n") | |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n") | |
print_and_write(outfile, f"Power limits: {args.power_limits}\n") | |
print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n") | |
for batch_size in args.batch_sizes: | |
for power_limit in args.power_limits: | |
for num_inference_steps in args.num_inference_steps: | |
print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True) | |
cmd=[ | |
"docker", "run", | |
"--gpus", '"device=' + ','.join(args.gpu_ids) + '"', | |
"--cap-add", "SYS_ADMIN", | |
"--name", f"leaderboard-t2i-{''.join(args.gpu_ids)}", | |
"--rm", | |
"-v", "/data/leaderboard/hfcache:/root/.cache/huggingface", | |
"-v", f"{os.getcwd()}:/workspace/text-to-image", | |
"mlenergy/leaderboard:diffusion-t2i", | |
"--result-root", args.result_root, | |
"--batch-size", batch_size, | |
"--num-batches", "10", | |
"--power-limit", power_limit, | |
"--model", args.model, | |
"--huggingface-token", hf_token, | |
"--num-inference-steps", num_inference_steps, | |
] | |
if args.monitor_power: | |
cmd.append("--monitor-power") | |
with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc: | |
if proc.stdout: | |
i = 0 | |
for line in proc.stdout: | |
print_and_write(outfile, line, flush=i % 50 == 0) | |
i += 1 | |
# If proc exited with non-zero status, it's probably an OOM. | |
# Move on to the next batch size. | |
if proc.returncode != 0: | |
break | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model", type=str, help="ID of the model to benchmark") | |
parser.add_argument("--result-root", type=str, help="Root directory to store the results") | |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use") | |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark") | |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark") | |
parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of inference steps to run") | |
parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.") | |
args = parser.parse_args() | |
main(args) | |