Spaces:
Build error
Build error
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. | |
import json | |
import logging | |
import math | |
import numpy as np | |
import os | |
from datetime import datetime | |
import psutil | |
import torch | |
from fvcore.common.file_io import PathManager | |
from fvcore.nn.activation_count import activation_count | |
from fvcore.nn.flop_count import flop_count | |
from matplotlib import pyplot as plt | |
from torch import nn | |
import timesformer.utils.logging as logging | |
import timesformer.utils.multiprocessing as mpu | |
from timesformer.datasets.utils import pack_pathway_output | |
from timesformer.models.batchnorm_helper import SubBatchNorm3d | |
logger = logging.get_logger(__name__) | |
def check_nan_losses(loss): | |
""" | |
Determine whether the loss is NaN (not a number). | |
Args: | |
loss (loss): loss to check whether is NaN. | |
""" | |
if math.isnan(loss): | |
raise RuntimeError("ERROR: Got NaN losses {}".format(datetime.now())) | |
def params_count(model, ignore_bn=False): | |
""" | |
Compute the number of parameters. | |
Args: | |
model (model): model to count the number of parameters. | |
""" | |
if not ignore_bn: | |
return np.sum([p.numel() for p in model.parameters()]).item() | |
else: | |
count = 0 | |
for m in model.modules(): | |
if not isinstance(m, nn.BatchNorm3d): | |
for p in m.parameters(recurse=False): | |
count += p.numel() | |
return count | |
def gpu_mem_usage(): | |
""" | |
Compute the GPU memory usage for the current device (GB). | |
""" | |
if torch.cuda.is_available(): | |
mem_usage_bytes = torch.cuda.max_memory_allocated() | |
else: | |
mem_usage_bytes = 0 | |
return mem_usage_bytes / 1024 ** 3 | |
def cpu_mem_usage(): | |
""" | |
Compute the system memory (RAM) usage for the current device (GB). | |
Returns: | |
usage (float): used memory (GB). | |
total (float): total memory (GB). | |
""" | |
vram = psutil.virtual_memory() | |
usage = (vram.total - vram.available) / 1024 ** 3 | |
total = vram.total / 1024 ** 3 | |
return usage, total | |
def _get_model_analysis_input(cfg, use_train_input): | |
""" | |
Return a dummy input for model analysis with batch size 1. The input is | |
used for analyzing the model (counting flops and activations etc.). | |
Args: | |
cfg (CfgNode): configs. Details can be found in | |
lib/config/defaults.py | |
use_train_input (bool): if True, return the input for training. Otherwise, | |
return the input for testing. | |
Returns: | |
inputs: the input for model analysis. | |
""" | |
rgb_dimension = 3 | |
if use_train_input: | |
input_tensors = torch.rand( | |
rgb_dimension, | |
cfg.DATA.NUM_FRAMES, | |
cfg.DATA.TRAIN_CROP_SIZE, | |
cfg.DATA.TRAIN_CROP_SIZE, | |
) | |
else: | |
input_tensors = torch.rand( | |
rgb_dimension, | |
cfg.DATA.NUM_FRAMES, | |
cfg.DATA.TEST_CROP_SIZE, | |
cfg.DATA.TEST_CROP_SIZE, | |
) | |
if not cfg.MODEL.ARCH in ['resformer', 'vit']: | |
model_inputs = pack_pathway_output(cfg, input_tensors) | |
for i in range(len(model_inputs)): | |
model_inputs[i] = model_inputs[i].unsqueeze(0) | |
if cfg.NUM_GPUS: | |
model_inputs[i] = model_inputs[i].cuda(non_blocking=True) | |
else: | |
model_inputs = input_tensors.cuda(non_blocking=True).unsqueeze(0) | |
# If detection is enabled, count flops for one proposal. | |
if cfg.DETECTION.ENABLE: | |
bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]]) | |
if cfg.NUM_GPUS: | |
bbox = bbox.cuda() | |
inputs = (model_inputs, bbox) | |
else: | |
inputs = (model_inputs,) | |
return inputs | |
def get_model_stats(model, cfg, mode, use_train_input): | |
""" | |
Compute statistics for the current model given the config. | |
Args: | |
model (model): model to perform analysis. | |
cfg (CfgNode): configs. Details can be found in | |
lib/config/defaults.py | |
mode (str): Options include `flop` or `activation`. Compute either flop | |
(gflops) or activation count (mega). | |
use_train_input (bool): if True, compute statistics for training. Otherwise, | |
compute statistics for testing. | |
Returns: | |
float: the total number of count of the given model. | |
""" | |
assert mode in [ | |
"flop", | |
"activation", | |
], "'{}' not supported for model analysis".format(mode) | |
if mode == "flop": | |
model_stats_fun = flop_count | |
elif mode == "activation": | |
model_stats_fun = activation_count | |
# Set model to evaluation mode for analysis. | |
# Evaluation mode can avoid getting stuck with sync batchnorm. | |
model_mode = model.training | |
model.eval() | |
inputs = _get_model_analysis_input(cfg, use_train_input) | |
count_dict, *_ = model_stats_fun(model, inputs) | |
count = sum(count_dict.values()) | |
model.train(model_mode) | |
return count | |
def log_model_info(model, cfg, use_train_input=True): | |
""" | |
Log info, includes number of parameters, gpu usage, gflops and activation count. | |
The model info is computed when the model is in validation mode. | |
Args: | |
model (model): model to log the info. | |
cfg (CfgNode): configs. Details can be found in | |
lib/config/defaults.py | |
use_train_input (bool): if True, log info for training. Otherwise, | |
log info for testing. | |
""" | |
logger.info("Model:\n{}".format(model)) | |
logger.info("Params: {:,}".format(params_count(model))) | |
logger.info("Mem: {:,} MB".format(gpu_mem_usage())) | |
logger.info( | |
"Flops: {:,} G".format( | |
get_model_stats(model, cfg, "flop", use_train_input) | |
) | |
) | |
logger.info( | |
"Activations: {:,} M".format( | |
get_model_stats(model, cfg, "activation", use_train_input) | |
) | |
) | |
logger.info("nvidia-smi") | |
os.system("nvidia-smi") | |
def is_eval_epoch(cfg, cur_epoch, multigrid_schedule): | |
""" | |
Determine if the model should be evaluated at the current epoch. | |
Args: | |
cfg (CfgNode): configs. Details can be found in | |
lib/config/defaults.py | |
cur_epoch (int): current epoch. | |
multigrid_schedule (List): schedule for multigrid training. | |
""" | |
if cur_epoch + 1 == cfg.SOLVER.MAX_EPOCH: | |
return True | |
if multigrid_schedule is not None: | |
prev_epoch = 0 | |
for s in multigrid_schedule: | |
if cur_epoch < s[-1]: | |
period = max( | |
(s[-1] - prev_epoch) // cfg.MULTIGRID.EVAL_FREQ + 1, 1 | |
) | |
return (s[-1] - 1 - cur_epoch) % period == 0 | |
prev_epoch = s[-1] | |
return (cur_epoch + 1) % cfg.TRAIN.EVAL_PERIOD == 0 | |
def plot_input(tensor, bboxes=(), texts=(), path="./tmp_vis.png"): | |
""" | |
Plot the input tensor with the optional bounding box and save it to disk. | |
Args: | |
tensor (tensor): a tensor with shape of `NxCxHxW`. | |
bboxes (tuple): bounding boxes with format of [[x, y, h, w]]. | |
texts (tuple): a tuple of string to plot. | |
path (str): path to the image to save to. | |
""" | |
tensor = tensor.float() | |
tensor = tensor - tensor.min() | |
tensor = tensor / tensor.max() | |
f, ax = plt.subplots(nrows=1, ncols=tensor.shape[0], figsize=(50, 20)) | |
for i in range(tensor.shape[0]): | |
ax[i].axis("off") | |
ax[i].imshow(tensor[i].permute(1, 2, 0)) | |
# ax[1][0].axis('off') | |
if bboxes is not None and len(bboxes) > i: | |
for box in bboxes[i]: | |
x1, y1, x2, y2 = box | |
ax[i].vlines(x1, y1, y2, colors="g", linestyles="solid") | |
ax[i].vlines(x2, y1, y2, colors="g", linestyles="solid") | |
ax[i].hlines(y1, x1, x2, colors="g", linestyles="solid") | |
ax[i].hlines(y2, x1, x2, colors="g", linestyles="solid") | |
if texts is not None and len(texts) > i: | |
ax[i].text(0, 0, texts[i]) | |
f.savefig(path) | |
def frozen_bn_stats(model): | |
""" | |
Set all the bn layers to eval mode. | |
Args: | |
model (model): model to set bn layers to eval mode. | |
""" | |
for m in model.modules(): | |
if isinstance(m, nn.BatchNorm3d): | |
m.eval() | |
def aggregate_sub_bn_stats(module): | |
""" | |
Recursively find all SubBN modules and aggregate sub-BN stats. | |
Args: | |
module (nn.Module) | |
Returns: | |
count (int): number of SubBN module found. | |
""" | |
count = 0 | |
for child in module.children(): | |
if isinstance(child, SubBatchNorm3d): | |
child.aggregate_stats() | |
count += 1 | |
else: | |
count += aggregate_sub_bn_stats(child) | |
return count | |
def launch_job(cfg, init_method, func, daemon=False): | |
""" | |
Run 'func' on one or more GPUs, specified in cfg | |
Args: | |
cfg (CfgNode): configs. Details can be found in | |
lib/config/defaults.py | |
init_method (str): initialization method to launch the job with multiple | |
devices. | |
func (function): job to run on GPU(s) | |
daemon (bool): The spawned processes’ daemon flag. If set to True, | |
daemonic processes will be created | |
""" | |
if cfg.NUM_GPUS > 1: | |
torch.multiprocessing.spawn( | |
mpu.run, | |
nprocs=cfg.NUM_GPUS, | |
args=( | |
cfg.NUM_GPUS, | |
func, | |
init_method, | |
cfg.SHARD_ID, | |
cfg.NUM_SHARDS, | |
cfg.DIST_BACKEND, | |
cfg, | |
), | |
daemon=daemon, | |
) | |
else: | |
func(cfg=cfg) | |
def get_class_names(path, parent_path=None, subset_path=None): | |
""" | |
Read json file with entries {classname: index} and return | |
an array of class names in order. | |
If parent_path is provided, load and map all children to their ids. | |
Args: | |
path (str): path to class ids json file. | |
File must be in the format {"class1": id1, "class2": id2, ...} | |
parent_path (Optional[str]): path to parent-child json file. | |
File must be in the format {"parent1": ["child1", "child2", ...], ...} | |
subset_path (Optional[str]): path to text file containing a subset | |
of class names, separated by newline characters. | |
Returns: | |
class_names (list of strs): list of class names. | |
class_parents (dict): a dictionary where key is the name of the parent class | |
and value is a list of ids of the children classes. | |
subset_ids (list of ints): list of ids of the classes provided in the | |
subset file. | |
""" | |
try: | |
with PathManager.open(path, "r") as f: | |
class2idx = json.load(f) | |
except Exception as err: | |
print("Fail to load file from {} with error {}".format(path, err)) | |
return | |
max_key = max(class2idx.values()) | |
class_names = [None] * (max_key + 1) | |
for k, i in class2idx.items(): | |
class_names[i] = k | |
class_parent = None | |
if parent_path is not None and parent_path != "": | |
try: | |
with PathManager.open(parent_path, "r") as f: | |
d_parent = json.load(f) | |
except EnvironmentError as err: | |
print( | |
"Fail to load file from {} with error {}".format( | |
parent_path, err | |
) | |
) | |
return | |
class_parent = {} | |
for parent, children in d_parent.items(): | |
indices = [ | |
class2idx[c] for c in children if class2idx.get(c) is not None | |
] | |
class_parent[parent] = indices | |
subset_ids = None | |
if subset_path is not None and subset_path != "": | |
try: | |
with PathManager.open(subset_path, "r") as f: | |
subset = f.read().split("\n") | |
subset_ids = [ | |
class2idx[name] | |
for name in subset | |
if class2idx.get(name) is not None | |
] | |
except EnvironmentError as err: | |
print( | |
"Fail to load file from {} with error {}".format( | |
subset_path, err | |
) | |
) | |
return | |
return class_names, class_parent, subset_ids | |