Spaces:
Build error
Build error
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. | |
"""ResNe(X)t Head helper.""" | |
import torch | |
import torch.nn as nn | |
class ResNetBasicHead(nn.Module): | |
""" | |
ResNe(X)t 3D head. | |
This layer performs a fully-connected projection during training, when the | |
input size is 1x1x1. It performs a convolutional projection during testing | |
when the input size is larger than 1x1x1. If the inputs are from multiple | |
different pathways, the inputs will be concatenated after pooling. | |
""" | |
def __init__( | |
self, | |
dim_in, | |
num_classes, | |
pool_size, | |
dropout_rate=0.0, | |
act_func="softmax", | |
): | |
""" | |
The `__init__` method of any subclass should also contain these | |
arguments. | |
ResNetBasicHead takes p pathways as input where p in [1, infty]. | |
Args: | |
dim_in (list): the list of channel dimensions of the p inputs to the | |
ResNetHead. | |
num_classes (int): the channel dimensions of the p outputs to the | |
ResNetHead. | |
pool_size (list): the list of kernel sizes of p spatial temporal | |
poolings, temporal pool kernel size, spatial pool kernel size, | |
spatial pool kernel size in order. | |
dropout_rate (float): dropout rate. If equal to 0.0, perform no | |
dropout. | |
act_func (string): activation function to use. 'softmax': applies | |
softmax on the output. 'sigmoid': applies sigmoid on the output. | |
""" | |
super(ResNetBasicHead, self).__init__() | |
assert ( | |
len({len(pool_size), len(dim_in)}) == 1 | |
), "pathway dimensions are not consistent." | |
self.num_pathways = len(pool_size) | |
for pathway in range(self.num_pathways): | |
if pool_size[pathway] is None: | |
avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) | |
else: | |
avg_pool = nn.AvgPool3d(pool_size[pathway], stride=1) | |
self.add_module("pathway{}_avgpool".format(pathway), avg_pool) | |
if dropout_rate > 0.0: | |
self.dropout = nn.Dropout(dropout_rate) | |
# Perform FC in a fully convolutional manner. The FC layer will be | |
# initialized with a different std comparing to convolutional layers. | |
self.projection = nn.Linear(sum(dim_in), num_classes, bias=True) | |
# Softmax for evaluation and testing. | |
if act_func == "softmax": | |
self.act = nn.Softmax(dim=4) | |
elif act_func == "sigmoid": | |
self.act = nn.Sigmoid() | |
else: | |
raise NotImplementedError( | |
"{} is not supported as an activation" | |
"function.".format(act_func) | |
) | |
def forward(self, inputs): | |
assert ( | |
len(inputs) == self.num_pathways | |
), "Input tensor does not contain {} pathway".format(self.num_pathways) | |
pool_out = [] | |
for pathway in range(self.num_pathways): | |
m = getattr(self, "pathway{}_avgpool".format(pathway)) | |
pool_out.append(m(inputs[pathway])) | |
x = torch.cat(pool_out, 1) | |
# (N, C, T, H, W) -> (N, T, H, W, C). | |
x = x.permute((0, 2, 3, 4, 1)) | |
# Perform dropout. | |
if hasattr(self, "dropout"): | |
x = self.dropout(x) | |
x = self.projection(x) | |
# Performs fully convlutional inference. | |
if not self.training: | |
x = self.act(x) | |
x = x.mean([1, 2, 3]) | |
x = x.view(x.shape[0], -1) | |
return x | |
class X3DHead(nn.Module): | |
""" | |
X3D head. | |
This layer performs a fully-connected projection during training, when the | |
input size is 1x1x1. It performs a convolutional projection during testing | |
when the input size is larger than 1x1x1. If the inputs are from multiple | |
different pathways, the inputs will be concatenated after pooling. | |
""" | |
def __init__( | |
self, | |
dim_in, | |
dim_inner, | |
dim_out, | |
num_classes, | |
pool_size, | |
dropout_rate=0.0, | |
act_func="softmax", | |
inplace_relu=True, | |
eps=1e-5, | |
bn_mmt=0.1, | |
norm_module=nn.BatchNorm3d, | |
bn_lin5_on=False, | |
): | |
""" | |
The `__init__` method of any subclass should also contain these | |
arguments. | |
X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input. | |
Args: | |
dim_in (float): the channel dimension C of the input. | |
num_classes (int): the channel dimensions of the output. | |
pool_size (float): a single entry list of kernel size for | |
spatiotemporal pooling for the TxHxW dimensions. | |
dropout_rate (float): dropout rate. If equal to 0.0, perform no | |
dropout. | |
act_func (string): activation function to use. 'softmax': applies | |
softmax on the output. 'sigmoid': applies sigmoid on the output. | |
inplace_relu (bool): if True, calculate the relu on the original | |
input without allocating new memory. | |
eps (float): epsilon for batch norm. | |
bn_mmt (float): momentum for batch norm. Noted that BN momentum in | |
PyTorch = 1 - BN momentum in Caffe2. | |
norm_module (nn.Module): nn.Module for the normalization layer. The | |
default is nn.BatchNorm3d. | |
bn_lin5_on (bool): if True, perform normalization on the features | |
before the classifier. | |
""" | |
super(X3DHead, self).__init__() | |
self.pool_size = pool_size | |
self.dropout_rate = dropout_rate | |
self.num_classes = num_classes | |
self.act_func = act_func | |
self.eps = eps | |
self.bn_mmt = bn_mmt | |
self.inplace_relu = inplace_relu | |
self.bn_lin5_on = bn_lin5_on | |
self._construct_head(dim_in, dim_inner, dim_out, norm_module) | |
def _construct_head(self, dim_in, dim_inner, dim_out, norm_module): | |
self.conv_5 = nn.Conv3d( | |
dim_in, | |
dim_inner, | |
kernel_size=(1, 1, 1), | |
stride=(1, 1, 1), | |
padding=(0, 0, 0), | |
bias=False, | |
) | |
self.conv_5_bn = norm_module( | |
num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt | |
) | |
self.conv_5_relu = nn.ReLU(self.inplace_relu) | |
if self.pool_size is None: | |
self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) | |
else: | |
self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1) | |
self.lin_5 = nn.Conv3d( | |
dim_inner, | |
dim_out, | |
kernel_size=(1, 1, 1), | |
stride=(1, 1, 1), | |
padding=(0, 0, 0), | |
bias=False, | |
) | |
if self.bn_lin5_on: | |
self.lin_5_bn = norm_module( | |
num_features=dim_out, eps=self.eps, momentum=self.bn_mmt | |
) | |
self.lin_5_relu = nn.ReLU(self.inplace_relu) | |
if self.dropout_rate > 0.0: | |
self.dropout = nn.Dropout(self.dropout_rate) | |
# Perform FC in a fully convolutional manner. The FC layer will be | |
# initialized with a different std comparing to convolutional layers. | |
self.projection = nn.Linear(dim_out, self.num_classes, bias=True) | |
# Softmax for evaluation and testing. | |
if self.act_func == "softmax": | |
self.act = nn.Softmax(dim=4) | |
elif self.act_func == "sigmoid": | |
self.act = nn.Sigmoid() | |
else: | |
raise NotImplementedError( | |
"{} is not supported as an activation" | |
"function.".format(self.act_func) | |
) | |
def forward(self, inputs): | |
# In its current design the X3D head is only useable for a single | |
# pathway input. | |
assert len(inputs) == 1, "Input tensor does not contain 1 pathway" | |
x = self.conv_5(inputs[0]) | |
x = self.conv_5_bn(x) | |
x = self.conv_5_relu(x) | |
x = self.avg_pool(x) | |
x = self.lin_5(x) | |
if self.bn_lin5_on: | |
x = self.lin_5_bn(x) | |
x = self.lin_5_relu(x) | |
# (N, C, T, H, W) -> (N, T, H, W, C). | |
x = x.permute((0, 2, 3, 4, 1)) | |
# Perform dropout. | |
if hasattr(self, "dropout"): | |
x = self.dropout(x) | |
x = self.projection(x) | |
# Performs fully convlutional inference. | |
if not self.training: | |
x = self.act(x) | |
x = x.mean([1, 2, 3]) | |
x = x.view(x.shape[0], -1) | |
return x | |