Spaces:
Sleeping
Sleeping
import warnings | |
from .distance import PairwiseDistance | |
from .module import Module | |
from .. import functional as F | |
from .. import _reduction as _Reduction | |
from torch import Tensor | |
from typing import Callable, Optional | |
__all__ = ['L1Loss', 'NLLLoss', 'NLLLoss2d', 'PoissonNLLLoss', 'GaussianNLLLoss', 'KLDivLoss', | |
'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss', | |
'SmoothL1Loss', 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'MultiLabelSoftMarginLoss', | |
'CosineEmbeddingLoss', 'MarginRankingLoss', 'MultiMarginLoss', 'TripletMarginLoss', | |
'TripletMarginWithDistanceLoss', 'CTCLoss'] | |
class _Loss(Module): | |
reduction: str | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__() | |
if size_average is not None or reduce is not None: | |
self.reduction: str = _Reduction.legacy_get_string(size_average, reduce) | |
else: | |
self.reduction = reduction | |
class _WeightedLoss(_Loss): | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.register_buffer('weight', weight) | |
self.weight: Optional[Tensor] | |
class L1Loss(_Loss): | |
r"""Creates a criterion that measures the mean absolute error (MAE) between each element in | |
the input :math:`x` and target :math:`y`. | |
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = \left| x_n - y_n \right|, | |
where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then: | |
.. math:: | |
\ell(x, y) = | |
\begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
:math:`x` and :math:`y` are tensors of arbitrary shapes with a total | |
of :math:`n` elements each. | |
The sum operation still operates over all the elements, and divides by :math:`n`. | |
The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``. | |
Supports real-valued and complex-valued inputs. | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then | |
:math:`(*)`, same shape as the input. | |
Examples:: | |
>>> loss = nn.L1Loss() | |
>>> input = torch.randn(3, 5, requires_grad=True) | |
>>> target = torch.randn(3, 5) | |
>>> output = loss(input, target) | |
>>> output.backward() | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.l1_loss(input, target, reduction=self.reduction) | |
class NLLLoss(_WeightedLoss): | |
r"""The negative log likelihood loss. It is useful to train a classification | |
problem with `C` classes. | |
If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning | |
weight to each of the classes. This is particularly useful when you have an | |
unbalanced training set. | |
The `input` given through a forward call is expected to contain | |
log-probabilities of each class. `input` has to be a Tensor of size either | |
:math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` | |
with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for | |
higher dimension inputs, such as computing NLL loss per-pixel for 2D images. | |
Obtaining log-probabilities in a neural network is easily achieved by | |
adding a `LogSoftmax` layer in the last layer of your network. | |
You may use `CrossEntropyLoss` instead, if you prefer not to add an extra | |
layer. | |
The `target` that this loss expects should be a class index in the range :math:`[0, C-1]` | |
where `C = number of classes`; if `ignore_index` is specified, this loss also accepts | |
this class index (this index may not necessarily be in the class range). | |
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = - w_{y_n} x_{n,y_n}, \quad | |
w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\}, | |
where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and | |
:math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & | |
\text{if reduction} = \text{`mean';}\\ | |
\sum_{n=1}^N l_n, & | |
\text{if reduction} = \text{`sum'.} | |
\end{cases} | |
Args: | |
weight (Tensor, optional): a manual rescaling weight given to each | |
class. If given, it has to be a Tensor of size `C`. Otherwise, it is | |
treated as if having all ones. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``None`` | |
ignore_index (int, optional): Specifies a target value that is ignored | |
and does not contribute to the input gradient. When | |
:attr:`size_average` is ``True``, the loss is averaged over | |
non-ignored targets. | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``None`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will | |
be applied, ``'mean'``: the weighted mean of the output is taken, | |
``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in | |
the meantime, specifying either of those two args will override | |
:attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, or | |
:math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` | |
in the case of `K`-dimensional loss. | |
- Target: :math:`(N)` or :math:`()`, where each value is | |
:math:`0 \leq \text{targets}[i] \leq C-1`, or | |
:math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of | |
K-dimensional loss. | |
- Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or | |
:math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss. | |
Otherwise, scalar. | |
Examples:: | |
>>> m = nn.LogSoftmax(dim=1) | |
>>> loss = nn.NLLLoss() | |
>>> # input is of size N x C = 3 x 5 | |
>>> input = torch.randn(3, 5, requires_grad=True) | |
>>> # each element in target has to have 0 <= value < C | |
>>> target = torch.tensor([1, 0, 4]) | |
>>> output = loss(m(input), target) | |
>>> output.backward() | |
>>> | |
>>> | |
>>> # 2D loss example (used, for example, with image inputs) | |
>>> N, C = 5, 4 | |
>>> loss = nn.NLLLoss() | |
>>> # input is of size N x C x height x width | |
>>> data = torch.randn(N, 16, 10, 10) | |
>>> conv = nn.Conv2d(16, C, (3, 3)) | |
>>> m = nn.LogSoftmax(dim=1) | |
>>> # each element in target has to have 0 <= value < C | |
>>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) | |
>>> output = loss(m(conv(data)), target) | |
>>> output.backward() | |
""" | |
__constants__ = ['ignore_index', 'reduction'] | |
ignore_index: int | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100, | |
reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(weight, size_average, reduce, reduction) | |
self.ignore_index = ignore_index | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction) | |
class NLLLoss2d(NLLLoss): | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100, | |
reduce=None, reduction: str = 'mean') -> None: | |
warnings.warn("NLLLoss2d has been deprecated. " | |
"Please use NLLLoss instead as a drop-in replacement and see " | |
"https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.") | |
super().__init__(weight, size_average, ignore_index, reduce, reduction) | |
class PoissonNLLLoss(_Loss): | |
r"""Negative log likelihood loss with Poisson distribution of target. | |
The loss can be described as: | |
.. math:: | |
\text{target} \sim \mathrm{Poisson}(\text{input}) | |
\text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input}) | |
+ \log(\text{target!}) | |
The last term can be omitted or approximated with Stirling formula. The | |
approximation is used for target values more than 1. For targets less or | |
equal to 1 zeros are added to the loss. | |
Args: | |
log_input (bool, optional): if ``True`` the loss is computed as | |
:math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is | |
:math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`. | |
full (bool, optional): whether to compute full loss, i. e. to add the | |
Stirling approximation term | |
.. math:: | |
\text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}). | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when | |
:attr:`log_input = False`. Default: 1e-8 | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Examples:: | |
>>> loss = nn.PoissonNLLLoss() | |
>>> log_input = torch.randn(5, 2, requires_grad=True) | |
>>> target = torch.randn(5, 2) | |
>>> output = loss(log_input, target) | |
>>> output.backward() | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`, | |
the same shape as the input. | |
""" | |
__constants__ = ['log_input', 'full', 'eps', 'reduction'] | |
log_input: bool | |
full: bool | |
eps: float | |
def __init__(self, log_input: bool = True, full: bool = False, size_average=None, | |
eps: float = 1e-8, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.log_input = log_input | |
self.full = full | |
self.eps = eps | |
def forward(self, log_input: Tensor, target: Tensor) -> Tensor: | |
return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full, | |
eps=self.eps, reduction=self.reduction) | |
class GaussianNLLLoss(_Loss): | |
r"""Gaussian negative log likelihood loss. | |
The targets are treated as samples from Gaussian distributions with | |
expectations and variances predicted by the neural network. For a | |
``target`` tensor modelled as having Gaussian distribution with a tensor | |
of expectations ``input`` and a tensor of positive variances ``var`` the loss is: | |
.. math:: | |
\text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var}, | |
\ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2} | |
{\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.} | |
where :attr:`eps` is used for stability. By default, the constant term of | |
the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same | |
size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension | |
of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting. | |
Args: | |
full (bool, optional): include the constant term in the loss | |
calculation. Default: ``False``. | |
eps (float, optional): value used to clamp ``var`` (see note below), for | |
stability. Default: 1e-6. | |
reduction (str, optional): specifies the reduction to apply to the | |
output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction | |
will be applied, ``'mean'``: the output is the average of all batch | |
member losses, ``'sum'``: the output is the sum of all batch member | |
losses. Default: ``'mean'``. | |
Shape: | |
- Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional | |
dimensions | |
- Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input | |
but with one dimension equal to 1 (to allow for broadcasting) | |
- Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but | |
with one dimension equal to 1, or same shape as the input but with one fewer | |
dimension (to allow for broadcasting) | |
- Output: scalar if :attr:`reduction` is ``'mean'`` (default) or | |
``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same | |
shape as the input | |
Examples:: | |
>>> loss = nn.GaussianNLLLoss() | |
>>> input = torch.randn(5, 2, requires_grad=True) | |
>>> target = torch.randn(5, 2) | |
>>> var = torch.ones(5, 2, requires_grad=True) # heteroscedastic | |
>>> output = loss(input, target, var) | |
>>> output.backward() | |
>>> loss = nn.GaussianNLLLoss() | |
>>> input = torch.randn(5, 2, requires_grad=True) | |
>>> target = torch.randn(5, 2) | |
>>> var = torch.ones(5, 1, requires_grad=True) # homoscedastic | |
>>> output = loss(input, target, var) | |
>>> output.backward() | |
Note: | |
The clamping of ``var`` is ignored with respect to autograd, and so the | |
gradients are unaffected by it. | |
Reference: | |
Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the | |
target probability distribution", Proceedings of 1994 IEEE International | |
Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60 | |
vol.1, doi: 10.1109/ICNN.1994.374138. | |
""" | |
__constants__ = ['full', 'eps', 'reduction'] | |
full: bool | |
eps: float | |
def __init__(self, *, full: bool = False, eps: float = 1e-6, reduction: str = 'mean') -> None: | |
super().__init__(None, None, reduction) | |
self.full = full | |
self.eps = eps | |
def forward(self, input: Tensor, target: Tensor, var: Tensor) -> Tensor: | |
return F.gaussian_nll_loss(input, target, var, full=self.full, eps=self.eps, reduction=self.reduction) | |
class KLDivLoss(_Loss): | |
r"""The Kullback-Leibler divergence loss. | |
For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`, | |
where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the | |
:attr:`target`, we define the **pointwise KL-divergence** as | |
.. math:: | |
L(y_{\text{pred}},\ y_{\text{true}}) | |
= y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}} | |
= y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}}) | |
To avoid underflow issues when computing this quantity, this loss expects the argument | |
:attr:`input` in the log-space. The argument :attr:`target` may also be provided in the | |
log-space if :attr:`log_target`\ `= True`. | |
To summarise, this function is roughly equivalent to computing | |
.. code-block:: python | |
if not log_target: # default | |
loss_pointwise = target * (target.log() - input) | |
else: | |
loss_pointwise = target.exp() * (target - input) | |
and then reducing this result depending on the argument :attr:`reduction` as | |
.. code-block:: python | |
if reduction == "mean": # default | |
loss = loss_pointwise.mean() | |
elif reduction == "batchmean": # mathematically correct | |
loss = loss_pointwise.sum() / input.size(0) | |
elif reduction == "sum": | |
loss = loss_pointwise.sum() | |
else: # reduction == "none" | |
loss = loss_pointwise | |
.. note:: | |
As all the other losses in PyTorch, this function expects the first argument, | |
:attr:`input`, to be the output of the model (e.g. the neural network) | |
and the second, :attr:`target`, to be the observations in the dataset. | |
This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where | |
:math:`P` denotes the distribution of the observations and :math:`Q` denotes the model. | |
.. warning:: | |
:attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use | |
:attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition. | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to `False`, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is `False`. Default: `True` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: `True` | |
reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"` | |
log_target (bool, optional): Specifies whether `target` is the log space. Default: `False` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`, | |
same shape as the input. | |
Examples:: | |
>>> import torch.nn.functional as F | |
>>> kl_loss = nn.KLDivLoss(reduction="batchmean") | |
>>> # input should be a distribution in the log space | |
>>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1) | |
>>> # Sample a batch of distributions. Usually this would come from the dataset | |
>>> target = F.softmax(torch.rand(3, 5), dim=1) | |
>>> output = kl_loss(input, target) | |
>>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True) | |
>>> log_target = F.log_softmax(torch.rand(3, 5), dim=1) | |
>>> output = kl_loss(input, log_target) | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', log_target: bool = False) -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.log_target = log_target | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.kl_div(input, target, reduction=self.reduction, log_target=self.log_target) | |
class MSELoss(_Loss): | |
r"""Creates a criterion that measures the mean squared error (squared L2 norm) between | |
each element in the input :math:`x` and target :math:`y`. | |
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = \left( x_n - y_n \right)^2, | |
where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then: | |
.. math:: | |
\ell(x, y) = | |
\begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
:math:`x` and :math:`y` are tensors of arbitrary shapes with a total | |
of :math:`n` elements each. | |
The mean operation still operates over all the elements, and divides by :math:`n`. | |
The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``. | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
Examples:: | |
>>> loss = nn.MSELoss() | |
>>> input = torch.randn(3, 5, requires_grad=True) | |
>>> target = torch.randn(3, 5) | |
>>> output = loss(input, target) | |
>>> output.backward() | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.mse_loss(input, target, reduction=self.reduction) | |
class BCELoss(_WeightedLoss): | |
r"""Creates a criterion that measures the Binary Cross Entropy between the target and | |
the input probabilities: | |
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right], | |
where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
This is used for measuring the error of a reconstruction in for example | |
an auto-encoder. Note that the targets :math:`y` should be numbers | |
between 0 and 1. | |
Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be | |
mathematically undefined in the above loss equation. PyTorch chooses to set | |
:math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`. | |
However, an infinite term in the loss equation is not desirable for several reasons. | |
For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be | |
multiplying 0 with infinity. Secondly, if we have an infinite loss value, then | |
we would also have an infinite term in our gradient, since | |
:math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`. | |
This would make BCELoss's backward method nonlinear with respect to :math:`x_n`, | |
and using it for things like linear regression would not be straight-forward. | |
Our solution is that BCELoss clamps its log function outputs to be greater than | |
or equal to -100. This way, we can always have a finite loss value and a linear | |
backward method. | |
Args: | |
weight (Tensor, optional): a manual rescaling weight given to the loss | |
of each batch element. If given, has to be a Tensor of size `nbatch`. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same | |
shape as input. | |
Examples:: | |
>>> m = nn.Sigmoid() | |
>>> loss = nn.BCELoss() | |
>>> input = torch.randn(3, 2, requires_grad=True) | |
>>> target = torch.rand(3, 2, requires_grad=False) | |
>>> output = loss(m(input), target) | |
>>> output.backward() | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(weight, size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction) | |
class BCEWithLogitsLoss(_Loss): | |
r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single | |
class. This version is more numerically stable than using a plain `Sigmoid` | |
followed by a `BCELoss` as, by combining the operations into one layer, | |
we take advantage of the log-sum-exp trick for numerical stability. | |
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = - w_n \left[ y_n \cdot \log \sigma(x_n) | |
+ (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right], | |
where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
This is used for measuring the error of a reconstruction in for example | |
an auto-encoder. Note that the targets `t[i]` should be numbers | |
between 0 and 1. | |
It's possible to trade off recall and precision by adding weights to positive examples. | |
In the case of multi-label classification the loss can be described as: | |
.. math:: | |
\ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad | |
l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c}) | |
+ (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right], | |
where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification, | |
:math:`c = 1` for single-label binary classification), | |
:math:`n` is the number of the sample in the batch and | |
:math:`p_c` is the weight of the positive answer for the class :math:`c`. | |
:math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision. | |
For example, if a dataset contains 100 positive and 300 negative examples of a single class, | |
then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`. | |
The loss would act as if the dataset contains :math:`3\times 100=300` positive examples. | |
Examples:: | |
>>> target = torch.ones([10, 64], dtype=torch.float32) # 64 classes, batch size = 10 | |
>>> output = torch.full([10, 64], 1.5) # A prediction (logit) | |
>>> pos_weight = torch.ones([64]) # All weights are equal to 1 | |
>>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) | |
>>> criterion(output, target) # -log(sigmoid(1.5)) | |
tensor(0.20...) | |
In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes | |
in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the | |
loss function based on the imbalance between negative and positive samples for the respective class. | |
This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss | |
calculation accurately accounts for the distribution in each class. | |
Args: | |
weight (Tensor, optional): a manual rescaling weight given to the loss | |
of each batch element. If given, has to be a Tensor of size `nbatch`. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target. | |
Must be a tensor with equal size along the class dimension to the number of classes. | |
Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired | |
operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of | |
size [B, C, H, W] will apply different pos_weights to each element of the batch or | |
[C, H, W] the same pos_weights across the batch. To apply the same positive weight | |
along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1]. | |
Default: ``None`` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same | |
shape as input. | |
Examples:: | |
>>> loss = nn.BCEWithLogitsLoss() | |
>>> input = torch.randn(3, requires_grad=True) | |
>>> target = torch.empty(3).random_(2) | |
>>> output = loss(input, target) | |
>>> output.backward() | |
""" | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', | |
pos_weight: Optional[Tensor] = None) -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.register_buffer('weight', weight) | |
self.register_buffer('pos_weight', pos_weight) | |
self.weight: Optional[Tensor] | |
self.pos_weight: Optional[Tensor] | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.binary_cross_entropy_with_logits(input, target, | |
self.weight, | |
pos_weight=self.pos_weight, | |
reduction=self.reduction) | |
class HingeEmbeddingLoss(_Loss): | |
r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y` | |
(containing 1 or -1). | |
This is usually used for measuring whether two inputs are similar or | |
dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically | |
used for learning nonlinear embeddings or semi-supervised learning. | |
The loss function for :math:`n`-th sample in the mini-batch is | |
.. math:: | |
l_n = \begin{cases} | |
x_n, & \text{if}\; y_n = 1,\\ | |
\max \{0, margin - x_n\}, & \text{if}\; y_n = -1, | |
\end{cases} | |
and the total loss functions is | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
where :math:`L = \{l_1,\dots,l_N\}^\top`. | |
Args: | |
margin (float, optional): Has a default value of `1`. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation | |
operates over all the elements. | |
- Target: :math:`(*)`, same shape as the input | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input | |
""" | |
__constants__ = ['margin', 'reduction'] | |
margin: float | |
def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.margin = margin | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction) | |
class MultiLabelMarginLoss(_Loss): | |
r"""Creates a criterion that optimizes a multi-class multi-classification | |
hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) | |
and output :math:`y` (which is a 2D `Tensor` of target class indices). | |
For each sample in the mini-batch: | |
.. math:: | |
\text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)} | |
where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \ | |
:math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \ | |
:math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \ | |
and :math:`i \neq y[j]` for all :math:`i` and :math:`j`. | |
:math:`y` and :math:`x` must have the same size. | |
The criterion only considers a contiguous block of non-negative targets that | |
starts at the front. | |
This allows for different samples to have variable amounts of target classes. | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C` | |
is the number of classes. | |
- Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`. | |
Examples:: | |
>>> loss = nn.MultiLabelMarginLoss() | |
>>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]]) | |
>>> # for target y, only consider labels 3 and 0, not after label -1 | |
>>> y = torch.LongTensor([[3, 0, -1, 1]]) | |
>>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4))) | |
>>> loss(x, y) | |
tensor(0.85...) | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.multilabel_margin_loss(input, target, reduction=self.reduction) | |
class SmoothL1Loss(_Loss): | |
r"""Creates a criterion that uses a squared term if the absolute | |
element-wise error falls below beta and an L1 term otherwise. | |
It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases | |
prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick). | |
For a batch of size :math:`N`, the unreduced loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1, ..., l_N\}^T | |
with | |
.. math:: | |
l_n = \begin{cases} | |
0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\ | |
|x_n - y_n| - 0.5 * beta, & \text{otherwise } | |
\end{cases} | |
If `reduction` is not `none`, then: | |
.. math:: | |
\ell(x, y) = | |
\begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
.. note:: | |
Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta` | |
portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`. | |
The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`. | |
.. note:: | |
Smooth L1 loss is closely related to :class:`HuberLoss`, being | |
equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is | |
also known as delta for Huber). This leads to the following differences: | |
* As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss` | |
converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss. | |
* As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while | |
:class:`HuberLoss` converges to :class:`MSELoss`. | |
* For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1. | |
For :class:`HuberLoss`, the slope of the L1 segment is beta. | |
.. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083 | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss. | |
The value must be non-negative. Default: 1.0 | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input. | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.beta = beta | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta) | |
class HuberLoss(_Loss): | |
r"""Creates a criterion that uses a squared term if the absolute | |
element-wise error falls below delta and a delta-scaled L1 term otherwise. | |
This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the | |
delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`, | |
while the L2 region provides smoothness over :class:`L1Loss` near 0. See | |
`Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`_ for more information. | |
For a batch of size :math:`N`, the unreduced loss can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1, ..., l_N\}^T | |
with | |
.. math:: | |
l_n = \begin{cases} | |
0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\ | |
delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise } | |
\end{cases} | |
If `reduction` is not `none`, then: | |
.. math:: | |
\ell(x, y) = | |
\begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
.. note:: | |
When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`. | |
In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta | |
in Smooth L1). | |
See :class:`SmoothL1Loss` for additional discussion on the differences in behavior | |
between the two losses. | |
Args: | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'`` | |
delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss. | |
The value must be positive. Default: 1.0 | |
Shape: | |
- Input: :math:`(*)` where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input. | |
""" | |
__constants__ = ['reduction', 'delta'] | |
def __init__(self, reduction: str = 'mean', delta: float = 1.0) -> None: | |
super().__init__(reduction=reduction) | |
self.delta = delta | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta) | |
class SoftMarginLoss(_Loss): | |
r"""Creates a criterion that optimizes a two-class classification | |
logistic loss between input tensor :math:`x` and target tensor :math:`y` | |
(containing 1 or -1). | |
.. math:: | |
\text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()} | |
Args: | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(*)`, where :math:`*` means any number of dimensions. | |
- Target: :math:`(*)`, same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same | |
shape as input. | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.soft_margin_loss(input, target, reduction=self.reduction) | |
class CrossEntropyLoss(_WeightedLoss): | |
r"""This criterion computes the cross entropy loss between input logits | |
and target. | |
It is useful when training a classification problem with `C` classes. | |
If provided, the optional argument :attr:`weight` should be a 1D `Tensor` | |
assigning weight to each of the classes. | |
This is particularly useful when you have an unbalanced training set. | |
The `input` is expected to contain the unnormalized logits for each class (which do `not` need | |
to be positive or sum to 1, in general). | |
`input` has to be a Tensor of size :math:`(C)` for unbatched input, | |
:math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the | |
`K`-dimensional case. The last being useful for higher dimension inputs, such | |
as computing cross entropy loss per-pixel for 2D images. | |
The `target` that this criterion expects should contain either: | |
- Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if | |
`ignore_index` is specified, this loss also accepts this class index (this index | |
may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction` | |
set to ``'none'``) loss for this case can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})} | |
\cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\} | |
where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, | |
:math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as | |
:math:`d_1, ..., d_k` for the `K`-dimensional case. If | |
:attr:`reduction` is not ``'none'`` (default ``'mean'``), then | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, & | |
\text{if reduction} = \text{`mean';}\\ | |
\sum_{n=1}^N l_n, & | |
\text{if reduction} = \text{`sum'.} | |
\end{cases} | |
Note that this case is equivalent to applying :class:`~torch.nn.LogSoftmax` | |
on an input, followed by :class:`~torch.nn.NLLLoss`. | |
- Probabilities for each class; useful when labels beyond a single class per minibatch item | |
are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with | |
:attr:`reduction` set to ``'none'``) loss for this case can be described as: | |
.. math:: | |
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c} | |
where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, | |
:math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as | |
:math:`d_1, ..., d_k` for the `K`-dimensional case. If | |
:attr:`reduction` is not ``'none'`` (default ``'mean'``), then | |
.. math:: | |
\ell(x, y) = \begin{cases} | |
\frac{\sum_{n=1}^N l_n}{N}, & | |
\text{if reduction} = \text{`mean';}\\ | |
\sum_{n=1}^N l_n, & | |
\text{if reduction} = \text{`sum'.} | |
\end{cases} | |
.. note:: | |
The performance of this criterion is generally better when `target` contains class | |
indices, as this allows for optimized computation. Consider providing `target` as | |
class probabilities only when a single class label per minibatch item is too restrictive. | |
Args: | |
weight (Tensor, optional): a manual rescaling weight given to each class. | |
If given, has to be a Tensor of size `C` and floating point dtype | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
ignore_index (int, optional): Specifies a target value that is ignored | |
and does not contribute to the input gradient. When :attr:`size_average` is | |
``True``, the loss is averaged over non-ignored targets. Note that | |
:attr:`ignore_index` is only applicable when the target contains class indices. | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will | |
be applied, ``'mean'``: the weighted mean of the output is taken, | |
``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in | |
the meantime, specifying either of those two args will override | |
:attr:`reduction`. Default: ``'mean'`` | |
label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount | |
of smoothing when computing the loss, where 0.0 means no smoothing. The targets | |
become a mixture of the original ground truth and a uniform distribution as described in | |
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`. | |
Shape: | |
- Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` | |
in the case of `K`-dimensional loss. | |
- Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with | |
:math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. | |
If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`. | |
- Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` | |
in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar. | |
where: | |
.. math:: | |
\begin{aligned} | |
C ={} & \text{number of classes} \\ | |
N ={} & \text{batch size} \\ | |
\end{aligned} | |
Examples:: | |
>>> # Example of target with class indices | |
>>> loss = nn.CrossEntropyLoss() | |
>>> input = torch.randn(3, 5, requires_grad=True) | |
>>> target = torch.empty(3, dtype=torch.long).random_(5) | |
>>> output = loss(input, target) | |
>>> output.backward() | |
>>> | |
>>> # Example of target with class probabilities | |
>>> input = torch.randn(3, 5, requires_grad=True) | |
>>> target = torch.randn(3, 5).softmax(dim=1) | |
>>> output = loss(input, target) | |
>>> output.backward() | |
""" | |
__constants__ = ['ignore_index', 'reduction', 'label_smoothing'] | |
ignore_index: int | |
label_smoothing: float | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100, | |
reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None: | |
super().__init__(weight, size_average, reduce, reduction) | |
self.ignore_index = ignore_index | |
self.label_smoothing = label_smoothing | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.cross_entropy(input, target, weight=self.weight, | |
ignore_index=self.ignore_index, reduction=self.reduction, | |
label_smoothing=self.label_smoothing) | |
class MultiLabelSoftMarginLoss(_WeightedLoss): | |
r"""Creates a criterion that optimizes a multi-label one-versus-all | |
loss based on max-entropy, between input :math:`x` and target :math:`y` of size | |
:math:`(N, C)`. | |
For each sample in the minibatch: | |
.. math:: | |
loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1}) | |
+ (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right) | |
where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`, | |
:math:`y[i] \in \left\{0, \; 1\right\}`. | |
Args: | |
weight (Tensor, optional): a manual rescaling weight given to each | |
class. If given, it has to be a Tensor of size `C`. Otherwise, it is | |
treated as if having all ones. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes. | |
- Target: :math:`(N, C)`, label targets must have the same shape as the input. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`. | |
""" | |
__constants__ = ['reduction'] | |
def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(weight, size_average, reduce, reduction) | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction) | |
class CosineEmbeddingLoss(_Loss): | |
r"""Creates a criterion that measures the loss given input tensors | |
:math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1. | |
Use (:math:`y=1`) to maximize the cosine similarity of two inputs, and (:math:`y=-1`) otherwise. | |
This is typically used for learning nonlinear | |
embeddings or semi-supervised learning. | |
The loss function for each sample is: | |
.. math:: | |
\text{loss}(x, y) = | |
\begin{cases} | |
1 - \cos(x_1, x_2), & \text{if } y = 1 \\ | |
\max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1 | |
\end{cases} | |
Args: | |
margin (float, optional): Should be a number from :math:`-1` to :math:`1`, | |
:math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the | |
default value is :math:`0`. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension. | |
- Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1. | |
- Target: :math:`(N)` or :math:`()`. | |
- Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar. | |
Examples:: | |
>>> loss = nn.CosineEmbeddingLoss() | |
>>> input1 = torch.randn(3, 5, requires_grad=True) | |
>>> input2 = torch.randn(3, 5, requires_grad=True) | |
>>> target = torch.ones(3) | |
>>> output = loss(input1, input2, target) | |
>>> output.backward() | |
""" | |
__constants__ = ['margin', 'reduction'] | |
margin: float | |
def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.margin = margin | |
def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor: | |
return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction) | |
class MarginRankingLoss(_Loss): | |
r"""Creates a criterion that measures the loss given | |
inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`, | |
and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1). | |
If :math:`y = 1` then it assumed the first input should be ranked higher | |
(have a larger value) than the second input, and vice-versa for :math:`y = -1`. | |
The loss function for each pair of samples in the mini-batch is: | |
.. math:: | |
\text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin}) | |
Args: | |
margin (float, optional): Has a default value of :math:`0`. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input1: :math:`(N)` or :math:`()` where `N` is the batch size. | |
- Input2: :math:`(N)` or :math:`()`, same shape as the Input1. | |
- Target: :math:`(N)` or :math:`()`, same shape as the inputs. | |
- Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`. | |
Examples:: | |
>>> loss = nn.MarginRankingLoss() | |
>>> input1 = torch.randn(3, requires_grad=True) | |
>>> input2 = torch.randn(3, requires_grad=True) | |
>>> target = torch.randn(3).sign() | |
>>> output = loss(input1, input2, target) | |
>>> output.backward() | |
""" | |
__constants__ = ['margin', 'reduction'] | |
margin: float | |
def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(size_average, reduce, reduction) | |
self.margin = margin | |
def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor: | |
return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction) | |
class MultiMarginLoss(_WeightedLoss): | |
r"""Creates a criterion that optimizes a multi-class classification hinge | |
loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and | |
output :math:`y` (which is a 1D tensor of target class indices, | |
:math:`0 \leq y \leq \text{x.size}(1)-1`): | |
For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar | |
output :math:`y` is: | |
.. math:: | |
\text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)} | |
where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}` | |
and :math:`i \neq y`. | |
Optionally, you can give non-equal weighting on the classes by passing | |
a 1D :attr:`weight` tensor into the constructor. | |
The loss function then becomes: | |
.. math:: | |
\text{loss}(x, y) = \frac{\sum_i w[y] * \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)} | |
Args: | |
p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2` | |
are the only supported values. | |
margin (float, optional): Has a default value of :math:`1`. | |
weight (Tensor, optional): a manual rescaling weight given to each | |
class. If given, it has to be a Tensor of size `C`. Otherwise, it is | |
treated as if having all ones. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes. | |
- Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`. | |
- Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target. | |
Examples:: | |
>>> loss = nn.MultiMarginLoss() | |
>>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]]) | |
>>> y = torch.tensor([3]) | |
>>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4))) | |
>>> loss(x, y) | |
tensor(0.32...) | |
""" | |
__constants__ = ['p', 'margin', 'reduction'] | |
margin: float | |
p: int | |
def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = None, size_average=None, | |
reduce=None, reduction: str = 'mean') -> None: | |
super().__init__(weight, size_average, reduce, reduction) | |
if p != 1 and p != 2: | |
raise ValueError("only p == 1 and p == 2 supported") | |
if weight is not None and weight.dim() != 1 : | |
raise ValueError( | |
f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead" | |
) | |
self.p = p | |
self.margin = margin | |
def forward(self, input: Tensor, target: Tensor) -> Tensor: | |
return F.multi_margin_loss(input, target, p=self.p, margin=self.margin, | |
weight=self.weight, reduction=self.reduction) | |
class TripletMarginLoss(_Loss): | |
r"""Creates a criterion that measures the triplet loss given an input | |
tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`. | |
This is used for measuring a relative similarity between samples. A triplet | |
is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative | |
examples` respectively). The shapes of all input tensors should be | |
:math:`(N, D)`. | |
The distance swap is described in detail in the paper `Learning shallow | |
convolutional feature descriptors with triplet losses`_ by | |
V. Balntas, E. Riba et al. | |
The loss function for each sample in the mini-batch is: | |
.. math:: | |
L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} | |
where | |
.. math:: | |
d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p | |
The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is | |
added for numerical stability. | |
See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the | |
triplet margin loss for input tensors using a custom distance function. | |
Args: | |
margin (float, optional): Default: :math:`1`. | |
p (int, optional): The norm degree for pairwise distance. Default: :math:`2`. | |
eps (float, optional): Small constant for numerical stability. Default: :math:`1e-6`. | |
swap (bool, optional): The distance swap is described in detail in the paper | |
`Learning shallow convolutional feature descriptors with triplet losses` by | |
V. Balntas, E. Riba et al. Default: ``False``. | |
size_average (bool, optional): Deprecated (see :attr:`reduction`). By default, | |
the losses are averaged over each loss element in the batch. Note that for | |
some losses, there are multiple elements per sample. If the field :attr:`size_average` | |
is set to ``False``, the losses are instead summed for each minibatch. Ignored | |
when :attr:`reduce` is ``False``. Default: ``True`` | |
reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the | |
losses are averaged or summed over observations for each minibatch depending | |
on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per | |
batch element instead and ignores :attr:`size_average`. Default: ``True`` | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average` | |
and :attr:`reduce` are in the process of being deprecated, and in the meantime, | |
specifying either of those two args will override :attr:`reduction`. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension. | |
- Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and | |
input shape is :math:`(N, D)`; a scalar otherwise. | |
Examples:: | |
>>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7) | |
>>> anchor = torch.randn(100, 128, requires_grad=True) | |
>>> positive = torch.randn(100, 128, requires_grad=True) | |
>>> negative = torch.randn(100, 128, requires_grad=True) | |
>>> output = triplet_loss(anchor, positive, negative) | |
>>> output.backward() | |
.. _Learning shallow convolutional feature descriptors with triplet losses: | |
http://www.bmva.org/bmvc/2016/papers/paper119/index.html | |
""" | |
__constants__ = ['margin', 'p', 'eps', 'swap', 'reduction'] | |
margin: float | |
p: float | |
eps: float | |
swap: bool | |
def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None, | |
reduce=None, reduction: str = 'mean'): | |
super().__init__(size_average, reduce, reduction) | |
self.margin = margin | |
self.p = p | |
self.eps = eps | |
self.swap = swap | |
def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: | |
return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p, | |
eps=self.eps, swap=self.swap, reduction=self.reduction) | |
class TripletMarginWithDistanceLoss(_Loss): | |
r"""Creates a criterion that measures the triplet loss given input | |
tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor, | |
positive, and negative examples, respectively), and a nonnegative, | |
real-valued function ("distance function") used to compute the relationship | |
between the anchor and positive example ("positive distance") and the | |
anchor and negative example ("negative distance"). | |
The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``) | |
can be described as: | |
.. math:: | |
\ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad | |
l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} | |
where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function | |
quantifying the closeness of two tensors, referred to as the :attr:`distance_function`; | |
and :math:`margin` is a nonnegative margin representing the minimum difference | |
between the positive and negative distances that is required for the loss to | |
be 0. The input tensors have :math:`N` elements each and can be of any shape | |
that the distance function can handle. | |
If :attr:`reduction` is not ``'none'`` | |
(default ``'mean'``), then: | |
.. math:: | |
\ell(x, y) = | |
\begin{cases} | |
\operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\ | |
\operatorname{sum}(L), & \text{if reduction} = \text{`sum'.} | |
\end{cases} | |
See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet | |
loss for input tensors using the :math:`l_p` distance as the distance function. | |
Args: | |
distance_function (Callable, optional): A nonnegative, real-valued function that | |
quantifies the closeness of two tensors. If not specified, | |
`nn.PairwiseDistance` will be used. Default: ``None`` | |
margin (float, optional): A nonnegative margin representing the minimum difference | |
between the positive and negative distances required for the loss to be 0. Larger | |
margins penalize cases where the negative examples are not distant enough from the | |
anchors, relative to the positives. Default: :math:`1`. | |
swap (bool, optional): Whether to use the distance swap described in the paper | |
`Learning shallow convolutional feature descriptors with triplet losses` by | |
V. Balntas, E. Riba et al. If True, and if the positive example is closer to the | |
negative example than the anchor is, swaps the positive example and the anchor in | |
the loss computation. Default: ``False``. | |
reduction (str, optional): Specifies the (optional) reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the sum of the output will be divided by the number of | |
elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'`` | |
Shape: | |
- Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions | |
as supported by the distance function. | |
- Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar | |
otherwise. | |
Examples:: | |
>>> # Initialize embeddings | |
>>> embedding = nn.Embedding(1000, 128) | |
>>> anchor_ids = torch.randint(0, 1000, (1,)) | |
>>> positive_ids = torch.randint(0, 1000, (1,)) | |
>>> negative_ids = torch.randint(0, 1000, (1,)) | |
>>> anchor = embedding(anchor_ids) | |
>>> positive = embedding(positive_ids) | |
>>> negative = embedding(negative_ids) | |
>>> | |
>>> # Built-in Distance Function | |
>>> triplet_loss = \ | |
>>> nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance()) | |
>>> output = triplet_loss(anchor, positive, negative) | |
>>> output.backward() | |
>>> | |
>>> # Custom Distance Function | |
>>> def l_infinity(x1, x2): | |
>>> return torch.max(torch.abs(x1 - x2), dim=1).values | |
>>> | |
>>> # xdoctest: +SKIP("FIXME: Would call backwards a second time") | |
>>> triplet_loss = ( | |
>>> nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5)) | |
>>> output = triplet_loss(anchor, positive, negative) | |
>>> output.backward() | |
>>> | |
>>> # Custom Distance Function (Lambda) | |
>>> triplet_loss = ( | |
>>> nn.TripletMarginWithDistanceLoss( | |
>>> distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))) | |
>>> output = triplet_loss(anchor, positive, negative) | |
>>> output.backward() | |
Reference: | |
V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses: | |
http://www.bmva.org/bmvc/2016/papers/paper119/index.html | |
""" | |
__constants__ = ['margin', 'swap', 'reduction'] | |
margin: float | |
swap: bool | |
def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None, | |
margin: float = 1.0, swap: bool = False, reduction: str = 'mean'): | |
super().__init__(size_average=None, reduce=None, reduction=reduction) | |
self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \ | |
distance_function if distance_function is not None else PairwiseDistance() | |
self.margin = margin | |
self.swap = swap | |
def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor: | |
return F.triplet_margin_with_distance_loss(anchor, positive, negative, | |
distance_function=self.distance_function, | |
margin=self.margin, swap=self.swap, reduction=self.reduction) | |
class CTCLoss(_Loss): | |
r"""The Connectionist Temporal Classification loss. | |
Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the | |
probability of possible alignments of input to target, producing a loss value which is differentiable | |
with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which | |
limits the length of the target sequence such that it must be :math:`\leq` the input length. | |
Args: | |
blank (int, optional): blank label. Default :math:`0`. | |
reduction (str, optional): Specifies the reduction to apply to the output: | |
``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied, | |
``'mean'``: the output losses will be divided by the target lengths and | |
then the mean over the batch is taken, ``'sum'``: the output losses will be summed. | |
Default: ``'mean'`` | |
zero_infinity (bool, optional): | |
Whether to zero infinite losses and the associated gradients. | |
Default: ``False`` | |
Infinite losses mainly occur when the inputs are too short | |
to be aligned to the targets. | |
Shape: | |
- Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`, | |
where :math:`T = \text{input length}`, | |
:math:`N = \text{batch size}`, and | |
:math:`C = \text{number of classes (including blank)}`. | |
The logarithmized probabilities of the outputs (e.g. obtained with | |
:func:`torch.nn.functional.log_softmax`). | |
- Targets: Tensor of size :math:`(N, S)` or | |
:math:`(\operatorname{sum}(\text{target\_lengths}))`, | |
where :math:`N = \text{batch size}` and | |
:math:`S = \text{max target length, if shape is } (N, S)`. | |
It represent the target sequences. Each element in the target | |
sequence is a class index. And the target index cannot be blank (default=0). | |
In the :math:`(N, S)` form, targets are padded to the | |
length of the longest sequence, and stacked. | |
In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form, | |
the targets are assumed to be un-padded and | |
concatenated within 1 dimension. | |
- Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`, | |
where :math:`N = \text{batch size}`. It represent the lengths of the | |
inputs (must each be :math:`\leq T`). And the lengths are specified | |
for each sequence to achieve masking under the assumption that sequences | |
are padded to equal lengths. | |
- Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`, | |
where :math:`N = \text{batch size}`. It represent lengths of the targets. | |
Lengths are specified for each sequence to achieve masking under the | |
assumption that sequences are padded to equal lengths. If target shape is | |
:math:`(N,S)`, target_lengths are effectively the stop index | |
:math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for | |
each target in a batch. Lengths must each be :math:`\leq S` | |
If the targets are given as a 1d tensor that is the concatenation of individual | |
targets, the target_lengths must add up to the total length of the tensor. | |
- Output: scalar if :attr:`reduction` is ``'mean'`` (default) or | |
``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or | |
:math:`()` if input is unbatched, where :math:`N = \text{batch size}`. | |
Examples:: | |
>>> # Target are to be padded | |
>>> T = 50 # Input sequence length | |
>>> C = 20 # Number of classes (including blank) | |
>>> N = 16 # Batch size | |
>>> S = 30 # Target sequence length of longest target in batch (padding length) | |
>>> S_min = 10 # Minimum target length, for demonstration purposes | |
>>> | |
>>> # Initialize random batch of input vectors, for *size = (T,N,C) | |
>>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() | |
>>> | |
>>> # Initialize random batch of targets (0 = blank, 1:C = classes) | |
>>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long) | |
>>> | |
>>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) | |
>>> target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long) | |
>>> ctc_loss = nn.CTCLoss() | |
>>> loss = ctc_loss(input, target, input_lengths, target_lengths) | |
>>> loss.backward() | |
>>> | |
>>> | |
>>> # Target are to be un-padded | |
>>> T = 50 # Input sequence length | |
>>> C = 20 # Number of classes (including blank) | |
>>> N = 16 # Batch size | |
>>> | |
>>> # Initialize random batch of input vectors, for *size = (T,N,C) | |
>>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() | |
>>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) | |
>>> | |
>>> # Initialize random batch of targets (0 = blank, 1:C = classes) | |
>>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) | |
>>> target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long) | |
>>> ctc_loss = nn.CTCLoss() | |
>>> loss = ctc_loss(input, target, input_lengths, target_lengths) | |
>>> loss.backward() | |
>>> | |
>>> | |
>>> # Target are to be un-padded and unbatched (effectively N=1) | |
>>> T = 50 # Input sequence length | |
>>> C = 20 # Number of classes (including blank) | |
>>> | |
>>> # Initialize random batch of input vectors, for *size = (T,C) | |
>>> # xdoctest: +SKIP("FIXME: error in doctest") | |
>>> input = torch.randn(T, C).log_softmax(1).detach().requires_grad_() | |
>>> input_lengths = torch.tensor(T, dtype=torch.long) | |
>>> | |
>>> # Initialize random batch of targets (0 = blank, 1:C = classes) | |
>>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long) | |
>>> target = torch.randint(low=1, high=C, size=(target_lengths,), dtype=torch.long) | |
>>> ctc_loss = nn.CTCLoss() | |
>>> loss = ctc_loss(input, target, input_lengths, target_lengths) | |
>>> loss.backward() | |
Reference: | |
A. Graves et al.: Connectionist Temporal Classification: | |
Labelling Unsegmented Sequence Data with Recurrent Neural Networks: | |
https://www.cs.toronto.edu/~graves/icml_2006.pdf | |
Note: | |
In order to use CuDNN, the following must be satisfied: :attr:`targets` must be | |
in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`, | |
:attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of | |
dtype :attr:`torch.int32`. | |
The regular implementation uses the (more common in PyTorch) `torch.long` dtype. | |
Note: | |
In some circumstances when using the CUDA backend with CuDNN, this operator | |
may select a nondeterministic algorithm to increase performance. If this is | |
undesirable, you can try to make the operation deterministic (potentially at | |
a performance cost) by setting ``torch.backends.cudnn.deterministic = | |
True``. | |
Please see the notes on :doc:`/notes/randomness` for background. | |
""" | |
__constants__ = ['blank', 'reduction'] | |
blank: int | |
zero_infinity: bool | |
def __init__(self, blank: int = 0, reduction: str = 'mean', zero_infinity: bool = False): | |
super().__init__(reduction=reduction) | |
self.blank = blank | |
self.zero_infinity = zero_infinity | |
def forward(self, log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor) -> Tensor: | |
return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction, | |
self.zero_infinity) | |
# TODO: L1HingeEmbeddingCriterion | |
# TODO: MSECriterion weight | |
# TODO: ClassSimplexCriterion | |