# Copyright 2020 Google Research. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Base box coder. Box coders convert between coordinate frames, namely image-centric (with (0,0) on the top left of image) and anchor-centric (with (0,0) being defined by a specific anchor). Users of a BoxCoder can call two methods: encode: which encodes a box with respect to a given anchor (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and decode: which inverts this encoding with a decode operation. In both cases, the arguments are assumed to be in 1-1 correspondence already; it is not the job of a BoxCoder to perform matching. """ import torch from typing import List, Optional from .box_list import BoxList # Box coder types. FASTER_RCNN = 'faster_rcnn' KEYPOINT = 'keypoint' MEAN_STDDEV = 'mean_stddev' SQUARE = 'square' """Faster RCNN box coder. Faster RCNN box coder follows the coding schema described below: ty = (y - ya) / ha tx = (x - xa) / wa th = log(h / ha) tw = log(w / wa) where x, y, w, h denote the box's center coordinates, width and height respectively. Similarly, xa, ya, wa, ha denote the anchor's center coordinates, width and height. tx, ty, tw and th denote the anchor-encoded center, width and height respectively. See http://arxiv.org/abs/1506.01497 for details. """ EPS = 1e-8 #@torch.jit.script class FasterRcnnBoxCoder(object): """Faster RCNN box coder.""" def __init__(self, scale_factors: Optional[List[float]] = None, eps: float = EPS): """Constructor for FasterRcnnBoxCoder. Args: scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. If set to None, does not perform scaling. For Faster RCNN, the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0]. """ self._scale_factors = scale_factors if scale_factors is not None: assert len(scale_factors) == 4 for scalar in scale_factors: assert scalar > 0 self.eps = eps #@property def code_size(self): return 4 def encode(self, boxes: BoxList, anchors: BoxList): """Encode a box collection with respect to anchor collection. Args: boxes: BoxList holding N boxes to be encoded. anchors: BoxList of anchors. Returns: a tensor representing N anchor-encoded boxes of the format [ty, tx, th, tw]. """ # Convert anchors to the center coordinate representation. ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() # Avoid NaN in division and log below. ha += self.eps wa += self.eps h += self.eps w += self.eps tx = (xcenter - xcenter_a) / wa ty = (ycenter - ycenter_a) / ha tw = torch.log(w / wa) th = torch.log(h / ha) # Scales location targets as used in paper for joint training. if self._scale_factors is not None: ty *= self._scale_factors[0] tx *= self._scale_factors[1] th *= self._scale_factors[2] tw *= self._scale_factors[3] return torch.stack([ty, tx, th, tw]).t() def decode(self, rel_codes, anchors: BoxList): """Decode relative codes to boxes. Args: rel_codes: a tensor representing N anchor-encoded boxes. anchors: BoxList of anchors. Returns: boxes: BoxList holding N bounding boxes. """ ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ty, tx, th, tw = rel_codes.t().unbind() if self._scale_factors is not None: ty /= self._scale_factors[0] tx /= self._scale_factors[1] th /= self._scale_factors[2] tw /= self._scale_factors[3] w = torch.exp(tw) * wa h = torch.exp(th) * ha ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. return BoxList(torch.stack([ymin, xmin, ymax, xmax]).t()) def batch_decode(encoded_boxes, box_coder: FasterRcnnBoxCoder, anchors: BoxList): """Decode a batch of encoded boxes. This op takes a batch of encoded bounding boxes and transforms them to a batch of bounding boxes specified by their corners in the order of [y_min, x_min, y_max, x_max]. Args: encoded_boxes: a float32 tensor of shape [batch_size, num_anchors, code_size] representing the location of the objects. box_coder: a BoxCoder object. anchors: a BoxList of anchors used to encode `encoded_boxes`. Returns: decoded_boxes: a float32 tensor of shape [batch_size, num_anchors, coder_size] representing the corners of the objects in the order of [y_min, x_min, y_max, x_max]. Raises: ValueError: if batch sizes of the inputs are inconsistent, or if the number of anchors inferred from encoded_boxes and anchors are inconsistent. """ assert len(encoded_boxes.shape) == 3 if encoded_boxes.shape[1] != anchors.num_boxes(): raise ValueError('The number of anchors inferred from encoded_boxes' ' and anchors are inconsistent: shape[1] of encoded_boxes' ' %s should be equal to the number of anchors: %s.' % (encoded_boxes.shape[1], anchors.num_boxes())) decoded_boxes = torch.stack([ box_coder.decode(boxes, anchors).boxes for boxes in encoded_boxes.unbind() ]) return decoded_boxes