""" RetinaNet / EfficientDet Anchor Gen Adapted for PyTorch from Tensorflow impl at https://github.com/google/automl/blob/6f6694cec1a48cdb33d5d1551a2d5db8ad227798/efficientdet/anchors.py Hacked together by Ross Wightman, original copyright below """ # Copyright 2020 Google Research. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Anchor definition. This module is borrowed from TPU RetinaNet implementation: https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py """ from typing import Optional, Tuple, Sequence import numpy as np import torch import torch.nn as nn #import torchvision.ops.boxes as tvb from torchvision.ops.boxes import batched_nms, remove_small_boxes from typing import List from effdet.object_detection import ArgMaxMatcher, FasterRcnnBoxCoder, BoxList, IouSimilarity, TargetAssigner from .soft_nms import batched_soft_nms # The minimum score to consider a logit for identifying detections. MIN_CLASS_SCORE = -5.0 # The score for a dummy detection _DUMMY_DETECTION_SCORE = -1e5 # The maximum number of (anchor,class) pairs to keep for non-max suppression. MAX_DETECTION_POINTS = 5000 # The maximum number of detections per image. MAX_DETECTIONS_PER_IMAGE = 100 def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False): """Transforms relative regression coordinates to absolute positions. Network predictions are normalized and relative to a given anchor; this reverses the transformation and outputs absolute coordinates for the input image. Args: rel_codes: box regression targets. anchors: anchors on all feature levels. Returns: outputs: bounding boxes. """ ycenter_a = (anchors[:, 0] + anchors[:, 2]) / 2 xcenter_a = (anchors[:, 1] + anchors[:, 3]) / 2 ha = anchors[:, 2] - anchors[:, 0] wa = anchors[:, 3] - anchors[:, 1] ty, tx, th, tw = rel_codes.unbind(dim=1) w = torch.exp(tw) * wa h = torch.exp(th) * ha ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. if output_xyxy: out = torch.stack([xmin, ymin, xmax, ymax], dim=1) else: out = torch.stack([ymin, xmin, ymax, xmax], dim=1) return out def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor): boxes = boxes.clamp(min=0) size = torch.cat([size, size], dim=0) boxes = boxes.min(size) return boxes def generate_detections( cls_outputs, box_outputs, anchor_boxes, indices, classes, img_scale: Optional[torch.Tensor], img_size: Optional[torch.Tensor], max_det_per_image: int = MAX_DETECTIONS_PER_IMAGE, soft_nms: bool = False): """Generates detections with RetinaNet model outputs and anchors. Args: cls_outputs: a torch tensor with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a torch tensor with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a torch tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a torch tensor with shape [N], which is the indices from top-k selection. classes: a torch tensor with shape [N], which represents the class prediction on all selected anchors from top-k selection. img_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. max_det_per_image: an int constant, added as argument to make torchscript happy Returns: detections: detection results in a tensor with shape [MAX_DETECTION_POINTS, 6], each row representing [x_min, y_min, x_max, y_max, score, class] """ assert box_outputs.shape[-1] == 4 assert anchor_boxes.shape[-1] == 4 assert cls_outputs.shape[-1] == 1 anchor_boxes = anchor_boxes[indices, :] # Appply bounding box regression to anchors, boxes are converted to xyxy # here since PyTorch NMS expects them in that form. boxes = decode_box_outputs(box_outputs.float(), anchor_boxes, output_xyxy=True) if img_scale is not None and img_size is not None: boxes = clip_boxes_xyxy(boxes, img_size / img_scale) # clip before NMS better? scores = cls_outputs.sigmoid().squeeze(1).float() if soft_nms: top_detection_idx, soft_scores = batched_soft_nms( boxes, scores, classes, method_gaussian=True, iou_threshold=0.3, score_threshold=.001) scores[top_detection_idx] = soft_scores else: top_detection_idx = batched_nms(boxes, scores, classes, iou_threshold=0.5) # keep only topk scoring predictions top_detection_idx = top_detection_idx[:max_det_per_image] boxes = boxes[top_detection_idx] scores = scores[top_detection_idx, None] classes = classes[top_detection_idx, None] + 1 # back to class idx with background class = 0 if img_scale is not None: boxes = boxes * img_scale # FIXME add option to convert boxes back to yxyx? Otherwise must be handled downstream if # that is the preferred output format. # stack em and pad out to MAX_DETECTIONS_PER_IMAGE if necessary num_det = len(top_detection_idx) detections = torch.cat([boxes, scores, classes.float()], dim=1) if num_det < max_det_per_image: detections = torch.cat([ detections, torch.zeros((max_det_per_image - num_det, 6), device=detections.device, dtype=detections.dtype) ], dim=0) return detections def get_feat_sizes(image_size: Tuple[int, int], max_level: int): """Get feat widths and heights for all levels. Args: image_size: a tuple (H, W) max_level: maximum feature level. Returns: feat_sizes: a list of tuples (height, width) for each level. """ feat_size = image_size feat_sizes = [feat_size] for _ in range(1, max_level + 1): feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1) feat_sizes.append(feat_size) return feat_sizes class Anchors(nn.Module): """RetinaNet Anchors class.""" def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]): """Constructs multiscale RetinaNet anchors. Args: min_level: integer number of minimum level of the output feature pyramid. max_level: integer number of maximum level of the output feature pyramid. num_scales: integer number representing intermediate scales added on each level. For instances, num_scales=2 adds two additional anchor scales [2^0, 2^0.5] on each level. aspect_ratios: list of tuples representing the aspect ratio anchors added on each level. For instances, aspect_ratios = [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level. anchor_scale: float number representing the scale of size of the base anchor to the feature stride 2^level. image_size: Sequence specifying input image size of model (H, W). The image_size should be divided by the largest feature stride 2^max_level. """ super(Anchors, self).__init__() self.min_level = min_level self.max_level = max_level self.num_scales = num_scales self.aspect_ratios = aspect_ratios if isinstance(anchor_scale, Sequence): assert len(anchor_scale) == max_level - min_level + 1 self.anchor_scales = anchor_scale else: self.anchor_scales = [anchor_scale] * (max_level - min_level + 1) assert isinstance(image_size, Sequence) and len(image_size) == 2 # FIXME this restriction can likely be relaxed with some additional changes assert image_size[0] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' assert image_size[1] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' self.image_size = tuple(image_size) self.feat_sizes = get_feat_sizes(image_size, max_level) self.config = self._generate_configs() self.register_buffer('boxes', self._generate_boxes()) @classmethod def from_config(cls, config): return cls( config.min_level, config.max_level, config.num_scales, config.aspect_ratios, config.anchor_scale, config.image_size) def _generate_configs(self): """Generate configurations of anchor boxes.""" anchor_configs = {} feat_sizes = self.feat_sizes for level in range(self.min_level, self.max_level + 1): anchor_configs[level] = [] for scale_octave in range(self.num_scales): for aspect in self.aspect_ratios: anchor_configs[level].append( ((feat_sizes[0][0] // feat_sizes[level][0], feat_sizes[0][1] // feat_sizes[level][1]), scale_octave / float(self.num_scales), aspect, self.anchor_scales[level - self.min_level])) return anchor_configs def _generate_boxes(self): """Generates multiscale anchor boxes.""" boxes_all = [] for _, configs in self.config.items(): boxes_level = [] for config in configs: stride, octave_scale, aspect, anchor_scale = config base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale if isinstance(aspect, Sequence): aspect_x = aspect[0] aspect_y = aspect[1] else: aspect_x = np.sqrt(aspect) aspect_y = 1.0 / aspect_x anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0 anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0 x = np.arange(stride[1] / 2, self.image_size[1], stride[1]) y = np.arange(stride[0] / 2, self.image_size[0], stride[0]) xv, yv = np.meshgrid(x, y) xv = xv.reshape(-1) yv = yv.reshape(-1) boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2, yv + anchor_size_y_2, xv + anchor_size_x_2)) boxes = np.swapaxes(boxes, 0, 1) boxes_level.append(np.expand_dims(boxes, axis=1)) # concat anchors on the same level to the reshape NxAx4 boxes_level = np.concatenate(boxes_level, axis=1) boxes_all.append(boxes_level.reshape([-1, 4])) anchor_boxes = np.vstack(boxes_all) anchor_boxes = torch.from_numpy(anchor_boxes).float() return anchor_boxes def get_anchors_per_location(self): return self.num_scales * len(self.aspect_ratios) class AnchorLabeler(object): """Labeler for multiscale anchor boxes. """ def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5): """Constructs anchor labeler to assign labels to anchors. Args: anchors: an instance of class Anchors. num_classes: integer number representing number of classes in the dataset. match_threshold: float number between 0 and 1 representing the threshold to assign positive labels for anchors. """ similarity_calc = IouSimilarity() matcher = ArgMaxMatcher( match_threshold, unmatched_threshold=match_threshold, negatives_lower_than_unmatched=True, force_match_for_each_row=True) box_coder = FasterRcnnBoxCoder() self.target_assigner = TargetAssigner(similarity_calc, matcher, box_coder) self.anchors = anchors self.match_threshold = match_threshold self.num_classes = num_classes self.indices_cache = {} def label_anchors(self, gt_boxes, gt_classes, filter_valid=True): """Labels anchors with ground truth inputs. Args: gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a box. gt_classes: A integer tensor with shape [N, 1] representing groundtruth classes. filter_valid: Filter out any boxes w/ gt class <= -1 before assigning Returns: cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: scalar tensor storing number of positives in an image. """ cls_targets_out = [] box_targets_out = [] if filter_valid: valid_idx = gt_classes > -1 # filter gt targets w/ label <= -1 gt_boxes = gt_boxes[valid_idx] gt_classes = gt_classes[valid_idx] cls_targets, box_targets, matches = self.target_assigner.assign( BoxList(self.anchors.boxes), BoxList(gt_boxes), gt_classes) # class labels start from 1 and the background class = -1 cls_targets = (cls_targets - 1).long() # Unpack labels. """Unpacks an array of cls/box into multiple scales.""" count = 0 for level in range(self.anchors.min_level, self.anchors.max_level + 1): feat_size = self.anchors.feat_sizes[level] steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) count += steps num_positives = (matches.match_results > -1).float().sum() return cls_targets_out, box_targets_out, num_positives def batch_label_anchors(self, gt_boxes, gt_classes, filter_valid=True): batch_size = len(gt_boxes) assert batch_size == len(gt_classes) num_levels = self.anchors.max_level - self.anchors.min_level + 1 cls_targets_out = [[] for _ in range(num_levels)] box_targets_out = [[] for _ in range(num_levels)] num_positives_out = [] anchor_box_list = BoxList(self.anchors.boxes) for i in range(batch_size): last_sample = i == batch_size - 1 if filter_valid: valid_idx = gt_classes[i] > -1 # filter gt targets w/ label <= -1 gt_box_list = BoxList(gt_boxes[i][valid_idx]) gt_class_i = gt_classes[i][valid_idx] else: gt_box_list = BoxList(gt_boxes[i]) gt_class_i = gt_classes[i] cls_targets, box_targets, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_class_i) # class labels start from 1 and the background class = -1 cls_targets = (cls_targets - 1).long() # Unpack labels. """Unpacks an array of cls/box into multiple scales.""" count = 0 for level in range(self.anchors.min_level, self.anchors.max_level + 1): level_idx = level - self.anchors.min_level feat_size = self.anchors.feat_sizes[level] steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() cls_targets_out[level_idx].append( cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) box_targets_out[level_idx].append( box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) count += steps if last_sample: cls_targets_out[level_idx] = torch.stack(cls_targets_out[level_idx]) box_targets_out[level_idx] = torch.stack(box_targets_out[level_idx]) num_positives_out.append((matches.match_results > -1).float().sum()) if last_sample: num_positives_out = torch.stack(num_positives_out) return cls_targets_out, box_targets_out, num_positives_out