Spaces:
Sleeping
Sleeping
""" RetinaNet / EfficientDet Anchor Gen | |
Adapted for PyTorch from Tensorflow impl at | |
https://github.com/google/automl/blob/6f6694cec1a48cdb33d5d1551a2d5db8ad227798/efficientdet/anchors.py | |
Hacked together by Ross Wightman, original copyright below | |
""" | |
# Copyright 2020 Google Research. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Anchor definition. | |
This module is borrowed from TPU RetinaNet implementation: | |
https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py | |
""" | |
from typing import Optional, Tuple, Sequence | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
#import torchvision.ops.boxes as tvb | |
from torchvision.ops.boxes import batched_nms, remove_small_boxes | |
from typing import List | |
from effdet.object_detection import ArgMaxMatcher, FasterRcnnBoxCoder, BoxList, IouSimilarity, TargetAssigner | |
from .soft_nms import batched_soft_nms | |
# The minimum score to consider a logit for identifying detections. | |
MIN_CLASS_SCORE = -5.0 | |
# The score for a dummy detection | |
_DUMMY_DETECTION_SCORE = -1e5 | |
# The maximum number of (anchor,class) pairs to keep for non-max suppression. | |
MAX_DETECTION_POINTS = 5000 | |
# The maximum number of detections per image. | |
MAX_DETECTIONS_PER_IMAGE = 100 | |
def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False): | |
"""Transforms relative regression coordinates to absolute positions. | |
Network predictions are normalized and relative to a given anchor; this | |
reverses the transformation and outputs absolute coordinates for the input image. | |
Args: | |
rel_codes: box regression targets. | |
anchors: anchors on all feature levels. | |
Returns: | |
outputs: bounding boxes. | |
""" | |
ycenter_a = (anchors[:, 0] + anchors[:, 2]) / 2 | |
xcenter_a = (anchors[:, 1] + anchors[:, 3]) / 2 | |
ha = anchors[:, 2] - anchors[:, 0] | |
wa = anchors[:, 3] - anchors[:, 1] | |
ty, tx, th, tw = rel_codes.unbind(dim=1) | |
w = torch.exp(tw) * wa | |
h = torch.exp(th) * ha | |
ycenter = ty * ha + ycenter_a | |
xcenter = tx * wa + xcenter_a | |
ymin = ycenter - h / 2. | |
xmin = xcenter - w / 2. | |
ymax = ycenter + h / 2. | |
xmax = xcenter + w / 2. | |
if output_xyxy: | |
out = torch.stack([xmin, ymin, xmax, ymax], dim=1) | |
else: | |
out = torch.stack([ymin, xmin, ymax, xmax], dim=1) | |
return out | |
def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor): | |
boxes = boxes.clamp(min=0) | |
size = torch.cat([size, size], dim=0) | |
boxes = boxes.min(size) | |
return boxes | |
def generate_detections( | |
cls_outputs, box_outputs, anchor_boxes, indices, classes, | |
img_scale: Optional[torch.Tensor], img_size: Optional[torch.Tensor], | |
max_det_per_image: int = MAX_DETECTIONS_PER_IMAGE, soft_nms: bool = False): | |
"""Generates detections with RetinaNet model outputs and anchors. | |
Args: | |
cls_outputs: a torch tensor with shape [N, 1], which has the highest class | |
scores on all feature levels. The N is the number of selected | |
top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) | |
box_outputs: a torch tensor with shape [N, 4], which stacks box regression | |
outputs on all feature levels. The N is the number of selected top-k | |
total anchors on all levels. (k being MAX_DETECTION_POINTS) | |
anchor_boxes: a torch tensor with shape [N, 4], which stacks anchors on all | |
feature levels. The N is the number of selected top-k total anchors on all levels. | |
indices: a torch tensor with shape [N], which is the indices from top-k selection. | |
classes: a torch tensor with shape [N], which represents the class | |
prediction on all selected anchors from top-k selection. | |
img_scale: a float tensor representing the scale between original image | |
and input image for the detector. It is used to rescale detections for | |
evaluating with the original groundtruth annotations. | |
max_det_per_image: an int constant, added as argument to make torchscript happy | |
Returns: | |
detections: detection results in a tensor with shape [MAX_DETECTION_POINTS, 6], | |
each row representing [x_min, y_min, x_max, y_max, score, class] | |
""" | |
assert box_outputs.shape[-1] == 4 | |
assert anchor_boxes.shape[-1] == 4 | |
assert cls_outputs.shape[-1] == 1 | |
anchor_boxes = anchor_boxes[indices, :] | |
# Appply bounding box regression to anchors, boxes are converted to xyxy | |
# here since PyTorch NMS expects them in that form. | |
boxes = decode_box_outputs(box_outputs.float(), anchor_boxes, output_xyxy=True) | |
if img_scale is not None and img_size is not None: | |
boxes = clip_boxes_xyxy(boxes, img_size / img_scale) # clip before NMS better? | |
scores = cls_outputs.sigmoid().squeeze(1).float() | |
if soft_nms: | |
top_detection_idx, soft_scores = batched_soft_nms( | |
boxes, scores, classes, method_gaussian=True, iou_threshold=0.3, score_threshold=.001) | |
scores[top_detection_idx] = soft_scores | |
else: | |
top_detection_idx = batched_nms(boxes, scores, classes, iou_threshold=0.5) | |
# keep only topk scoring predictions | |
top_detection_idx = top_detection_idx[:max_det_per_image] | |
boxes = boxes[top_detection_idx] | |
scores = scores[top_detection_idx, None] | |
classes = classes[top_detection_idx, None] + 1 # back to class idx with background class = 0 | |
if img_scale is not None: | |
boxes = boxes * img_scale | |
# FIXME add option to convert boxes back to yxyx? Otherwise must be handled downstream if | |
# that is the preferred output format. | |
# stack em and pad out to MAX_DETECTIONS_PER_IMAGE if necessary | |
num_det = len(top_detection_idx) | |
detections = torch.cat([boxes, scores, classes.float()], dim=1) | |
if num_det < max_det_per_image: | |
detections = torch.cat([ | |
detections, | |
torch.zeros((max_det_per_image - num_det, 6), device=detections.device, dtype=detections.dtype) | |
], dim=0) | |
return detections | |
def get_feat_sizes(image_size: Tuple[int, int], max_level: int): | |
"""Get feat widths and heights for all levels. | |
Args: | |
image_size: a tuple (H, W) | |
max_level: maximum feature level. | |
Returns: | |
feat_sizes: a list of tuples (height, width) for each level. | |
""" | |
feat_size = image_size | |
feat_sizes = [feat_size] | |
for _ in range(1, max_level + 1): | |
feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1) | |
feat_sizes.append(feat_size) | |
return feat_sizes | |
class Anchors(nn.Module): | |
"""RetinaNet Anchors class.""" | |
def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]): | |
"""Constructs multiscale RetinaNet anchors. | |
Args: | |
min_level: integer number of minimum level of the output feature pyramid. | |
max_level: integer number of maximum level of the output feature pyramid. | |
num_scales: integer number representing intermediate scales added | |
on each level. For instances, num_scales=2 adds two additional | |
anchor scales [2^0, 2^0.5] on each level. | |
aspect_ratios: list of tuples representing the aspect ratio anchors added | |
on each level. For instances, aspect_ratios = | |
[(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level. | |
anchor_scale: float number representing the scale of size of the base | |
anchor to the feature stride 2^level. | |
image_size: Sequence specifying input image size of model (H, W). | |
The image_size should be divided by the largest feature stride 2^max_level. | |
""" | |
super(Anchors, self).__init__() | |
self.min_level = min_level | |
self.max_level = max_level | |
self.num_scales = num_scales | |
self.aspect_ratios = aspect_ratios | |
if isinstance(anchor_scale, Sequence): | |
assert len(anchor_scale) == max_level - min_level + 1 | |
self.anchor_scales = anchor_scale | |
else: | |
self.anchor_scales = [anchor_scale] * (max_level - min_level + 1) | |
assert isinstance(image_size, Sequence) and len(image_size) == 2 | |
# FIXME this restriction can likely be relaxed with some additional changes | |
assert image_size[0] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' | |
assert image_size[1] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)' | |
self.image_size = tuple(image_size) | |
self.feat_sizes = get_feat_sizes(image_size, max_level) | |
self.config = self._generate_configs() | |
self.register_buffer('boxes', self._generate_boxes()) | |
def from_config(cls, config): | |
return cls( | |
config.min_level, config.max_level, | |
config.num_scales, config.aspect_ratios, | |
config.anchor_scale, config.image_size) | |
def _generate_configs(self): | |
"""Generate configurations of anchor boxes.""" | |
anchor_configs = {} | |
feat_sizes = self.feat_sizes | |
for level in range(self.min_level, self.max_level + 1): | |
anchor_configs[level] = [] | |
for scale_octave in range(self.num_scales): | |
for aspect in self.aspect_ratios: | |
anchor_configs[level].append( | |
((feat_sizes[0][0] // feat_sizes[level][0], | |
feat_sizes[0][1] // feat_sizes[level][1]), | |
scale_octave / float(self.num_scales), aspect, | |
self.anchor_scales[level - self.min_level])) | |
return anchor_configs | |
def _generate_boxes(self): | |
"""Generates multiscale anchor boxes.""" | |
boxes_all = [] | |
for _, configs in self.config.items(): | |
boxes_level = [] | |
for config in configs: | |
stride, octave_scale, aspect, anchor_scale = config | |
base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale | |
base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale | |
if isinstance(aspect, Sequence): | |
aspect_x = aspect[0] | |
aspect_y = aspect[1] | |
else: | |
aspect_x = np.sqrt(aspect) | |
aspect_y = 1.0 / aspect_x | |
anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0 | |
anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0 | |
x = np.arange(stride[1] / 2, self.image_size[1], stride[1]) | |
y = np.arange(stride[0] / 2, self.image_size[0], stride[0]) | |
xv, yv = np.meshgrid(x, y) | |
xv = xv.reshape(-1) | |
yv = yv.reshape(-1) | |
boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2, | |
yv + anchor_size_y_2, xv + anchor_size_x_2)) | |
boxes = np.swapaxes(boxes, 0, 1) | |
boxes_level.append(np.expand_dims(boxes, axis=1)) | |
# concat anchors on the same level to the reshape NxAx4 | |
boxes_level = np.concatenate(boxes_level, axis=1) | |
boxes_all.append(boxes_level.reshape([-1, 4])) | |
anchor_boxes = np.vstack(boxes_all) | |
anchor_boxes = torch.from_numpy(anchor_boxes).float() | |
return anchor_boxes | |
def get_anchors_per_location(self): | |
return self.num_scales * len(self.aspect_ratios) | |
class AnchorLabeler(object): | |
"""Labeler for multiscale anchor boxes. | |
""" | |
def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5): | |
"""Constructs anchor labeler to assign labels to anchors. | |
Args: | |
anchors: an instance of class Anchors. | |
num_classes: integer number representing number of classes in the dataset. | |
match_threshold: float number between 0 and 1 representing the threshold | |
to assign positive labels for anchors. | |
""" | |
similarity_calc = IouSimilarity() | |
matcher = ArgMaxMatcher( | |
match_threshold, | |
unmatched_threshold=match_threshold, | |
negatives_lower_than_unmatched=True, | |
force_match_for_each_row=True) | |
box_coder = FasterRcnnBoxCoder() | |
self.target_assigner = TargetAssigner(similarity_calc, matcher, box_coder) | |
self.anchors = anchors | |
self.match_threshold = match_threshold | |
self.num_classes = num_classes | |
self.indices_cache = {} | |
def label_anchors(self, gt_boxes, gt_classes, filter_valid=True): | |
"""Labels anchors with ground truth inputs. | |
Args: | |
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. | |
For each row, it stores [y0, x0, y1, x1] for four corners of a box. | |
gt_classes: A integer tensor with shape [N, 1] representing groundtruth classes. | |
filter_valid: Filter out any boxes w/ gt class <= -1 before assigning | |
Returns: | |
cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. | |
The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l | |
represent the dimension of class logits at l-th level. | |
box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. | |
The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and | |
width_l represent the dimension of bounding box regression output at l-th level. | |
num_positives: scalar tensor storing number of positives in an image. | |
""" | |
cls_targets_out = [] | |
box_targets_out = [] | |
if filter_valid: | |
valid_idx = gt_classes > -1 # filter gt targets w/ label <= -1 | |
gt_boxes = gt_boxes[valid_idx] | |
gt_classes = gt_classes[valid_idx] | |
cls_targets, box_targets, matches = self.target_assigner.assign( | |
BoxList(self.anchors.boxes), BoxList(gt_boxes), gt_classes) | |
# class labels start from 1 and the background class = -1 | |
cls_targets = (cls_targets - 1).long() | |
# Unpack labels. | |
"""Unpacks an array of cls/box into multiple scales.""" | |
count = 0 | |
for level in range(self.anchors.min_level, self.anchors.max_level + 1): | |
feat_size = self.anchors.feat_sizes[level] | |
steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() | |
cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) | |
box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) | |
count += steps | |
num_positives = (matches.match_results > -1).float().sum() | |
return cls_targets_out, box_targets_out, num_positives | |
def batch_label_anchors(self, gt_boxes, gt_classes, filter_valid=True): | |
batch_size = len(gt_boxes) | |
assert batch_size == len(gt_classes) | |
num_levels = self.anchors.max_level - self.anchors.min_level + 1 | |
cls_targets_out = [[] for _ in range(num_levels)] | |
box_targets_out = [[] for _ in range(num_levels)] | |
num_positives_out = [] | |
anchor_box_list = BoxList(self.anchors.boxes) | |
for i in range(batch_size): | |
last_sample = i == batch_size - 1 | |
if filter_valid: | |
valid_idx = gt_classes[i] > -1 # filter gt targets w/ label <= -1 | |
gt_box_list = BoxList(gt_boxes[i][valid_idx]) | |
gt_class_i = gt_classes[i][valid_idx] | |
else: | |
gt_box_list = BoxList(gt_boxes[i]) | |
gt_class_i = gt_classes[i] | |
cls_targets, box_targets, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_class_i) | |
# class labels start from 1 and the background class = -1 | |
cls_targets = (cls_targets - 1).long() | |
# Unpack labels. | |
"""Unpacks an array of cls/box into multiple scales.""" | |
count = 0 | |
for level in range(self.anchors.min_level, self.anchors.max_level + 1): | |
level_idx = level - self.anchors.min_level | |
feat_size = self.anchors.feat_sizes[level] | |
steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location() | |
cls_targets_out[level_idx].append( | |
cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) | |
box_targets_out[level_idx].append( | |
box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1])) | |
count += steps | |
if last_sample: | |
cls_targets_out[level_idx] = torch.stack(cls_targets_out[level_idx]) | |
box_targets_out[level_idx] = torch.stack(box_targets_out[level_idx]) | |
num_positives_out.append((matches.match_results > -1).float().sum()) | |
if last_sample: | |
num_positives_out = torch.stack(num_positives_out) | |
return cls_targets_out, box_targets_out, num_positives_out | |