Spaces:

rootstrap-org
/

waste-classifier

Sleeping

App Files Files Community

waste-classifier / efficientdet /effdet /anchors.py

santit96

Create the streamlit app that classifies the trash in an image into classes

fa84113 over 1 year ago

raw

history blame contribute delete

17.9 kB

	""" RetinaNet / EfficientDet Anchor Gen

	Adapted for PyTorch from Tensorflow impl at
	https://github.com/google/automl/blob/6f6694cec1a48cdb33d5d1551a2d5db8ad227798/efficientdet/anchors.py

	Hacked together by Ross Wightman, original copyright below
	"""
	# Copyright 2020 Google Research. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Anchor definition.

	This module is borrowed from TPU RetinaNet implementation:
	https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py
	"""
	from typing import Optional, Tuple, Sequence

	import numpy as np
	import torch
	import torch.nn as nn
	#import torchvision.ops.boxes as tvb
	from torchvision.ops.boxes import batched_nms, remove_small_boxes
	from typing import List

	from effdet.object_detection import ArgMaxMatcher, FasterRcnnBoxCoder, BoxList, IouSimilarity, TargetAssigner
	from .soft_nms import batched_soft_nms


	# The minimum score to consider a logit for identifying detections.
	MIN_CLASS_SCORE = -5.0

	# The score for a dummy detection
	_DUMMY_DETECTION_SCORE = -1e5

	# The maximum number of (anchor,class) pairs to keep for non-max suppression.
	MAX_DETECTION_POINTS = 5000

	# The maximum number of detections per image.
	MAX_DETECTIONS_PER_IMAGE = 100


	def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False):
	"""Transforms relative regression coordinates to absolute positions.

	Network predictions are normalized and relative to a given anchor; this
	reverses the transformation and outputs absolute coordinates for the input image.

	Args:
	rel_codes: box regression targets.

	anchors: anchors on all feature levels.

	Returns:
	outputs: bounding boxes.

	"""
	ycenter_a = (anchors[:, 0] + anchors[:, 2]) / 2
	xcenter_a = (anchors[:, 1] + anchors[:, 3]) / 2
	ha = anchors[:, 2] - anchors[:, 0]
	wa = anchors[:, 3] - anchors[:, 1]

	ty, tx, th, tw = rel_codes.unbind(dim=1)

	w = torch.exp(tw) * wa
	h = torch.exp(th) * ha
	ycenter = ty * ha + ycenter_a
	xcenter = tx * wa + xcenter_a
	ymin = ycenter - h / 2.
	xmin = xcenter - w / 2.
	ymax = ycenter + h / 2.
	xmax = xcenter + w / 2.
	if output_xyxy:
	out = torch.stack([xmin, ymin, xmax, ymax], dim=1)
	else:
	out = torch.stack([ymin, xmin, ymax, xmax], dim=1)
	return out


	def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor):
	boxes = boxes.clamp(min=0)
	size = torch.cat([size, size], dim=0)
	boxes = boxes.min(size)
	return boxes


	def generate_detections(
	cls_outputs, box_outputs, anchor_boxes, indices, classes,
	img_scale: Optional[torch.Tensor], img_size: Optional[torch.Tensor],
	max_det_per_image: int = MAX_DETECTIONS_PER_IMAGE, soft_nms: bool = False):
	"""Generates detections with RetinaNet model outputs and anchors.

	Args:
	cls_outputs: a torch tensor with shape [N, 1], which has the highest class
	scores on all feature levels. The N is the number of selected
	top-K total anchors on all levels. (k being MAX_DETECTION_POINTS)

	box_outputs: a torch tensor with shape [N, 4], which stacks box regression
	outputs on all feature levels. The N is the number of selected top-k
	total anchors on all levels. (k being MAX_DETECTION_POINTS)

	anchor_boxes: a torch tensor with shape [N, 4], which stacks anchors on all
	feature levels. The N is the number of selected top-k total anchors on all levels.

	indices: a torch tensor with shape [N], which is the indices from top-k selection.

	classes: a torch tensor with shape [N], which represents the class
	prediction on all selected anchors from top-k selection.

	img_scale: a float tensor representing the scale between original image
	and input image for the detector. It is used to rescale detections for
	evaluating with the original groundtruth annotations.

	max_det_per_image: an int constant, added as argument to make torchscript happy

	Returns:
	detections: detection results in a tensor with shape [MAX_DETECTION_POINTS, 6],
	each row representing [x_min, y_min, x_max, y_max, score, class]
	"""
	assert box_outputs.shape[-1] == 4
	assert anchor_boxes.shape[-1] == 4
	assert cls_outputs.shape[-1] == 1

	anchor_boxes = anchor_boxes[indices, :]

	# Appply bounding box regression to anchors, boxes are converted to xyxy
	# here since PyTorch NMS expects them in that form.
	boxes = decode_box_outputs(box_outputs.float(), anchor_boxes, output_xyxy=True)
	if img_scale is not None and img_size is not None:
	boxes = clip_boxes_xyxy(boxes, img_size / img_scale) # clip before NMS better?

	scores = cls_outputs.sigmoid().squeeze(1).float()
	if soft_nms:
	top_detection_idx, soft_scores = batched_soft_nms(
	boxes, scores, classes, method_gaussian=True, iou_threshold=0.3, score_threshold=.001)
	scores[top_detection_idx] = soft_scores
	else:
	top_detection_idx = batched_nms(boxes, scores, classes, iou_threshold=0.5)

	# keep only topk scoring predictions
	top_detection_idx = top_detection_idx[:max_det_per_image]
	boxes = boxes[top_detection_idx]
	scores = scores[top_detection_idx, None]
	classes = classes[top_detection_idx, None] + 1 # back to class idx with background class = 0

	if img_scale is not None:
	boxes = boxes * img_scale

	# FIXME add option to convert boxes back to yxyx? Otherwise must be handled downstream if
	# that is the preferred output format.

	# stack em and pad out to MAX_DETECTIONS_PER_IMAGE if necessary
	num_det = len(top_detection_idx)
	detections = torch.cat([boxes, scores, classes.float()], dim=1)
	if num_det < max_det_per_image:
	detections = torch.cat([
	detections,
	torch.zeros((max_det_per_image - num_det, 6), device=detections.device, dtype=detections.dtype)
	], dim=0)
	return detections


	def get_feat_sizes(image_size: Tuple[int, int], max_level: int):
	"""Get feat widths and heights for all levels.
	Args:
	image_size: a tuple (H, W)
	max_level: maximum feature level.
	Returns:
	feat_sizes: a list of tuples (height, width) for each level.
	"""
	feat_size = image_size
	feat_sizes = [feat_size]
	for _ in range(1, max_level + 1):
	feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1)
	feat_sizes.append(feat_size)
	return feat_sizes


	class Anchors(nn.Module):
	"""RetinaNet Anchors class."""

	def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]):
	"""Constructs multiscale RetinaNet anchors.

	Args:
	min_level: integer number of minimum level of the output feature pyramid.

	max_level: integer number of maximum level of the output feature pyramid.

	num_scales: integer number representing intermediate scales added
	on each level. For instances, num_scales=2 adds two additional
	anchor scales [2^0, 2^0.5] on each level.

	aspect_ratios: list of tuples representing the aspect ratio anchors added
	on each level. For instances, aspect_ratios =
	[(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.

	anchor_scale: float number representing the scale of size of the base
	anchor to the feature stride 2^level.

	image_size: Sequence specifying input image size of model (H, W).
	The image_size should be divided by the largest feature stride 2^max_level.
	"""
	super(Anchors, self).__init__()
	self.min_level = min_level
	self.max_level = max_level
	self.num_scales = num_scales
	self.aspect_ratios = aspect_ratios
	if isinstance(anchor_scale, Sequence):
	assert len(anchor_scale) == max_level - min_level + 1
	self.anchor_scales = anchor_scale
	else:
	self.anchor_scales = [anchor_scale] * (max_level - min_level + 1)

	assert isinstance(image_size, Sequence) and len(image_size) == 2
	# FIXME this restriction can likely be relaxed with some additional changes
	assert image_size[0] % 2 max_level == 0, 'Image size must be divisible by 2 max_level (128)'
	assert image_size[1] % 2 max_level == 0, 'Image size must be divisible by 2 max_level (128)'
	self.image_size = tuple(image_size)
	self.feat_sizes = get_feat_sizes(image_size, max_level)
	self.config = self._generate_configs()
	self.register_buffer('boxes', self._generate_boxes())

	@classmethod
	def from_config(cls, config):
	return cls(
	config.min_level, config.max_level,
	config.num_scales, config.aspect_ratios,
	config.anchor_scale, config.image_size)

	def _generate_configs(self):
	"""Generate configurations of anchor boxes."""
	anchor_configs = {}
	feat_sizes = self.feat_sizes
	for level in range(self.min_level, self.max_level + 1):
	anchor_configs[level] = []
	for scale_octave in range(self.num_scales):
	for aspect in self.aspect_ratios:
	anchor_configs[level].append(
	((feat_sizes[0][0] // feat_sizes[level][0],
	feat_sizes[0][1] // feat_sizes[level][1]),
	scale_octave / float(self.num_scales), aspect,
	self.anchor_scales[level - self.min_level]))
	return anchor_configs

	def _generate_boxes(self):
	"""Generates multiscale anchor boxes."""
	boxes_all = []
	for _, configs in self.config.items():
	boxes_level = []
	for config in configs:
	stride, octave_scale, aspect, anchor_scale = config
	base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale
	base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale
	if isinstance(aspect, Sequence):
	aspect_x = aspect[0]
	aspect_y = aspect[1]
	else:
	aspect_x = np.sqrt(aspect)
	aspect_y = 1.0 / aspect_x
	anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0
	anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0

	x = np.arange(stride[1] / 2, self.image_size[1], stride[1])
	y = np.arange(stride[0] / 2, self.image_size[0], stride[0])
	xv, yv = np.meshgrid(x, y)
	xv = xv.reshape(-1)
	yv = yv.reshape(-1)

	boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
	yv + anchor_size_y_2, xv + anchor_size_x_2))
	boxes = np.swapaxes(boxes, 0, 1)
	boxes_level.append(np.expand_dims(boxes, axis=1))

	# concat anchors on the same level to the reshape NxAx4
	boxes_level = np.concatenate(boxes_level, axis=1)
	boxes_all.append(boxes_level.reshape([-1, 4]))

	anchor_boxes = np.vstack(boxes_all)
	anchor_boxes = torch.from_numpy(anchor_boxes).float()
	return anchor_boxes

	def get_anchors_per_location(self):
	return self.num_scales * len(self.aspect_ratios)


	class AnchorLabeler(object):
	"""Labeler for multiscale anchor boxes.
	"""

	def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5):
	"""Constructs anchor labeler to assign labels to anchors.

	Args:
	anchors: an instance of class Anchors.

	num_classes: integer number representing number of classes in the dataset.

	match_threshold: float number between 0 and 1 representing the threshold
	to assign positive labels for anchors.
	"""
	similarity_calc = IouSimilarity()
	matcher = ArgMaxMatcher(
	match_threshold,
	unmatched_threshold=match_threshold,
	negatives_lower_than_unmatched=True,
	force_match_for_each_row=True)
	box_coder = FasterRcnnBoxCoder()

	self.target_assigner = TargetAssigner(similarity_calc, matcher, box_coder)
	self.anchors = anchors
	self.match_threshold = match_threshold
	self.num_classes = num_classes
	self.indices_cache = {}

	def label_anchors(self, gt_boxes, gt_classes, filter_valid=True):
	"""Labels anchors with ground truth inputs.

	Args:
	gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
	For each row, it stores [y0, x0, y1, x1] for four corners of a box.

	gt_classes: A integer tensor with shape [N, 1] representing groundtruth classes.

	filter_valid: Filter out any boxes w/ gt class <= -1 before assigning

	Returns:
	cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
	The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l
	represent the dimension of class logits at l-th level.

	box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
	The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and
	width_l represent the dimension of bounding box regression output at l-th level.

	num_positives: scalar tensor storing number of positives in an image.
	"""
	cls_targets_out = []
	box_targets_out = []

	if filter_valid:
	valid_idx = gt_classes > -1 # filter gt targets w/ label <= -1
	gt_boxes = gt_boxes[valid_idx]
	gt_classes = gt_classes[valid_idx]

	cls_targets, box_targets, matches = self.target_assigner.assign(
	BoxList(self.anchors.boxes), BoxList(gt_boxes), gt_classes)

	# class labels start from 1 and the background class = -1
	cls_targets = (cls_targets - 1).long()

	# Unpack labels.
	"""Unpacks an array of cls/box into multiple scales."""
	count = 0
	for level in range(self.anchors.min_level, self.anchors.max_level + 1):
	feat_size = self.anchors.feat_sizes[level]
	steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
	cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
	box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
	count += steps

	num_positives = (matches.match_results > -1).float().sum()

	return cls_targets_out, box_targets_out, num_positives

	def batch_label_anchors(self, gt_boxes, gt_classes, filter_valid=True):
	batch_size = len(gt_boxes)
	assert batch_size == len(gt_classes)
	num_levels = self.anchors.max_level - self.anchors.min_level + 1
	cls_targets_out = [[] for _ in range(num_levels)]
	box_targets_out = [[] for _ in range(num_levels)]
	num_positives_out = []

	anchor_box_list = BoxList(self.anchors.boxes)
	for i in range(batch_size):
	last_sample = i == batch_size - 1

	if filter_valid:
	valid_idx = gt_classes[i] > -1 # filter gt targets w/ label <= -1
	gt_box_list = BoxList(gt_boxes[i][valid_idx])
	gt_class_i = gt_classes[i][valid_idx]
	else:
	gt_box_list = BoxList(gt_boxes[i])
	gt_class_i = gt_classes[i]
	cls_targets, box_targets, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_class_i)

	# class labels start from 1 and the background class = -1
	cls_targets = (cls_targets - 1).long()

	# Unpack labels.
	"""Unpacks an array of cls/box into multiple scales."""
	count = 0
	for level in range(self.anchors.min_level, self.anchors.max_level + 1):
	level_idx = level - self.anchors.min_level
	feat_size = self.anchors.feat_sizes[level]
	steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
	cls_targets_out[level_idx].append(
	cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
	box_targets_out[level_idx].append(
	box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
	count += steps
	if last_sample:
	cls_targets_out[level_idx] = torch.stack(cls_targets_out[level_idx])
	box_targets_out[level_idx] = torch.stack(box_targets_out[level_idx])

	num_positives_out.append((matches.match_results > -1).float().sum())
	if last_sample:
	num_positives_out = torch.stack(num_positives_out)

	return cls_targets_out, box_targets_out, num_positives_out