Spaces:
Sleeping
Sleeping
/* | |
* SPDX-License-Identifier: Apache-2.0 | |
*/ | |
using namespace ONNX_NAMESPACE; | |
namespace ONNX_NAMESPACE { | |
static const char* RoiAlign_ver16_doc = R"DOC( | |
Region of Interest (RoI) align operation described in the | |
[Mask R-CNN paper](https://arxiv.org/abs/1703.06870). | |
RoiAlign consumes an input tensor X and region of interests (rois) | |
to apply pooling across each RoI; it produces a 4-D tensor of shape | |
(num_rois, C, output_height, output_width). | |
RoiAlign is proposed to avoid the misalignment by removing | |
quantizations while converting from original image into feature | |
map and from feature map into RoI feature; in each ROI bin, | |
the value of the sampled locations are computed directly | |
through bilinear interpolation. | |
)DOC"; | |
ONNX_OPERATOR_SET_SCHEMA( | |
RoiAlign, | |
16, | |
OpSchema() | |
.SetDoc(RoiAlign_ver16_doc) | |
.Attr( | |
"spatial_scale", | |
"Multiplicative spatial scale factor to translate ROI coordinates " | |
"from their input spatial scale to the scale used when pooling, " | |
"i.e., spatial scale of the input feature map X relative to the " | |
"input image. E.g.; default is 1.0f. ", | |
AttributeProto::FLOAT, | |
1.f) | |
.Attr("output_height", "default 1; Pooled output Y's height.", AttributeProto::INT, static_cast<int64_t>(1)) | |
.Attr("output_width", "default 1; Pooled output Y's width.", AttributeProto::INT, static_cast<int64_t>(1)) | |
.Attr( | |
"sampling_ratio", | |
"Number of sampling points in the interpolation grid used to compute " | |
"the output value of each pooled output bin. If > 0, then exactly " | |
"sampling_ratio x sampling_ratio grid points are used. If == 0, then " | |
"an adaptive number of grid points are used (computed as " | |
"ceil(roi_width / output_width), and likewise for height). Default is 0.", | |
AttributeProto::INT, | |
static_cast<int64_t>(0)) | |
.Attr( | |
"mode", | |
"The pooling method. Two modes are supported: 'avg' and 'max'. " | |
"Default is 'avg'.", | |
AttributeProto::STRING, | |
std::string("avg")) | |
.Attr( | |
"coordinate_transformation_mode", | |
"Allowed values are 'half_pixel' and 'output_half_pixel'. " | |
"Use the value 'half_pixel' to pixel shift the input coordinates by -0.5 (the recommended behavior). " | |
"Use the value 'output_half_pixel' to omit the pixel shift for the input (use this for a " | |
"backward-compatible behavior).", | |
AttributeProto::STRING, | |
std::string("half_pixel")) | |
.Input( | |
0, | |
"X", | |
"Input data tensor from the previous operator; " | |
"4-D feature map of shape (N, C, H, W), " | |
"where N is the batch size, C is the number of channels, " | |
"and H and W are the height and the width of the data.", | |
"T1") | |
.Input( | |
1, | |
"rois", | |
"RoIs (Regions of Interest) to pool over; rois is " | |
"2-D input of shape (num_rois, 4) given as " | |
"[[x1, y1, x2, y2], ...]. " | |
"The RoIs' coordinates are in the coordinate system of the input image. " | |
"Each coordinate set has a 1:1 correspondence with the 'batch_indices' input.", | |
"T1") | |
.Input( | |
2, | |
"batch_indices", | |
"1-D tensor of shape (num_rois,) with each element denoting " | |
"the index of the corresponding image in the batch.", | |
"T2") | |
.Output( | |
0, | |
"Y", | |
"RoI pooled output, 4-D tensor of shape " | |
"(num_rois, C, output_height, output_width). The r-th batch element Y[r-1] " | |
"is a pooled feature map corresponding to the r-th RoI X[r-1].", | |
"T1") | |
.TypeConstraint( | |
"T1", | |
{"tensor(float16)", "tensor(float)", "tensor(double)"}, | |
"Constrain types to float tensors.") | |
.TypeConstraint("T2", {"tensor(int64)"}, "Constrain types to int tensors.") | |
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { | |
propagateElemTypeFromInputToOutput(ctx, 0, 0); | |
size_t input_param = 0, rois_param = 1, batch_index_param = 2; | |
checkInputRank(ctx, input_param, 4); | |
checkInputRank(ctx, rois_param, 2); | |
checkInputRank(ctx, batch_index_param, 1); | |
// Output dimensions, initialized to an unknown-dimension-value | |
Dim num_rois, C, ht, width; | |
// Get value of C from dim 1 of input_param, if available | |
unifyInputDim(ctx, input_param, 1, C); | |
// Get value of num_rois from dim 0 of rois_param, if available | |
unifyInputDim(ctx, rois_param, 0, num_rois); | |
// ... or from dim 0 of batch_index_param, if available | |
unifyInputDim(ctx, batch_index_param, 0, num_rois); | |
// Get height from attribute, using default-value of 1 | |
unifyDim(ht, getAttribute(ctx, "output_height", 1)); | |
// Get width from attribute, using default-value of 1 | |
unifyDim(width, getAttribute(ctx, "output_width", 1)); | |
// set output shape: | |
updateOutputShape(ctx, 0, {num_rois, C, ht, width}); | |
})); | |
static const char* NonMaxSuppression_ver11_doc = R"DOC( | |
Filter out boxes that have high intersection-over-union (IOU) overlap with previously selected boxes. | |
Bounding boxes with score less than score_threshold are removed. Bounding box format is indicated by attribute center_point_box. | |
Note that this algorithm is agnostic to where the origin is in the coordinate system and more generally is invariant to | |
orthogonal transformations and translations of the coordinate system; thus translating or reflections of the coordinate system | |
result in the same boxes being selected by the algorithm. | |
The selected_indices output is a set of integers indexing into the input collection of bounding boxes representing the selected boxes. | |
The bounding box coordinates corresponding to the selected indices can then be obtained using the Gather or GatherND operation. | |
)DOC"; | |
ONNX_OPERATOR_SET_SCHEMA( | |
NonMaxSuppression, | |
11, | |
OpSchema() | |
.Input( | |
0, | |
"boxes", | |
"An input tensor with shape [num_batches, spatial_dimension, 4]. The single box data format is indicated by center_point_box.", | |
"tensor(float)") | |
.Input(1, "scores", "An input tensor with shape [num_batches, num_classes, spatial_dimension]", "tensor(float)") | |
.Input( | |
2, | |
"max_output_boxes_per_class", | |
"Integer representing the maximum number of boxes to be selected per batch per class. It is a scalar. Default to 0, which means no output.", | |
"tensor(int64)", | |
OpSchema::Optional) | |
.Input( | |
3, | |
"iou_threshold", | |
"Float representing the threshold for deciding whether boxes overlap too much with respect to IOU. It is scalar. Value range [0, 1]. Default to 0.", | |
"tensor(float)", | |
OpSchema::Optional) | |
.Input( | |
4, | |
"score_threshold", | |
"Float representing the threshold for deciding when to remove boxes based on score. It is a scalar.", | |
"tensor(float)", | |
OpSchema::Optional) | |
.Output( | |
0, | |
"selected_indices", | |
"selected indices from the boxes tensor. [num_selected_indices, 3], the selected index format is [batch_index, class_index, box_index].", | |
"tensor(int64)") | |
.Attr( | |
"center_point_box", | |
"Integer indicate the format of the box data. The default is 0. " | |
"0 - the box data is supplied as [y1, x1, y2, x2] where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair of box corners " | |
"and the coordinates can be provided as normalized (i.e., lying in the interval [0, 1]) or absolute. Mostly used for TF models. " | |
"1 - the box data is supplied as [x_center, y_center, width, height]. Mostly used for Pytorch models.", | |
AttributeProto::INT, | |
static_cast<int64_t>(0)) | |
.SetDoc(NonMaxSuppression_ver11_doc) | |
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { | |
// Type inference - Output is always of type INT64 | |
auto* selected_indices_type = ctx.getOutputType(0)->mutable_tensor_type(); | |
selected_indices_type->set_elem_type(TensorProto_DataType::TensorProto_DataType_INT64); | |
// Shape inference | |
// The exact shape cannot be determined as it depends on the input and | |
// other input configurations for the op But part of the shape can be | |
// established | |
auto* selected_indices_shape = getOutputShape(ctx, 0); | |
selected_indices_shape->clear_dim(); | |
// Output is 2D always | |
// The value of the first dim is determined by input data | |
// hence its value cannot be determined statically | |
selected_indices_shape->add_dim(); | |
// The value of the second dim is 3 | |
selected_indices_shape->add_dim()->set_dim_value(3); | |
})); | |
} // namespace ONNX_NAMESPACE | |