File size: 9,533 Bytes
dc2106c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*

 * SPDX-License-Identifier: Apache-2.0

 */

#include "onnx/defs/schema.h"
using namespace ONNX_NAMESPACE;

namespace ONNX_NAMESPACE {

static const char* RoiAlign_ver16_doc = R"DOC(

Region of Interest (RoI) align operation described in the

[Mask R-CNN paper](https://arxiv.org/abs/1703.06870).

RoiAlign consumes an input tensor X and region of interests (rois)

to apply pooling across each RoI; it produces a 4-D tensor of shape

(num_rois, C, output_height, output_width).



RoiAlign is proposed to avoid the misalignment by removing

quantizations while converting from original image into feature

map and from feature map into RoI feature; in each ROI bin,

the value of the sampled locations are computed directly

through bilinear interpolation.

)DOC";

ONNX_OPERATOR_SET_SCHEMA(
    RoiAlign,
    16,
    OpSchema()
        .SetDoc(RoiAlign_ver16_doc)
        .Attr(
            "spatial_scale",
            "Multiplicative spatial scale factor to translate ROI coordinates "
            "from their input spatial scale to the scale used when pooling, "
            "i.e., spatial scale of the input feature map X relative to the "
            "input image. E.g.; default is 1.0f. ",
            AttributeProto::FLOAT,
            1.f)
        .Attr("output_height", "default 1; Pooled output Y's height.", AttributeProto::INT, static_cast<int64_t>(1))
        .Attr("output_width", "default 1; Pooled output Y's width.", AttributeProto::INT, static_cast<int64_t>(1))
        .Attr(
            "sampling_ratio",
            "Number of sampling points in the interpolation grid used to compute "
            "the output value of each pooled output bin. If > 0, then exactly "
            "sampling_ratio x sampling_ratio grid points are used. If == 0, then "
            "an adaptive number of grid points are used (computed as "
            "ceil(roi_width / output_width), and likewise for height). Default is 0.",
            AttributeProto::INT,
            static_cast<int64_t>(0))
        .Attr(
            "mode",
            "The pooling method. Two modes are supported: 'avg' and 'max'. "
            "Default is 'avg'.",
            AttributeProto::STRING,
            std::string("avg"))
        .Attr(
            "coordinate_transformation_mode",
            "Allowed values are 'half_pixel' and 'output_half_pixel'. "
            "Use the value 'half_pixel' to pixel shift the input coordinates by -0.5 (the recommended behavior). "
            "Use the value 'output_half_pixel' to omit the pixel shift for the input (use this for a "
            "backward-compatible behavior).",
            AttributeProto::STRING,
            std::string("half_pixel"))
        .Input(
            0,
            "X",
            "Input data tensor from the previous operator; "
            "4-D feature map of shape (N, C, H, W), "
            "where N is the batch size, C is the number of channels, "
            "and H and W are the height and the width of the data.",
            "T1")
        .Input(
            1,
            "rois",
            "RoIs (Regions of Interest) to pool over; rois is "
            "2-D input of shape (num_rois, 4) given as "
            "[[x1, y1, x2, y2], ...]. "
            "The RoIs' coordinates are in the coordinate system of the input image. "
            "Each coordinate set has a 1:1 correspondence with the 'batch_indices' input.",
            "T1")
        .Input(
            2,
            "batch_indices",
            "1-D tensor of shape (num_rois,) with each element denoting "
            "the index of the corresponding image in the batch.",
            "T2")
        .Output(
            0,
            "Y",
            "RoI pooled output, 4-D tensor of shape "
            "(num_rois, C, output_height, output_width). The r-th batch element Y[r-1] "
            "is a pooled feature map corresponding to the r-th RoI X[r-1].",
            "T1")
        .TypeConstraint(
            "T1",
            {"tensor(float16)", "tensor(float)", "tensor(double)"},
            "Constrain types to float tensors.")
        .TypeConstraint("T2", {"tensor(int64)"}, "Constrain types to int tensors.")
        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
          propagateElemTypeFromInputToOutput(ctx, 0, 0);

          size_t input_param = 0, rois_param = 1, batch_index_param = 2;

          checkInputRank(ctx, input_param, 4);
          checkInputRank(ctx, rois_param, 2);
          checkInputRank(ctx, batch_index_param, 1);

          // Output dimensions, initialized to an unknown-dimension-value
          Dim num_rois, C, ht, width;

          // Get value of C from dim 1 of input_param, if available
          unifyInputDim(ctx, input_param, 1, C);

          // Get value of num_rois from dim 0 of rois_param, if available
          unifyInputDim(ctx, rois_param, 0, num_rois);
          // ... or from dim 0 of batch_index_param, if available
          unifyInputDim(ctx, batch_index_param, 0, num_rois);

          // Get height from attribute, using default-value of 1
          unifyDim(ht, getAttribute(ctx, "output_height", 1));

          // Get width from attribute, using default-value of 1
          unifyDim(width, getAttribute(ctx, "output_width", 1));

          // set output shape:
          updateOutputShape(ctx, 0, {num_rois, C, ht, width});
        }));

static const char* NonMaxSuppression_ver11_doc = R"DOC(

Filter out boxes that have high intersection-over-union (IOU) overlap with previously selected boxes.

Bounding boxes with score less than score_threshold are removed. Bounding box format is indicated by attribute center_point_box.

Note that this algorithm is agnostic to where the origin is in the coordinate system and more generally is invariant to

orthogonal transformations and translations of the coordinate system; thus translating or reflections of the coordinate system

result in the same boxes being selected by the algorithm.

The selected_indices output is a set of integers indexing into the input collection of bounding boxes representing the selected boxes.

The bounding box coordinates corresponding to the selected indices can then be obtained using the Gather or GatherND operation.

)DOC";

ONNX_OPERATOR_SET_SCHEMA(
    NonMaxSuppression,
    11,
    OpSchema()
        .Input(
            0,
            "boxes",
            "An input tensor with shape [num_batches, spatial_dimension, 4]. The single box data format is indicated by center_point_box.",
            "tensor(float)")
        .Input(1, "scores", "An input tensor with shape [num_batches, num_classes, spatial_dimension]", "tensor(float)")
        .Input(
            2,
            "max_output_boxes_per_class",
            "Integer representing the maximum number of boxes to be selected per batch per class. It is a scalar. Default to 0, which means no output.",
            "tensor(int64)",
            OpSchema::Optional)
        .Input(
            3,
            "iou_threshold",
            "Float representing the threshold for deciding whether boxes overlap too much with respect to IOU. It is scalar. Value range [0, 1]. Default to 0.",
            "tensor(float)",
            OpSchema::Optional)
        .Input(
            4,
            "score_threshold",
            "Float representing the threshold for deciding when to remove boxes based on score. It is a scalar.",
            "tensor(float)",
            OpSchema::Optional)
        .Output(
            0,
            "selected_indices",
            "selected indices from the boxes tensor. [num_selected_indices, 3], the selected index format is [batch_index, class_index, box_index].",
            "tensor(int64)")
        .Attr(
            "center_point_box",
            "Integer indicate the format of the box data. The default is 0. "
            "0 - the box data is supplied as [y1, x1, y2, x2] where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair of box corners "
            "and the coordinates can be provided as normalized (i.e., lying in the interval [0, 1]) or absolute. Mostly used for TF models. "
            "1 - the box data is supplied as [x_center, y_center, width, height]. Mostly used for Pytorch models.",
            AttributeProto::INT,
            static_cast<int64_t>(0))
        .SetDoc(NonMaxSuppression_ver11_doc)
        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
          // Type inference - Output is always of type INT64
          auto* selected_indices_type = ctx.getOutputType(0)->mutable_tensor_type();
          selected_indices_type->set_elem_type(TensorProto_DataType::TensorProto_DataType_INT64);

          // Shape inference
          // The exact shape cannot be determined as it depends on the input and
          // other input configurations for the op But part of the shape can be
          // established

          auto* selected_indices_shape = getOutputShape(ctx, 0);
          selected_indices_shape->clear_dim();

          // Output is 2D always

          // The value of the first dim is determined by input data
          // hence its value cannot be determined statically
          selected_indices_shape->add_dim();

          // The value of the second dim is 3
          selected_indices_shape->add_dim()->set_dim_value(3);
        }));

} // namespace ONNX_NAMESPACE