YinuoGuo27 commited on
Commit
02f8487
·
verified ·
1 Parent(s): da524b2

Upload 96 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. difpoint/src/__init__.py +5 -0
  2. difpoint/src/__pycache__/__init__.cpython-310.pyc +0 -0
  3. difpoint/src/__pycache__/__init__.cpython-38.pyc +0 -0
  4. difpoint/src/models/XPose/__init__.py +6 -0
  5. difpoint/src/models/XPose/config_model/UniPose_SwinT.py +125 -0
  6. difpoint/src/models/XPose/config_model/__init__.py +6 -0
  7. difpoint/src/models/XPose/config_model/coco_transformer.py +8 -0
  8. difpoint/src/models/XPose/models/UniPose/__init__.py +10 -0
  9. difpoint/src/models/XPose/models/UniPose/attention.py +373 -0
  10. difpoint/src/models/XPose/models/UniPose/backbone.py +211 -0
  11. difpoint/src/models/XPose/models/UniPose/deformable_transformer.py +1230 -0
  12. difpoint/src/models/XPose/models/UniPose/fuse_modules.py +276 -0
  13. difpoint/src/models/XPose/models/UniPose/mask_generate.py +56 -0
  14. difpoint/src/models/XPose/models/UniPose/ops/__init__.py +6 -0
  15. difpoint/src/models/XPose/models/UniPose/ops/functions/__init__.py +10 -0
  16. difpoint/src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py +61 -0
  17. difpoint/src/models/XPose/models/UniPose/ops/modules/__init__.py +9 -0
  18. difpoint/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn.py +142 -0
  19. difpoint/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py +130 -0
  20. difpoint/src/models/XPose/models/UniPose/ops/setup.py +79 -0
  21. difpoint/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp +41 -0
  22. difpoint/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h +33 -0
  23. difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu +153 -0
  24. difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h +30 -0
  25. difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh +1327 -0
  26. difpoint/src/models/XPose/models/UniPose/ops/src/ms_deform_attn.h +62 -0
  27. difpoint/src/models/XPose/models/UniPose/ops/src/vision.cpp +16 -0
  28. difpoint/src/models/XPose/models/UniPose/ops/test.py +89 -0
  29. difpoint/src/models/XPose/models/UniPose/position_encoding.py +157 -0
  30. difpoint/src/models/XPose/models/UniPose/swin_transformer.py +701 -0
  31. difpoint/src/models/XPose/models/UniPose/transformer_deformable.py +595 -0
  32. difpoint/src/models/XPose/models/UniPose/transformer_vanilla.py +102 -0
  33. difpoint/src/models/XPose/models/UniPose/unipose.py +621 -0
  34. difpoint/src/models/XPose/models/UniPose/utils.py +348 -0
  35. difpoint/src/models/XPose/models/__init__.py +16 -0
  36. difpoint/src/models/XPose/models/registry.py +58 -0
  37. difpoint/src/models/XPose/predefined_keypoints.py +56 -0
  38. difpoint/src/models/XPose/transforms.py +394 -0
  39. difpoint/src/models/XPose/util/__init__.py +6 -0
  40. difpoint/src/models/XPose/util/addict.py +159 -0
  41. difpoint/src/models/XPose/util/box_ops.py +139 -0
  42. difpoint/src/models/XPose/util/config.py +425 -0
  43. difpoint/src/models/XPose/util/keypoint_ops.py +29 -0
  44. difpoint/src/models/XPose/util/misc.py +701 -0
  45. difpoint/src/models/__init__.py +13 -0
  46. difpoint/src/models/__pycache__/__init__.cpython-310.pyc +0 -0
  47. difpoint/src/models/__pycache__/__init__.cpython-38.pyc +0 -0
  48. difpoint/src/models/__pycache__/appearance_feature_extractor_model.cpython-310.pyc +0 -0
  49. difpoint/src/models/__pycache__/appearance_feature_extractor_model.cpython-38.pyc +0 -0
  50. difpoint/src/models/__pycache__/base_model.cpython-310.pyc +0 -0
difpoint/src/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Author : wenshao
3
+ # @Email : [email protected]
4
+ # @Project : FasterLivePortrait
5
+ # @FileName: __init__.py.py
difpoint/src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (144 Bytes). View file
 
difpoint/src/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (157 Bytes). View file
 
difpoint/src/models/XPose/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2024/8/5 21:58
3
+ # @Author : shaoguowen
4
+ # @Email : [email protected]
5
+ # @Project : FasterLivePortrait
6
+ # @FileName: __init__.py.py
difpoint/src/models/XPose/config_model/UniPose_SwinT.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ['coco_transformer.py']
2
+
3
+ use_label_enc = True
4
+
5
+ num_classes=2
6
+
7
+ lr = 0.0001
8
+ param_dict_type = 'default'
9
+ lr_backbone = 1e-05
10
+ lr_backbone_names = ['backbone.0']
11
+ lr_linear_proj_names = ['reference_points', 'sampling_offsets']
12
+ lr_linear_proj_mult = 0.1
13
+ ddetr_lr_param = False
14
+ batch_size = 2
15
+ weight_decay = 0.0001
16
+ epochs = 12
17
+ lr_drop = 11
18
+ save_checkpoint_interval = 100
19
+ clip_max_norm = 0.1
20
+ onecyclelr = False
21
+ multi_step_lr = False
22
+ lr_drop_list = [33, 45]
23
+
24
+
25
+ modelname = 'UniPose'
26
+ frozen_weights = None
27
+ backbone = 'swin_T_224_1k'
28
+
29
+
30
+ dilation = False
31
+ position_embedding = 'sine'
32
+ pe_temperatureH = 20
33
+ pe_temperatureW = 20
34
+ return_interm_indices = [1, 2, 3]
35
+ backbone_freeze_keywords = None
36
+ enc_layers = 6
37
+ dec_layers = 6
38
+ unic_layers = 0
39
+ pre_norm = False
40
+ dim_feedforward = 2048
41
+ hidden_dim = 256
42
+ dropout = 0.0
43
+ nheads = 8
44
+ num_queries = 900
45
+ query_dim = 4
46
+ num_patterns = 0
47
+ pdetr3_bbox_embed_diff_each_layer = False
48
+ pdetr3_refHW = -1
49
+ random_refpoints_xy = False
50
+ fix_refpoints_hw = -1
51
+ dabdetr_yolo_like_anchor_update = False
52
+ dabdetr_deformable_encoder = False
53
+ dabdetr_deformable_decoder = False
54
+ use_deformable_box_attn = False
55
+ box_attn_type = 'roi_align'
56
+ dec_layer_number = None
57
+ num_feature_levels = 4
58
+ enc_n_points = 4
59
+ dec_n_points = 4
60
+ decoder_layer_noise = False
61
+ dln_xy_noise = 0.2
62
+ dln_hw_noise = 0.2
63
+ add_channel_attention = False
64
+ add_pos_value = False
65
+ two_stage_type = 'standard'
66
+ two_stage_pat_embed = 0
67
+ two_stage_add_query_num = 0
68
+ two_stage_bbox_embed_share = False
69
+ two_stage_class_embed_share = False
70
+ two_stage_learn_wh = False
71
+ two_stage_default_hw = 0.05
72
+ two_stage_keep_all_tokens = False
73
+ num_select = 50
74
+ transformer_activation = 'relu'
75
+ batch_norm_type = 'FrozenBatchNorm2d'
76
+ masks = False
77
+
78
+ decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
79
+ matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
80
+ decoder_module_seq = ['sa', 'ca', 'ffn']
81
+ nms_iou_threshold = -1
82
+
83
+ dec_pred_bbox_embed_share = True
84
+ dec_pred_class_embed_share = True
85
+
86
+
87
+ use_dn = True
88
+ dn_number = 100
89
+ dn_box_noise_scale = 1.0
90
+ dn_label_noise_ratio = 0.5
91
+ dn_label_coef=1.0
92
+ dn_bbox_coef=1.0
93
+ embed_init_tgt = True
94
+ dn_labelbook_size = 2000
95
+
96
+ match_unstable_error = True
97
+
98
+ # for ema
99
+ use_ema = True
100
+ ema_decay = 0.9997
101
+ ema_epoch = 0
102
+
103
+ use_detached_boxes_dec_out = False
104
+
105
+ max_text_len = 256
106
+ shuffle_type = None
107
+
108
+ use_text_enhancer = True
109
+ use_fusion_layer = True
110
+
111
+ use_checkpoint = False # True
112
+ use_transformer_ckpt = True
113
+ text_encoder_type = 'bert-base-uncased'
114
+
115
+ use_text_cross_attention = True
116
+ text_dropout = 0.0
117
+ fusion_dropout = 0.0
118
+ fusion_droppath = 0.1
119
+
120
+ num_body_points=68
121
+ binary_query_selection = False
122
+ use_cdn = True
123
+ ffn_extra_layernorm = False
124
+
125
+ fix_size=False
difpoint/src/models/XPose/config_model/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2024/8/5 21:58
3
+ # @Author : shaoguowen
4
+ # @Email : [email protected]
5
+ # @Project : FasterLivePortrait
6
+ # @FileName: __init__.py.py
difpoint/src/models/XPose/config_model/coco_transformer.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
2
+ data_aug_max_size = 1333
3
+ data_aug_scales2_resize = [400, 500, 600]
4
+ data_aug_scales2_crop = [384, 600]
5
+
6
+
7
+ data_aug_scale_overlap = None
8
+
difpoint/src/models/XPose/models/UniPose/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Conditional DETR
3
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Copied from DETR (https://github.com/facebookresearch/detr)
7
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
8
+ # ------------------------------------------------------------------------
9
+
10
+ from .unipose import build_unipose
difpoint/src/models/XPose/models/UniPose/attention.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # ED-Pose
8
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # Conditional DETR
12
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
13
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
+ # ------------------------------------------------------------------------
15
+ # Modified from codes in torch.nn
16
+ # ------------------------------------------------------------------------
17
+
18
+ """
19
+ MultiheadAttention that support query, key, and value to have different dimensions.
20
+ Query, key, and value projections are removed.
21
+
22
+ Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
23
+ and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
24
+ """
25
+
26
+ import warnings
27
+ import torch
28
+ from torch.nn.modules.linear import Linear
29
+ from torch.nn.init import constant_
30
+ from torch.nn.modules.module import Module
31
+ from torch._jit_internal import Optional, Tuple
32
+ try:
33
+ from torch.overrides import has_torch_function, handle_torch_function
34
+ except:
35
+ from torch._overrides import has_torch_function, handle_torch_function
36
+ from torch.nn.functional import linear, pad, softmax, dropout
37
+ Tensor = torch.Tensor
38
+
39
+ class MultiheadAttention(Module):
40
+ r"""Allows the model to jointly attend to information
41
+ from different representation subspaces.
42
+ See reference: Attention Is All You Need
43
+ .. math::
44
+ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
45
+ \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
46
+ Args:
47
+ embed_dim: total dimension of the model.
48
+ num_heads: parallel attention heads.
49
+ dropout: a Dropout layer on attn_output_weights. Default: 0.0.
50
+ bias: add bias as module parameter. Default: True.
51
+ add_bias_kv: add bias to the key and value sequences at dim=0.
52
+ add_zero_attn: add a new batch of zeros to the key and
53
+ value sequences at dim=1.
54
+ kdim: total number of features in key. Default: None.
55
+ vdim: total number of features in value. Default: None.
56
+ Note: if kdim and vdim are None, they will be set to embed_dim such that
57
+ query, key, and value have the same number of features.
58
+ Examples::
59
+ >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
60
+ >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
61
+ """
62
+ bias_k: Optional[torch.Tensor]
63
+ bias_v: Optional[torch.Tensor]
64
+
65
+ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
66
+ super(MultiheadAttention, self).__init__()
67
+ self.embed_dim = embed_dim
68
+ self.kdim = kdim if kdim is not None else embed_dim
69
+ self.vdim = vdim if vdim is not None else embed_dim
70
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
71
+
72
+ self.num_heads = num_heads
73
+ self.dropout = dropout
74
+ self.head_dim = embed_dim // num_heads
75
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
76
+
77
+ vdim = vdim if vdim is not None else embed_dim
78
+ self.out_proj = Linear(vdim , vdim)
79
+
80
+ self.in_proj_bias = None
81
+ self.in_proj_weight = None
82
+ self.bias_k = self.bias_v = None
83
+ self.q_proj_weight = None
84
+ self.k_proj_weight = None
85
+ self.v_proj_weight = None
86
+
87
+ self.add_zero_attn = add_zero_attn
88
+
89
+ self._reset_parameters()
90
+
91
+ def _reset_parameters(self):
92
+ constant_(self.out_proj.bias, 0.)
93
+
94
+ def __setstate__(self, state):
95
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
96
+ if '_qkv_same_embed_dim' not in state:
97
+ state['_qkv_same_embed_dim'] = True
98
+
99
+ super(MultiheadAttention, self).__setstate__(state)
100
+
101
+ def forward(self, query, key, value, key_padding_mask=None,
102
+ need_weights=True, attn_mask=None):
103
+ # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
104
+ r"""
105
+ Args:
106
+ query, key, value: map a query and a set of key-value pairs to an output.
107
+ See "Attention Is All You Need" for more details.
108
+ key_padding_mask: if provided, specified padding elements in the key will
109
+ be ignored by the attention. When given a binary mask and a value is True,
110
+ the corresponding value on the attention layer will be ignored. When given
111
+ a byte mask and a value is non-zero, the corresponding value on the attention
112
+ layer will be ignored
113
+ need_weights: output attn_output_weights.
114
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
115
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
116
+ Shape:
117
+ - Inputs:
118
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
119
+ the embedding dimension.
120
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
121
+ the embedding dimension.
122
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
123
+ the embedding dimension.
124
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
125
+ If a ByteTensor is provided, the non-zero positions will be ignored while the position
126
+ with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
127
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
128
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
129
+ 3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
130
+ S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
131
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
132
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
133
+ is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
134
+ is provided, it will be added to the attention weight.
135
+ - Outputs:
136
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
137
+ E is the embedding dimension.
138
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
139
+ L is the target sequence length, S is the source sequence length.
140
+ """
141
+ if not self._qkv_same_embed_dim:
142
+ return multi_head_attention_forward(
143
+ query, key, value, self.embed_dim, self.num_heads,
144
+ self.in_proj_weight, self.in_proj_bias,
145
+ self.bias_k, self.bias_v, self.add_zero_attn,
146
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
147
+ training=self.training,
148
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
149
+ attn_mask=attn_mask, use_separate_proj_weight=True,
150
+ q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
151
+ v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
152
+ else:
153
+ return multi_head_attention_forward(
154
+ query, key, value, self.embed_dim, self.num_heads,
155
+ self.in_proj_weight, self.in_proj_bias,
156
+ self.bias_k, self.bias_v, self.add_zero_attn,
157
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
158
+ training=self.training,
159
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
160
+ attn_mask=attn_mask, out_dim=self.vdim)
161
+
162
+
163
+ def multi_head_attention_forward(query: Tensor,
164
+ key: Tensor,
165
+ value: Tensor,
166
+ embed_dim_to_check: int,
167
+ num_heads: int,
168
+ in_proj_weight: Tensor,
169
+ in_proj_bias: Tensor,
170
+ bias_k: Optional[Tensor],
171
+ bias_v: Optional[Tensor],
172
+ add_zero_attn: bool,
173
+ dropout_p: float,
174
+ out_proj_weight: Tensor,
175
+ out_proj_bias: Tensor,
176
+ training: bool = True,
177
+ key_padding_mask: Optional[Tensor] = None,
178
+ need_weights: bool = True,
179
+ attn_mask: Optional[Tensor] = None,
180
+ use_separate_proj_weight: bool = False,
181
+ q_proj_weight: Optional[Tensor] = None,
182
+ k_proj_weight: Optional[Tensor] = None,
183
+ v_proj_weight: Optional[Tensor] = None,
184
+ static_k: Optional[Tensor] = None,
185
+ static_v: Optional[Tensor] = None,
186
+ out_dim: Optional[Tensor] = None
187
+ ) -> Tuple[Tensor, Optional[Tensor]]:
188
+ r"""
189
+ Args:
190
+ query, key, value: map a query and a set of key-value pairs to an output.
191
+ See "Attention Is All You Need" for more details.
192
+ embed_dim_to_check: total dimension of the model.
193
+ num_heads: parallel attention heads.
194
+ in_proj_weight, in_proj_bias: input projection weight and bias.
195
+ bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
196
+ add_zero_attn: add a new batch of zeros to the key and
197
+ value sequences at dim=1.
198
+ dropout_p: probability of an element to be zeroed.
199
+ out_proj_weight, out_proj_bias: the output projection weight and bias.
200
+ training: apply dropout if is ``True``.
201
+ key_padding_mask: if provided, specified padding elements in the key will
202
+ be ignored by the attention. This is an binary mask. When the value is True,
203
+ the corresponding value on the attention layer will be filled with -inf.
204
+ need_weights: output attn_output_weights.
205
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
206
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
207
+ use_separate_proj_weight: the function accept the proj. weights for query, key,
208
+ and value in different forms. If false, in_proj_weight will be used, which is
209
+ a combination of q_proj_weight, k_proj_weight, v_proj_weight.
210
+ q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
211
+ static_k, static_v: static key and value used for attention operators.
212
+ Shape:
213
+ Inputs:
214
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
215
+ the embedding dimension.
216
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
217
+ the embedding dimension.
218
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
219
+ the embedding dimension.
220
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
221
+ If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
222
+ will be unchanged. If a BoolTensor is provided, the positions with the
223
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
224
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
225
+ 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
226
+ S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
227
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
228
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
229
+ are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
230
+ is provided, it will be added to the attention weight.
231
+ - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
232
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
233
+ - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
234
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
235
+ Outputs:
236
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
237
+ E is the embedding dimension.
238
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
239
+ L is the target sequence length, S is the source sequence length.
240
+ """
241
+ if not torch.jit.is_scripting():
242
+ tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
243
+ out_proj_weight, out_proj_bias)
244
+ if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
245
+ return handle_torch_function(
246
+ multi_head_attention_forward, tens_ops, query, key, value,
247
+ embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
248
+ bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
249
+ out_proj_bias, training=training, key_padding_mask=key_padding_mask,
250
+ need_weights=need_weights, attn_mask=attn_mask,
251
+ use_separate_proj_weight=use_separate_proj_weight,
252
+ q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
253
+ v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
254
+ tgt_len, bsz, embed_dim = query.size()
255
+ assert embed_dim == embed_dim_to_check
256
+ # allow MHA to have different sizes for the feature dimension
257
+ assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
258
+
259
+ head_dim = embed_dim // num_heads
260
+ v_head_dim = out_dim // num_heads
261
+ assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
262
+ scaling = float(head_dim) ** -0.5
263
+
264
+ q = query * scaling
265
+ k = key
266
+ v = value
267
+
268
+ if attn_mask is not None:
269
+ assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
270
+ attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
271
+ 'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
272
+ if attn_mask.dtype == torch.uint8:
273
+ warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
274
+ attn_mask = attn_mask.to(torch.bool)
275
+
276
+ if attn_mask.dim() == 2:
277
+ attn_mask = attn_mask.unsqueeze(0)
278
+ if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
279
+ raise RuntimeError('The size of the 2D attn_mask is not correct.')
280
+ elif attn_mask.dim() == 3:
281
+ if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
282
+ raise RuntimeError('The size of the 3D attn_mask is not correct.')
283
+ else:
284
+ raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
285
+ # attn_mask's dim is 3 now.
286
+
287
+ # convert ByteTensor key_padding_mask to bool
288
+ if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
289
+ warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
290
+ key_padding_mask = key_padding_mask.to(torch.bool)
291
+
292
+ if bias_k is not None and bias_v is not None:
293
+ if static_k is None and static_v is None:
294
+ k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
295
+ v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
296
+ if attn_mask is not None:
297
+ attn_mask = pad(attn_mask, (0, 1))
298
+ if key_padding_mask is not None:
299
+ key_padding_mask = pad(key_padding_mask, (0, 1))
300
+ else:
301
+ assert static_k is None, "bias cannot be added to static key."
302
+ assert static_v is None, "bias cannot be added to static value."
303
+ else:
304
+ assert bias_k is None
305
+ assert bias_v is None
306
+
307
+ q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
308
+ if k is not None:
309
+ k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
310
+ if v is not None:
311
+ v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
312
+
313
+ if static_k is not None:
314
+ assert static_k.size(0) == bsz * num_heads
315
+ assert static_k.size(2) == head_dim
316
+ k = static_k
317
+
318
+ if static_v is not None:
319
+ assert static_v.size(0) == bsz * num_heads
320
+ assert static_v.size(2) == v_head_dim
321
+ v = static_v
322
+
323
+ src_len = k.size(1)
324
+
325
+ if key_padding_mask is not None:
326
+ assert key_padding_mask.size(0) == bsz
327
+ assert key_padding_mask.size(1) == src_len
328
+
329
+ if add_zero_attn:
330
+ src_len += 1
331
+ k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
332
+ v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
333
+ if attn_mask is not None:
334
+ attn_mask = pad(attn_mask, (0, 1))
335
+ if key_padding_mask is not None:
336
+ key_padding_mask = pad(key_padding_mask, (0, 1))
337
+
338
+ attn_output_weights = torch.bmm(q, k.transpose(1, 2))
339
+ assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
340
+
341
+ if attn_mask is not None:
342
+ if attn_mask.dtype == torch.bool:
343
+ attn_output_weights.masked_fill_(attn_mask, float('-inf'))
344
+ else:
345
+ attn_output_weights += attn_mask
346
+
347
+
348
+ if key_padding_mask is not None:
349
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
350
+ attn_output_weights = attn_output_weights.masked_fill(
351
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
352
+ float('-inf'),
353
+ )
354
+ attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
355
+
356
+ # attn_output_weights = softmax(
357
+ # attn_output_weights, dim=-1)
358
+ attn_output_weights = softmax(
359
+ attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
360
+ attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
361
+
362
+ attn_output = torch.bmm(attn_output_weights, v)
363
+ assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
364
+ attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
365
+ attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
366
+
367
+ if need_weights:
368
+ # average attention weights over heads
369
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
370
+ return attn_output, attn_output_weights.sum(dim=1) / num_heads
371
+ else:
372
+ return attn_output, None
373
+
difpoint/src/models/XPose/models/UniPose/backbone.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # Conditional DETR
8
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # Copied from DETR (https://github.com/facebookresearch/detr)
12
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13
+ # ------------------------------------------------------------------------
14
+
15
+ """
16
+ Backbone modules.
17
+ """
18
+
19
+ import torch
20
+ import torch.nn.functional as F
21
+ import torchvision
22
+ from torch import nn
23
+ from torchvision.models._utils import IntermediateLayerGetter
24
+ from typing import Dict, List
25
+
26
+ from ...util.misc import NestedTensor, is_main_process
27
+
28
+ from .position_encoding import build_position_encoding
29
+ from .swin_transformer import build_swin_transformer
30
+
31
+ class FrozenBatchNorm2d(torch.nn.Module):
32
+ """
33
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
34
+
35
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt,
36
+ without which any other models than torchvision.models.resnet[18,34,50,101]
37
+ produce nans.
38
+ """
39
+
40
+ def __init__(self, n):
41
+ super(FrozenBatchNorm2d, self).__init__()
42
+ self.register_buffer("weight", torch.ones(n))
43
+ self.register_buffer("bias", torch.zeros(n))
44
+ self.register_buffer("running_mean", torch.zeros(n))
45
+ self.register_buffer("running_var", torch.ones(n))
46
+
47
+ def _load_from_state_dict(
48
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
49
+ ):
50
+ num_batches_tracked_key = prefix + "num_batches_tracked"
51
+ if num_batches_tracked_key in state_dict:
52
+ del state_dict[num_batches_tracked_key]
53
+
54
+ super(FrozenBatchNorm2d, self)._load_from_state_dict(
55
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
56
+ )
57
+
58
+ def forward(self, x):
59
+ # move reshapes to the beginning
60
+ # to make it fuser-friendly
61
+ w = self.weight.reshape(1, -1, 1, 1)
62
+ b = self.bias.reshape(1, -1, 1, 1)
63
+ rv = self.running_var.reshape(1, -1, 1, 1)
64
+ rm = self.running_mean.reshape(1, -1, 1, 1)
65
+ eps = 1e-5
66
+ scale = w * (rv + eps).rsqrt()
67
+ bias = b - rm * scale
68
+ return x * scale + bias
69
+
70
+
71
+ class BackboneBase(nn.Module):
72
+ def __init__(
73
+ self,
74
+ backbone: nn.Module,
75
+ train_backbone: bool,
76
+ num_channels: int,
77
+ return_interm_indices: list,
78
+ ):
79
+ super().__init__()
80
+ for name, parameter in backbone.named_parameters():
81
+ if (
82
+ not train_backbone
83
+ or "layer2" not in name
84
+ and "layer3" not in name
85
+ and "layer4" not in name
86
+ ):
87
+ parameter.requires_grad_(False)
88
+
89
+ return_layers = {}
90
+ for idx, layer_index in enumerate(return_interm_indices):
91
+ return_layers.update(
92
+ {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
93
+ )
94
+
95
+ self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
96
+ self.num_channels = num_channels
97
+
98
+ def forward(self, tensor_list: NestedTensor):
99
+ xs = self.body(tensor_list.tensors)
100
+ out: Dict[str, NestedTensor] = {}
101
+ for name, x in xs.items():
102
+ m = tensor_list.mask
103
+ assert m is not None
104
+ mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
105
+ out[name] = NestedTensor(x, mask)
106
+ # import ipdb; ipdb.set_trace()
107
+ return out
108
+
109
+
110
+ class Backbone(BackboneBase):
111
+ """ResNet backbone with frozen BatchNorm."""
112
+
113
+ def __init__(
114
+ self,
115
+ name: str,
116
+ train_backbone: bool,
117
+ dilation: bool,
118
+ return_interm_indices: list,
119
+ batch_norm=FrozenBatchNorm2d,
120
+ ):
121
+ if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
122
+ backbone = getattr(torchvision.models, name)(
123
+ replace_stride_with_dilation=[False, False, dilation],
124
+ pretrained=is_main_process(),
125
+ norm_layer=batch_norm,
126
+ )
127
+ else:
128
+ raise NotImplementedError("Why you can get here with name {}".format(name))
129
+ # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
130
+ assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
131
+ assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
132
+ num_channels_all = [256, 512, 1024, 2048]
133
+ num_channels = num_channels_all[4 - len(return_interm_indices) :]
134
+ super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
135
+
136
+
137
+ class Joiner(nn.Sequential):
138
+ def __init__(self, backbone, position_embedding):
139
+ super().__init__(backbone, position_embedding)
140
+
141
+ def forward(self, tensor_list: NestedTensor):
142
+ xs = self[0](tensor_list)
143
+ out: List[NestedTensor] = []
144
+ pos = []
145
+ for name, x in xs.items():
146
+ out.append(x)
147
+ # position encoding
148
+ pos.append(self[1](x).to(x.tensors.dtype))
149
+
150
+ return out, pos
151
+
152
+
153
+ def build_backbone(args):
154
+ """
155
+ Useful args:
156
+ - backbone: backbone name
157
+ - lr_backbone:
158
+ - dilation
159
+ - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
160
+ - backbone_freeze_keywords:
161
+ - use_checkpoint: for swin only for now
162
+
163
+ """
164
+ position_embedding = build_position_encoding(args)
165
+ train_backbone = True
166
+ if not train_backbone:
167
+ raise ValueError("Please set lr_backbone > 0")
168
+ return_interm_indices = args.return_interm_indices
169
+ assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
170
+ args.backbone_freeze_keywords
171
+ use_checkpoint = getattr(args, "use_checkpoint", False)
172
+
173
+ if args.backbone in ["resnet50", "resnet101"]:
174
+ backbone = Backbone(
175
+ args.backbone,
176
+ train_backbone,
177
+ args.dilation,
178
+ return_interm_indices,
179
+ batch_norm=FrozenBatchNorm2d,
180
+ )
181
+ bb_num_channels = backbone.num_channels
182
+ elif args.backbone in [
183
+ "swin_T_224_1k",
184
+ "swin_B_224_22k",
185
+ "swin_B_384_22k",
186
+ "swin_L_224_22k",
187
+ "swin_L_384_22k",
188
+ ]:
189
+ pretrain_img_size = int(args.backbone.split("_")[-2])
190
+ backbone = build_swin_transformer(
191
+ args.backbone,
192
+ pretrain_img_size=pretrain_img_size,
193
+ out_indices=tuple(return_interm_indices),
194
+ dilation=False,
195
+ use_checkpoint=use_checkpoint,
196
+ )
197
+
198
+ bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
199
+ else:
200
+ raise NotImplementedError("Unknown backbone {}".format(args.backbone))
201
+
202
+ assert len(bb_num_channels) == len(
203
+ return_interm_indices
204
+ ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
205
+
206
+ model = Joiner(backbone, position_embedding)
207
+ model.num_channels = bb_num_channels
208
+ assert isinstance(
209
+ bb_num_channels, List
210
+ ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
211
+ return model
difpoint/src/models/XPose/models/UniPose/deformable_transformer.py ADDED
@@ -0,0 +1,1230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # ED-Pose
8
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # DINO
12
+ # Copyright (c) 2022 IDEA. All Rights Reserved.
13
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
+ # ------------------------------------------------------------------------
15
+ # Modified from DETR (https://github.com/facebookresearch/detr)
16
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
17
+ # ------------------------------------------------------------------------
18
+
19
+ import math
20
+ import copy
21
+ import torch
22
+ import torch.utils.checkpoint as checkpoint
23
+ from torch import nn, Tensor
24
+ from typing import Optional
25
+ from ...util.misc import inverse_sigmoid
26
+
27
+ from .transformer_vanilla import TransformerEncoderLayer
28
+ from .fuse_modules import BiAttentionBlock
29
+ from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
30
+ from .ops.modules import MSDeformAttn
31
+
32
+
33
+ class DeformableTransformer(nn.Module):
34
+
35
+ def __init__(self, d_model=256, nhead=8,
36
+ num_queries=300,
37
+ num_encoder_layers=6,
38
+ num_unicoder_layers=0,
39
+ num_decoder_layers=6,
40
+ dim_feedforward=2048, dropout=0.0,
41
+ activation="relu", normalize_before=False,
42
+ return_intermediate_dec=False, query_dim=4,
43
+ num_patterns=0,
44
+ modulate_hw_attn=False,
45
+ # for deformable encoder
46
+ deformable_encoder=False,
47
+ deformable_decoder=False,
48
+ num_feature_levels=1,
49
+ enc_n_points=4,
50
+ dec_n_points=4,
51
+ use_deformable_box_attn=False,
52
+ box_attn_type='roi_align',
53
+ # init query
54
+ learnable_tgt_init=False,
55
+ decoder_query_perturber=None,
56
+ add_channel_attention=False,
57
+ add_pos_value=False,
58
+ random_refpoints_xy=False,
59
+ # two stage
60
+ two_stage_type='no',
61
+ two_stage_pat_embed=0,
62
+ two_stage_add_query_num=0,
63
+ two_stage_learn_wh=False,
64
+ two_stage_keep_all_tokens=False,
65
+ # evo of #anchors
66
+ dec_layer_number=None,
67
+ rm_enc_query_scale=True,
68
+ rm_dec_query_scale=True,
69
+ rm_self_attn_layers=None,
70
+ key_aware_type=None,
71
+ # layer share
72
+ layer_share_type=None,
73
+ # for detach
74
+ rm_detach=None,
75
+ decoder_sa_type='ca',
76
+ module_seq=['sa', 'ca', 'ffn'],
77
+ # for dn
78
+ embed_init_tgt=False,
79
+
80
+ use_detached_boxes_dec_out=False,
81
+ use_text_enhancer=False,
82
+ use_fusion_layer=False,
83
+ use_checkpoint=False,
84
+ use_transformer_ckpt=False,
85
+ use_text_cross_attention=False,
86
+ text_dropout=0.1,
87
+ fusion_dropout=0.1,
88
+ fusion_droppath=0.0,
89
+
90
+ binary_query_selection=False,
91
+ ffn_extra_layernorm=False,
92
+ ):
93
+ super().__init__()
94
+ self.num_feature_levels = num_feature_levels
95
+ self.num_encoder_layers = num_encoder_layers
96
+ self.num_unicoder_layers = num_unicoder_layers
97
+ self.num_decoder_layers = num_decoder_layers
98
+ self.deformable_encoder = deformable_encoder
99
+ self.deformable_decoder = deformable_decoder
100
+ self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
101
+ self.num_queries = num_queries
102
+ self.random_refpoints_xy = random_refpoints_xy
103
+ self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
104
+ self.ffn_extra_layernorm = ffn_extra_layernorm
105
+ assert query_dim == 4
106
+
107
+ self.binary_query_selection = binary_query_selection
108
+ if self.binary_query_selection:
109
+ self.binary_query_selection_layer = nn.Linear(d_model, 1)
110
+ # assert not binary_query_selection, 'binary_query_selection not implemented yet'
111
+
112
+ if num_feature_levels > 1:
113
+ assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
114
+ if use_deformable_box_attn:
115
+ assert deformable_encoder or deformable_encoder
116
+
117
+ assert layer_share_type in [None, 'encoder', 'decoder', 'both']
118
+ if layer_share_type in ['encoder', 'both']:
119
+ enc_layer_share = True
120
+ else:
121
+ enc_layer_share = False
122
+ if layer_share_type in ['decoder', 'both']:
123
+ dec_layer_share = True
124
+ else:
125
+ dec_layer_share = False
126
+ assert layer_share_type is None
127
+
128
+ self.decoder_sa_type = decoder_sa_type
129
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
130
+
131
+ # choose encoder layer type
132
+ if deformable_encoder:
133
+ encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
134
+ dropout, activation,
135
+ num_feature_levels, nhead, enc_n_points,
136
+ add_channel_attention=add_channel_attention,
137
+ use_deformable_box_attn=use_deformable_box_attn,
138
+ box_attn_type=box_attn_type)
139
+ else:
140
+ raise NotImplementedError
141
+
142
+ if use_text_enhancer:
143
+ text_enhance_layer = TransformerEncoderLayer(
144
+ d_model=d_model,
145
+ nhead=nhead // 2,
146
+ dim_feedforward=dim_feedforward // 2,
147
+ dropout=text_dropout
148
+ )
149
+ else:
150
+ text_enhance_layer = None
151
+
152
+ if use_fusion_layer:
153
+ feature_fusion_layer = BiAttentionBlock(
154
+ v_dim=d_model,
155
+ l_dim=d_model,
156
+ embed_dim=dim_feedforward // 2,
157
+ num_heads=nhead // 2,
158
+ dropout=fusion_dropout,
159
+ drop_path=fusion_droppath
160
+ )
161
+ else:
162
+ feature_fusion_layer = None
163
+
164
+ encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
165
+ assert encoder_norm is None
166
+ self.encoder = TransformerEncoder(
167
+ encoder_layer, num_encoder_layers, d_model=d_model,
168
+ num_queries=num_queries,
169
+ enc_layer_share=enc_layer_share,
170
+ text_enhance_layer=text_enhance_layer,
171
+ feature_fusion_layer=feature_fusion_layer,
172
+ use_checkpoint=use_checkpoint,
173
+ use_transformer_ckpt=use_transformer_ckpt,
174
+ )
175
+
176
+ # choose decoder layer type
177
+ if deformable_decoder:
178
+ decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
179
+ dropout, activation,
180
+ num_feature_levels, nhead, dec_n_points,
181
+ use_text_cross_attention=use_text_cross_attention,
182
+ ffn_extra_layernorm=ffn_extra_layernorm, )
183
+
184
+ else:
185
+ raise NotImplementedError
186
+
187
+ decoder_norm = nn.LayerNorm(d_model)
188
+ self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
189
+ return_intermediate=return_intermediate_dec,
190
+ d_model=d_model, query_dim=query_dim,
191
+ modulate_hw_attn=modulate_hw_attn,
192
+ num_feature_levels=num_feature_levels,
193
+ deformable_decoder=deformable_decoder,
194
+ decoder_query_perturber=decoder_query_perturber,
195
+ dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
196
+ dec_layer_share=dec_layer_share,
197
+ use_detached_boxes_dec_out=use_detached_boxes_dec_out
198
+ )
199
+
200
+ self.d_model = d_model
201
+ self.nhead = nhead
202
+ self.dec_layers = num_decoder_layers
203
+ self.num_queries = num_queries # useful for single stage model only
204
+ self.num_patterns = num_patterns
205
+ if not isinstance(num_patterns, int):
206
+ Warning("num_patterns should be int but {}".format(type(num_patterns)))
207
+ self.num_patterns = 0
208
+
209
+ if num_feature_levels > 1:
210
+ if self.num_encoder_layers > 0:
211
+ self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
212
+ else:
213
+ self.level_embed = None
214
+
215
+ self.learnable_tgt_init = learnable_tgt_init
216
+ assert learnable_tgt_init, "why not learnable_tgt_init"
217
+ self.embed_init_tgt = embed_init_tgt
218
+ if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
219
+ self.tgt_embed = nn.Embedding(self.num_queries, d_model)
220
+ nn.init.normal_(self.tgt_embed.weight.data)
221
+ else:
222
+ self.tgt_embed = None
223
+
224
+ # for two stage
225
+ self.two_stage_type = two_stage_type
226
+ self.two_stage_pat_embed = two_stage_pat_embed
227
+ self.two_stage_add_query_num = two_stage_add_query_num
228
+ self.two_stage_learn_wh = two_stage_learn_wh
229
+ assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
230
+ if two_stage_type == 'standard':
231
+ # anchor selection at the output of encoder
232
+ self.enc_output = nn.Linear(d_model, d_model)
233
+ self.enc_output_norm = nn.LayerNorm(d_model)
234
+
235
+ if two_stage_pat_embed > 0:
236
+ self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
237
+ nn.init.normal_(self.pat_embed_for_2stage)
238
+
239
+ if two_stage_add_query_num > 0:
240
+ self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
241
+
242
+ if two_stage_learn_wh:
243
+ # import ipdb; ipdb.set_trace()
244
+ self.two_stage_wh_embedding = nn.Embedding(1, 2)
245
+ else:
246
+ self.two_stage_wh_embedding = None
247
+
248
+ if two_stage_type == 'no':
249
+ self.init_ref_points(num_queries) # init self.refpoint_embed
250
+
251
+ self.enc_out_class_embed = None
252
+ self.enc_out_bbox_embed = None
253
+
254
+ # evolution of anchors
255
+ self.dec_layer_number = dec_layer_number
256
+ if dec_layer_number is not None:
257
+ if self.two_stage_type != 'no' or num_patterns == 0:
258
+ assert dec_layer_number[
259
+ 0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
260
+ else:
261
+ assert dec_layer_number[
262
+ 0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
263
+
264
+ self._reset_parameters()
265
+
266
+ self.rm_self_attn_layers = rm_self_attn_layers
267
+ if rm_self_attn_layers is not None:
268
+ # assert len(rm_self_attn_layers) == num_decoder_layers
269
+ print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
270
+ for lid, dec_layer in enumerate(self.decoder.layers):
271
+ if lid in rm_self_attn_layers:
272
+ dec_layer.rm_self_attn_modules()
273
+
274
+ self.rm_detach = rm_detach
275
+ if self.rm_detach:
276
+ assert isinstance(rm_detach, list)
277
+ assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
278
+ self.decoder.rm_detach = rm_detach
279
+
280
+ def _reset_parameters(self):
281
+ for p in self.parameters():
282
+ if p.dim() > 1:
283
+ nn.init.xavier_uniform_(p)
284
+ for m in self.modules():
285
+ if isinstance(m, MSDeformAttn):
286
+ m._reset_parameters()
287
+ if self.num_feature_levels > 1 and self.level_embed is not None:
288
+ nn.init.normal_(self.level_embed)
289
+
290
+ if self.two_stage_learn_wh:
291
+ nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
292
+
293
+ def get_valid_ratio(self, mask):
294
+ _, H, W = mask.shape
295
+ valid_H = torch.sum(~mask[:, :, 0], 1)
296
+ valid_W = torch.sum(~mask[:, 0, :], 1)
297
+ valid_ratio_h = valid_H.float() / H
298
+ valid_ratio_w = valid_W.float() / W
299
+ valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
300
+ return valid_ratio
301
+
302
+ def init_ref_points(self, use_num_queries):
303
+ self.refpoint_embed = nn.Embedding(use_num_queries, 4)
304
+
305
+ if self.random_refpoints_xy:
306
+ # import ipdb; ipdb.set_trace()
307
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
308
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
309
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
310
+
311
+ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
312
+ dn_meta=None,targets=None,kpt_embed=None):
313
+ """
314
+ Input:
315
+ - srcs: List of multi features [bs, ci, hi, wi]
316
+ - masks: List of multi masks [bs, hi, wi]
317
+ - refpoint_embed: [bs, num_dn, 4]. None in infer
318
+ - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
319
+ - tgt: [bs, num_dn, d_model]. None in infer
320
+
321
+ """
322
+ # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
323
+ # assert refpoint_embed is None
324
+
325
+ # prepare input for encoder
326
+ src_flatten = []
327
+ mask_flatten = []
328
+ lvl_pos_embed_flatten = []
329
+ spatial_shapes = []
330
+ for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
331
+ bs, c, h, w = src.shape
332
+ spatial_shape = (h, w)
333
+ spatial_shapes.append(spatial_shape)
334
+
335
+ src = src.flatten(2).transpose(1, 2) # bs, hw, c
336
+ mask = mask.flatten(1) # bs, hw
337
+ pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c
338
+ if self.num_feature_levels > 1 and self.level_embed is not None:
339
+ lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
340
+ else:
341
+ lvl_pos_embed = pos_embed
342
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
343
+ src_flatten.append(src)
344
+ mask_flatten.append(mask)
345
+ src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c
346
+ mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw}
347
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c
348
+ spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
349
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
350
+ valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
351
+
352
+ # two stage
353
+ enc_topk_proposals = enc_refpoint_embed = None
354
+
355
+ #########################################################
356
+ # Begin Encoder
357
+ #########################################################
358
+ memory, memory_text = self.encoder(
359
+ src_flatten,
360
+ pos=lvl_pos_embed_flatten,
361
+ level_start_index=level_start_index,
362
+ spatial_shapes=spatial_shapes,
363
+ valid_ratios=valid_ratios,
364
+ key_padding_mask=mask_flatten,
365
+ memory_text=text_dict['encoded_text'],
366
+ text_attention_mask=~text_dict['text_token_mask'],
367
+ # we ~ the mask . False means use the token; True means pad the token
368
+ position_ids=text_dict['position_ids'],
369
+ text_self_attention_masks=text_dict['text_self_attention_masks'],
370
+ )
371
+ #########################################################
372
+ # End Encoder
373
+ # - memory: bs, \sum{hw}, c
374
+ # - mask_flatten: bs, \sum{hw}
375
+ # - lvl_pos_embed_flatten: bs, \sum{hw}, c
376
+ # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
377
+ # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
378
+ #########################################################
379
+ text_dict['encoded_text'] = memory_text
380
+
381
+ if self.two_stage_type == 'standard':
382
+ if self.two_stage_learn_wh:
383
+ input_hw = self.two_stage_wh_embedding.weight[0]
384
+ else:
385
+ input_hw = None
386
+ output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
387
+ input_hw)
388
+ output_memory = self.enc_output_norm(self.enc_output(output_memory))
389
+
390
+ if self.two_stage_pat_embed > 0:
391
+ bs, nhw, _ = output_memory.shape
392
+ # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
393
+ output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
394
+ _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
395
+ output_memory = output_memory + _pats
396
+ output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
397
+
398
+ if self.two_stage_add_query_num > 0:
399
+ assert refpoint_embed is not None
400
+ output_memory = torch.cat((output_memory, tgt), dim=1)
401
+ output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
402
+
403
+ if self.binary_query_selection:
404
+ topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
405
+ else:
406
+ if text_dict is not None:
407
+ enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
408
+ else:
409
+ enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
410
+
411
+ topk_logits = enc_outputs_class_unselected.max(-1)[0]
412
+ enc_outputs_coord_unselected = self.enc_out_bbox_embed(
413
+ output_memory) + output_proposals # (bs, \sum{hw}, 4) unsigmoid
414
+ topk = self.num_queries
415
+
416
+ topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq
417
+
418
+ # gather boxes
419
+ refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
420
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) # unsigmoid
421
+ refpoint_embed_ = refpoint_embed_undetach.detach()
422
+ init_box_proposal = torch.gather(output_proposals, 1,
423
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid() # sigmoid
424
+
425
+ # gather tgt
426
+ tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
427
+ if self.embed_init_tgt:
428
+ tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, d_model
429
+ else:
430
+ tgt_ = tgt_undetach.detach()
431
+
432
+ if refpoint_embed is not None:
433
+ refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
434
+ tgt = torch.cat([tgt, tgt_], dim=1)
435
+ else:
436
+ refpoint_embed, tgt = refpoint_embed_, tgt_
437
+
438
+ elif self.two_stage_type == 'no':
439
+ tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, d_model
440
+ refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, 4
441
+
442
+ if refpoint_embed is not None:
443
+ refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
444
+ tgt = torch.cat([tgt, tgt_], dim=1)
445
+ else:
446
+ refpoint_embed, tgt = refpoint_embed_, tgt_
447
+
448
+ if self.num_patterns > 0:
449
+ tgt_embed = tgt.repeat(1, self.num_patterns, 1)
450
+ refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
451
+ tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
452
+ 1) # 1, n_q*n_pat, d_model
453
+ tgt = tgt_embed + tgt_pat
454
+
455
+ init_box_proposal = refpoint_embed_.sigmoid()
456
+
457
+ else:
458
+ raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
459
+ #########################################################
460
+ # End preparing tgt
461
+ # - tgt: bs, NQ, d_model
462
+ # - refpoint_embed(unsigmoid): bs, NQ, d_model
463
+ #########################################################
464
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
465
+ # if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
466
+ # import ipdb; ipdb.set_trace()
467
+ # if tgt.isnan().any() | tgt.isinf().any():
468
+ # import ipdb; ipdb.set_trace()
469
+
470
+ #########################################################
471
+ # Begin Decoder
472
+ #########################################################
473
+ hs, references = self.decoder(
474
+ tgt=tgt.transpose(0, 1),
475
+ memory=memory.transpose(0, 1),
476
+ memory_key_padding_mask=mask_flatten,
477
+ pos=lvl_pos_embed_flatten.transpose(0, 1),
478
+ refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
479
+ level_start_index=level_start_index,
480
+ spatial_shapes=spatial_shapes,
481
+ valid_ratios=valid_ratios, tgt_mask=attn_mask,
482
+ tgt_mask2=attn_mask2,
483
+ memory_text=text_dict['encoded_text'],
484
+ text_attention_mask=~text_dict['text_token_mask'],
485
+ text_dict=text_dict,
486
+ dn_meta=dn_meta,
487
+ targets=targets,
488
+ kpt_embed=kpt_embed
489
+ # we ~ the mask . False means use the token; True means pad the token
490
+ )
491
+ #########################################################
492
+ # End Decoder
493
+ # hs: n_dec, bs, nq, d_model
494
+ # references: n_dec+1, bs, nq, query_dim
495
+ #########################################################
496
+
497
+ #########################################################
498
+ # Begin postprocess
499
+ #########################################################
500
+ if self.two_stage_type == 'standard':
501
+ if self.two_stage_keep_all_tokens:
502
+ hs_enc = output_memory.unsqueeze(0)
503
+ ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
504
+ init_box_proposal = output_proposals
505
+ # import ipdb; ipdb.set_trace()
506
+ else:
507
+ hs_enc = tgt_undetach.unsqueeze(0)
508
+ ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
509
+ else:
510
+ hs_enc = ref_enc = None
511
+ #########################################################
512
+ # End postprocess
513
+ # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
514
+ # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
515
+ #########################################################
516
+
517
+ return hs, references, hs_enc, ref_enc, init_box_proposal
518
+ # hs: (n_dec, bs, nq, d_model)
519
+ # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
520
+ # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
521
+ # ref_enc: sigmoid coordinates. \
522
+ # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
523
+
524
+
525
+ class TransformerEncoder(nn.Module):
526
+
527
+ def __init__(self,
528
+ encoder_layer, num_layers, d_model=256,
529
+ num_queries=300,
530
+ enc_layer_share=False,
531
+ text_enhance_layer=None,
532
+ feature_fusion_layer=None,
533
+ use_checkpoint=False,
534
+ use_transformer_ckpt=False,
535
+ ):
536
+ """_summary_
537
+
538
+ Args:
539
+ encoder_layer (_type_): _description_
540
+ num_layers (_type_): _description_
541
+ norm (_type_, optional): _description_. Defaults to None.
542
+ d_model (int, optional): _description_. Defaults to 256.
543
+ num_queries (int, optional): _description_. Defaults to 300.
544
+ enc_layer_share (bool, optional): _description_. Defaults to False.
545
+
546
+ """
547
+ super().__init__()
548
+ # prepare layers
549
+ self.layers = []
550
+ self.text_layers = []
551
+ self.fusion_layers = []
552
+ if num_layers > 0:
553
+ self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
554
+
555
+ if text_enhance_layer is not None:
556
+ self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
557
+ if feature_fusion_layer is not None:
558
+ self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
559
+ else:
560
+ self.layers = []
561
+ del encoder_layer
562
+
563
+ if text_enhance_layer is not None:
564
+ self.text_layers = []
565
+ del text_enhance_layer
566
+ if feature_fusion_layer is not None:
567
+ self.fusion_layers = []
568
+ del feature_fusion_layer
569
+
570
+ self.query_scale = None
571
+ self.num_queries = num_queries
572
+ self.num_layers = num_layers
573
+ self.d_model = d_model
574
+
575
+ self.use_checkpoint = use_checkpoint
576
+ self.use_transformer_ckpt = use_transformer_ckpt
577
+
578
+ @staticmethod
579
+ def get_reference_points(spatial_shapes, valid_ratios, device):
580
+ reference_points_list = []
581
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
582
+ ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
583
+ torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
584
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
585
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
586
+ ref = torch.stack((ref_x, ref_y), -1)
587
+ reference_points_list.append(ref)
588
+ reference_points = torch.cat(reference_points_list, 1)
589
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
590
+ return reference_points
591
+
592
+ def forward(self,
593
+ # for images
594
+ src: Tensor,
595
+ pos: Tensor,
596
+ spatial_shapes: Tensor,
597
+ level_start_index: Tensor,
598
+ valid_ratios: Tensor,
599
+ key_padding_mask: Tensor,
600
+ # for texts
601
+ memory_text: Tensor = None,
602
+ text_attention_mask: Tensor = None,
603
+ pos_text: Tensor = None,
604
+ text_self_attention_masks: Tensor = None,
605
+ position_ids: Tensor = None,
606
+ ):
607
+ """
608
+ Input:
609
+ - src: [bs, sum(hi*wi), 256]
610
+ - pos: pos embed for src. [bs, sum(hi*wi), 256]
611
+ - spatial_shapes: h,w of each level [num_level, 2]
612
+ - level_start_index: [num_level] start point of level in sum(hi*wi).
613
+ - valid_ratios: [bs, num_level, 2]
614
+ - key_padding_mask: [bs, sum(hi*wi)]
615
+
616
+ - memory_text: bs, n_text, 256
617
+ - text_attention_mask: bs, n_text
618
+ False for no padding; True for padding
619
+ - pos_text: bs, n_text, 256
620
+
621
+ - position_ids: bs, n_text
622
+ Intermedia:
623
+ - reference_points: [bs, sum(hi*wi), num_level, 2]
624
+ Outpus:
625
+ - output: [bs, sum(hi*wi), 256]
626
+ """
627
+
628
+ output = src
629
+
630
+ # preparation and reshape
631
+ if self.num_layers > 0:
632
+ reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
633
+
634
+ if self.text_layers:
635
+ # generate pos_text
636
+ bs, n_text, text_dim = memory_text.shape
637
+ if pos_text is None and position_ids is None:
638
+ pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
639
+ 1,
640
+ 1)
641
+ pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
642
+ if position_ids is not None:
643
+ pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
644
+
645
+ # main process
646
+ for layer_id, layer in enumerate(self.layers):
647
+ # if output.isnan().any() or memory_text.isnan().any():
648
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
649
+ # import ipdb; ipdb.set_trace()
650
+ if self.fusion_layers:
651
+ if self.use_checkpoint:
652
+ output, memory_text = checkpoint.checkpoint(
653
+ self.fusion_layers[layer_id],
654
+ output,
655
+ memory_text,
656
+ key_padding_mask,
657
+ text_attention_mask
658
+ )
659
+ else:
660
+ output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
661
+ attention_mask_v=key_padding_mask,
662
+ attention_mask_l=text_attention_mask)
663
+
664
+ if self.text_layers:
665
+ memory_text = self.text_layers[layer_id](
666
+ src=memory_text.transpose(0, 1),
667
+ src_mask=~text_self_attention_masks, # note we use ~ for mask here
668
+ src_key_padding_mask=text_attention_mask,
669
+ pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
670
+ ).transpose(0, 1)
671
+
672
+ # main process
673
+ if self.use_transformer_ckpt:
674
+ output = checkpoint.checkpoint(
675
+ layer,
676
+ output,
677
+ pos,
678
+ reference_points,
679
+ spatial_shapes,
680
+ level_start_index,
681
+ key_padding_mask
682
+ )
683
+ else:
684
+ output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
685
+ level_start_index=level_start_index, key_padding_mask=key_padding_mask)
686
+
687
+ return output, memory_text
688
+
689
+
690
+ class TransformerDecoder(nn.Module):
691
+
692
+ def __init__(self, decoder_layer, num_layers, norm=None,
693
+ return_intermediate=False,
694
+ d_model=256, query_dim=4,
695
+ modulate_hw_attn=False,
696
+ num_feature_levels=1,
697
+ deformable_decoder=False,
698
+ decoder_query_perturber=None,
699
+ dec_layer_number=None, # number of queries each layer in decoder
700
+ rm_dec_query_scale=False,
701
+ dec_layer_share=False,
702
+ dec_layer_dropout_prob=None,
703
+ use_detached_boxes_dec_out=False,
704
+ num_box_decoder_layers=2,
705
+ num_body_points=68,
706
+ ):
707
+ super().__init__()
708
+ if num_layers > 0:
709
+ self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
710
+ else:
711
+ self.layers = []
712
+ self.num_layers = num_layers
713
+ self.norm = norm
714
+ self.return_intermediate = return_intermediate
715
+ assert return_intermediate, "support return_intermediate only"
716
+ self.query_dim = query_dim
717
+ assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
718
+ self.num_feature_levels = num_feature_levels
719
+ self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
720
+
721
+ self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
722
+ if not deformable_decoder:
723
+ self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
724
+ else:
725
+ self.query_pos_sine_scale = None
726
+
727
+ if rm_dec_query_scale:
728
+ self.query_scale = None
729
+ else:
730
+ raise NotImplementedError
731
+ self.query_scale = MLP(d_model, d_model, d_model, 2)
732
+ self.bbox_embed = None
733
+ self.class_embed = None
734
+ self.pose_embed = None
735
+ self.pose_hw_embed = None
736
+ self.d_model = d_model
737
+ self.modulate_hw_attn = modulate_hw_attn
738
+ self.deformable_decoder = deformable_decoder
739
+
740
+ if not deformable_decoder and modulate_hw_attn:
741
+ self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
742
+ else:
743
+ self.ref_anchor_head = None
744
+
745
+ self.decoder_query_perturber = decoder_query_perturber
746
+ self.box_pred_damping = None
747
+
748
+ self.dec_layer_number = dec_layer_number
749
+ if dec_layer_number is not None:
750
+ assert isinstance(dec_layer_number, list)
751
+ assert len(dec_layer_number) == num_layers
752
+ # assert dec_layer_number[0] ==
753
+
754
+ self.dec_layer_dropout_prob = dec_layer_dropout_prob
755
+ if dec_layer_dropout_prob is not None:
756
+ assert isinstance(dec_layer_dropout_prob, list)
757
+ assert len(dec_layer_dropout_prob) == num_layers
758
+ for i in dec_layer_dropout_prob:
759
+ assert 0.0 <= i <= 1.0
760
+
761
+ self.rm_detach = None
762
+ self.num_body_points = num_body_points
763
+
764
+ self.hw = nn.Embedding(17, 2)
765
+ self.num_box_decoder_layers = num_box_decoder_layers
766
+ self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
767
+ self.hw_append = nn.Embedding(self.num_body_points-17, 2)
768
+
769
+ def forward(self, tgt, memory,
770
+ tgt_mask: Optional[Tensor] = None,
771
+ tgt_mask2: Optional[Tensor] = None,
772
+ memory_mask: Optional[Tensor] = None,
773
+ tgt_key_padding_mask: Optional[Tensor] = None,
774
+ memory_key_padding_mask: Optional[Tensor] = None,
775
+ pos: Optional[Tensor] = None,
776
+ refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
777
+ # for memory
778
+ level_start_index: Optional[Tensor] = None, # num_levels
779
+ spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
780
+ valid_ratios: Optional[Tensor] = None,
781
+ # for text
782
+ memory_text: Optional[Tensor] = None,
783
+ text_attention_mask: Optional[Tensor] = None,
784
+ text_dict: Optional[Tensor] = None,
785
+ dn_meta: Optional[Tensor] = None,
786
+ targets: Optional[Tensor] = None,
787
+ kpt_embed: Optional[Tensor] = None
788
+ ):
789
+ """
790
+ Input:
791
+ - tgt: nq, bs, d_model
792
+ - memory: hw, bs, d_model
793
+ - pos: hw, bs, d_model
794
+ - refpoints_unsigmoid: nq, bs, 2/4
795
+ - valid_ratios/spatial_shapes: bs, nlevel, 2
796
+ """
797
+
798
+ output = tgt
799
+ output += self.hw.weight[0, 0] * 0.0
800
+
801
+
802
+ intermediate = []
803
+ reference_points = refpoints_unsigmoid.sigmoid()
804
+ ref_points = [reference_points]
805
+ effect_num_dn = dn_meta['pad_size'] if self.training else 0
806
+ inter_select_number = 50
807
+ for layer_id, layer in enumerate(self.layers):
808
+
809
+ if reference_points.shape[-1] == 4:
810
+ reference_points_input = reference_points[:, :, None] \
811
+ * torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4
812
+ else:
813
+ assert reference_points.shape[-1] == 2
814
+ reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
815
+ query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2
816
+
817
+ # conditional query
818
+ raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
819
+ pos_scale = self.query_scale(output) if self.query_scale is not None else 1
820
+ query_pos = pos_scale * raw_query_pos
821
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
822
+ # if query_pos.isnan().any() | query_pos.isinf().any():
823
+ # import ipdb; ipdb.set_trace()
824
+
825
+ # main process
826
+ output = layer(
827
+ tgt=output,
828
+ tgt_query_pos=query_pos,
829
+ tgt_query_sine_embed=query_sine_embed,
830
+ tgt_key_padding_mask=tgt_key_padding_mask,
831
+ tgt_reference_points=reference_points_input,
832
+
833
+ memory_text=memory_text,
834
+ text_attention_mask=text_attention_mask,
835
+
836
+ memory=memory,
837
+ memory_key_padding_mask=memory_key_padding_mask,
838
+ memory_level_start_index=level_start_index,
839
+ memory_spatial_shapes=spatial_shapes,
840
+ memory_pos=pos,
841
+
842
+ self_attn_mask=tgt_mask,
843
+ cross_attn_mask=memory_mask
844
+ )
845
+ if output.isnan().any() | output.isinf().any():
846
+ print(f"output layer_id {layer_id} is nan")
847
+ try:
848
+ num_nan = output.isnan().sum().item()
849
+ num_inf = output.isinf().sum().item()
850
+ print(f"num_nan {num_nan}, num_inf {num_inf}")
851
+ except Exception as e:
852
+ print(e)
853
+
854
+
855
+
856
+
857
+ intermediate.append(self.norm(output))
858
+ # iter update
859
+ if layer_id < self.num_box_decoder_layers:
860
+ reference_before_sigmoid = inverse_sigmoid(reference_points)
861
+ delta_unsig = self.bbox_embed[layer_id](output)
862
+ outputs_unsig = delta_unsig + reference_before_sigmoid
863
+ new_reference_points = outputs_unsig.sigmoid()
864
+
865
+ # select # ref points as anchors
866
+ if layer_id == self.num_box_decoder_layers - 1:
867
+ dn_output = output[:effect_num_dn]
868
+ dn_new_reference_points = new_reference_points[:effect_num_dn]
869
+ class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
870
+ effect_num_dn:].transpose(0, 1)
871
+ topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
872
+ new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
873
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
874
+ new_output_for_box = torch.gather(output[effect_num_dn:], 0,
875
+ topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
876
+ keypoint_embed=kpt_embed.transpose(0, 1)
877
+
878
+ new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
879
+ delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
880
+ keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
881
+ num_queries, _, bs, _ = keypoint_xy.shape
882
+ aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
883
+ keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
884
+ keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
885
+ new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
886
+ new_reference_points = torch.cat(
887
+ (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
888
+ output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
889
+ new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
890
+ output = torch.cat((dn_output, output), dim=0)
891
+ tgt_mask = tgt_mask2
892
+
893
+ if layer_id >= self.num_box_decoder_layers:
894
+ reference_before_sigmoid = inverse_sigmoid(reference_points)
895
+ output_bbox_dn = output[:effect_num_dn]
896
+ output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
897
+ reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
898
+ reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
899
+ 0::(self.num_body_points + 1)]
900
+ delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
901
+ delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
902
+ outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
903
+ outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
904
+ new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
905
+ new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
906
+ output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
907
+ delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
908
+ outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
909
+ device=output.device)).clone() ##
910
+ delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
911
+ outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
912
+ outputs_unsig[..., 2:] += delta_hw_unsig
913
+ new_reference_points_for_keypoint = outputs_unsig.sigmoid()
914
+ bs = new_reference_points_for_box_norm.shape[1]
915
+ new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
916
+ new_reference_points_for_keypoint.view(-1, self.num_body_points,
917
+ bs, 4)), dim=1).flatten(0,
918
+ 1)
919
+ new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
920
+
921
+ if self.rm_detach and 'dec' in self.rm_detach:
922
+ reference_points = new_reference_points
923
+ else:
924
+ reference_points = new_reference_points.detach()
925
+
926
+ # if layer_id != self.num_layers - 1:
927
+ if self.use_detached_boxes_dec_out:
928
+ ref_points.append(reference_points)
929
+ else:
930
+ ref_points.append(new_reference_points)
931
+
932
+ return [
933
+ [itm_out.transpose(0, 1) for itm_out in intermediate],
934
+ [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
935
+ ]
936
+
937
+
938
+ class DeformableTransformerEncoderLayer(nn.Module):
939
+ def __init__(self,
940
+ d_model=256, d_ffn=1024,
941
+ dropout=0.1, activation="relu",
942
+ n_levels=4, n_heads=8, n_points=4,
943
+ add_channel_attention=False,
944
+ use_deformable_box_attn=False,
945
+ box_attn_type='roi_align',
946
+ ):
947
+ super().__init__()
948
+
949
+ # self attention
950
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
951
+ self.dropout1 = nn.Dropout(dropout)
952
+ self.norm1 = nn.LayerNorm(d_model)
953
+
954
+ # ffn
955
+ self.linear1 = nn.Linear(d_model, d_ffn)
956
+ self.activation = _get_activation_fn(activation, d_model=d_ffn)
957
+ self.dropout2 = nn.Dropout(dropout)
958
+ self.linear2 = nn.Linear(d_ffn, d_model)
959
+ self.dropout3 = nn.Dropout(dropout)
960
+ self.norm2 = nn.LayerNorm(d_model)
961
+
962
+ # channel attention
963
+ self.add_channel_attention = add_channel_attention
964
+ if add_channel_attention:
965
+ self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
966
+ self.norm_channel = nn.LayerNorm(d_model)
967
+
968
+ @staticmethod
969
+ def with_pos_embed(tensor, pos):
970
+ return tensor if pos is None else tensor + pos
971
+
972
+ def forward_ffn(self, src):
973
+ src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
974
+ src = src + self.dropout3(src2)
975
+ src = self.norm2(src)
976
+ return src
977
+
978
+ def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
979
+ # self attention
980
+ # import ipdb; ipdb.set_trace()
981
+ src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
982
+ key_padding_mask)
983
+ src = src + self.dropout1(src2)
984
+ src = self.norm1(src)
985
+
986
+ # ffn
987
+ src = self.forward_ffn(src)
988
+
989
+ # channel attn
990
+ if self.add_channel_attention:
991
+ src = self.norm_channel(src + self.activ_channel(src))
992
+
993
+ return src
994
+
995
+
996
+ class DeformableTransformerDecoderLayer(nn.Module):
997
+ def __init__(self, d_model=256, d_ffn=1024,
998
+ dropout=0.1, activation="relu",
999
+ n_levels=4, n_heads=8, n_points=4,
1000
+ use_text_feat_guide=False,
1001
+ use_text_cross_attention=False,
1002
+ ffn_extra_layernorm=False
1003
+ ):
1004
+ super().__init__()
1005
+
1006
+ # cross attention
1007
+ # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
1008
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
1009
+ self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1010
+ self.norm1 = nn.LayerNorm(d_model)
1011
+
1012
+ # cross attention text
1013
+ if use_text_cross_attention:
1014
+ self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
1015
+ self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1016
+ self.catext_norm = nn.LayerNorm(d_model)
1017
+
1018
+ # self attention
1019
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
1020
+ self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1021
+ self.norm2 = nn.LayerNorm(d_model)
1022
+
1023
+ # ffn
1024
+ self.linear1 = nn.Linear(d_model, d_ffn)
1025
+ self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
1026
+ self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1027
+ self.linear2 = nn.Linear(d_ffn, d_model)
1028
+ self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1029
+ self.norm3 = nn.LayerNorm(d_model)
1030
+ if ffn_extra_layernorm:
1031
+ raise NotImplementedError('ffn_extra_layernorm not implemented')
1032
+ self.norm_ext = nn.LayerNorm(d_ffn)
1033
+ else:
1034
+ self.norm_ext = None
1035
+
1036
+ self.key_aware_proj = None
1037
+ self.use_text_feat_guide = use_text_feat_guide
1038
+ assert not use_text_feat_guide
1039
+ self.use_text_cross_attention = use_text_cross_attention
1040
+
1041
+ def rm_self_attn_modules(self):
1042
+ self.self_attn = None
1043
+ self.dropout2 = None
1044
+ self.norm2 = None
1045
+
1046
+ @staticmethod
1047
+ def with_pos_embed(tensor, pos):
1048
+ return tensor if pos is None else tensor + pos
1049
+
1050
+ def forward_ffn(self, tgt, ipdb_flag=False):
1051
+
1052
+ with torch.cuda.amp.autocast(enabled=False):
1053
+ tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
1054
+
1055
+ tgt = tgt + self.dropout4(tgt2)
1056
+ tgt = self.norm3(tgt)
1057
+ return tgt
1058
+
1059
+ def forward(self,
1060
+ # for tgt
1061
+ tgt: Optional[Tensor], # nq, bs, d_model
1062
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
1063
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
1064
+ tgt_key_padding_mask: Optional[Tensor] = None,
1065
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
1066
+
1067
+ memory_text: Optional[Tensor] = None, # bs, num_token, d_model
1068
+ text_attention_mask: Optional[Tensor] = None, # bs, num_token
1069
+
1070
+ # for memory
1071
+ memory: Optional[Tensor] = None, # hw, bs, d_model
1072
+ memory_key_padding_mask: Optional[Tensor] = None,
1073
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
1074
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
1075
+ memory_pos: Optional[Tensor] = None, # pos for memory
1076
+
1077
+ # sa
1078
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
1079
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
1080
+ ):
1081
+ """
1082
+ Input:
1083
+ - tgt/tgt_query_pos: nq, bs, d_model
1084
+ -
1085
+ """
1086
+ assert cross_attn_mask is None
1087
+
1088
+ # self attention
1089
+ if self.self_attn is not None:
1090
+ # import ipdb; ipdb.set_trace()
1091
+ q = k = self.with_pos_embed(tgt, tgt_query_pos)
1092
+ tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
1093
+ tgt = tgt + self.dropout2(tgt2)
1094
+ tgt = self.norm2(tgt)
1095
+
1096
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1097
+ # if tgt.isnan().any() | tgt.isinf().any() :
1098
+ # import ipdb; ipdb.set_trace()
1099
+
1100
+ if self.use_text_cross_attention:
1101
+ tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
1102
+ memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
1103
+ tgt = tgt + self.catext_dropout(tgt2)
1104
+ tgt = self.catext_norm(tgt)
1105
+
1106
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1107
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
1108
+ # import ipdb; ipdb.set_trace()
1109
+
1110
+ # if tgt.isnan().any() | tgt.isinf().any() :
1111
+ # import ipdb; ipdb.set_trace()
1112
+
1113
+ tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
1114
+ tgt_reference_points.transpose(0, 1).contiguous(),
1115
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
1116
+ memory_key_padding_mask).transpose(0, 1)
1117
+ tgt = tgt + self.dropout1(tgt2)
1118
+ tgt = self.norm1(tgt)
1119
+
1120
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1121
+ # tgtk = tgt.clone()
1122
+ # if tgt.isnan().any() | tgt.isinf().any() :
1123
+ # import ipdb; ipdb.set_trace()
1124
+
1125
+ # ffn
1126
+ tgt = self.forward_ffn(tgt)
1127
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1128
+ # if tgt.isnan().any() | tgt.isinf().any() :
1129
+ # tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
1130
+ # import ipdb; ipdb.set_trace()
1131
+
1132
+ return tgt
1133
+
1134
+
1135
+ def _get_clones(module, N, layer_share=False):
1136
+ # import ipdb; ipdb.set_trace()
1137
+ if layer_share:
1138
+ return nn.ModuleList([module for i in range(N)])
1139
+ else:
1140
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
1141
+
1142
+
1143
+ def build_deformable_transformer(args):
1144
+ decoder_query_perturber = None
1145
+ if args.decoder_layer_noise:
1146
+ from .utils import RandomBoxPerturber
1147
+ decoder_query_perturber = RandomBoxPerturber(
1148
+ x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
1149
+ w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
1150
+
1151
+ use_detached_boxes_dec_out = False
1152
+ try:
1153
+ use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
1154
+ except:
1155
+ use_detached_boxes_dec_out = False
1156
+
1157
+ binary_query_selection = False
1158
+ try:
1159
+ binary_query_selection = args.binary_query_selection
1160
+ except:
1161
+ binary_query_selection = False
1162
+
1163
+ ffn_extra_layernorm = False
1164
+ try:
1165
+ ffn_extra_layernorm = args.ffn_extra_layernorm
1166
+ except:
1167
+ print('ffn_extra_layernorm not found, set to False')
1168
+ ffn_extra_layernorm = False
1169
+
1170
+ return DeformableTransformer(
1171
+ d_model=args.hidden_dim,
1172
+ dropout=args.dropout,
1173
+ nhead=args.nheads,
1174
+ num_queries=args.num_queries,
1175
+ dim_feedforward=args.dim_feedforward,
1176
+ num_encoder_layers=args.enc_layers,
1177
+ num_unicoder_layers=args.unic_layers,
1178
+ num_decoder_layers=args.dec_layers,
1179
+ normalize_before=args.pre_norm,
1180
+ return_intermediate_dec=True,
1181
+ query_dim=args.query_dim,
1182
+ activation=args.transformer_activation,
1183
+ num_patterns=args.num_patterns,
1184
+ modulate_hw_attn=True,
1185
+
1186
+ deformable_encoder=True,
1187
+ deformable_decoder=True,
1188
+ num_feature_levels=args.num_feature_levels,
1189
+ enc_n_points=args.enc_n_points,
1190
+ dec_n_points=args.dec_n_points,
1191
+ use_deformable_box_attn=args.use_deformable_box_attn,
1192
+ box_attn_type=args.box_attn_type,
1193
+
1194
+ learnable_tgt_init=True,
1195
+ decoder_query_perturber=decoder_query_perturber,
1196
+
1197
+ add_channel_attention=args.add_channel_attention,
1198
+ add_pos_value=args.add_pos_value,
1199
+ random_refpoints_xy=args.random_refpoints_xy,
1200
+
1201
+ # two stage
1202
+ two_stage_type=args.two_stage_type, # ['no', 'standard', 'early']
1203
+ two_stage_pat_embed=args.two_stage_pat_embed,
1204
+ two_stage_add_query_num=args.two_stage_add_query_num,
1205
+ two_stage_learn_wh=args.two_stage_learn_wh,
1206
+ two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
1207
+ dec_layer_number=args.dec_layer_number,
1208
+ rm_self_attn_layers=None,
1209
+ key_aware_type=None,
1210
+ layer_share_type=None,
1211
+
1212
+ rm_detach=None,
1213
+ decoder_sa_type=args.decoder_sa_type,
1214
+ module_seq=args.decoder_module_seq,
1215
+
1216
+ embed_init_tgt=args.embed_init_tgt,
1217
+ use_detached_boxes_dec_out=use_detached_boxes_dec_out,
1218
+ use_text_enhancer=args.use_text_enhancer,
1219
+ use_fusion_layer=args.use_fusion_layer,
1220
+ use_checkpoint=args.use_checkpoint,
1221
+ use_transformer_ckpt=args.use_transformer_ckpt,
1222
+ use_text_cross_attention=args.use_text_cross_attention,
1223
+
1224
+ text_dropout=args.text_dropout,
1225
+ fusion_dropout=args.fusion_dropout,
1226
+ fusion_droppath=args.fusion_droppath,
1227
+
1228
+ binary_query_selection=binary_query_selection,
1229
+ ffn_extra_layernorm=ffn_extra_layernorm,
1230
+ )
difpoint/src/models/XPose/models/UniPose/fuse_modules.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ # from timm.models.layers import DropPath
6
+ from src.models.util import DropPath
7
+
8
+
9
+ class FeatureResizer(nn.Module):
10
+ """
11
+ This class takes as input a set of embeddings of dimension C1 and outputs a set of
12
+ embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
13
+ """
14
+
15
+ def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
16
+ super().__init__()
17
+ self.do_ln = do_ln
18
+ # Object feature encoding
19
+ self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
20
+ self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
21
+ self.dropout = nn.Dropout(dropout)
22
+
23
+ def forward(self, encoder_features):
24
+ x = self.fc(encoder_features)
25
+ if self.do_ln:
26
+ x = self.layer_norm(x)
27
+ output = self.dropout(x)
28
+ return output
29
+
30
+
31
+ def l1norm(X, dim, eps=1e-8):
32
+ """L1-normalize columns of X
33
+ """
34
+ norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
35
+ X = torch.div(X, norm)
36
+ return X
37
+
38
+
39
+ def l2norm(X, dim, eps=1e-8):
40
+ """L2-normalize columns of X
41
+ """
42
+ norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
43
+ X = torch.div(X, norm)
44
+ return X
45
+
46
+
47
+ def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
48
+ """
49
+ query: (n_context, queryL, d)
50
+ context: (n_context, sourceL, d)
51
+ """
52
+ batch_size_q, queryL = query.size(0), query.size(1)
53
+ batch_size, sourceL = context.size(0), context.size(1)
54
+
55
+ # Get attention
56
+ # --> (batch, d, queryL)
57
+ queryT = torch.transpose(query, 1, 2)
58
+
59
+ # (batch, sourceL, d)(batch, d, queryL)
60
+ # --> (batch, sourceL, queryL)
61
+ attn = torch.bmm(context, queryT)
62
+ if raw_feature_norm == "softmax":
63
+ # --> (batch*sourceL, queryL)
64
+ attn = attn.view(batch_size * sourceL, queryL)
65
+ attn = nn.Softmax()(attn)
66
+ # --> (batch, sourceL, queryL)
67
+ attn = attn.view(batch_size, sourceL, queryL)
68
+ elif raw_feature_norm == "l2norm":
69
+ attn = l2norm(attn, 2)
70
+ elif raw_feature_norm == "clipped_l2norm":
71
+ attn = nn.LeakyReLU(0.1)(attn)
72
+ attn = l2norm(attn, 2)
73
+ else:
74
+ raise ValueError("unknown first norm type:", raw_feature_norm)
75
+ # --> (batch, queryL, sourceL)
76
+ attn = torch.transpose(attn, 1, 2).contiguous()
77
+ # --> (batch*queryL, sourceL)
78
+ attn = attn.view(batch_size * queryL, sourceL)
79
+ attn = nn.Softmax()(attn * smooth)
80
+ # --> (batch, queryL, sourceL)
81
+ attn = attn.view(batch_size, queryL, sourceL)
82
+ # --> (batch, sourceL, queryL)
83
+ attnT = torch.transpose(attn, 1, 2).contiguous()
84
+
85
+ # --> (batch, d, sourceL)
86
+ contextT = torch.transpose(context, 1, 2)
87
+ # (batch x d x sourceL)(batch x sourceL x queryL)
88
+ # --> (batch, d, queryL)
89
+ weightedContext = torch.bmm(contextT, attnT)
90
+ # --> (batch, queryL, d)
91
+ weightedContext = torch.transpose(weightedContext, 1, 2)
92
+
93
+ return weightedContext, attnT
94
+
95
+
96
+ class BiMultiHeadAttention(nn.Module):
97
+ def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
98
+ super(BiMultiHeadAttention, self).__init__()
99
+
100
+ self.embed_dim = embed_dim
101
+ self.num_heads = num_heads
102
+ self.head_dim = embed_dim // num_heads
103
+ self.v_dim = v_dim
104
+ self.l_dim = l_dim
105
+
106
+ assert (
107
+ self.head_dim * self.num_heads == self.embed_dim
108
+ ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
109
+ self.scale = self.head_dim ** (-0.5)
110
+ self.dropout = dropout
111
+
112
+ self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
113
+ self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
114
+ self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
115
+ self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
116
+
117
+ self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
118
+ self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
119
+
120
+ self.stable_softmax_2d = True
121
+ self.clamp_min_for_underflow = True
122
+ self.clamp_max_for_overflow = True
123
+
124
+ self._reset_parameters()
125
+
126
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
127
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
128
+
129
+ def _reset_parameters(self):
130
+ nn.init.xavier_uniform_(self.v_proj.weight)
131
+ self.v_proj.bias.data.fill_(0)
132
+ nn.init.xavier_uniform_(self.l_proj.weight)
133
+ self.l_proj.bias.data.fill_(0)
134
+ nn.init.xavier_uniform_(self.values_v_proj.weight)
135
+ self.values_v_proj.bias.data.fill_(0)
136
+ nn.init.xavier_uniform_(self.values_l_proj.weight)
137
+ self.values_l_proj.bias.data.fill_(0)
138
+ nn.init.xavier_uniform_(self.out_v_proj.weight)
139
+ self.out_v_proj.bias.data.fill_(0)
140
+ nn.init.xavier_uniform_(self.out_l_proj.weight)
141
+ self.out_l_proj.bias.data.fill_(0)
142
+
143
+ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
144
+ """_summary_
145
+
146
+ Args:
147
+ v (_type_): bs, n_img, dim
148
+ l (_type_): bs, n_text, dim
149
+ attention_mask_v (_type_, optional): _description_. bs, n_img
150
+ attention_mask_l (_type_, optional): _description_. bs, n_text
151
+
152
+ Returns:
153
+ _type_: _description_
154
+ """
155
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
156
+ # import ipdb; ipdb.set_trace()
157
+ bsz, tgt_len, _ = v.size()
158
+
159
+ query_states = self.v_proj(v) * self.scale
160
+ key_states = self._shape(self.l_proj(l), -1, bsz)
161
+ value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
162
+ value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
163
+
164
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
165
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
166
+ key_states = key_states.view(*proj_shape)
167
+ value_v_states = value_v_states.view(*proj_shape)
168
+ value_l_states = value_l_states.view(*proj_shape)
169
+
170
+ src_len = key_states.size(1)
171
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt
172
+
173
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
174
+ raise ValueError(
175
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
176
+ )
177
+
178
+ if self.stable_softmax_2d:
179
+ attn_weights = attn_weights - attn_weights.max()
180
+
181
+ if self.clamp_min_for_underflow:
182
+ attn_weights = torch.clamp(attn_weights,
183
+ min=-50000) # Do not increase -50000, data type half has quite limited range
184
+ if self.clamp_max_for_overflow:
185
+ attn_weights = torch.clamp(attn_weights,
186
+ max=50000) # Do not increase 50000, data type half has quite limited range
187
+
188
+ attn_weights_T = attn_weights.transpose(1, 2)
189
+ attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
190
+ 0])
191
+ if self.clamp_min_for_underflow:
192
+ attn_weights_l = torch.clamp(attn_weights_l,
193
+ min=-50000) # Do not increase -50000, data type half has quite limited range
194
+ if self.clamp_max_for_overflow:
195
+ attn_weights_l = torch.clamp(attn_weights_l,
196
+ max=50000) # Do not increase 50000, data type half has quite limited range
197
+
198
+ # mask vison for language
199
+ if attention_mask_v is not None:
200
+ attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
201
+ attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
202
+
203
+ attn_weights_l = attn_weights_l.softmax(dim=-1)
204
+
205
+ # mask language for vision
206
+ if attention_mask_l is not None:
207
+ attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
208
+ attn_weights.masked_fill_(attention_mask_l, float('-inf'))
209
+ attn_weights_v = attn_weights.softmax(dim=-1)
210
+
211
+ attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
212
+ attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
213
+
214
+ attn_output_v = torch.bmm(attn_probs_v, value_l_states)
215
+ attn_output_l = torch.bmm(attn_probs_l, value_v_states)
216
+
217
+ if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
218
+ raise ValueError(
219
+ f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
220
+ )
221
+
222
+ if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
223
+ raise ValueError(
224
+ f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
225
+ )
226
+
227
+ attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
228
+ attn_output_v = attn_output_v.transpose(1, 2)
229
+ attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
230
+
231
+ attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
232
+ attn_output_l = attn_output_l.transpose(1, 2)
233
+ attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
234
+
235
+ attn_output_v = self.out_v_proj(attn_output_v)
236
+ attn_output_l = self.out_l_proj(attn_output_l)
237
+
238
+ return attn_output_v, attn_output_l
239
+
240
+
241
+ # Bi-Direction MHA (text->image, image->text)
242
+ class BiAttentionBlock(nn.Module):
243
+ def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
244
+ drop_path=.0, init_values=1e-4, cfg=None):
245
+ """
246
+ Inputs:
247
+ embed_dim - Dimensionality of input and attention feature vectors
248
+ hidden_dim - Dimensionality of hidden layer in feed-forward network
249
+ (usually 2-4x larger than embed_dim)
250
+ num_heads - Number of heads to use in the Multi-Head Attention block
251
+ dropout - Amount of dropout to apply in the feed-forward network
252
+ """
253
+ super(BiAttentionBlock, self).__init__()
254
+
255
+ # pre layer norm
256
+ self.layer_norm_v = nn.LayerNorm(v_dim)
257
+ self.layer_norm_l = nn.LayerNorm(l_dim)
258
+ self.attn = BiMultiHeadAttention(v_dim=v_dim,
259
+ l_dim=l_dim,
260
+ embed_dim=embed_dim,
261
+ num_heads=num_heads,
262
+ dropout=dropout)
263
+
264
+ # add layer scale for training stability
265
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
266
+ self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
267
+ self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
268
+
269
+ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
270
+ v = self.layer_norm_v(v)
271
+ l = self.layer_norm_l(l)
272
+ delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
273
+ # v, l = v + delta_v, l + delta_l
274
+ v = v + self.drop_path(self.gamma_v * delta_v)
275
+ l = l + self.drop_path(self.gamma_l * delta_l)
276
+ return v, l
difpoint/src/models/XPose/models/UniPose/mask_generate.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def prepare_for_mask(kpt_mask):
5
+
6
+
7
+ tgt_size2 = 50 * 69
8
+ attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
9
+ group_bbox_kpt = 69
10
+ num_group=50
11
+ for matchj in range(num_group * group_bbox_kpt):
12
+ sj = (matchj // group_bbox_kpt) * group_bbox_kpt
13
+ ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
14
+ if sj > 0:
15
+ attn_mask2[:,:,matchj, :sj] = True
16
+ if ej < num_group * group_bbox_kpt:
17
+ attn_mask2[:,:,matchj, ej:] = True
18
+
19
+
20
+ bs, length = kpt_mask.shape
21
+ equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
22
+ equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
23
+ for idx in range(num_group):
24
+ start_idx = idx * length
25
+ end_idx = (idx + 1) * length
26
+ attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
27
+ attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
28
+
29
+
30
+
31
+
32
+ input_query_label = None
33
+ input_query_bbox = None
34
+ attn_mask = None
35
+ dn_meta = None
36
+
37
+ return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
38
+
39
+
40
+ def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
41
+
42
+ if dn_meta and dn_meta['pad_size'] > 0:
43
+
44
+ output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
45
+ output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
46
+
47
+ outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
48
+ outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
49
+
50
+ out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
51
+ if aux_loss:
52
+ out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
53
+ dn_meta['output_known_lbs_bboxes'] = out
54
+ return outputs_class, outputs_coord
55
+
56
+
difpoint/src/models/XPose/models/UniPose/ops/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2024/8/5 21:58
3
+ # @Author : shaoguowen
4
+ # @Email : [email protected]
5
+ # @Project : FasterLivePortrait
6
+ # @FileName: __init__.py.py
difpoint/src/models/XPose/models/UniPose/ops/functions/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from .ms_deform_attn_func import MSDeformAttnFunction
10
+
difpoint/src/models/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import torch
14
+ import torch.nn.functional as F
15
+ from torch.autograd import Function
16
+ from torch.autograd.function import once_differentiable
17
+
18
+ import MultiScaleDeformableAttention as MSDA
19
+
20
+
21
+ class MSDeformAttnFunction(Function):
22
+ @staticmethod
23
+ def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24
+ ctx.im2col_step = im2col_step
25
+ output = MSDA.ms_deform_attn_forward(
26
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27
+ ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28
+ return output
29
+
30
+ @staticmethod
31
+ @once_differentiable
32
+ def backward(ctx, grad_output):
33
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34
+ grad_value, grad_sampling_loc, grad_attn_weight = \
35
+ MSDA.ms_deform_attn_backward(
36
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37
+
38
+ return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39
+
40
+
41
+ def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42
+ # for debug and test only,
43
+ # need to use cuda version instead
44
+ N_, S_, M_, D_ = value.shape
45
+ _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46
+ value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47
+ sampling_grids = 2 * sampling_locations - 1
48
+ sampling_value_list = []
49
+ for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50
+ # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51
+ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52
+ # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53
+ sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54
+ # N_*M_, D_, Lq_, P_
55
+ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56
+ mode='bilinear', padding_mode='zeros', align_corners=False)
57
+ sampling_value_list.append(sampling_value_l_)
58
+ # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59
+ attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60
+ output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61
+ return output.transpose(1, 2).contiguous()
difpoint/src/models/XPose/models/UniPose/ops/modules/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from .ms_deform_attn import MSDeformAttn
difpoint/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import warnings
14
+ import math, os
15
+ import sys
16
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ import torch
19
+ from torch import nn
20
+ import torch.nn.functional as F
21
+ from torch.nn.init import xavier_uniform_, constant_
22
+
23
+ from src.models.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction
24
+
25
+
26
+ def _is_power_of_2(n):
27
+ if (not isinstance(n, int)) or (n < 0):
28
+ raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
29
+ return (n & (n-1) == 0) and n != 0
30
+
31
+
32
+ class MSDeformAttn(nn.Module):
33
+ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
34
+ """
35
+ Multi-Scale Deformable Attention Module
36
+ :param d_model hidden dimension
37
+ :param n_levels number of feature levels
38
+ :param n_heads number of attention heads
39
+ :param n_points number of sampling points per attention head per feature level
40
+ """
41
+ super().__init__()
42
+ if d_model % n_heads != 0:
43
+ raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
44
+ _d_per_head = d_model // n_heads
45
+ # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
46
+ if not _is_power_of_2(_d_per_head):
47
+ warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
48
+ "which is more efficient in our CUDA implementation.")
49
+
50
+ self.im2col_step = 64
51
+
52
+ self.d_model = d_model
53
+ self.n_levels = n_levels
54
+ self.n_heads = n_heads
55
+ self.n_points = n_points
56
+
57
+ self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
58
+ self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
59
+ self.value_proj = nn.Linear(d_model, d_model)
60
+ self.output_proj = nn.Linear(d_model, d_model)
61
+
62
+ self.use_4D_normalizer = use_4D_normalizer
63
+
64
+ self._reset_parameters()
65
+
66
+ def _reset_parameters(self):
67
+ constant_(self.sampling_offsets.weight.data, 0.)
68
+ thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
69
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
70
+ grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
71
+ for i in range(self.n_points):
72
+ grid_init[:, :, i, :] *= i + 1
73
+ with torch.no_grad():
74
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
75
+ constant_(self.attention_weights.weight.data, 0.)
76
+ constant_(self.attention_weights.bias.data, 0.)
77
+ xavier_uniform_(self.value_proj.weight.data)
78
+ constant_(self.value_proj.bias.data, 0.)
79
+ xavier_uniform_(self.output_proj.weight.data)
80
+ constant_(self.output_proj.bias.data, 0.)
81
+
82
+ def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
83
+ """
84
+ :param query (N, Length_{query}, C)
85
+ :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
86
+ or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
87
+ :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
88
+ :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
89
+ :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
90
+ :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
91
+
92
+ :return output (N, Length_{query}, C)
93
+ """
94
+ N, Len_q, _ = query.shape
95
+ N, Len_in, _ = input_flatten.shape
96
+ assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
97
+
98
+ value = self.value_proj(input_flatten)
99
+ if input_padding_mask is not None:
100
+ value = value.masked_fill(input_padding_mask[..., None], float(0))
101
+ value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102
+ sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103
+ attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104
+ attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105
+ # N, Len_q, n_heads, n_levels, n_points, 2
106
+
107
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
108
+ # import ipdb; ipdb.set_trace()
109
+
110
+ if reference_points.shape[-1] == 2:
111
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
112
+ sampling_locations = reference_points[:, :, None, :, None, :] \
113
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
114
+ elif reference_points.shape[-1] == 4:
115
+ if self.use_4D_normalizer:
116
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
117
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
118
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
119
+ else:
120
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
121
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
122
+ else:
123
+ raise ValueError(
124
+ 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
125
+
126
+
127
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
128
+ # import ipdb; ipdb.set_trace()
129
+
130
+ # for amp
131
+ if value.dtype == torch.float16:
132
+ # for mixed precision
133
+ output = MSDeformAttnFunction.apply(
134
+ value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
135
+ output = output.to(torch.float16)
136
+ output = self.output_proj(output)
137
+ return output
138
+
139
+ output = MSDeformAttnFunction.apply(
140
+ value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
141
+ output = self.output_proj(output)
142
+ return output
difpoint/src/models/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import warnings
14
+ import math, os
15
+
16
+ import torch
17
+ from torch import nn
18
+ import torch.nn.functional as F
19
+ from torch.nn.init import xavier_uniform_, constant_
20
+
21
+ try:
22
+ from src.models.XPose.models.UniPose.ops.functions import MSDeformAttnFunction
23
+ except:
24
+ warnings.warn('Failed to import MSDeformAttnFunction.')
25
+
26
+
27
+ def _is_power_of_2(n):
28
+ if (not isinstance(n, int)) or (n < 0):
29
+ raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
30
+ return (n & (n-1) == 0) and n != 0
31
+
32
+
33
+ class MSDeformAttn(nn.Module):
34
+ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
35
+ """
36
+ Multi-Scale Deformable Attention Module
37
+ :param d_model hidden dimension
38
+ :param n_levels number of feature levels
39
+ :param n_heads number of attention heads
40
+ :param n_points number of sampling points per attention head per feature level
41
+ """
42
+ super().__init__()
43
+ if d_model % n_heads != 0:
44
+ raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
45
+ _d_per_head = d_model // n_heads
46
+ # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
47
+ if not _is_power_of_2(_d_per_head):
48
+ warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
49
+ "which is more efficient in our CUDA implementation.")
50
+
51
+ self.im2col_step = 64
52
+
53
+ self.d_model = d_model
54
+ self.n_levels = n_levels
55
+ self.n_heads = n_heads
56
+ self.n_points = n_points
57
+
58
+ self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
59
+ self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
60
+ self.value_proj = nn.Linear(d_model, d_model)
61
+ self.output_proj = nn.Linear(d_model, d_model)
62
+
63
+ self.use_4D_normalizer = use_4D_normalizer
64
+
65
+ self._reset_parameters()
66
+
67
+ def _reset_parameters(self):
68
+ constant_(self.sampling_offsets.weight.data, 0.)
69
+ thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
70
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
71
+ grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
72
+ for i in range(self.n_points):
73
+ grid_init[:, :, i, :] *= i + 1
74
+ with torch.no_grad():
75
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
76
+ constant_(self.attention_weights.weight.data, 0.)
77
+ constant_(self.attention_weights.bias.data, 0.)
78
+ xavier_uniform_(self.value_proj.weight.data)
79
+ constant_(self.value_proj.bias.data, 0.)
80
+ xavier_uniform_(self.output_proj.weight.data)
81
+ constant_(self.output_proj.bias.data, 0.)
82
+
83
+ def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
84
+ """
85
+ :param query (N, Length_{query}, C)
86
+ :param key (N, 1, C)
87
+ :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
88
+ or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
89
+ :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
90
+ :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
91
+ :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
92
+ :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
93
+
94
+ :return output (N, Length_{query}, C)
95
+ """
96
+ N, Len_q, _ = query.shape
97
+ N, Len_in, _ = input_flatten.shape
98
+ assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
99
+
100
+ value = self.value_proj(input_flatten)
101
+ if input_padding_mask is not None:
102
+ value = value.masked_fill(input_padding_mask[..., None], float(0))
103
+ value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
104
+ sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
105
+ attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
106
+ attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
107
+ # N, Len_q, n_heads, n_levels, n_points, 2
108
+
109
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
110
+ # import ipdb; ipdb.set_trace()
111
+
112
+ if reference_points.shape[-1] == 2:
113
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
114
+ sampling_locations = reference_points[:, :, None, :, None, :] \
115
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
116
+ elif reference_points.shape[-1] == 4:
117
+ if self.use_4D_normalizer:
118
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
119
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
120
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
121
+ else:
122
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
123
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
124
+ else:
125
+ raise ValueError(
126
+ 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
127
+ output = MSDeformAttnFunction.apply(
128
+ value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
129
+ output = self.output_proj(output)
130
+ return output
difpoint/src/models/XPose/models/UniPose/ops/setup.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ import os
10
+ import glob
11
+
12
+ import torch
13
+
14
+ from torch.utils.cpp_extension import CUDA_HOME
15
+ from torch.utils.cpp_extension import CppExtension
16
+ from torch.utils.cpp_extension import CUDAExtension
17
+
18
+ from setuptools import find_packages
19
+ from setuptools import setup
20
+
21
+ requirements = ["torch", "torchvision"]
22
+
23
+ def get_extensions():
24
+ this_dir = os.path.dirname(os.path.abspath(__file__))
25
+ extensions_dir = os.path.join(this_dir, "src")
26
+
27
+ main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28
+ source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29
+ source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30
+
31
+ sources = main_file + source_cpu
32
+ extension = CppExtension
33
+ extra_compile_args = {"cxx": []}
34
+ define_macros = []
35
+
36
+ # import ipdb; ipdb.set_trace()
37
+
38
+ if torch.cuda.is_available() and CUDA_HOME is not None:
39
+ extension = CUDAExtension
40
+ sources += source_cuda
41
+ define_macros += [("WITH_CUDA", None)]
42
+ extra_compile_args["nvcc"] = [
43
+ "-DCUDA_HAS_FP16=1",
44
+ "-D__CUDA_NO_HALF_OPERATORS__",
45
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
46
+ "-D__CUDA_NO_HALF2_OPERATORS__",
47
+ # 添加以下行来指定多个 CUDA 架构
48
+ "-gencode=arch=compute_60,code=sm_60",
49
+ "-gencode=arch=compute_70,code=sm_70",
50
+ "-gencode=arch=compute_75,code=sm_75",
51
+ "-gencode=arch=compute_80,code=sm_80",
52
+ "-gencode=arch=compute_86,code=sm_86",
53
+ ]
54
+ else:
55
+ raise NotImplementedError('Cuda is not availabel')
56
+
57
+ sources = [os.path.join(extensions_dir, s) for s in sources]
58
+ include_dirs = [extensions_dir]
59
+ ext_modules = [
60
+ extension(
61
+ "MultiScaleDeformableAttention",
62
+ sources,
63
+ include_dirs=include_dirs,
64
+ define_macros=define_macros,
65
+ extra_compile_args=extra_compile_args,
66
+ )
67
+ ]
68
+ return ext_modules
69
+
70
+ setup(
71
+ name="MultiScaleDeformableAttention",
72
+ version="1.0",
73
+ author="Weijie Su",
74
+ url="https://github.com/fundamentalvision/Deformable-DETR",
75
+ description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
76
+ packages=find_packages(exclude=("configs", "tests",)),
77
+ ext_modules=get_extensions(),
78
+ cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
79
+ )
difpoint/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include <vector>
12
+
13
+ #include <ATen/ATen.h>
14
+ #include <ATen/cuda/CUDAContext.h>
15
+
16
+
17
+ at::Tensor
18
+ ms_deform_attn_cpu_forward(
19
+ const at::Tensor &value,
20
+ const at::Tensor &spatial_shapes,
21
+ const at::Tensor &level_start_index,
22
+ const at::Tensor &sampling_loc,
23
+ const at::Tensor &attn_weight,
24
+ const int im2col_step)
25
+ {
26
+ AT_ERROR("Not implement on cpu");
27
+ }
28
+
29
+ std::vector<at::Tensor>
30
+ ms_deform_attn_cpu_backward(
31
+ const at::Tensor &value,
32
+ const at::Tensor &spatial_shapes,
33
+ const at::Tensor &level_start_index,
34
+ const at::Tensor &sampling_loc,
35
+ const at::Tensor &attn_weight,
36
+ const at::Tensor &grad_output,
37
+ const int im2col_step)
38
+ {
39
+ AT_ERROR("Not implement on cpu");
40
+ }
41
+
difpoint/src/models/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+ #include <torch/extension.h>
13
+
14
+ at::Tensor
15
+ ms_deform_attn_cpu_forward(
16
+ const at::Tensor &value,
17
+ const at::Tensor &spatial_shapes,
18
+ const at::Tensor &level_start_index,
19
+ const at::Tensor &sampling_loc,
20
+ const at::Tensor &attn_weight,
21
+ const int im2col_step);
22
+
23
+ std::vector<at::Tensor>
24
+ ms_deform_attn_cpu_backward(
25
+ const at::Tensor &value,
26
+ const at::Tensor &spatial_shapes,
27
+ const at::Tensor &level_start_index,
28
+ const at::Tensor &sampling_loc,
29
+ const at::Tensor &attn_weight,
30
+ const at::Tensor &grad_output,
31
+ const int im2col_step);
32
+
33
+
difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include <vector>
12
+ #include "cuda/ms_deform_im2col_cuda.cuh"
13
+
14
+ #include <ATen/ATen.h>
15
+ #include <ATen/cuda/CUDAContext.h>
16
+ #include <cuda.h>
17
+ #include <cuda_runtime.h>
18
+
19
+
20
+ at::Tensor ms_deform_attn_cuda_forward(
21
+ const at::Tensor &value,
22
+ const at::Tensor &spatial_shapes,
23
+ const at::Tensor &level_start_index,
24
+ const at::Tensor &sampling_loc,
25
+ const at::Tensor &attn_weight,
26
+ const int im2col_step)
27
+ {
28
+ AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
29
+ AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
30
+ AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
31
+ AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
32
+ AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
33
+
34
+ AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
35
+ AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
36
+ AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
37
+ AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
38
+ AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
39
+
40
+ const int batch = value.size(0);
41
+ const int spatial_size = value.size(1);
42
+ const int num_heads = value.size(2);
43
+ const int channels = value.size(3);
44
+
45
+ const int num_levels = spatial_shapes.size(0);
46
+
47
+ const int num_query = sampling_loc.size(1);
48
+ const int num_point = sampling_loc.size(4);
49
+
50
+ const int im2col_step_ = std::min(batch, im2col_step);
51
+
52
+ AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
53
+
54
+ auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
55
+
56
+ const int batch_n = im2col_step_;
57
+ auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
58
+ auto per_value_size = spatial_size * num_heads * channels;
59
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
60
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
61
+ for (int n = 0; n < batch/im2col_step_; ++n)
62
+ {
63
+ auto columns = output_n.select(0, n);
64
+ AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
65
+ ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
66
+ value.data<scalar_t>() + n * im2col_step_ * per_value_size,
67
+ spatial_shapes.data<int64_t>(),
68
+ level_start_index.data<int64_t>(),
69
+ sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
70
+ attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
71
+ batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
72
+ columns.data<scalar_t>());
73
+
74
+ }));
75
+ }
76
+
77
+ output = output.view({batch, num_query, num_heads*channels});
78
+
79
+ return output;
80
+ }
81
+
82
+
83
+ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
84
+ const at::Tensor &value,
85
+ const at::Tensor &spatial_shapes,
86
+ const at::Tensor &level_start_index,
87
+ const at::Tensor &sampling_loc,
88
+ const at::Tensor &attn_weight,
89
+ const at::Tensor &grad_output,
90
+ const int im2col_step)
91
+ {
92
+
93
+ AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
94
+ AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
95
+ AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
96
+ AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
97
+ AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
98
+ AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
99
+
100
+ AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101
+ AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102
+ AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103
+ AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104
+ AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105
+ AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106
+
107
+ const int batch = value.size(0);
108
+ const int spatial_size = value.size(1);
109
+ const int num_heads = value.size(2);
110
+ const int channels = value.size(3);
111
+
112
+ const int num_levels = spatial_shapes.size(0);
113
+
114
+ const int num_query = sampling_loc.size(1);
115
+ const int num_point = sampling_loc.size(4);
116
+
117
+ const int im2col_step_ = std::min(batch, im2col_step);
118
+
119
+ AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120
+
121
+ auto grad_value = at::zeros_like(value);
122
+ auto grad_sampling_loc = at::zeros_like(sampling_loc);
123
+ auto grad_attn_weight = at::zeros_like(attn_weight);
124
+
125
+ const int batch_n = im2col_step_;
126
+ auto per_value_size = spatial_size * num_heads * channels;
127
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129
+ auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130
+
131
+ for (int n = 0; n < batch/im2col_step_; ++n)
132
+ {
133
+ auto grad_output_g = grad_output_n.select(0, n);
134
+ AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135
+ ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136
+ grad_output_g.data<scalar_t>(),
137
+ value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138
+ spatial_shapes.data<int64_t>(),
139
+ level_start_index.data<int64_t>(),
140
+ sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141
+ attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142
+ batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143
+ grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
144
+ grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145
+ grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146
+
147
+ }));
148
+ }
149
+
150
+ return {
151
+ grad_value, grad_sampling_loc, grad_attn_weight
152
+ };
153
+ }
difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+ #include <torch/extension.h>
13
+
14
+ at::Tensor ms_deform_attn_cuda_forward(
15
+ const at::Tensor &value,
16
+ const at::Tensor &spatial_shapes,
17
+ const at::Tensor &level_start_index,
18
+ const at::Tensor &sampling_loc,
19
+ const at::Tensor &attn_weight,
20
+ const int im2col_step);
21
+
22
+ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23
+ const at::Tensor &value,
24
+ const at::Tensor &spatial_shapes,
25
+ const at::Tensor &level_start_index,
26
+ const at::Tensor &sampling_loc,
27
+ const at::Tensor &attn_weight,
28
+ const at::Tensor &grad_output,
29
+ const int im2col_step);
30
+
difpoint/src/models/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh ADDED
@@ -0,0 +1,1327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************
7
+ * Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
8
+ * Copyright (c) 2018 Microsoft
9
+ **************************************************************************
10
+ */
11
+
12
+ #include <cstdio>
13
+ #include <algorithm>
14
+ #include <cstring>
15
+
16
+ #include <ATen/ATen.h>
17
+ #include <ATen/cuda/CUDAContext.h>
18
+
19
+ #include <THC/THCAtomics.cuh>
20
+
21
+ #define CUDA_KERNEL_LOOP(i, n) \
22
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
23
+ i < (n); \
24
+ i += blockDim.x * gridDim.x)
25
+
26
+ const int CUDA_NUM_THREADS = 1024;
27
+ inline int GET_BLOCKS(const int N, const int num_threads)
28
+ {
29
+ return (N + num_threads - 1) / num_threads;
30
+ }
31
+
32
+
33
+ template <typename scalar_t>
34
+ __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
35
+ const int &height, const int &width, const int &nheads, const int &channels,
36
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c)
37
+ {
38
+ const int h_low = floor(h);
39
+ const int w_low = floor(w);
40
+ const int h_high = h_low + 1;
41
+ const int w_high = w_low + 1;
42
+
43
+ const scalar_t lh = h - h_low;
44
+ const scalar_t lw = w - w_low;
45
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
46
+
47
+ const int w_stride = nheads * channels;
48
+ const int h_stride = width * w_stride;
49
+ const int h_low_ptr_offset = h_low * h_stride;
50
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
51
+ const int w_low_ptr_offset = w_low * w_stride;
52
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
53
+ const int base_ptr = m * channels + c;
54
+
55
+ scalar_t v1 = 0;
56
+ if (h_low >= 0 && w_low >= 0)
57
+ {
58
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
59
+ v1 = bottom_data[ptr1];
60
+ }
61
+ scalar_t v2 = 0;
62
+ if (h_low >= 0 && w_high <= width - 1)
63
+ {
64
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
65
+ v2 = bottom_data[ptr2];
66
+ }
67
+ scalar_t v3 = 0;
68
+ if (h_high <= height - 1 && w_low >= 0)
69
+ {
70
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
71
+ v3 = bottom_data[ptr3];
72
+ }
73
+ scalar_t v4 = 0;
74
+ if (h_high <= height - 1 && w_high <= width - 1)
75
+ {
76
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
77
+ v4 = bottom_data[ptr4];
78
+ }
79
+
80
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
81
+
82
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
83
+ return val;
84
+ }
85
+
86
+
87
+ template <typename scalar_t>
88
+ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
89
+ const int &height, const int &width, const int &nheads, const int &channels,
90
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c,
91
+ const scalar_t &top_grad,
92
+ const scalar_t &attn_weight,
93
+ scalar_t* &grad_value,
94
+ scalar_t* grad_sampling_loc,
95
+ scalar_t* grad_attn_weight)
96
+ {
97
+ const int h_low = floor(h);
98
+ const int w_low = floor(w);
99
+ const int h_high = h_low + 1;
100
+ const int w_high = w_low + 1;
101
+
102
+ const scalar_t lh = h - h_low;
103
+ const scalar_t lw = w - w_low;
104
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
105
+
106
+ const int w_stride = nheads * channels;
107
+ const int h_stride = width * w_stride;
108
+ const int h_low_ptr_offset = h_low * h_stride;
109
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
110
+ const int w_low_ptr_offset = w_low * w_stride;
111
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
112
+ const int base_ptr = m * channels + c;
113
+
114
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
115
+ const scalar_t top_grad_value = top_grad * attn_weight;
116
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
117
+
118
+ scalar_t v1 = 0;
119
+ if (h_low >= 0 && w_low >= 0)
120
+ {
121
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
122
+ v1 = bottom_data[ptr1];
123
+ grad_h_weight -= hw * v1;
124
+ grad_w_weight -= hh * v1;
125
+ atomicAdd(grad_value+ptr1, w1*top_grad_value);
126
+ }
127
+ scalar_t v2 = 0;
128
+ if (h_low >= 0 && w_high <= width - 1)
129
+ {
130
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
131
+ v2 = bottom_data[ptr2];
132
+ grad_h_weight -= lw * v2;
133
+ grad_w_weight += hh * v2;
134
+ atomicAdd(grad_value+ptr2, w2*top_grad_value);
135
+ }
136
+ scalar_t v3 = 0;
137
+ if (h_high <= height - 1 && w_low >= 0)
138
+ {
139
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
140
+ v3 = bottom_data[ptr3];
141
+ grad_h_weight += hw * v3;
142
+ grad_w_weight -= lh * v3;
143
+ atomicAdd(grad_value+ptr3, w3*top_grad_value);
144
+ }
145
+ scalar_t v4 = 0;
146
+ if (h_high <= height - 1 && w_high <= width - 1)
147
+ {
148
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
149
+ v4 = bottom_data[ptr4];
150
+ grad_h_weight += lw * v4;
151
+ grad_w_weight += lh * v4;
152
+ atomicAdd(grad_value+ptr4, w4*top_grad_value);
153
+ }
154
+
155
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
156
+ *grad_attn_weight = top_grad * val;
157
+ *grad_sampling_loc = width * grad_w_weight * top_grad_value;
158
+ *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
159
+ }
160
+
161
+
162
+ template <typename scalar_t>
163
+ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
164
+ const int &height, const int &width, const int &nheads, const int &channels,
165
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c,
166
+ const scalar_t &top_grad,
167
+ const scalar_t &attn_weight,
168
+ scalar_t* &grad_value,
169
+ scalar_t* grad_sampling_loc,
170
+ scalar_t* grad_attn_weight)
171
+ {
172
+ const int h_low = floor(h);
173
+ const int w_low = floor(w);
174
+ const int h_high = h_low + 1;
175
+ const int w_high = w_low + 1;
176
+
177
+ const scalar_t lh = h - h_low;
178
+ const scalar_t lw = w - w_low;
179
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
180
+
181
+ const int w_stride = nheads * channels;
182
+ const int h_stride = width * w_stride;
183
+ const int h_low_ptr_offset = h_low * h_stride;
184
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
185
+ const int w_low_ptr_offset = w_low * w_stride;
186
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
187
+ const int base_ptr = m * channels + c;
188
+
189
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
190
+ const scalar_t top_grad_value = top_grad * attn_weight;
191
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
192
+
193
+ scalar_t v1 = 0;
194
+ if (h_low >= 0 && w_low >= 0)
195
+ {
196
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
197
+ v1 = bottom_data[ptr1];
198
+ grad_h_weight -= hw * v1;
199
+ grad_w_weight -= hh * v1;
200
+ atomicAdd(grad_value+ptr1, w1*top_grad_value);
201
+ }
202
+ scalar_t v2 = 0;
203
+ if (h_low >= 0 && w_high <= width - 1)
204
+ {
205
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
206
+ v2 = bottom_data[ptr2];
207
+ grad_h_weight -= lw * v2;
208
+ grad_w_weight += hh * v2;
209
+ atomicAdd(grad_value+ptr2, w2*top_grad_value);
210
+ }
211
+ scalar_t v3 = 0;
212
+ if (h_high <= height - 1 && w_low >= 0)
213
+ {
214
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
215
+ v3 = bottom_data[ptr3];
216
+ grad_h_weight += hw * v3;
217
+ grad_w_weight -= lh * v3;
218
+ atomicAdd(grad_value+ptr3, w3*top_grad_value);
219
+ }
220
+ scalar_t v4 = 0;
221
+ if (h_high <= height - 1 && w_high <= width - 1)
222
+ {
223
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
224
+ v4 = bottom_data[ptr4];
225
+ grad_h_weight += lw * v4;
226
+ grad_w_weight += lh * v4;
227
+ atomicAdd(grad_value+ptr4, w4*top_grad_value);
228
+ }
229
+
230
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
231
+ atomicAdd(grad_attn_weight, top_grad * val);
232
+ atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
233
+ atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
234
+ }
235
+
236
+
237
+ template <typename scalar_t>
238
+ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
239
+ const scalar_t *data_value,
240
+ const int64_t *data_spatial_shapes,
241
+ const int64_t *data_level_start_index,
242
+ const scalar_t *data_sampling_loc,
243
+ const scalar_t *data_attn_weight,
244
+ const int batch_size,
245
+ const int spatial_size,
246
+ const int num_heads,
247
+ const int channels,
248
+ const int num_levels,
249
+ const int num_query,
250
+ const int num_point,
251
+ scalar_t *data_col)
252
+ {
253
+ CUDA_KERNEL_LOOP(index, n)
254
+ {
255
+ int _temp = index;
256
+ const int c_col = _temp % channels;
257
+ _temp /= channels;
258
+ const int sampling_index = _temp;
259
+ const int m_col = _temp % num_heads;
260
+ _temp /= num_heads;
261
+ const int q_col = _temp % num_query;
262
+ _temp /= num_query;
263
+ const int b_col = _temp;
264
+
265
+ scalar_t *data_col_ptr = data_col + index;
266
+ int data_weight_ptr = sampling_index * num_levels * num_point;
267
+ int data_loc_w_ptr = data_weight_ptr << 1;
268
+ const int qid_stride = num_heads * channels;
269
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
270
+ scalar_t col = 0;
271
+
272
+ for (int l_col=0; l_col < num_levels; ++l_col)
273
+ {
274
+ const int level_start_id = data_level_start_index[l_col];
275
+ const int spatial_h_ptr = l_col << 1;
276
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
277
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
278
+ const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
279
+ for (int p_col=0; p_col < num_point; ++p_col)
280
+ {
281
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
282
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
283
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
284
+
285
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
286
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
287
+
288
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
289
+ {
290
+ col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
291
+ }
292
+
293
+ data_weight_ptr += 1;
294
+ data_loc_w_ptr += 2;
295
+ }
296
+ }
297
+ *data_col_ptr = col;
298
+ }
299
+ }
300
+
301
+ template <typename scalar_t, unsigned int blockSize>
302
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
303
+ const scalar_t *grad_col,
304
+ const scalar_t *data_value,
305
+ const int64_t *data_spatial_shapes,
306
+ const int64_t *data_level_start_index,
307
+ const scalar_t *data_sampling_loc,
308
+ const scalar_t *data_attn_weight,
309
+ const int batch_size,
310
+ const int spatial_size,
311
+ const int num_heads,
312
+ const int channels,
313
+ const int num_levels,
314
+ const int num_query,
315
+ const int num_point,
316
+ scalar_t *grad_value,
317
+ scalar_t *grad_sampling_loc,
318
+ scalar_t *grad_attn_weight)
319
+ {
320
+ CUDA_KERNEL_LOOP(index, n)
321
+ {
322
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
323
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
324
+ unsigned int tid = threadIdx.x;
325
+ int _temp = index;
326
+ const int c_col = _temp % channels;
327
+ _temp /= channels;
328
+ const int sampling_index = _temp;
329
+ const int m_col = _temp % num_heads;
330
+ _temp /= num_heads;
331
+ const int q_col = _temp % num_query;
332
+ _temp /= num_query;
333
+ const int b_col = _temp;
334
+
335
+ const scalar_t top_grad = grad_col[index];
336
+
337
+ int data_weight_ptr = sampling_index * num_levels * num_point;
338
+ int data_loc_w_ptr = data_weight_ptr << 1;
339
+ const int grad_sampling_ptr = data_weight_ptr;
340
+ grad_sampling_loc += grad_sampling_ptr << 1;
341
+ grad_attn_weight += grad_sampling_ptr;
342
+ const int grad_weight_stride = 1;
343
+ const int grad_loc_stride = 2;
344
+ const int qid_stride = num_heads * channels;
345
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
346
+
347
+ for (int l_col=0; l_col < num_levels; ++l_col)
348
+ {
349
+ const int level_start_id = data_level_start_index[l_col];
350
+ const int spatial_h_ptr = l_col << 1;
351
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
352
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
353
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
354
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
355
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
356
+
357
+ for (int p_col=0; p_col < num_point; ++p_col)
358
+ {
359
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
360
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
361
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
362
+
363
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
364
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
365
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
366
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
367
+ *(cache_grad_attn_weight+threadIdx.x)=0;
368
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
369
+ {
370
+ ms_deform_attn_col2im_bilinear(
371
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
372
+ top_grad, weight, grad_value_ptr,
373
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
374
+ }
375
+
376
+ __syncthreads();
377
+ if (tid == 0)
378
+ {
379
+ scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
380
+ int sid=2;
381
+ for (unsigned int tid = 1; tid < blockSize; ++tid)
382
+ {
383
+ _grad_w += cache_grad_sampling_loc[sid];
384
+ _grad_h += cache_grad_sampling_loc[sid + 1];
385
+ _grad_a += cache_grad_attn_weight[tid];
386
+ sid += 2;
387
+ }
388
+
389
+
390
+ *grad_sampling_loc = _grad_w;
391
+ *(grad_sampling_loc + 1) = _grad_h;
392
+ *grad_attn_weight = _grad_a;
393
+ }
394
+ __syncthreads();
395
+
396
+ data_weight_ptr += 1;
397
+ data_loc_w_ptr += 2;
398
+ grad_attn_weight += grad_weight_stride;
399
+ grad_sampling_loc += grad_loc_stride;
400
+ }
401
+ }
402
+ }
403
+ }
404
+
405
+
406
+ template <typename scalar_t, unsigned int blockSize>
407
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
408
+ const scalar_t *grad_col,
409
+ const scalar_t *data_value,
410
+ const int64_t *data_spatial_shapes,
411
+ const int64_t *data_level_start_index,
412
+ const scalar_t *data_sampling_loc,
413
+ const scalar_t *data_attn_weight,
414
+ const int batch_size,
415
+ const int spatial_size,
416
+ const int num_heads,
417
+ const int channels,
418
+ const int num_levels,
419
+ const int num_query,
420
+ const int num_point,
421
+ scalar_t *grad_value,
422
+ scalar_t *grad_sampling_loc,
423
+ scalar_t *grad_attn_weight)
424
+ {
425
+ CUDA_KERNEL_LOOP(index, n)
426
+ {
427
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
428
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
429
+ unsigned int tid = threadIdx.x;
430
+ int _temp = index;
431
+ const int c_col = _temp % channels;
432
+ _temp /= channels;
433
+ const int sampling_index = _temp;
434
+ const int m_col = _temp % num_heads;
435
+ _temp /= num_heads;
436
+ const int q_col = _temp % num_query;
437
+ _temp /= num_query;
438
+ const int b_col = _temp;
439
+
440
+ const scalar_t top_grad = grad_col[index];
441
+
442
+ int data_weight_ptr = sampling_index * num_levels * num_point;
443
+ int data_loc_w_ptr = data_weight_ptr << 1;
444
+ const int grad_sampling_ptr = data_weight_ptr;
445
+ grad_sampling_loc += grad_sampling_ptr << 1;
446
+ grad_attn_weight += grad_sampling_ptr;
447
+ const int grad_weight_stride = 1;
448
+ const int grad_loc_stride = 2;
449
+ const int qid_stride = num_heads * channels;
450
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
451
+
452
+ for (int l_col=0; l_col < num_levels; ++l_col)
453
+ {
454
+ const int level_start_id = data_level_start_index[l_col];
455
+ const int spatial_h_ptr = l_col << 1;
456
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
457
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
458
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
459
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
460
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
461
+
462
+ for (int p_col=0; p_col < num_point; ++p_col)
463
+ {
464
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
465
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
466
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
467
+
468
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
469
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
470
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
471
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
472
+ *(cache_grad_attn_weight+threadIdx.x)=0;
473
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
474
+ {
475
+ ms_deform_attn_col2im_bilinear(
476
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
477
+ top_grad, weight, grad_value_ptr,
478
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
479
+ }
480
+
481
+ __syncthreads();
482
+
483
+ for (unsigned int s=blockSize/2; s>0; s>>=1)
484
+ {
485
+ if (tid < s) {
486
+ const unsigned int xid1 = tid << 1;
487
+ const unsigned int xid2 = (tid + s) << 1;
488
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
489
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
490
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
491
+ }
492
+ __syncthreads();
493
+ }
494
+
495
+ if (tid == 0)
496
+ {
497
+ *grad_sampling_loc = cache_grad_sampling_loc[0];
498
+ *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
499
+ *grad_attn_weight = cache_grad_attn_weight[0];
500
+ }
501
+ __syncthreads();
502
+
503
+ data_weight_ptr += 1;
504
+ data_loc_w_ptr += 2;
505
+ grad_attn_weight += grad_weight_stride;
506
+ grad_sampling_loc += grad_loc_stride;
507
+ }
508
+ }
509
+ }
510
+ }
511
+
512
+
513
+ template <typename scalar_t>
514
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
515
+ const scalar_t *grad_col,
516
+ const scalar_t *data_value,
517
+ const int64_t *data_spatial_shapes,
518
+ const int64_t *data_level_start_index,
519
+ const scalar_t *data_sampling_loc,
520
+ const scalar_t *data_attn_weight,
521
+ const int batch_size,
522
+ const int spatial_size,
523
+ const int num_heads,
524
+ const int channels,
525
+ const int num_levels,
526
+ const int num_query,
527
+ const int num_point,
528
+ scalar_t *grad_value,
529
+ scalar_t *grad_sampling_loc,
530
+ scalar_t *grad_attn_weight)
531
+ {
532
+ CUDA_KERNEL_LOOP(index, n)
533
+ {
534
+ extern __shared__ int _s[];
535
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
536
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
537
+ unsigned int tid = threadIdx.x;
538
+ int _temp = index;
539
+ const int c_col = _temp % channels;
540
+ _temp /= channels;
541
+ const int sampling_index = _temp;
542
+ const int m_col = _temp % num_heads;
543
+ _temp /= num_heads;
544
+ const int q_col = _temp % num_query;
545
+ _temp /= num_query;
546
+ const int b_col = _temp;
547
+
548
+ const scalar_t top_grad = grad_col[index];
549
+
550
+ int data_weight_ptr = sampling_index * num_levels * num_point;
551
+ int data_loc_w_ptr = data_weight_ptr << 1;
552
+ const int grad_sampling_ptr = data_weight_ptr;
553
+ grad_sampling_loc += grad_sampling_ptr << 1;
554
+ grad_attn_weight += grad_sampling_ptr;
555
+ const int grad_weight_stride = 1;
556
+ const int grad_loc_stride = 2;
557
+ const int qid_stride = num_heads * channels;
558
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
559
+
560
+ for (int l_col=0; l_col < num_levels; ++l_col)
561
+ {
562
+ const int level_start_id = data_level_start_index[l_col];
563
+ const int spatial_h_ptr = l_col << 1;
564
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
565
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
566
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
567
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
568
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
569
+
570
+ for (int p_col=0; p_col < num_point; ++p_col)
571
+ {
572
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
573
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
574
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
575
+
576
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
577
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
578
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
579
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
580
+ *(cache_grad_attn_weight+threadIdx.x)=0;
581
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
582
+ {
583
+ ms_deform_attn_col2im_bilinear(
584
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
585
+ top_grad, weight, grad_value_ptr,
586
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
587
+ }
588
+
589
+ __syncthreads();
590
+ if (tid == 0)
591
+ {
592
+ scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
593
+ int sid=2;
594
+ for (unsigned int tid = 1; tid < blockDim.x; ++tid)
595
+ {
596
+ _grad_w += cache_grad_sampling_loc[sid];
597
+ _grad_h += cache_grad_sampling_loc[sid + 1];
598
+ _grad_a += cache_grad_attn_weight[tid];
599
+ sid += 2;
600
+ }
601
+
602
+
603
+ *grad_sampling_loc = _grad_w;
604
+ *(grad_sampling_loc + 1) = _grad_h;
605
+ *grad_attn_weight = _grad_a;
606
+ }
607
+ __syncthreads();
608
+
609
+ data_weight_ptr += 1;
610
+ data_loc_w_ptr += 2;
611
+ grad_attn_weight += grad_weight_stride;
612
+ grad_sampling_loc += grad_loc_stride;
613
+ }
614
+ }
615
+ }
616
+ }
617
+
618
+ template <typename scalar_t>
619
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
620
+ const scalar_t *grad_col,
621
+ const scalar_t *data_value,
622
+ const int64_t *data_spatial_shapes,
623
+ const int64_t *data_level_start_index,
624
+ const scalar_t *data_sampling_loc,
625
+ const scalar_t *data_attn_weight,
626
+ const int batch_size,
627
+ const int spatial_size,
628
+ const int num_heads,
629
+ const int channels,
630
+ const int num_levels,
631
+ const int num_query,
632
+ const int num_point,
633
+ scalar_t *grad_value,
634
+ scalar_t *grad_sampling_loc,
635
+ scalar_t *grad_attn_weight)
636
+ {
637
+ CUDA_KERNEL_LOOP(index, n)
638
+ {
639
+ extern __shared__ int _s[];
640
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
641
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
642
+ unsigned int tid = threadIdx.x;
643
+ int _temp = index;
644
+ const int c_col = _temp % channels;
645
+ _temp /= channels;
646
+ const int sampling_index = _temp;
647
+ const int m_col = _temp % num_heads;
648
+ _temp /= num_heads;
649
+ const int q_col = _temp % num_query;
650
+ _temp /= num_query;
651
+ const int b_col = _temp;
652
+
653
+ const scalar_t top_grad = grad_col[index];
654
+
655
+ int data_weight_ptr = sampling_index * num_levels * num_point;
656
+ int data_loc_w_ptr = data_weight_ptr << 1;
657
+ const int grad_sampling_ptr = data_weight_ptr;
658
+ grad_sampling_loc += grad_sampling_ptr << 1;
659
+ grad_attn_weight += grad_sampling_ptr;
660
+ const int grad_weight_stride = 1;
661
+ const int grad_loc_stride = 2;
662
+ const int qid_stride = num_heads * channels;
663
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
664
+
665
+ for (int l_col=0; l_col < num_levels; ++l_col)
666
+ {
667
+ const int level_start_id = data_level_start_index[l_col];
668
+ const int spatial_h_ptr = l_col << 1;
669
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
670
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
671
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
672
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
673
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
674
+
675
+ for (int p_col=0; p_col < num_point; ++p_col)
676
+ {
677
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
678
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
679
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
680
+
681
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
682
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
683
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
684
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
685
+ *(cache_grad_attn_weight+threadIdx.x)=0;
686
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
687
+ {
688
+ ms_deform_attn_col2im_bilinear(
689
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
690
+ top_grad, weight, grad_value_ptr,
691
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
692
+ }
693
+
694
+ __syncthreads();
695
+
696
+ for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
697
+ {
698
+ if (tid < s) {
699
+ const unsigned int xid1 = tid << 1;
700
+ const unsigned int xid2 = (tid + s) << 1;
701
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
702
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
703
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
704
+ if (tid + (s << 1) < spre)
705
+ {
706
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
707
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
708
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
709
+ }
710
+ }
711
+ __syncthreads();
712
+ }
713
+
714
+ if (tid == 0)
715
+ {
716
+ *grad_sampling_loc = cache_grad_sampling_loc[0];
717
+ *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
718
+ *grad_attn_weight = cache_grad_attn_weight[0];
719
+ }
720
+ __syncthreads();
721
+
722
+ data_weight_ptr += 1;
723
+ data_loc_w_ptr += 2;
724
+ grad_attn_weight += grad_weight_stride;
725
+ grad_sampling_loc += grad_loc_stride;
726
+ }
727
+ }
728
+ }
729
+ }
730
+
731
+ template <typename scalar_t>
732
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
733
+ const scalar_t *grad_col,
734
+ const scalar_t *data_value,
735
+ const int64_t *data_spatial_shapes,
736
+ const int64_t *data_level_start_index,
737
+ const scalar_t *data_sampling_loc,
738
+ const scalar_t *data_attn_weight,
739
+ const int batch_size,
740
+ const int spatial_size,
741
+ const int num_heads,
742
+ const int channels,
743
+ const int num_levels,
744
+ const int num_query,
745
+ const int num_point,
746
+ scalar_t *grad_value,
747
+ scalar_t *grad_sampling_loc,
748
+ scalar_t *grad_attn_weight)
749
+ {
750
+ CUDA_KERNEL_LOOP(index, n)
751
+ {
752
+ extern __shared__ int _s[];
753
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
754
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
755
+ unsigned int tid = threadIdx.x;
756
+ int _temp = index;
757
+ const int c_col = _temp % channels;
758
+ _temp /= channels;
759
+ const int sampling_index = _temp;
760
+ const int m_col = _temp % num_heads;
761
+ _temp /= num_heads;
762
+ const int q_col = _temp % num_query;
763
+ _temp /= num_query;
764
+ const int b_col = _temp;
765
+
766
+ const scalar_t top_grad = grad_col[index];
767
+
768
+ int data_weight_ptr = sampling_index * num_levels * num_point;
769
+ int data_loc_w_ptr = data_weight_ptr << 1;
770
+ const int grad_sampling_ptr = data_weight_ptr;
771
+ grad_sampling_loc += grad_sampling_ptr << 1;
772
+ grad_attn_weight += grad_sampling_ptr;
773
+ const int grad_weight_stride = 1;
774
+ const int grad_loc_stride = 2;
775
+ const int qid_stride = num_heads * channels;
776
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
777
+
778
+ for (int l_col=0; l_col < num_levels; ++l_col)
779
+ {
780
+ const int level_start_id = data_level_start_index[l_col];
781
+ const int spatial_h_ptr = l_col << 1;
782
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
783
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
784
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
785
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
786
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
787
+
788
+ for (int p_col=0; p_col < num_point; ++p_col)
789
+ {
790
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
791
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
792
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
793
+
794
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
795
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
796
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
797
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
798
+ *(cache_grad_attn_weight+threadIdx.x)=0;
799
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
800
+ {
801
+ ms_deform_attn_col2im_bilinear(
802
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
803
+ top_grad, weight, grad_value_ptr,
804
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
805
+ }
806
+
807
+ __syncthreads();
808
+
809
+ for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
810
+ {
811
+ if (tid < s) {
812
+ const unsigned int xid1 = tid << 1;
813
+ const unsigned int xid2 = (tid + s) << 1;
814
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
815
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
816
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
817
+ if (tid + (s << 1) < spre)
818
+ {
819
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
820
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
821
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
822
+ }
823
+ }
824
+ __syncthreads();
825
+ }
826
+
827
+ if (tid == 0)
828
+ {
829
+ atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
830
+ atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
831
+ atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
832
+ }
833
+ __syncthreads();
834
+
835
+ data_weight_ptr += 1;
836
+ data_loc_w_ptr += 2;
837
+ grad_attn_weight += grad_weight_stride;
838
+ grad_sampling_loc += grad_loc_stride;
839
+ }
840
+ }
841
+ }
842
+ }
843
+
844
+
845
+ template <typename scalar_t>
846
+ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
847
+ const scalar_t *grad_col,
848
+ const scalar_t *data_value,
849
+ const int64_t *data_spatial_shapes,
850
+ const int64_t *data_level_start_index,
851
+ const scalar_t *data_sampling_loc,
852
+ const scalar_t *data_attn_weight,
853
+ const int batch_size,
854
+ const int spatial_size,
855
+ const int num_heads,
856
+ const int channels,
857
+ const int num_levels,
858
+ const int num_query,
859
+ const int num_point,
860
+ scalar_t *grad_value,
861
+ scalar_t *grad_sampling_loc,
862
+ scalar_t *grad_attn_weight)
863
+ {
864
+ CUDA_KERNEL_LOOP(index, n)
865
+ {
866
+ int _temp = index;
867
+ const int c_col = _temp % channels;
868
+ _temp /= channels;
869
+ const int sampling_index = _temp;
870
+ const int m_col = _temp % num_heads;
871
+ _temp /= num_heads;
872
+ const int q_col = _temp % num_query;
873
+ _temp /= num_query;
874
+ const int b_col = _temp;
875
+
876
+ const scalar_t top_grad = grad_col[index];
877
+
878
+ int data_weight_ptr = sampling_index * num_levels * num_point;
879
+ int data_loc_w_ptr = data_weight_ptr << 1;
880
+ const int grad_sampling_ptr = data_weight_ptr;
881
+ grad_sampling_loc += grad_sampling_ptr << 1;
882
+ grad_attn_weight += grad_sampling_ptr;
883
+ const int grad_weight_stride = 1;
884
+ const int grad_loc_stride = 2;
885
+ const int qid_stride = num_heads * channels;
886
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
887
+
888
+ for (int l_col=0; l_col < num_levels; ++l_col)
889
+ {
890
+ const int level_start_id = data_level_start_index[l_col];
891
+ const int spatial_h_ptr = l_col << 1;
892
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
893
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
894
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
895
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
896
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
897
+
898
+ for (int p_col=0; p_col < num_point; ++p_col)
899
+ {
900
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
901
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
902
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
903
+
904
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
905
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
906
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
907
+ {
908
+ ms_deform_attn_col2im_bilinear_gm(
909
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
910
+ top_grad, weight, grad_value_ptr,
911
+ grad_sampling_loc, grad_attn_weight);
912
+ }
913
+ data_weight_ptr += 1;
914
+ data_loc_w_ptr += 2;
915
+ grad_attn_weight += grad_weight_stride;
916
+ grad_sampling_loc += grad_loc_stride;
917
+ }
918
+ }
919
+ }
920
+ }
921
+
922
+
923
+ template <typename scalar_t>
924
+ void ms_deformable_im2col_cuda(cudaStream_t stream,
925
+ const scalar_t* data_value,
926
+ const int64_t* data_spatial_shapes,
927
+ const int64_t* data_level_start_index,
928
+ const scalar_t* data_sampling_loc,
929
+ const scalar_t* data_attn_weight,
930
+ const int batch_size,
931
+ const int spatial_size,
932
+ const int num_heads,
933
+ const int channels,
934
+ const int num_levels,
935
+ const int num_query,
936
+ const int num_point,
937
+ scalar_t* data_col)
938
+ {
939
+ const int num_kernels = batch_size * num_query * num_heads * channels;
940
+ const int num_actual_kernels = batch_size * num_query * num_heads * channels;
941
+ const int num_threads = CUDA_NUM_THREADS;
942
+ ms_deformable_im2col_gpu_kernel<scalar_t>
943
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
944
+ 0, stream>>>(
945
+ num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
946
+ batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
947
+
948
+ cudaError_t err = cudaGetLastError();
949
+ if (err != cudaSuccess)
950
+ {
951
+ printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
952
+ }
953
+
954
+ }
955
+
956
+ template <typename scalar_t>
957
+ void ms_deformable_col2im_cuda(cudaStream_t stream,
958
+ const scalar_t* grad_col,
959
+ const scalar_t* data_value,
960
+ const int64_t * data_spatial_shapes,
961
+ const int64_t * data_level_start_index,
962
+ const scalar_t * data_sampling_loc,
963
+ const scalar_t * data_attn_weight,
964
+ const int batch_size,
965
+ const int spatial_size,
966
+ const int num_heads,
967
+ const int channels,
968
+ const int num_levels,
969
+ const int num_query,
970
+ const int num_point,
971
+ scalar_t* grad_value,
972
+ scalar_t* grad_sampling_loc,
973
+ scalar_t* grad_attn_weight)
974
+ {
975
+ const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
976
+ const int num_kernels = batch_size * num_query * num_heads * channels;
977
+ const int num_actual_kernels = batch_size * num_query * num_heads * channels;
978
+ if (channels > 1024)
979
+ {
980
+ if ((channels & 1023) == 0)
981
+ {
982
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
983
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
984
+ num_threads*3*sizeof(scalar_t), stream>>>(
985
+ num_kernels,
986
+ grad_col,
987
+ data_value,
988
+ data_spatial_shapes,
989
+ data_level_start_index,
990
+ data_sampling_loc,
991
+ data_attn_weight,
992
+ batch_size,
993
+ spatial_size,
994
+ num_heads,
995
+ channels,
996
+ num_levels,
997
+ num_query,
998
+ num_point,
999
+ grad_value,
1000
+ grad_sampling_loc,
1001
+ grad_attn_weight);
1002
+ }
1003
+ else
1004
+ {
1005
+ ms_deformable_col2im_gpu_kernel_gm<scalar_t>
1006
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1007
+ 0, stream>>>(
1008
+ num_kernels,
1009
+ grad_col,
1010
+ data_value,
1011
+ data_spatial_shapes,
1012
+ data_level_start_index,
1013
+ data_sampling_loc,
1014
+ data_attn_weight,
1015
+ batch_size,
1016
+ spatial_size,
1017
+ num_heads,
1018
+ channels,
1019
+ num_levels,
1020
+ num_query,
1021
+ num_point,
1022
+ grad_value,
1023
+ grad_sampling_loc,
1024
+ grad_attn_weight);
1025
+ }
1026
+ }
1027
+ else{
1028
+ switch(channels)
1029
+ {
1030
+ case 1:
1031
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
1032
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1033
+ 0, stream>>>(
1034
+ num_kernels,
1035
+ grad_col,
1036
+ data_value,
1037
+ data_spatial_shapes,
1038
+ data_level_start_index,
1039
+ data_sampling_loc,
1040
+ data_attn_weight,
1041
+ batch_size,
1042
+ spatial_size,
1043
+ num_heads,
1044
+ channels,
1045
+ num_levels,
1046
+ num_query,
1047
+ num_point,
1048
+ grad_value,
1049
+ grad_sampling_loc,
1050
+ grad_attn_weight);
1051
+ break;
1052
+ case 2:
1053
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
1054
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1055
+ 0, stream>>>(
1056
+ num_kernels,
1057
+ grad_col,
1058
+ data_value,
1059
+ data_spatial_shapes,
1060
+ data_level_start_index,
1061
+ data_sampling_loc,
1062
+ data_attn_weight,
1063
+ batch_size,
1064
+ spatial_size,
1065
+ num_heads,
1066
+ channels,
1067
+ num_levels,
1068
+ num_query,
1069
+ num_point,
1070
+ grad_value,
1071
+ grad_sampling_loc,
1072
+ grad_attn_weight);
1073
+ break;
1074
+ case 4:
1075
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
1076
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1077
+ 0, stream>>>(
1078
+ num_kernels,
1079
+ grad_col,
1080
+ data_value,
1081
+ data_spatial_shapes,
1082
+ data_level_start_index,
1083
+ data_sampling_loc,
1084
+ data_attn_weight,
1085
+ batch_size,
1086
+ spatial_size,
1087
+ num_heads,
1088
+ channels,
1089
+ num_levels,
1090
+ num_query,
1091
+ num_point,
1092
+ grad_value,
1093
+ grad_sampling_loc,
1094
+ grad_attn_weight);
1095
+ break;
1096
+ case 8:
1097
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
1098
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1099
+ 0, stream>>>(
1100
+ num_kernels,
1101
+ grad_col,
1102
+ data_value,
1103
+ data_spatial_shapes,
1104
+ data_level_start_index,
1105
+ data_sampling_loc,
1106
+ data_attn_weight,
1107
+ batch_size,
1108
+ spatial_size,
1109
+ num_heads,
1110
+ channels,
1111
+ num_levels,
1112
+ num_query,
1113
+ num_point,
1114
+ grad_value,
1115
+ grad_sampling_loc,
1116
+ grad_attn_weight);
1117
+ break;
1118
+ case 16:
1119
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
1120
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1121
+ 0, stream>>>(
1122
+ num_kernels,
1123
+ grad_col,
1124
+ data_value,
1125
+ data_spatial_shapes,
1126
+ data_level_start_index,
1127
+ data_sampling_loc,
1128
+ data_attn_weight,
1129
+ batch_size,
1130
+ spatial_size,
1131
+ num_heads,
1132
+ channels,
1133
+ num_levels,
1134
+ num_query,
1135
+ num_point,
1136
+ grad_value,
1137
+ grad_sampling_loc,
1138
+ grad_attn_weight);
1139
+ break;
1140
+ case 32:
1141
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
1142
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1143
+ 0, stream>>>(
1144
+ num_kernels,
1145
+ grad_col,
1146
+ data_value,
1147
+ data_spatial_shapes,
1148
+ data_level_start_index,
1149
+ data_sampling_loc,
1150
+ data_attn_weight,
1151
+ batch_size,
1152
+ spatial_size,
1153
+ num_heads,
1154
+ channels,
1155
+ num_levels,
1156
+ num_query,
1157
+ num_point,
1158
+ grad_value,
1159
+ grad_sampling_loc,
1160
+ grad_attn_weight);
1161
+ break;
1162
+ case 64:
1163
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
1164
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1165
+ 0, stream>>>(
1166
+ num_kernels,
1167
+ grad_col,
1168
+ data_value,
1169
+ data_spatial_shapes,
1170
+ data_level_start_index,
1171
+ data_sampling_loc,
1172
+ data_attn_weight,
1173
+ batch_size,
1174
+ spatial_size,
1175
+ num_heads,
1176
+ channels,
1177
+ num_levels,
1178
+ num_query,
1179
+ num_point,
1180
+ grad_value,
1181
+ grad_sampling_loc,
1182
+ grad_attn_weight);
1183
+ break;
1184
+ case 128:
1185
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
1186
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1187
+ 0, stream>>>(
1188
+ num_kernels,
1189
+ grad_col,
1190
+ data_value,
1191
+ data_spatial_shapes,
1192
+ data_level_start_index,
1193
+ data_sampling_loc,
1194
+ data_attn_weight,
1195
+ batch_size,
1196
+ spatial_size,
1197
+ num_heads,
1198
+ channels,
1199
+ num_levels,
1200
+ num_query,
1201
+ num_point,
1202
+ grad_value,
1203
+ grad_sampling_loc,
1204
+ grad_attn_weight);
1205
+ break;
1206
+ case 256:
1207
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
1208
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1209
+ 0, stream>>>(
1210
+ num_kernels,
1211
+ grad_col,
1212
+ data_value,
1213
+ data_spatial_shapes,
1214
+ data_level_start_index,
1215
+ data_sampling_loc,
1216
+ data_attn_weight,
1217
+ batch_size,
1218
+ spatial_size,
1219
+ num_heads,
1220
+ channels,
1221
+ num_levels,
1222
+ num_query,
1223
+ num_point,
1224
+ grad_value,
1225
+ grad_sampling_loc,
1226
+ grad_attn_weight);
1227
+ break;
1228
+ case 512:
1229
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
1230
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1231
+ 0, stream>>>(
1232
+ num_kernels,
1233
+ grad_col,
1234
+ data_value,
1235
+ data_spatial_shapes,
1236
+ data_level_start_index,
1237
+ data_sampling_loc,
1238
+ data_attn_weight,
1239
+ batch_size,
1240
+ spatial_size,
1241
+ num_heads,
1242
+ channels,
1243
+ num_levels,
1244
+ num_query,
1245
+ num_point,
1246
+ grad_value,
1247
+ grad_sampling_loc,
1248
+ grad_attn_weight);
1249
+ break;
1250
+ case 1024:
1251
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
1252
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1253
+ 0, stream>>>(
1254
+ num_kernels,
1255
+ grad_col,
1256
+ data_value,
1257
+ data_spatial_shapes,
1258
+ data_level_start_index,
1259
+ data_sampling_loc,
1260
+ data_attn_weight,
1261
+ batch_size,
1262
+ spatial_size,
1263
+ num_heads,
1264
+ channels,
1265
+ num_levels,
1266
+ num_query,
1267
+ num_point,
1268
+ grad_value,
1269
+ grad_sampling_loc,
1270
+ grad_attn_weight);
1271
+ break;
1272
+ default:
1273
+ if (channels < 64)
1274
+ {
1275
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
1276
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1277
+ num_threads*3*sizeof(scalar_t), stream>>>(
1278
+ num_kernels,
1279
+ grad_col,
1280
+ data_value,
1281
+ data_spatial_shapes,
1282
+ data_level_start_index,
1283
+ data_sampling_loc,
1284
+ data_attn_weight,
1285
+ batch_size,
1286
+ spatial_size,
1287
+ num_heads,
1288
+ channels,
1289
+ num_levels,
1290
+ num_query,
1291
+ num_point,
1292
+ grad_value,
1293
+ grad_sampling_loc,
1294
+ grad_attn_weight);
1295
+ }
1296
+ else
1297
+ {
1298
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
1299
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1300
+ num_threads*3*sizeof(scalar_t), stream>>>(
1301
+ num_kernels,
1302
+ grad_col,
1303
+ data_value,
1304
+ data_spatial_shapes,
1305
+ data_level_start_index,
1306
+ data_sampling_loc,
1307
+ data_attn_weight,
1308
+ batch_size,
1309
+ spatial_size,
1310
+ num_heads,
1311
+ channels,
1312
+ num_levels,
1313
+ num_query,
1314
+ num_point,
1315
+ grad_value,
1316
+ grad_sampling_loc,
1317
+ grad_attn_weight);
1318
+ }
1319
+ }
1320
+ }
1321
+ cudaError_t err = cudaGetLastError();
1322
+ if (err != cudaSuccess)
1323
+ {
1324
+ printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
1325
+ }
1326
+
1327
+ }
difpoint/src/models/XPose/models/UniPose/ops/src/ms_deform_attn.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+
13
+ #include "cpu/ms_deform_attn_cpu.h"
14
+
15
+ #ifdef WITH_CUDA
16
+ #include "cuda/ms_deform_attn_cuda.h"
17
+ #endif
18
+
19
+
20
+ at::Tensor
21
+ ms_deform_attn_forward(
22
+ const at::Tensor &value,
23
+ const at::Tensor &spatial_shapes,
24
+ const at::Tensor &level_start_index,
25
+ const at::Tensor &sampling_loc,
26
+ const at::Tensor &attn_weight,
27
+ const int im2col_step)
28
+ {
29
+ if (value.type().is_cuda())
30
+ {
31
+ #ifdef WITH_CUDA
32
+ return ms_deform_attn_cuda_forward(
33
+ value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34
+ #else
35
+ AT_ERROR("Not compiled with GPU support");
36
+ #endif
37
+ }
38
+ AT_ERROR("Not implemented on the CPU");
39
+ }
40
+
41
+ std::vector<at::Tensor>
42
+ ms_deform_attn_backward(
43
+ const at::Tensor &value,
44
+ const at::Tensor &spatial_shapes,
45
+ const at::Tensor &level_start_index,
46
+ const at::Tensor &sampling_loc,
47
+ const at::Tensor &attn_weight,
48
+ const at::Tensor &grad_output,
49
+ const int im2col_step)
50
+ {
51
+ if (value.type().is_cuda())
52
+ {
53
+ #ifdef WITH_CUDA
54
+ return ms_deform_attn_cuda_backward(
55
+ value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56
+ #else
57
+ AT_ERROR("Not compiled with GPU support");
58
+ #endif
59
+ }
60
+ AT_ERROR("Not implemented on the CPU");
61
+ }
62
+
difpoint/src/models/XPose/models/UniPose/ops/src/vision.cpp ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include "ms_deform_attn.h"
12
+
13
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14
+ m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15
+ m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16
+ }
difpoint/src/models/XPose/models/UniPose/ops/test.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import time
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.autograd import gradcheck
17
+
18
+ from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19
+
20
+
21
+ N, M, D = 1, 2, 2
22
+ Lq, L, P = 2, 2, 2
23
+ shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24
+ level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25
+ S = sum([(H*W).item() for H, W in shapes])
26
+
27
+
28
+ torch.manual_seed(3)
29
+
30
+
31
+ @torch.no_grad()
32
+ def check_forward_equal_with_pytorch_double():
33
+ value = torch.rand(N, S, M, D).cuda() * 0.01
34
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37
+ im2col_step = 2
38
+ output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39
+ output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40
+ fwdok = torch.allclose(output_cuda, output_pytorch)
41
+ max_abs_err = (output_cuda - output_pytorch).abs().max()
42
+ max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43
+
44
+ print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45
+
46
+
47
+ @torch.no_grad()
48
+ def check_forward_equal_with_pytorch_float():
49
+ value = torch.rand(N, S, M, D).cuda() * 0.01
50
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53
+ im2col_step = 2
54
+ output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55
+ output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56
+ fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57
+ max_abs_err = (output_cuda - output_pytorch).abs().max()
58
+ max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59
+
60
+ print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61
+
62
+
63
+ def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64
+
65
+ value = torch.rand(N, S, M, channels).cuda() * 0.01
66
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69
+ im2col_step = 2
70
+ func = MSDeformAttnFunction.apply
71
+
72
+ value.requires_grad = grad_value
73
+ sampling_locations.requires_grad = grad_sampling_loc
74
+ attention_weights.requires_grad = grad_attn_weight
75
+
76
+ gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77
+
78
+ print(f'* {gradok} check_gradient_numerical(D={channels})')
79
+
80
+
81
+ if __name__ == '__main__':
82
+ check_forward_equal_with_pytorch_double()
83
+ check_forward_equal_with_pytorch_float()
84
+
85
+ for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86
+ check_gradient_numerical(channels, True, True, True)
87
+
88
+
89
+
difpoint/src/models/XPose/models/UniPose/position_encoding.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Conditional DETR
7
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
8
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9
+ # ------------------------------------------------------------------------
10
+ # Copied from DETR (https://github.com/facebookresearch/detr)
11
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
12
+ # ------------------------------------------------------------------------
13
+
14
+ """
15
+ Various positional encodings for the transformer.
16
+ """
17
+ import math
18
+ import torch
19
+ from torch import nn
20
+
21
+ from ...util.misc import NestedTensor
22
+
23
+
24
+ class PositionEmbeddingSine(nn.Module):
25
+ """
26
+ This is a more standard version of the position embedding, very similar to the one
27
+ used by the Attention is all you need paper, generalized to work on images.
28
+ """
29
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
30
+ super().__init__()
31
+ self.num_pos_feats = num_pos_feats
32
+ self.temperature = temperature
33
+ self.normalize = normalize
34
+ if scale is not None and normalize is False:
35
+ raise ValueError("normalize should be True if scale is passed")
36
+ if scale is None:
37
+ scale = 2 * math.pi
38
+ self.scale = scale
39
+
40
+ def forward(self, tensor_list: NestedTensor):
41
+ x = tensor_list.tensors
42
+ mask = tensor_list.mask
43
+ assert mask is not None
44
+ not_mask = ~mask
45
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
46
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
47
+ if self.normalize:
48
+ eps = 1e-6
49
+ # if os.environ.get("SHILONG_AMP", None) == '1':
50
+ # eps = 1e-4
51
+ # else:
52
+ # eps = 1e-6
53
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
54
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
55
+
56
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
57
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
58
+
59
+ pos_x = x_embed[:, :, :, None] / dim_t
60
+ pos_y = y_embed[:, :, :, None] / dim_t
61
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
62
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
63
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
64
+ return pos
65
+
66
+ class PositionEmbeddingSineHW(nn.Module):
67
+ """
68
+ This is a more standard version of the position embedding, very similar to the one
69
+ used by the Attention is all you need paper, generalized to work on images.
70
+ """
71
+ def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
72
+ super().__init__()
73
+ self.num_pos_feats = num_pos_feats
74
+ self.temperatureH = temperatureH
75
+ self.temperatureW = temperatureW
76
+ self.normalize = normalize
77
+ if scale is not None and normalize is False:
78
+ raise ValueError("normalize should be True if scale is passed")
79
+ if scale is None:
80
+ scale = 2 * math.pi
81
+ self.scale = scale
82
+
83
+ def forward(self, tensor_list: NestedTensor):
84
+ x = tensor_list.tensors
85
+ mask = tensor_list.mask
86
+ assert mask is not None
87
+ not_mask = ~mask
88
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
89
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
90
+
91
+ # import ipdb; ipdb.set_trace()
92
+
93
+ if self.normalize:
94
+ eps = 1e-6
95
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
96
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
97
+
98
+ dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
99
+ dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
100
+ pos_x = x_embed[:, :, :, None] / dim_tx
101
+
102
+ dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
103
+ dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
104
+ pos_y = y_embed[:, :, :, None] / dim_ty
105
+
106
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
107
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
108
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
109
+
110
+ # import ipdb; ipdb.set_trace()
111
+
112
+ return pos
113
+
114
+ class PositionEmbeddingLearned(nn.Module):
115
+ """
116
+ Absolute pos embedding, learned.
117
+ """
118
+ def __init__(self, num_pos_feats=256):
119
+ super().__init__()
120
+ self.row_embed = nn.Embedding(50, num_pos_feats)
121
+ self.col_embed = nn.Embedding(50, num_pos_feats)
122
+ self.reset_parameters()
123
+
124
+ def reset_parameters(self):
125
+ nn.init.uniform_(self.row_embed.weight)
126
+ nn.init.uniform_(self.col_embed.weight)
127
+
128
+ def forward(self, tensor_list: NestedTensor):
129
+ x = tensor_list.tensors
130
+ h, w = x.shape[-2:]
131
+ i = torch.arange(w, device=x.device)
132
+ j = torch.arange(h, device=x.device)
133
+ x_emb = self.col_embed(i)
134
+ y_emb = self.row_embed(j)
135
+ pos = torch.cat([
136
+ x_emb.unsqueeze(0).repeat(h, 1, 1),
137
+ y_emb.unsqueeze(1).repeat(1, w, 1),
138
+ ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
139
+ return pos
140
+
141
+
142
+ def build_position_encoding(args):
143
+ N_steps = args.hidden_dim // 2
144
+ if args.position_embedding in ('v2', 'sine'):
145
+ # TODO find a better way of exposing other arguments
146
+ position_embedding = PositionEmbeddingSineHW(
147
+ N_steps,
148
+ temperatureH=args.pe_temperatureH,
149
+ temperatureW=args.pe_temperatureW,
150
+ normalize=True
151
+ )
152
+ elif args.position_embedding in ('v3', 'learned'):
153
+ position_embedding = PositionEmbeddingLearned(N_steps)
154
+ else:
155
+ raise ValueError(f"not supported {args.position_embedding}")
156
+
157
+ return position_embedding
difpoint/src/models/XPose/models/UniPose/swin_transformer.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.utils.checkpoint as checkpoint
6
+ import numpy as np
7
+
8
+ from ...util.misc import NestedTensor
9
+ # from timm.models.layers import DropPath, to_2tuple, trunc_normal_
10
+ from src.models.util import DropPath, to_2tuple, trunc_normal_
11
+
12
+
13
+
14
+ class Mlp(nn.Module):
15
+ """ Multilayer perceptron."""
16
+
17
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
18
+ super().__init__()
19
+ out_features = out_features or in_features
20
+ hidden_features = hidden_features or in_features
21
+ self.fc1 = nn.Linear(in_features, hidden_features)
22
+ self.act = act_layer()
23
+ self.fc2 = nn.Linear(hidden_features, out_features)
24
+ self.drop = nn.Dropout(drop)
25
+
26
+ def forward(self, x):
27
+ x = self.fc1(x)
28
+ x = self.act(x)
29
+ x = self.drop(x)
30
+ x = self.fc2(x)
31
+ x = self.drop(x)
32
+ return x
33
+
34
+
35
+ def window_partition(x, window_size):
36
+ """
37
+ Args:
38
+ x: (B, H, W, C)
39
+ window_size (int): window size
40
+ Returns:
41
+ windows: (num_windows*B, window_size, window_size, C)
42
+ """
43
+ B, H, W, C = x.shape
44
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
45
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
46
+ return windows
47
+
48
+
49
+ def window_reverse(windows, window_size, H, W):
50
+ """
51
+ Args:
52
+ windows: (num_windows*B, window_size, window_size, C)
53
+ window_size (int): Window size
54
+ H (int): Height of image
55
+ W (int): Width of image
56
+ Returns:
57
+ x: (B, H, W, C)
58
+ """
59
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
60
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
61
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
62
+ return x
63
+
64
+
65
+ class WindowAttention(nn.Module):
66
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
67
+ It supports both of shifted and non-shifted window.
68
+ Args:
69
+ dim (int): Number of input channels.
70
+ window_size (tuple[int]): The height and width of the window.
71
+ num_heads (int): Number of attention heads.
72
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
73
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
74
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
75
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
76
+ """
77
+
78
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
79
+
80
+ super().__init__()
81
+ self.dim = dim
82
+ self.window_size = window_size # Wh, Ww
83
+ self.num_heads = num_heads
84
+ head_dim = dim // num_heads
85
+ self.scale = qk_scale or head_dim ** -0.5
86
+
87
+ # define a parameter table of relative position bias
88
+ self.relative_position_bias_table = nn.Parameter(
89
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
90
+
91
+ # get pair-wise relative position index for each token inside the window
92
+ coords_h = torch.arange(self.window_size[0])
93
+ coords_w = torch.arange(self.window_size[1])
94
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
95
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
96
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
97
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
98
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
99
+ relative_coords[:, :, 1] += self.window_size[1] - 1
100
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
101
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
102
+ self.register_buffer("relative_position_index", relative_position_index)
103
+
104
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
105
+ self.attn_drop = nn.Dropout(attn_drop)
106
+ self.proj = nn.Linear(dim, dim)
107
+ self.proj_drop = nn.Dropout(proj_drop)
108
+
109
+ trunc_normal_(self.relative_position_bias_table, std=.02)
110
+ self.softmax = nn.Softmax(dim=-1)
111
+
112
+ def forward(self, x, mask=None):
113
+ """ Forward function.
114
+ Args:
115
+ x: input features with shape of (num_windows*B, N, C)
116
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
117
+ """
118
+ B_, N, C = x.shape
119
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
120
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
121
+
122
+ q = q * self.scale
123
+ attn = (q @ k.transpose(-2, -1))
124
+
125
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
126
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
127
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
128
+ attn = attn + relative_position_bias.unsqueeze(0)
129
+
130
+ if mask is not None:
131
+ nW = mask.shape[0]
132
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
133
+ attn = attn.view(-1, self.num_heads, N, N)
134
+ attn = self.softmax(attn)
135
+ else:
136
+ attn = self.softmax(attn)
137
+
138
+ attn = self.attn_drop(attn)
139
+
140
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
141
+ x = self.proj(x)
142
+ x = self.proj_drop(x)
143
+ return x
144
+
145
+
146
+ class SwinTransformerBlock(nn.Module):
147
+ """ Swin Transformer Block.
148
+ Args:
149
+ dim (int): Number of input channels.
150
+ num_heads (int): Number of attention heads.
151
+ window_size (int): Window size.
152
+ shift_size (int): Shift size for SW-MSA.
153
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
154
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
155
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
156
+ drop (float, optional): Dropout rate. Default: 0.0
157
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
158
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
159
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
160
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
161
+ """
162
+
163
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
164
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
165
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
166
+ super().__init__()
167
+ self.dim = dim
168
+ self.num_heads = num_heads
169
+ self.window_size = window_size
170
+ self.shift_size = shift_size
171
+ self.mlp_ratio = mlp_ratio
172
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
173
+
174
+ self.norm1 = norm_layer(dim)
175
+ self.attn = WindowAttention(
176
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
177
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
178
+
179
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
180
+ self.norm2 = norm_layer(dim)
181
+ mlp_hidden_dim = int(dim * mlp_ratio)
182
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
183
+
184
+ self.H = None
185
+ self.W = None
186
+
187
+ def forward(self, x, mask_matrix):
188
+ """ Forward function.
189
+ Args:
190
+ x: Input feature, tensor size (B, H*W, C).
191
+ H, W: Spatial resolution of the input feature.
192
+ mask_matrix: Attention mask for cyclic shift.
193
+ """
194
+ B, L, C = x.shape
195
+ H, W = self.H, self.W
196
+ assert L == H * W, "input feature has wrong size"
197
+
198
+ shortcut = x
199
+ x = self.norm1(x)
200
+ x = x.view(B, H, W, C)
201
+
202
+ # pad feature maps to multiples of window size
203
+ pad_l = pad_t = 0
204
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
205
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
206
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
207
+ _, Hp, Wp, _ = x.shape
208
+
209
+ # cyclic shift
210
+ if self.shift_size > 0:
211
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
212
+ attn_mask = mask_matrix
213
+ else:
214
+ shifted_x = x
215
+ attn_mask = None
216
+
217
+ # partition windows
218
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
219
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
220
+
221
+ # W-MSA/SW-MSA
222
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
223
+
224
+ # merge windows
225
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
226
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
227
+
228
+ # reverse cyclic shift
229
+ if self.shift_size > 0:
230
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
231
+ else:
232
+ x = shifted_x
233
+
234
+ if pad_r > 0 or pad_b > 0:
235
+ x = x[:, :H, :W, :].contiguous()
236
+
237
+ x = x.view(B, H * W, C)
238
+
239
+ # FFN
240
+ x = shortcut + self.drop_path(x)
241
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
242
+
243
+ return x
244
+
245
+
246
+ class PatchMerging(nn.Module):
247
+ """ Patch Merging Layer
248
+ Args:
249
+ dim (int): Number of input channels.
250
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
251
+ """
252
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
253
+ super().__init__()
254
+ self.dim = dim
255
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
256
+ self.norm = norm_layer(4 * dim)
257
+
258
+ def forward(self, x, H, W):
259
+ """ Forward function.
260
+ Args:
261
+ x: Input feature, tensor size (B, H*W, C).
262
+ H, W: Spatial resolution of the input feature.
263
+ """
264
+ B, L, C = x.shape
265
+ assert L == H * W, "input feature has wrong size"
266
+
267
+ x = x.view(B, H, W, C)
268
+
269
+ # padding
270
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
271
+ if pad_input:
272
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
273
+
274
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
275
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
276
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
277
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
278
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
279
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
280
+
281
+ x = self.norm(x)
282
+ x = self.reduction(x)
283
+
284
+ return x
285
+
286
+
287
+ class BasicLayer(nn.Module):
288
+ """ A basic Swin Transformer layer for one stage.
289
+ Args:
290
+ dim (int): Number of feature channels
291
+ depth (int): Depths of this stage.
292
+ num_heads (int): Number of attention head.
293
+ window_size (int): Local window size. Default: 7.
294
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
295
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
296
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
297
+ drop (float, optional): Dropout rate. Default: 0.0
298
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
299
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
300
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
301
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
302
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
303
+ """
304
+
305
+ def __init__(self,
306
+ dim,
307
+ depth,
308
+ num_heads,
309
+ window_size=7,
310
+ mlp_ratio=4.,
311
+ qkv_bias=True,
312
+ qk_scale=None,
313
+ drop=0.,
314
+ attn_drop=0.,
315
+ drop_path=0.,
316
+ norm_layer=nn.LayerNorm,
317
+ downsample=None,
318
+ use_checkpoint=False):
319
+ super().__init__()
320
+ self.window_size = window_size
321
+ self.shift_size = window_size // 2
322
+ self.depth = depth
323
+ self.use_checkpoint = use_checkpoint
324
+
325
+ # build blocks
326
+ self.blocks = nn.ModuleList([
327
+ SwinTransformerBlock(
328
+ dim=dim,
329
+ num_heads=num_heads,
330
+ window_size=window_size,
331
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
332
+ mlp_ratio=mlp_ratio,
333
+ qkv_bias=qkv_bias,
334
+ qk_scale=qk_scale,
335
+ drop=drop,
336
+ attn_drop=attn_drop,
337
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
338
+ norm_layer=norm_layer)
339
+ for i in range(depth)])
340
+
341
+ # patch merging layer
342
+ if downsample is not None:
343
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
344
+ else:
345
+ self.downsample = None
346
+
347
+ def forward(self, x, H, W):
348
+ """ Forward function.
349
+ Args:
350
+ x: Input feature, tensor size (B, H*W, C).
351
+ H, W: Spatial resolution of the input feature.
352
+ """
353
+
354
+ # calculate attention mask for SW-MSA
355
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
356
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
357
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
358
+ h_slices = (slice(0, -self.window_size),
359
+ slice(-self.window_size, -self.shift_size),
360
+ slice(-self.shift_size, None))
361
+ w_slices = (slice(0, -self.window_size),
362
+ slice(-self.window_size, -self.shift_size),
363
+ slice(-self.shift_size, None))
364
+ cnt = 0
365
+ for h in h_slices:
366
+ for w in w_slices:
367
+ img_mask[:, h, w, :] = cnt
368
+ cnt += 1
369
+
370
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
371
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
372
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
373
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
374
+
375
+ for blk in self.blocks:
376
+ blk.H, blk.W = H, W
377
+ if self.use_checkpoint:
378
+ x = checkpoint.checkpoint(blk, x, attn_mask)
379
+ else:
380
+ x = blk(x, attn_mask)
381
+ if self.downsample is not None:
382
+ x_down = self.downsample(x, H, W)
383
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
384
+ return x, H, W, x_down, Wh, Ww
385
+ else:
386
+ return x, H, W, x, H, W
387
+
388
+
389
+ class PatchEmbed(nn.Module):
390
+ """ Image to Patch Embedding
391
+ Args:
392
+ patch_size (int): Patch token size. Default: 4.
393
+ in_chans (int): Number of input image channels. Default: 3.
394
+ embed_dim (int): Number of linear projection output channels. Default: 96.
395
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
396
+ """
397
+
398
+ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
399
+ super().__init__()
400
+ patch_size = to_2tuple(patch_size)
401
+ self.patch_size = patch_size
402
+
403
+ self.in_chans = in_chans
404
+ self.embed_dim = embed_dim
405
+
406
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
407
+ if norm_layer is not None:
408
+ self.norm = norm_layer(embed_dim)
409
+ else:
410
+ self.norm = None
411
+
412
+ def forward(self, x):
413
+ """Forward function."""
414
+ # padding
415
+ _, _, H, W = x.size()
416
+ if W % self.patch_size[1] != 0:
417
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
418
+ if H % self.patch_size[0] != 0:
419
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
420
+
421
+ x = self.proj(x) # B C Wh Ww
422
+ if self.norm is not None:
423
+ Wh, Ww = x.size(2), x.size(3)
424
+ x = x.flatten(2).transpose(1, 2)
425
+ x = self.norm(x)
426
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
427
+
428
+ return x
429
+
430
+
431
+ class SwinTransformer(nn.Module):
432
+ """ Swin Transformer backbone.
433
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
434
+ https://arxiv.org/pdf/2103.14030
435
+ Args:
436
+ pretrain_img_size (int): Input image size for training the pretrained model,
437
+ used in absolute postion embedding. Default 224.
438
+ patch_size (int | tuple(int)): Patch size. Default: 4.
439
+ in_chans (int): Number of input image channels. Default: 3.
440
+ embed_dim (int): Number of linear projection output channels. Default: 96.
441
+ depths (tuple[int]): Depths of each Swin Transformer stage.
442
+ num_heads (tuple[int]): Number of attention head of each stage.
443
+ window_size (int): Window size. Default: 7.
444
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
445
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
446
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
447
+ drop_rate (float): Dropout rate.
448
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
449
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
450
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
451
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
452
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
453
+ out_indices (Sequence[int]): Output from which stages.
454
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
455
+ -1 means not freezing any parameters.
456
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
457
+ dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
458
+ """
459
+
460
+ def __init__(self,
461
+ pretrain_img_size=224,
462
+ patch_size=4,
463
+ in_chans=3,
464
+ embed_dim=96,
465
+ depths=[2, 2, 6, 2],
466
+ num_heads=[3, 6, 12, 24],
467
+ window_size=7,
468
+ mlp_ratio=4.,
469
+ qkv_bias=True,
470
+ qk_scale=None,
471
+ drop_rate=0.,
472
+ attn_drop_rate=0.,
473
+ drop_path_rate=0.2,
474
+ norm_layer=nn.LayerNorm,
475
+ ape=False,
476
+ patch_norm=True,
477
+ out_indices=(0, 1, 2, 3),
478
+ frozen_stages=-1,
479
+ dilation=False,
480
+ use_checkpoint=False):
481
+ super().__init__()
482
+
483
+ self.pretrain_img_size = pretrain_img_size
484
+ self.num_layers = len(depths)
485
+ self.embed_dim = embed_dim
486
+ self.ape = ape
487
+ self.patch_norm = patch_norm
488
+ self.out_indices = out_indices
489
+ self.frozen_stages = frozen_stages
490
+ self.dilation = dilation
491
+
492
+ # if use_checkpoint:
493
+ # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
494
+
495
+ # split image into non-overlapping patches
496
+ self.patch_embed = PatchEmbed(
497
+ patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
498
+ norm_layer=norm_layer if self.patch_norm else None)
499
+
500
+ # absolute position embedding
501
+ if self.ape:
502
+ pretrain_img_size = to_2tuple(pretrain_img_size)
503
+ patch_size = to_2tuple(patch_size)
504
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
505
+
506
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
507
+ trunc_normal_(self.absolute_pos_embed, std=.02)
508
+
509
+ self.pos_drop = nn.Dropout(p=drop_rate)
510
+
511
+ # stochastic depth
512
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
513
+
514
+ # build layers
515
+ self.layers = nn.ModuleList()
516
+ # prepare downsample list
517
+ downsamplelist = [PatchMerging for i in range(self.num_layers)]
518
+ downsamplelist[-1] = None
519
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
520
+ if self.dilation:
521
+ downsamplelist[-2] = None
522
+ num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
523
+ for i_layer in range(self.num_layers):
524
+ layer = BasicLayer(
525
+ # dim=int(embed_dim * 2 ** i_layer),
526
+ dim=num_features[i_layer],
527
+ depth=depths[i_layer],
528
+ num_heads=num_heads[i_layer],
529
+ window_size=window_size,
530
+ mlp_ratio=mlp_ratio,
531
+ qkv_bias=qkv_bias,
532
+ qk_scale=qk_scale,
533
+ drop=drop_rate,
534
+ attn_drop=attn_drop_rate,
535
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
536
+ norm_layer=norm_layer,
537
+ # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
538
+ downsample=downsamplelist[i_layer],
539
+ use_checkpoint=use_checkpoint)
540
+ self.layers.append(layer)
541
+
542
+ # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
543
+ self.num_features = num_features
544
+
545
+ # add a norm layer for each output
546
+ for i_layer in out_indices:
547
+ layer = norm_layer(num_features[i_layer])
548
+ layer_name = f'norm{i_layer}'
549
+ self.add_module(layer_name, layer)
550
+
551
+ self._freeze_stages()
552
+
553
+ def _freeze_stages(self):
554
+ if self.frozen_stages >= 0:
555
+ self.patch_embed.eval()
556
+ for param in self.patch_embed.parameters():
557
+ param.requires_grad = False
558
+
559
+ if self.frozen_stages >= 1 and self.ape:
560
+ self.absolute_pos_embed.requires_grad = False
561
+
562
+ if self.frozen_stages >= 2:
563
+ self.pos_drop.eval()
564
+ for i in range(0, self.frozen_stages - 1):
565
+ m = self.layers[i]
566
+ m.eval()
567
+ for param in m.parameters():
568
+ param.requires_grad = False
569
+
570
+
571
+
572
+ def forward_raw(self, x):
573
+ """Forward function."""
574
+ x = self.patch_embed(x)
575
+
576
+ Wh, Ww = x.size(2), x.size(3)
577
+ if self.ape:
578
+ # interpolate the position embedding to the corresponding size
579
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
580
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
581
+ else:
582
+ x = x.flatten(2).transpose(1, 2)
583
+ x = self.pos_drop(x)
584
+
585
+ outs = []
586
+ for i in range(self.num_layers):
587
+ layer = self.layers[i]
588
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
589
+ # import ipdb; ipdb.set_trace()
590
+
591
+ if i in self.out_indices:
592
+ norm_layer = getattr(self, f'norm{i}')
593
+ x_out = norm_layer(x_out)
594
+
595
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
596
+ outs.append(out)
597
+ # in:
598
+ # torch.Size([2, 3, 1024, 1024])
599
+ # outs:
600
+ # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
601
+ # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
602
+ return tuple(outs)
603
+
604
+
605
+ def forward(self, tensor_list: NestedTensor):
606
+ x = tensor_list.tensors
607
+
608
+ """Forward function."""
609
+ x = self.patch_embed(x)
610
+
611
+ Wh, Ww = x.size(2), x.size(3)
612
+ if self.ape:
613
+ # interpolate the position embedding to the corresponding size
614
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
615
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
616
+ else:
617
+ x = x.flatten(2).transpose(1, 2)
618
+ x = self.pos_drop(x)
619
+
620
+ outs = []
621
+ for i in range(self.num_layers):
622
+ layer = self.layers[i]
623
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
624
+
625
+ if i in self.out_indices:
626
+ norm_layer = getattr(self, f'norm{i}')
627
+ x_out = norm_layer(x_out)
628
+
629
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
630
+ outs.append(out)
631
+ # in:
632
+ # torch.Size([2, 3, 1024, 1024])
633
+ # out:
634
+ # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
635
+ # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
636
+
637
+ # collect for nesttensors
638
+ outs_dict = {}
639
+ for idx, out_i in enumerate(outs):
640
+ m = tensor_list.mask
641
+ assert m is not None
642
+ mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
643
+ outs_dict[idx] = NestedTensor(out_i, mask)
644
+
645
+ return outs_dict
646
+
647
+
648
+ def train(self, mode=True):
649
+ """Convert the model into training mode while keep layers freezed."""
650
+ super(SwinTransformer, self).train(mode)
651
+ self._freeze_stages()
652
+
653
+
654
+
655
+ def build_swin_transformer(modelname, pretrain_img_size, **kw):
656
+ assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']
657
+
658
+ model_para_dict = {
659
+ 'swin_T_224_1k': dict(
660
+ embed_dim=96,
661
+ depths=[ 2, 2, 6, 2 ],
662
+ num_heads=[ 3, 6, 12, 24],
663
+ window_size=7
664
+ ),
665
+ 'swin_B_224_22k': dict(
666
+ embed_dim=128,
667
+ depths=[ 2, 2, 18, 2 ],
668
+ num_heads=[ 4, 8, 16, 32 ],
669
+ window_size=7
670
+ ),
671
+ 'swin_B_384_22k': dict(
672
+ embed_dim=128,
673
+ depths=[ 2, 2, 18, 2 ],
674
+ num_heads=[ 4, 8, 16, 32 ],
675
+ window_size=12
676
+ ),
677
+ 'swin_L_224_22k': dict(
678
+ embed_dim=192,
679
+ depths=[ 2, 2, 18, 2 ],
680
+ num_heads=[ 6, 12, 24, 48 ],
681
+ window_size=7
682
+ ),
683
+ 'swin_L_384_22k': dict(
684
+ embed_dim=192,
685
+ depths=[ 2, 2, 18, 2 ],
686
+ num_heads=[ 6, 12, 24, 48 ],
687
+ window_size=12
688
+ ),
689
+ }
690
+ kw_cgf = model_para_dict[modelname]
691
+ kw_cgf.update(kw)
692
+ model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
693
+ return model
694
+
695
+ if __name__ == "__main__":
696
+ model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
697
+ x = torch.rand(2, 3, 1024, 1024)
698
+ y = model.forward_raw(x)
699
+ import ipdb; ipdb.set_trace()
700
+ x = torch.rand(2, 3, 384, 384)
701
+ y = model.forward_raw(x)
difpoint/src/models/XPose/models/UniPose/transformer_deformable.py ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Deformable DETR
7
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
8
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9
+ # ------------------------------------------------------------------------
10
+ # Modified from DETR (https://github.com/facebookresearch/detr)
11
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
12
+ # ------------------------------------------------------------------------
13
+
14
+ import copy
15
+ import math
16
+ import torch
17
+ from torch import nn, Tensor
18
+ from torch.nn.init import xavier_uniform_, constant_, normal_
19
+ from typing import Optional
20
+
21
+ from ...util.misc import inverse_sigmoid
22
+ from .ops.modules import MSDeformAttn
23
+ from .utils import MLP, _get_activation_fn, gen_sineembed_for_position
24
+
25
+ class DeformableTransformer(nn.Module):
26
+ def __init__(self, d_model=256, nhead=8,
27
+ num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
28
+ activation="relu", return_intermediate_dec=False,
29
+ num_feature_levels=4, dec_n_points=4, enc_n_points=4,
30
+ two_stage=False, two_stage_num_proposals=300,
31
+ use_dab=False, high_dim_query_update=False, no_sine_embed=False):
32
+ super().__init__()
33
+
34
+ self.d_model = d_model
35
+ self.nhead = nhead
36
+ self.two_stage = two_stage
37
+ self.two_stage_num_proposals = two_stage_num_proposals
38
+ self.use_dab = use_dab
39
+
40
+ encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
41
+ dropout, activation,
42
+ num_feature_levels, nhead, enc_n_points)
43
+ self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
44
+
45
+ decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
46
+ dropout, activation,
47
+ num_feature_levels, nhead, dec_n_points)
48
+ self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,
49
+ use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)
50
+
51
+ self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
52
+
53
+ if two_stage:
54
+ self.enc_output = nn.Linear(d_model, d_model)
55
+ self.enc_output_norm = nn.LayerNorm(d_model)
56
+ self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
57
+ self.pos_trans_norm = nn.LayerNorm(d_model * 2)
58
+ else:
59
+ if not self.use_dab:
60
+ self.reference_points = nn.Linear(d_model, 2)
61
+
62
+ self.high_dim_query_update = high_dim_query_update
63
+ if high_dim_query_update:
64
+ assert not self.use_dab, "use_dab must be True"
65
+
66
+ self._reset_parameters()
67
+
68
+ def _reset_parameters(self):
69
+ for p in self.parameters():
70
+ if p.dim() > 1:
71
+ nn.init.xavier_uniform_(p)
72
+ for m in self.modules():
73
+ if isinstance(m, MSDeformAttn):
74
+ m._reset_parameters()
75
+ if not self.two_stage and not self.use_dab:
76
+ xavier_uniform_(self.reference_points.weight.data, gain=1.0)
77
+ constant_(self.reference_points.bias.data, 0.)
78
+ normal_(self.level_embed)
79
+
80
+ def get_proposal_pos_embed(self, proposals):
81
+ num_pos_feats = 128
82
+ temperature = 10000
83
+ scale = 2 * math.pi
84
+
85
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
86
+ dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
87
+ # N, L, 4
88
+ proposals = proposals.sigmoid() * scale
89
+ # N, L, 4, 128
90
+ pos = proposals[:, :, :, None] / dim_t
91
+ # N, L, 4, 64, 2
92
+ pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
93
+ return pos
94
+
95
+ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
96
+ N_, S_, C_ = memory.shape
97
+ base_scale = 4.0
98
+ proposals = []
99
+ _cur = 0
100
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
101
+ mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
102
+ valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
103
+ valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
104
+
105
+ grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
106
+ torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
107
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
108
+
109
+ scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
110
+ grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
111
+ wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
112
+ proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
113
+ proposals.append(proposal)
114
+ _cur += (H_ * W_)
115
+ output_proposals = torch.cat(proposals, 1)
116
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
117
+ output_proposals = torch.log(output_proposals / (1 - output_proposals))
118
+ output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
119
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
120
+
121
+ output_memory = memory
122
+ output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
123
+ output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
124
+ output_memory = self.enc_output_norm(self.enc_output(output_memory))
125
+ return output_memory, output_proposals
126
+
127
+ def get_valid_ratio(self, mask):
128
+ _, H, W = mask.shape
129
+ valid_H = torch.sum(~mask[:, :, 0], 1)
130
+ valid_W = torch.sum(~mask[:, 0, :], 1)
131
+ valid_ratio_h = valid_H.float() / H
132
+ valid_ratio_w = valid_W.float() / W
133
+ valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
134
+ return valid_ratio
135
+
136
+ def forward(self, srcs, masks, pos_embeds, query_embed=None):
137
+ """
138
+ Input:
139
+ - srcs: List([bs, c, h, w])
140
+ - masks: List([bs, h, w])
141
+ """
142
+ assert self.two_stage or query_embed is not None
143
+
144
+ # prepare input for encoder
145
+ src_flatten = []
146
+ mask_flatten = []
147
+ lvl_pos_embed_flatten = []
148
+ spatial_shapes = []
149
+ for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
150
+ bs, c, h, w = src.shape
151
+ spatial_shape = (h, w)
152
+ spatial_shapes.append(spatial_shape)
153
+
154
+ src = src.flatten(2).transpose(1, 2) # bs, hw, c
155
+ mask = mask.flatten(1) # bs, hw
156
+ pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c
157
+ lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
158
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
159
+ src_flatten.append(src)
160
+ mask_flatten.append(mask)
161
+ src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c
162
+ mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw}
163
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
164
+ spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
165
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
166
+ valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
167
+
168
+ # encoder
169
+ memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
170
+ # import ipdb; ipdb.set_trace()
171
+
172
+ # prepare input for decoder
173
+ bs, _, c = memory.shape
174
+ if self.two_stage:
175
+ output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
176
+
177
+ # hack implementation for two-stage Deformable DETR
178
+ enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
179
+ enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
180
+
181
+ topk = self.two_stage_num_proposals
182
+ topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
183
+ topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
184
+ topk_coords_unact = topk_coords_unact.detach()
185
+ reference_points = topk_coords_unact.sigmoid()
186
+ init_reference_out = reference_points
187
+ pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
188
+ query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
189
+ elif self.use_dab:
190
+ reference_points = query_embed[..., self.d_model:].sigmoid()
191
+ tgt = query_embed[..., :self.d_model]
192
+ tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
193
+ init_reference_out = reference_points
194
+ else:
195
+ query_embed, tgt = torch.split(query_embed, c, dim=1)
196
+ query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
197
+ tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
198
+ reference_points = self.reference_points(query_embed).sigmoid()
199
+ # bs, num_quires, 2
200
+ init_reference_out = reference_points
201
+
202
+ # decoder
203
+ # import ipdb; ipdb.set_trace()
204
+ hs, inter_references = self.decoder(tgt, reference_points, memory,
205
+ spatial_shapes, level_start_index, valid_ratios,
206
+ query_pos=query_embed if not self.use_dab else None,
207
+ src_padding_mask=mask_flatten)
208
+
209
+ inter_references_out = inter_references
210
+ if self.two_stage:
211
+ return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
212
+ return hs, init_reference_out, inter_references_out, None, None
213
+
214
+
215
+ class DeformableTransformerEncoderLayer(nn.Module):
216
+ def __init__(self,
217
+ d_model=256, d_ffn=1024,
218
+ dropout=0.1, activation="relu",
219
+ n_levels=4, n_heads=8, n_points=4,
220
+ add_channel_attention=False,
221
+ use_deformable_box_attn=False,
222
+ box_attn_type='roi_align',
223
+ ):
224
+ super().__init__()
225
+
226
+ # self attention
227
+ if use_deformable_box_attn:
228
+ self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
229
+ else:
230
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
231
+ self.dropout1 = nn.Dropout(dropout)
232
+ self.norm1 = nn.LayerNorm(d_model)
233
+
234
+ # ffn
235
+ self.linear1 = nn.Linear(d_model, d_ffn)
236
+ self.activation = _get_activation_fn(activation, d_model=d_ffn)
237
+ self.dropout2 = nn.Dropout(dropout)
238
+ self.linear2 = nn.Linear(d_ffn, d_model)
239
+ self.dropout3 = nn.Dropout(dropout)
240
+ self.norm2 = nn.LayerNorm(d_model)
241
+
242
+ # channel attention
243
+ self.add_channel_attention = add_channel_attention
244
+ if add_channel_attention:
245
+ self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
246
+ self.norm_channel = nn.LayerNorm(d_model)
247
+
248
+ @staticmethod
249
+ def with_pos_embed(tensor, pos):
250
+ return tensor if pos is None else tensor + pos
251
+
252
+ def forward_ffn(self, src):
253
+ src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
254
+ src = src + self.dropout3(src2)
255
+ src = self.norm2(src)
256
+ return src
257
+
258
+ def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
259
+ # self attention
260
+ # import ipdb; ipdb.set_trace()
261
+ src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)
262
+ src = src + self.dropout1(src2)
263
+ src = self.norm1(src)
264
+
265
+ # ffn
266
+ src = self.forward_ffn(src)
267
+
268
+ # channel attn
269
+ if self.add_channel_attention:
270
+ src = self.norm_channel(src + self.activ_channel(src))
271
+
272
+ return src
273
+
274
+
275
+ class DeformableTransformerEncoder(nn.Module):
276
+ def __init__(self, encoder_layer, num_layers, norm=None):
277
+ super().__init__()
278
+ if num_layers > 0:
279
+ self.layers = _get_clones(encoder_layer, num_layers)
280
+ else:
281
+ self.layers = []
282
+ del encoder_layer
283
+ self.num_layers = num_layers
284
+ self.norm = norm
285
+
286
+ @staticmethod
287
+ def get_reference_points(spatial_shapes, valid_ratios, device):
288
+ reference_points_list = []
289
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
290
+
291
+ ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
292
+ torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
293
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
294
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
295
+ ref = torch.stack((ref_x, ref_y), -1)
296
+ reference_points_list.append(ref)
297
+ reference_points = torch.cat(reference_points_list, 1)
298
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
299
+ return reference_points
300
+
301
+ def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
302
+ """
303
+ Input:
304
+ - src: [bs, sum(hi*wi), 256]
305
+ - spatial_shapes: h,w of each level [num_level, 2]
306
+ - level_start_index: [num_level] start point of level in sum(hi*wi).
307
+ - valid_ratios: [bs, num_level, 2]
308
+ - pos: pos embed for src. [bs, sum(hi*wi), 256]
309
+ - padding_mask: [bs, sum(hi*wi)]
310
+ Intermedia:
311
+ - reference_points: [bs, sum(hi*wi), num_lebel, 2]
312
+ """
313
+ output = src
314
+ # bs, sum(hi*wi), 256
315
+ # import ipdb; ipdb.set_trace()
316
+ if self.num_layers > 0:
317
+ reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
318
+ for _, layer in enumerate(self.layers):
319
+ output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
320
+
321
+ if self.norm is not None:
322
+ output = self.norm(output)
323
+
324
+ return output
325
+
326
+
327
+ class DeformableTransformerDecoderLayer(nn.Module):
328
+ def __init__(self, d_model=256, d_ffn=1024,
329
+ dropout=0.1, activation="relu",
330
+ n_levels=4, n_heads=8, n_points=4,
331
+ use_deformable_box_attn=False,
332
+ box_attn_type='roi_align',
333
+ key_aware_type=None,
334
+ decoder_sa_type='ca',
335
+ module_seq=['sa', 'ca', 'ffn'],
336
+ ):
337
+ super().__init__()
338
+ self.module_seq = module_seq
339
+ assert sorted(module_seq) == ['ca', 'ffn', 'sa']
340
+
341
+ # cross attention
342
+ # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
343
+ if use_deformable_box_attn:
344
+ self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
345
+ else:
346
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
347
+ self.dropout1 = nn.Dropout(dropout)
348
+ self.norm1 = nn.LayerNorm(d_model)
349
+
350
+ # self attention
351
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
352
+ self.dropout2 = nn.Dropout(dropout)
353
+ self.norm2 = nn.LayerNorm(d_model)
354
+
355
+ # ffn
356
+ self.linear1 = nn.Linear(d_model, d_ffn)
357
+ self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
358
+ self.dropout3 = nn.Dropout(dropout)
359
+ self.linear2 = nn.Linear(d_ffn, d_model)
360
+ self.dropout4 = nn.Dropout(dropout)
361
+ self.norm3 = nn.LayerNorm(d_model)
362
+
363
+ self.key_aware_type = key_aware_type
364
+ self.key_aware_proj = None
365
+ self.decoder_sa_type = decoder_sa_type
366
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
367
+
368
+ if decoder_sa_type == 'ca_content':
369
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
370
+
371
+
372
+
373
+
374
+ def rm_self_attn_modules(self):
375
+ self.self_attn = None
376
+ self.dropout2 = None
377
+ self.norm2 = None
378
+
379
+
380
+ @staticmethod
381
+ def with_pos_embed(tensor, pos):
382
+ return tensor if pos is None else tensor + pos
383
+
384
+ def forward_ffn(self, tgt):
385
+ tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
386
+ tgt = tgt + self.dropout4(tgt2)
387
+ tgt = self.norm3(tgt)
388
+ return tgt
389
+
390
+ def forward_sa(self,
391
+ # for tgt
392
+ tgt: Optional[Tensor], # nq, bs, d_model
393
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
394
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
395
+ tgt_key_padding_mask: Optional[Tensor] = None,
396
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
397
+
398
+ # for memory
399
+ memory: Optional[Tensor] = None, # hw, bs, d_model
400
+ memory_key_padding_mask: Optional[Tensor] = None,
401
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
402
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
403
+ memory_pos: Optional[Tensor] = None, # pos for memory
404
+
405
+ # sa
406
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
407
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
408
+ ):
409
+ # self attention
410
+ if self.self_attn is not None:
411
+ # import ipdb; ipdb.set_trace()
412
+ if self.decoder_sa_type == 'sa':
413
+ q = k = self.with_pos_embed(tgt, tgt_query_pos)
414
+ tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
415
+ tgt = tgt + self.dropout2(tgt2)
416
+ tgt = self.norm2(tgt)
417
+ elif self.decoder_sa_type == 'ca_label':
418
+ # import ipdb; ipdb.set_trace()
419
+ # q = self.with_pos_embed(tgt, tgt_query_pos)
420
+ bs = tgt.shape[1]
421
+ k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)
422
+ tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]
423
+ tgt = tgt + self.dropout2(tgt2)
424
+ tgt = self.norm2(tgt)
425
+ elif self.decoder_sa_type == 'ca_content':
426
+ tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
427
+ tgt_reference_points.transpose(0, 1).contiguous(),
428
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
429
+ tgt = tgt + self.dropout2(tgt2)
430
+ tgt = self.norm2(tgt)
431
+ else:
432
+ raise NotImplementedError("Unknown decoder_sa_type {}".format(self.decoder_sa_type))
433
+
434
+ return tgt
435
+
436
+ def forward_ca(self,
437
+ # for tgt
438
+ tgt: Optional[Tensor], # nq, bs, d_model
439
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
440
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
441
+ tgt_key_padding_mask: Optional[Tensor] = None,
442
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
443
+
444
+ # for memory
445
+ memory: Optional[Tensor] = None, # hw, bs, d_model
446
+ memory_key_padding_mask: Optional[Tensor] = None,
447
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
448
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
449
+ memory_pos: Optional[Tensor] = None, # pos for memory
450
+
451
+ # sa
452
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
453
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
454
+ ):
455
+ # cross attention
456
+ # import ipdb; ipdb.set_trace()
457
+ if self.key_aware_type is not None:
458
+
459
+ if self.key_aware_type == 'mean':
460
+ tgt = tgt + memory.mean(0, keepdim=True)
461
+ elif self.key_aware_type == 'proj_mean':
462
+ tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
463
+ else:
464
+ raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
465
+ tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
466
+ tgt_reference_points.transpose(0, 1).contiguous(),
467
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
468
+ tgt = tgt + self.dropout1(tgt2)
469
+ tgt = self.norm1(tgt)
470
+
471
+ return tgt
472
+
473
+ def forward(self,
474
+ # for tgt
475
+ tgt: Optional[Tensor], # nq, bs, d_model
476
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
477
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
478
+ tgt_key_padding_mask: Optional[Tensor] = None,
479
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
480
+
481
+ # for memory
482
+ memory: Optional[Tensor] = None, # hw, bs, d_model
483
+ memory_key_padding_mask: Optional[Tensor] = None,
484
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
485
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
486
+ memory_pos: Optional[Tensor] = None, # pos for memory
487
+
488
+ # sa
489
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
490
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
491
+ ):
492
+
493
+ for funcname in self.module_seq:
494
+ # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':
495
+ # import ipdb; ipdb.set_trace()
496
+ if funcname == 'ffn':
497
+ tgt = self.forward_ffn(tgt)
498
+ elif funcname == 'ca':
499
+ tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \
500
+ tgt_key_padding_mask, tgt_reference_points, \
501
+ memory, memory_key_padding_mask, memory_level_start_index, \
502
+ memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
503
+ elif funcname == 'sa':
504
+ tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \
505
+ tgt_key_padding_mask, tgt_reference_points, \
506
+ memory, memory_key_padding_mask, memory_level_start_index, \
507
+ memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
508
+ else:
509
+ raise ValueError('unknown funcname {}'.format(funcname))
510
+
511
+ return tgt
512
+
513
+
514
+
515
+ class DeformableTransformerDecoder(nn.Module):
516
+ def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):
517
+ super().__init__()
518
+ self.layers = _get_clones(decoder_layer, num_layers)
519
+ self.num_layers = num_layers
520
+ self.return_intermediate = return_intermediate
521
+ assert return_intermediate
522
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
523
+ self.bbox_embed = None
524
+ self.class_embed = None
525
+ self.use_dab = use_dab
526
+ self.d_model = d_model
527
+ self.query_dim = query_dim
528
+ if use_dab:
529
+ self.query_scale = MLP(d_model, d_model, d_model, 2)
530
+ self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)
531
+
532
+
533
+ def forward(self, tgt, reference_points, src, src_spatial_shapes,
534
+ src_level_start_index, src_valid_ratios,
535
+ query_pos=None, src_padding_mask=None):
536
+ output = tgt
537
+ if self.use_dab:
538
+ assert query_pos is None
539
+
540
+ intermediate = []
541
+ intermediate_reference_points = [reference_points]
542
+ for layer_id, layer in enumerate(self.layers):
543
+ # import ipdb; ipdb.set_trace()
544
+ if reference_points.shape[-1] == 4:
545
+ reference_points_input = reference_points[:, :, None] \
546
+ * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4
547
+ else:
548
+ assert reference_points.shape[-1] == 2
549
+ reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
550
+
551
+ if self.use_dab:
552
+ # import ipdb; ipdb.set_trace()
553
+ query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2
554
+ raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256
555
+ pos_scale = self.query_scale(output) if layer_id != 0 else 1
556
+ query_pos = pos_scale * raw_query_pos
557
+
558
+ output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
559
+
560
+ # hack implementation for iterative bounding box refinement
561
+ if self.bbox_embed is not None:
562
+ box_holder = self.bbox_embed(output)
563
+ box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
564
+ new_reference_points = box_holder[..., :self.query_dim].sigmoid()
565
+ reference_points = new_reference_points.detach()
566
+ if layer_id != self.num_layers - 1:
567
+ intermediate_reference_points.append(new_reference_points)
568
+
569
+ intermediate.append(output)
570
+
571
+ return torch.stack(intermediate), torch.stack(intermediate_reference_points)
572
+
573
+
574
+ def _get_clones(module, N):
575
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
576
+
577
+
578
+ def build_deforamble_transformer(args):
579
+ return DeformableTransformer(
580
+ d_model=args.hidden_dim,
581
+ nhead=args.nheads,
582
+ num_encoder_layers=args.enc_layers,
583
+ num_decoder_layers=args.dec_layers,
584
+ dim_feedforward=args.dim_feedforward,
585
+ dropout=args.dropout,
586
+ activation="relu",
587
+ return_intermediate_dec=True,
588
+ num_feature_levels=args.ddetr_num_feature_levels,
589
+ dec_n_points=args.ddetr_dec_n_points,
590
+ enc_n_points=args.ddetr_enc_n_points,
591
+ two_stage=args.ddetr_two_stage,
592
+ two_stage_num_proposals=args.num_queries,
593
+ use_dab=args.ddetr_use_dab,
594
+ high_dim_query_update=args.ddetr_high_dim_query_update,
595
+ no_sine_embed=args.ddetr_no_sine_embed)
difpoint/src/models/XPose/models/UniPose/transformer_vanilla.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
2
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3
+ """
4
+ DETR Transformer class.
5
+
6
+ Copy-paste from torch.nn.Transformer with modifications:
7
+ * positional encodings are passed in MHattention
8
+ * extra LN at the end of encoder is removed
9
+ * decoder returns a stack of activations from all decoding layers
10
+ """
11
+ import torch
12
+ from torch import Tensor, nn
13
+ from typing import List, Optional
14
+
15
+ from .utils import _get_activation_fn, _get_clones
16
+
17
+
18
+ class TextTransformer(nn.Module):
19
+ def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
20
+ super().__init__()
21
+ self.num_layers = num_layers
22
+ self.d_model = d_model
23
+ self.nheads = nheads
24
+ self.dim_feedforward = dim_feedforward
25
+ self.norm = None
26
+
27
+ single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)
28
+ self.layers = _get_clones(single_encoder_layer, num_layers)
29
+
30
+
31
+ def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):
32
+ """
33
+
34
+ Args:
35
+ text_attention_mask: bs, num_token
36
+ memory_text: bs, num_token, d_model
37
+
38
+ Raises:
39
+ RuntimeError: _description_
40
+
41
+ Returns:
42
+ output: bs, num_token, d_model
43
+ """
44
+
45
+ output = memory_text.transpose(0, 1)
46
+
47
+ for layer in self.layers:
48
+ output = layer(output, src_key_padding_mask=text_attention_mask)
49
+
50
+ if self.norm is not None:
51
+ output = self.norm(output)
52
+
53
+ return output.transpose(0, 1)
54
+
55
+
56
+
57
+
58
+ class TransformerEncoderLayer(nn.Module):
59
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False):
60
+ super().__init__()
61
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
62
+ # Implementation of Feedforward model
63
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
64
+ self.dropout = nn.Dropout(dropout)
65
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
66
+
67
+ self.norm1 = nn.LayerNorm(d_model)
68
+ self.norm2 = nn.LayerNorm(d_model)
69
+ self.dropout1 = nn.Dropout(dropout)
70
+ self.dropout2 = nn.Dropout(dropout)
71
+
72
+ self.activation = _get_activation_fn(activation)
73
+ self.normalize_before = normalize_before
74
+ self.nhead = nhead
75
+
76
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
77
+ return tensor if pos is None else tensor + pos
78
+
79
+ def forward(
80
+ self,
81
+ src,
82
+ src_mask: Optional[Tensor] = None,
83
+ src_key_padding_mask: Optional[Tensor] = None,
84
+ pos: Optional[Tensor] = None,
85
+ ):
86
+ # repeat attn mask
87
+ if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
88
+ # bs, num_q, num_k
89
+ src_mask = src_mask.repeat(self.nhead, 1, 1)
90
+
91
+ q = k = self.with_pos_embed(src, pos)
92
+
93
+ src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
94
+
95
+ # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
96
+ src = src + self.dropout1(src2)
97
+ src = self.norm1(src)
98
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
99
+ src = src + self.dropout2(src2)
100
+ src = self.norm2(src)
101
+ return src
102
+
difpoint/src/models/XPose/models/UniPose/unipose.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
7
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
8
+ # ------------------------------------------------------------------------
9
+ import os
10
+ import copy
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from torch import nn
14
+ from typing import List
15
+
16
+ from ...util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
17
+ from ...util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid
18
+
19
+ from .utils import MLP
20
+ from .backbone import build_backbone
21
+ from ..registry import MODULE_BUILD_FUNCS
22
+ from .mask_generate import prepare_for_mask, post_process
23
+ from .deformable_transformer import build_deformable_transformer
24
+
25
+
26
+ class UniPose(nn.Module):
27
+ """ This is the Cross-Attention Detector module that performs object detection """
28
+
29
+ def __init__(self, backbone, transformer, num_classes, num_queries,
30
+ aux_loss=False, iter_update=False,
31
+ query_dim=2,
32
+ random_refpoints_xy=False,
33
+ fix_refpoints_hw=-1,
34
+ num_feature_levels=1,
35
+ nheads=8,
36
+ # two stage
37
+ two_stage_type='no', # ['no', 'standard']
38
+ two_stage_add_query_num=0,
39
+ dec_pred_class_embed_share=True,
40
+ dec_pred_bbox_embed_share=True,
41
+ two_stage_class_embed_share=True,
42
+ two_stage_bbox_embed_share=True,
43
+ decoder_sa_type='sa',
44
+ num_patterns=0,
45
+ dn_number=100,
46
+ dn_box_noise_scale=0.4,
47
+ dn_label_noise_ratio=0.5,
48
+ dn_labelbook_size=100,
49
+ use_label_enc=True,
50
+
51
+ text_encoder_type='bert-base-uncased',
52
+
53
+ binary_query_selection=False,
54
+ use_cdn=True,
55
+ sub_sentence_present=True,
56
+ num_body_points=68,
57
+ num_box_decoder_layers=2,
58
+ ):
59
+ """ Initializes the model.
60
+ Parameters:
61
+ backbone: torch module of the backbone to be used. See backbone.py
62
+ transformer: torch module of the transformer architecture. See transformer.py
63
+ num_classes: number of object classes
64
+ num_queries: number of object queries, ie detection slot. This is the maximal number of objects
65
+ Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
66
+ aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
67
+
68
+ fix_refpoints_hw: -1(default): learn w and h for each box seperately
69
+ >0 : given fixed number
70
+ -2 : learn a shared w and h
71
+ """
72
+ super().__init__()
73
+ self.num_queries = num_queries
74
+ self.transformer = transformer
75
+ self.num_classes = num_classes
76
+ self.hidden_dim = hidden_dim = transformer.d_model
77
+ self.num_feature_levels = num_feature_levels
78
+ self.nheads = nheads
79
+ self.use_label_enc = use_label_enc
80
+ if use_label_enc:
81
+ self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
82
+ else:
83
+ raise NotImplementedError
84
+ self.label_enc = None
85
+ self.max_text_len = 256
86
+ self.binary_query_selection = binary_query_selection
87
+ self.sub_sentence_present = sub_sentence_present
88
+
89
+ # setting query dim
90
+ self.query_dim = query_dim
91
+ assert query_dim == 4
92
+ self.random_refpoints_xy = random_refpoints_xy
93
+ self.fix_refpoints_hw = fix_refpoints_hw
94
+
95
+ # for dn training
96
+ self.num_patterns = num_patterns
97
+ self.dn_number = dn_number
98
+ self.dn_box_noise_scale = dn_box_noise_scale
99
+ self.dn_label_noise_ratio = dn_label_noise_ratio
100
+ self.dn_labelbook_size = dn_labelbook_size
101
+ self.use_cdn = use_cdn
102
+
103
+
104
+ self.projection = MLP(512, hidden_dim, hidden_dim, 3)
105
+
106
+ self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)
107
+
108
+
109
+ device = "cuda" if torch.cuda.is_available() else "cpu"
110
+ # model, _ = clip.load("ViT-B/32", device=device)
111
+ # self.clip_model = model
112
+ # visual_parameters = list(self.clip_model.visual.parameters())
113
+ # #
114
+ # for param in visual_parameters:
115
+ # param.requires_grad = False
116
+
117
+ self.pos_proj = nn.Linear(hidden_dim, 768)
118
+ self.padding = nn.Embedding(1, 768)
119
+
120
+ # prepare input projection layers
121
+ if num_feature_levels > 1:
122
+ num_backbone_outs = len(backbone.num_channels)
123
+ input_proj_list = []
124
+ for _ in range(num_backbone_outs):
125
+ in_channels = backbone.num_channels[_]
126
+ input_proj_list.append(nn.Sequential(
127
+ nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
128
+ nn.GroupNorm(32, hidden_dim),
129
+ ))
130
+ for _ in range(num_feature_levels - num_backbone_outs):
131
+ input_proj_list.append(nn.Sequential(
132
+ nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
133
+ nn.GroupNorm(32, hidden_dim),
134
+ ))
135
+ in_channels = hidden_dim
136
+ self.input_proj = nn.ModuleList(input_proj_list)
137
+ else:
138
+ assert two_stage_type == 'no', "two_stage_type should be no if num_feature_levels=1 !!!"
139
+ self.input_proj = nn.ModuleList([
140
+ nn.Sequential(
141
+ nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
142
+ nn.GroupNorm(32, hidden_dim),
143
+ )])
144
+
145
+ self.backbone = backbone
146
+ self.aux_loss = aux_loss
147
+ self.box_pred_damping = box_pred_damping = None
148
+
149
+ self.iter_update = iter_update
150
+ assert iter_update, "Why not iter_update?"
151
+
152
+ # prepare pred layers
153
+ self.dec_pred_class_embed_share = dec_pred_class_embed_share
154
+ self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
155
+ # prepare class & box embed
156
+ _class_embed = ContrastiveAssign()
157
+
158
+
159
+
160
+ _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
161
+ nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
162
+ nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
163
+
164
+ _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
165
+ _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
166
+ nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
167
+ nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
168
+
169
+ if dec_pred_bbox_embed_share:
170
+ box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
171
+ else:
172
+ box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]
173
+ if dec_pred_class_embed_share:
174
+ class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
175
+ else:
176
+ class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]
177
+
178
+
179
+ if dec_pred_bbox_embed_share:
180
+
181
+ pose_embed_layerlist = [_pose_embed for i in
182
+ range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
183
+ else:
184
+ pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in
185
+ range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
186
+
187
+ pose_hw_embed_layerlist = [_pose_hw_embed for i in
188
+ range(transformer.num_decoder_layers - num_box_decoder_layers)]
189
+
190
+
191
+ self.num_box_decoder_layers = num_box_decoder_layers
192
+ self.bbox_embed = nn.ModuleList(box_embed_layerlist)
193
+ self.class_embed = nn.ModuleList(class_embed_layerlist)
194
+ self.num_body_points = num_body_points
195
+ self.pose_embed = nn.ModuleList(pose_embed_layerlist)
196
+ self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
197
+
198
+ self.transformer.decoder.bbox_embed = self.bbox_embed
199
+ self.transformer.decoder.class_embed = self.class_embed
200
+
201
+ self.transformer.decoder.pose_embed = self.pose_embed
202
+ self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
203
+
204
+ self.transformer.decoder.num_body_points = num_body_points
205
+
206
+
207
+ # two stage
208
+ self.two_stage_type = two_stage_type
209
+ self.two_stage_add_query_num = two_stage_add_query_num
210
+ assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
211
+ if two_stage_type != 'no':
212
+ if two_stage_bbox_embed_share:
213
+ assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
214
+ self.transformer.enc_out_bbox_embed = _bbox_embed
215
+ else:
216
+ self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
217
+
218
+ if two_stage_class_embed_share:
219
+ assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
220
+ self.transformer.enc_out_class_embed = _class_embed
221
+ else:
222
+ self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
223
+
224
+ self.refpoint_embed = None
225
+ if self.two_stage_add_query_num > 0:
226
+ self.init_ref_points(two_stage_add_query_num)
227
+
228
+ self.decoder_sa_type = decoder_sa_type
229
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
230
+ # self.replace_sa_with_double_ca = replace_sa_with_double_ca
231
+ if decoder_sa_type == 'ca_label':
232
+ self.label_embedding = nn.Embedding(num_classes, hidden_dim)
233
+ for layer in self.transformer.decoder.layers:
234
+ layer.label_embedding = self.label_embedding
235
+ else:
236
+ for layer in self.transformer.decoder.layers:
237
+ layer.label_embedding = None
238
+ self.label_embedding = None
239
+
240
+ self._reset_parameters()
241
+
242
+ def open_set_transfer_init(self):
243
+ for name, param in self.named_parameters():
244
+ if 'fusion_layers' in name:
245
+ continue
246
+ if 'ca_text' in name:
247
+ continue
248
+ if 'catext_norm' in name:
249
+ continue
250
+ if 'catext_dropout' in name:
251
+ continue
252
+ if "text_layers" in name:
253
+ continue
254
+ if 'bert' in name:
255
+ continue
256
+ if 'bbox_embed' in name:
257
+ continue
258
+ if 'label_enc.weight' in name:
259
+ continue
260
+ if 'feat_map' in name:
261
+ continue
262
+ if 'enc_output' in name:
263
+ continue
264
+
265
+ param.requires_grad_(False)
266
+
267
+ # import ipdb; ipdb.set_trace()
268
+
269
+ def _reset_parameters(self):
270
+ # init input_proj
271
+ for proj in self.input_proj:
272
+ nn.init.xavier_uniform_(proj[0].weight, gain=1)
273
+ nn.init.constant_(proj[0].bias, 0)
274
+
275
+ def init_ref_points(self, use_num_queries):
276
+ self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
277
+
278
+ if self.random_refpoints_xy:
279
+ # import ipdb; ipdb.set_trace()
280
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
281
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
282
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
283
+
284
+ if self.fix_refpoints_hw > 0:
285
+ print("fix_refpoints_hw: {}".format(self.fix_refpoints_hw))
286
+ assert self.random_refpoints_xy
287
+ self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw
288
+ self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])
289
+ self.refpoint_embed.weight.data[:, 2:].requires_grad = False
290
+ elif int(self.fix_refpoints_hw) == -1:
291
+ pass
292
+ elif int(self.fix_refpoints_hw) == -2:
293
+ print('learn a shared h and w')
294
+ assert self.random_refpoints_xy
295
+ self.refpoint_embed = nn.Embedding(use_num_queries, 2)
296
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
297
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
298
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
299
+ self.hw_embed = nn.Embedding(1, 1)
300
+ else:
301
+ raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))
302
+
303
+ def forward(self, samples: NestedTensor, targets: List = None, **kw):
304
+ """ The forward expects a NestedTensor, which consists of:
305
+ - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
306
+ - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
307
+
308
+ It returns a dict with the following elements:
309
+ - "pred_logits": the classification logits (including no-object) for all queries.
310
+ Shape= [batch_size x num_queries x num_classes]
311
+ - "pred_boxes": The normalized boxes coordinates for all queries, represented as
312
+ (center_x, center_y, width, height). These values are normalized in [0, 1],
313
+ relative to the size of each individual image (disregarding possible padding).
314
+ See PostProcess for information on how to retrieve the unnormalized bounding box.
315
+ - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
316
+ dictionnaries containing the two above keys for each decoder layer.
317
+ """
318
+
319
+ captions = [t['instance_text_prompt'] for t in targets]
320
+ bs=len(captions)
321
+ tensor_list = [tgt["object_embeddings_text"] for tgt in targets]
322
+ max_size = 350
323
+ padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]
324
+ object_embeddings_text = torch.stack(padded_tensors)
325
+
326
+ kpts_embeddings_text = torch.stack([tgt["kpts_embeddings_text"] for tgt in targets])[:, :self.num_body_points]
327
+ encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256
328
+ kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256
329
+
330
+
331
+ kpt_vis = torch.stack([tgt["kpt_vis_text"] for tgt in targets])[:, :self.num_body_points]
332
+ kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)
333
+
334
+
335
+ num_classes = encoded_text.shape[1] # bs, 81, 101, 256
336
+ text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)
337
+ text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0
338
+ for i in range(bs):
339
+ text_token_mask[i,:len(captions[i])]=True
340
+
341
+ position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)
342
+
343
+ for i in range(bs):
344
+ position_ids[i,:len(captions[i])]= 1
345
+
346
+
347
+ text_dict = {
348
+ 'encoded_text': encoded_text, # bs, 195, d_model
349
+ 'text_token_mask': text_token_mask, # bs, 195
350
+ 'position_ids': position_ids, # bs, 195
351
+ 'text_self_attention_masks': text_self_attention_masks # bs, 195,195
352
+ }
353
+
354
+
355
+ # import ipdb; ipdb.set_trace()
356
+
357
+ if isinstance(samples, (list, torch.Tensor)):
358
+ samples = nested_tensor_from_tensor_list(samples)
359
+ features, poss = self.backbone(samples)
360
+ if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
361
+ import ipdb;
362
+ ipdb.set_trace()
363
+
364
+
365
+ srcs = []
366
+ masks = []
367
+ for l, feat in enumerate(features):
368
+ src, mask = feat.decompose()
369
+ srcs.append(self.input_proj[l](src))
370
+ masks.append(mask)
371
+ assert mask is not None
372
+
373
+ if self.num_feature_levels > len(srcs):
374
+ _len_srcs = len(srcs)
375
+ for l in range(_len_srcs, self.num_feature_levels):
376
+ if l == _len_srcs:
377
+ src = self.input_proj[l](features[-1].tensors)
378
+ else:
379
+ src = self.input_proj[l](srcs[-1])
380
+ m = samples.mask
381
+ mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
382
+ pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
383
+ srcs.append(src)
384
+ masks.append(mask)
385
+ poss.append(pos_l)
386
+
387
+ if self.label_enc is not None:
388
+ label_enc = self.label_enc
389
+ else:
390
+ raise NotImplementedError
391
+ label_enc = encoded_text
392
+ if self.dn_number > 0 or targets is not None:
393
+ input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \
394
+ prepare_for_mask(kpt_mask=kpt_mask)
395
+ else:
396
+ assert targets is None
397
+ input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None
398
+
399
+
400
+ hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,
401
+ input_query_label, attn_mask, attn_mask2,
402
+ text_dict, dn_meta,targets,kpt_embeddings_specific)
403
+
404
+ # In case num object=0
405
+ if self.label_enc is not None:
406
+ hs[0] += self.label_enc.weight[0, 0] * 0.0
407
+
408
+ hs[0] += self.pos_proj.weight[0, 0] * 0.0
409
+ hs[0] += self.pos_proj.bias[0] * 0.0
410
+ hs[0] += self.padding.weight[0, 0] * 0.0
411
+
412
+ num_group = 50
413
+ effective_dn_number = dn_meta['pad_size'] if self.training else 0
414
+ outputs_coord_list = []
415
+ outputs_class = []
416
+
417
+
418
+ for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(
419
+ zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):
420
+
421
+
422
+ if dec_lid < self.num_box_decoder_layers:
423
+ layer_delta_unsig = layer_bbox_embed(layer_hs)
424
+ layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
425
+ layer_outputs_unsig = layer_outputs_unsig.sigmoid()
426
+ layer_cls = layer_cls_embed(layer_hs, text_dict)
427
+ outputs_coord_list.append(layer_outputs_unsig)
428
+ outputs_class.append(layer_cls)
429
+
430
+
431
+ else:
432
+
433
+ layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]
434
+ layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]
435
+ bs = layer_ref_sig.shape[0]
436
+ reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]
437
+ reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,
438
+ 0::(self.num_body_points + 1), :]
439
+ layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)
440
+ layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)
441
+ layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)
442
+ layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()
443
+ layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)
444
+ layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()
445
+ layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)
446
+ layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)
447
+ layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)
448
+ layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
449
+ outputs_class.append(layer_cls)
450
+ outputs_coord_list.append(layer_outputs_unsig)
451
+
452
+ # update keypoints
453
+ outputs_keypoints_list = []
454
+ outputs_keypoints_hw = []
455
+ kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
456
+ for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
457
+ if dec_lid < self.num_box_decoder_layers:
458
+ assert isinstance(layer_hs, torch.Tensor)
459
+ bs = layer_hs.shape[0]
460
+ layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))
461
+ outputs_keypoints_list.append(layer_res)
462
+ else:
463
+ bs = layer_ref_sig.shape[0]
464
+ layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
465
+ device=layer_hs.device))
466
+ delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)
467
+ layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
468
+ device=layer_hs.device))
469
+ layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])
470
+ vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,
471
+ device=layer_outputs_unsig_keypoints.device)
472
+ xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)
473
+ xyv = xyv.sigmoid()
474
+ layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)
475
+ layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)
476
+ layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
477
+ outputs_keypoints_list.append(layer_res)
478
+ outputs_keypoints_hw.append(layer_hw)
479
+
480
+
481
+ if self.dn_number > 0 and dn_meta is not None:
482
+ outputs_class, outputs_coord_list = \
483
+ post_process(outputs_class, outputs_coord_list,
484
+ dn_meta, self.aux_loss, self._set_aux_loss)
485
+ out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],
486
+ 'pred_keypoints': outputs_keypoints_list[-1]}
487
+
488
+ return out
489
+
490
+
491
+ @MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')
492
+ def build_unipose(args):
493
+
494
+ num_classes = args.num_classes
495
+ device = torch.device(args.device)
496
+
497
+ backbone = build_backbone(args)
498
+
499
+ transformer = build_deformable_transformer(args)
500
+
501
+ try:
502
+ match_unstable_error = args.match_unstable_error
503
+ dn_labelbook_size = args.dn_labelbook_size
504
+ except:
505
+ match_unstable_error = True
506
+ dn_labelbook_size = num_classes
507
+
508
+ try:
509
+ dec_pred_class_embed_share = args.dec_pred_class_embed_share
510
+ except:
511
+ dec_pred_class_embed_share = True
512
+ try:
513
+ dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
514
+ except:
515
+ dec_pred_bbox_embed_share = True
516
+
517
+ binary_query_selection = False
518
+ try:
519
+ binary_query_selection = args.binary_query_selection
520
+ except:
521
+ binary_query_selection = False
522
+
523
+ use_cdn = True
524
+ try:
525
+ use_cdn = args.use_cdn
526
+ except:
527
+ use_cdn = True
528
+
529
+ sub_sentence_present = True
530
+ try:
531
+ sub_sentence_present = args.sub_sentence_present
532
+ except:
533
+ sub_sentence_present = True
534
+ # print('********* sub_sentence_present', sub_sentence_present)
535
+
536
+ model = UniPose(
537
+ backbone,
538
+ transformer,
539
+ num_classes=num_classes,
540
+ num_queries=args.num_queries,
541
+ aux_loss=True,
542
+ iter_update=True,
543
+ query_dim=4,
544
+ random_refpoints_xy=args.random_refpoints_xy,
545
+ fix_refpoints_hw=args.fix_refpoints_hw,
546
+ num_feature_levels=args.num_feature_levels,
547
+ nheads=args.nheads,
548
+ dec_pred_class_embed_share=dec_pred_class_embed_share,
549
+ dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
550
+ # two stage
551
+ two_stage_type=args.two_stage_type,
552
+ # box_share
553
+ two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
554
+ two_stage_class_embed_share=args.two_stage_class_embed_share,
555
+ decoder_sa_type=args.decoder_sa_type,
556
+ num_patterns=args.num_patterns,
557
+ dn_number=args.dn_number if args.use_dn else 0,
558
+ dn_box_noise_scale=args.dn_box_noise_scale,
559
+ dn_label_noise_ratio=args.dn_label_noise_ratio,
560
+ dn_labelbook_size=dn_labelbook_size,
561
+ use_label_enc=args.use_label_enc,
562
+
563
+ text_encoder_type=args.text_encoder_type,
564
+
565
+ binary_query_selection=binary_query_selection,
566
+ use_cdn=use_cdn,
567
+ sub_sentence_present=sub_sentence_present
568
+ )
569
+
570
+ return model
571
+
572
+
573
+ class ContrastiveAssign(nn.Module):
574
+ def __init__(self, project=False, cal_bias=None, max_text_len=256):
575
+ """
576
+ :param x: query
577
+ :param y: text embed
578
+ :param proj:
579
+ :return:
580
+ """
581
+ super().__init__()
582
+ self.project = project
583
+ self.cal_bias = cal_bias
584
+ self.max_text_len = max_text_len
585
+
586
+ def forward(self, x, text_dict):
587
+ """_summary_
588
+
589
+ Args:
590
+ x (_type_): _description_
591
+ text_dict (_type_): _description_
592
+ {
593
+ 'encoded_text': encoded_text, # bs, 195, d_model
594
+ 'text_token_mask': text_token_mask, # bs, 195
595
+ # True for used tokens. False for padding tokens
596
+ }
597
+ Returns:
598
+ _type_: _description_
599
+ """
600
+ assert isinstance(text_dict, dict)
601
+
602
+ y = text_dict['encoded_text']
603
+
604
+
605
+ max_text_len = y.shape[1]
606
+
607
+
608
+
609
+ text_token_mask = text_dict['text_token_mask']
610
+
611
+ if self.cal_bias is not None:
612
+ raise NotImplementedError
613
+ return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)
614
+ res = x @ y.transpose(-1, -2)
615
+ res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
616
+
617
+ # padding to max_text_len
618
+ new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)
619
+ new_res[..., :res.shape[-1]] = res
620
+
621
+ return new_res
difpoint/src/models/XPose/models/UniPose/utils.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+
7
+ import copy
8
+ import torch
9
+ import random
10
+ from torch import nn, Tensor
11
+ import os
12
+ import numpy as np
13
+ import math
14
+ import torch.nn.functional as F
15
+ from torch import nn
16
+
17
+
18
+ def _get_clones(module, N, layer_share=False):
19
+ # import ipdb; ipdb.set_trace()
20
+ if layer_share:
21
+ return nn.ModuleList([module for i in range(N)])
22
+ else:
23
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
24
+
25
+
26
+ def get_sine_pos_embed(
27
+ pos_tensor: torch.Tensor,
28
+ num_pos_feats: int = 128,
29
+ temperature: int = 10000,
30
+ exchange_xy: bool = True,
31
+ ):
32
+ """generate sine position embedding from a position tensor
33
+ Args:
34
+ pos_tensor (torch.Tensor): shape: [..., n].
35
+ num_pos_feats (int): projected shape for each float in the tensor.
36
+ temperature (int): temperature in the sine/cosine function.
37
+ exchange_xy (bool, optional): exchange pos x and pos y. \
38
+ For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
39
+ Returns:
40
+ pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
41
+ """
42
+ scale = 2 * math.pi
43
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
44
+ dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
45
+
46
+ def sine_func(x: torch.Tensor):
47
+ sin_x = x * scale / dim_t
48
+ sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
49
+ return sin_x
50
+
51
+ pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
52
+ if exchange_xy:
53
+ pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
54
+ pos_res = torch.cat(pos_res, dim=-1)
55
+ return pos_res
56
+
57
+
58
+ def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):
59
+ """
60
+ Input:
61
+ - memory: bs, \sum{hw}, d_model
62
+ - memory_padding_mask: bs, \sum{hw}
63
+ - spatial_shapes: nlevel, 2
64
+ - learnedwh: 2
65
+ Output:
66
+ - output_memory: bs, \sum{hw}, d_model
67
+ - output_proposals: bs, \sum{hw}, 4
68
+ """
69
+ N_, S_, C_ = memory.shape
70
+ base_scale = 4.0
71
+ proposals = []
72
+ _cur = 0
73
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
74
+ mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
75
+ valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
76
+ valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
77
+
78
+ # import ipdb; ipdb.set_trace()
79
+
80
+ grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
81
+ torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
82
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
83
+
84
+ scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
85
+ grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
86
+
87
+ if learnedwh is not None:
88
+ # import ipdb; ipdb.set_trace()
89
+ wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
90
+ else:
91
+ wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
92
+
93
+ # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
94
+ # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
95
+ # wh = torch.ones_like(grid) / scale
96
+ proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
97
+ proposals.append(proposal)
98
+ _cur += (H_ * W_)
99
+ # import ipdb; ipdb.set_trace()
100
+ output_proposals = torch.cat(proposals, 1)
101
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
102
+ output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
103
+ output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
104
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
105
+
106
+ output_memory = memory
107
+ output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
108
+ output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
109
+
110
+ # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
111
+ # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
112
+
113
+ return output_memory, output_proposals
114
+
115
+
116
+ class RandomBoxPerturber():
117
+ def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
118
+ self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
119
+
120
+ def __call__(self, refanchors: Tensor) -> Tensor:
121
+ nq, bs, query_dim = refanchors.shape
122
+ device = refanchors.device
123
+
124
+ noise_raw = torch.rand_like(refanchors)
125
+ noise_scale = self.noise_scale.to(device)[:query_dim]
126
+
127
+ new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
128
+ return new_refanchors.clamp_(0, 1)
129
+
130
+
131
+ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):
132
+ """
133
+ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
134
+ Args:
135
+ inputs: A float tensor of arbitrary shape.
136
+ The predictions for each example.
137
+ targets: A float tensor with the same shape as inputs. Stores the binary
138
+ classification label for each element in inputs
139
+ (0 for the negative class and 1 for the positive class).
140
+ alpha: (optional) Weighting factor in range (0,1) to balance
141
+ positive vs negative examples. Default = -1 (no weighting).
142
+ gamma: Exponent of the modulating factor (1 - p_t) to
143
+ balance easy vs hard examples.
144
+ Returns:
145
+ Loss tensor
146
+ """
147
+ prob = inputs.sigmoid()
148
+ ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
149
+ p_t = prob * targets + (1 - prob) * (1 - targets)
150
+ loss = ce_loss * ((1 - p_t) ** gamma)
151
+
152
+ if alpha >= 0:
153
+ alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
154
+ loss = alpha_t * loss
155
+
156
+ if no_reduction:
157
+ return loss
158
+
159
+ return loss.mean(1).sum() / num_boxes
160
+
161
+
162
+ class MLP(nn.Module):
163
+ """ Very simple multi-layer perceptron (also called FFN)"""
164
+
165
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
166
+ super().__init__()
167
+ self.num_layers = num_layers
168
+ h = [hidden_dim] * (num_layers - 1)
169
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
170
+
171
+ def forward(self, x):
172
+ for i, layer in enumerate(self.layers):
173
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
174
+ return x
175
+
176
+
177
+ def _get_activation_fn(activation, d_model=256, batch_dim=0):
178
+ """Return an activation function given a string"""
179
+ if activation == "relu":
180
+ return F.relu
181
+ if activation == "gelu":
182
+ return F.gelu
183
+ if activation == "glu":
184
+ return F.glu
185
+ if activation == "prelu":
186
+ return nn.PReLU()
187
+ if activation == "selu":
188
+ return F.selu
189
+
190
+ raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
191
+
192
+
193
+ def gen_sineembed_for_position(pos_tensor):
194
+ # n_query, bs, _ = pos_tensor.size()
195
+ # sineembed_tensor = torch.zeros(n_query, bs, 256)
196
+ scale = 2 * math.pi
197
+ dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
198
+ dim_t = 10000 ** (2 * (dim_t // 2) / 128)
199
+ x_embed = pos_tensor[:, :, 0] * scale
200
+ y_embed = pos_tensor[:, :, 1] * scale
201
+ pos_x = x_embed[:, :, None] / dim_t
202
+ pos_y = y_embed[:, :, None] / dim_t
203
+ pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
204
+ pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
205
+ if pos_tensor.size(-1) == 2:
206
+ pos = torch.cat((pos_y, pos_x), dim=2)
207
+ elif pos_tensor.size(-1) == 4:
208
+ w_embed = pos_tensor[:, :, 2] * scale
209
+ pos_w = w_embed[:, :, None] / dim_t
210
+ pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
211
+
212
+ h_embed = pos_tensor[:, :, 3] * scale
213
+ pos_h = h_embed[:, :, None] / dim_t
214
+ pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
215
+
216
+ pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
217
+ else:
218
+ raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
219
+ return pos
220
+
221
+
222
+ def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
223
+ sigmas = kpt_preds.new_tensor(sigmas)
224
+ variances = (sigmas * 2) ** 2
225
+
226
+ assert kpt_preds.size(0) == kpt_gts.size(0)
227
+ kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
228
+ kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
229
+
230
+ squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
231
+ (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
232
+ # import pdb
233
+ # pdb.set_trace()
234
+ # assert (kpt_valids.sum(-1) > 0).all()
235
+ squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
236
+ squared_distance1 = torch.exp(-squared_distance0)
237
+ squared_distance1 = squared_distance1 * kpt_valids
238
+ oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
239
+
240
+ return oks
241
+
242
+
243
+ def oks_loss(pred,
244
+ target,
245
+ valid=None,
246
+ area=None,
247
+ linear=False,
248
+ sigmas=None,
249
+ eps=1e-6):
250
+ """Oks loss.
251
+ Computing the oks loss between a set of predicted poses and target poses.
252
+ The loss is calculated as negative log of oks.
253
+ Args:
254
+ pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
255
+ shape (n, 2K).
256
+ target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
257
+ linear (bool, optional): If True, use linear scale of loss instead of
258
+ log scale. Default: False.
259
+ eps (float): Eps to avoid log(0).
260
+ Return:
261
+ torch.Tensor: Loss tensor.
262
+ """
263
+ oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
264
+ if linear:
265
+ loss = 1 - oks
266
+ else:
267
+ loss = -oks.log()
268
+ return loss
269
+
270
+
271
+ class OKSLoss(nn.Module):
272
+ """IoULoss.
273
+ Computing the oks loss between a set of predicted poses and target poses.
274
+ Args:
275
+ linear (bool): If True, use linear scale of loss instead of log scale.
276
+ Default: False.
277
+ eps (float): Eps to avoid log(0).
278
+ reduction (str): Options are "none", "mean" and "sum".
279
+ loss_weight (float): Weight of loss.
280
+ """
281
+
282
+ def __init__(self,
283
+ linear=False,
284
+ num_keypoints=17,
285
+ eps=1e-6,
286
+ reduction='mean',
287
+ loss_weight=1.0):
288
+ super(OKSLoss, self).__init__()
289
+ self.linear = linear
290
+ self.eps = eps
291
+ self.reduction = reduction
292
+ self.loss_weight = loss_weight
293
+ if num_keypoints == 68:
294
+ self.sigmas = np.array([
295
+ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
296
+ 1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,
297
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
298
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
299
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
300
+ ], dtype=np.float32) / 10.0
301
+ else:
302
+ raise ValueError(f'Unsupported keypoints number {num_keypoints}')
303
+
304
+ def forward(self,
305
+ pred,
306
+ target,
307
+ valid,
308
+ area,
309
+ weight=None,
310
+ avg_factor=None,
311
+ reduction_override=None):
312
+ """Forward function.
313
+ Args:
314
+ pred (torch.Tensor): The prediction.
315
+ target (torch.Tensor): The learning target of the prediction.
316
+ valid (torch.Tensor): The visible flag of the target pose.
317
+ area (torch.Tensor): The area of the target pose.
318
+ weight (torch.Tensor, optional): The weight of loss for each
319
+ prediction. Defaults to None.
320
+ avg_factor (int, optional): Average factor that is used to average
321
+ the loss. Defaults to None.
322
+ reduction_override (str, optional): The reduction method used to
323
+ override the original reduction method of the loss.
324
+ Defaults to None. Options are "none", "mean" and "sum".
325
+ """
326
+ assert reduction_override in (None, 'none', 'mean', 'sum')
327
+ reduction = (
328
+ reduction_override if reduction_override else self.reduction)
329
+ if (weight is not None) and (not torch.any(weight > 0)) and (
330
+ reduction != 'none'):
331
+ if pred.dim() == weight.dim() + 1:
332
+ weight = weight.unsqueeze(1)
333
+ return (pred * weight).sum() # 0
334
+ if weight is not None and weight.dim() > 1:
335
+ # TODO: remove this in the future
336
+ # reduce the weight of shape (n, 4) to (n,) to match the
337
+ # iou_loss of shape (n,)
338
+ assert weight.shape == pred.shape
339
+ weight = weight.mean(-1)
340
+ loss = self.loss_weight * oks_loss(
341
+ pred,
342
+ target,
343
+ valid=valid,
344
+ area=area,
345
+ linear=self.linear,
346
+ sigmas=self.sigmas,
347
+ eps=self.eps)
348
+ return loss
difpoint/src/models/XPose/models/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
7
+ from .UniPose.unipose import build_unipose
8
+
9
+ def build_model(args):
10
+ # we use register to maintain models from catdet6 on.
11
+ from .registry import MODULE_BUILD_FUNCS
12
+
13
+ assert args.modelname in MODULE_BUILD_FUNCS._module_dict
14
+ build_func = MODULE_BUILD_FUNCS.get(args.modelname)
15
+ model = build_func(args)
16
+ return model
difpoint/src/models/XPose/models/registry.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Author: Yihao Chen
3
+ # @Date: 2021-08-16 16:03:17
4
+ # @Last Modified by: Shilong Liu
5
+ # @Last Modified time: 2022-01-23 15:26
6
+ # modified from mmcv
7
+
8
+ import inspect
9
+ from functools import partial
10
+
11
+
12
+ class Registry(object):
13
+
14
+ def __init__(self, name):
15
+ self._name = name
16
+ self._module_dict = dict()
17
+
18
+ def __repr__(self):
19
+ format_str = self.__class__.__name__ + '(name={}, items={})'.format(
20
+ self._name, list(self._module_dict.keys()))
21
+ return format_str
22
+
23
+ def __len__(self):
24
+ return len(self._module_dict)
25
+
26
+ @property
27
+ def name(self):
28
+ return self._name
29
+
30
+ @property
31
+ def module_dict(self):
32
+ return self._module_dict
33
+
34
+ def get(self, key):
35
+ return self._module_dict.get(key, None)
36
+
37
+ def registe_with_name(self, module_name=None, force=False):
38
+ return partial(self.register, module_name=module_name, force=force)
39
+
40
+ def register(self, module_build_function, module_name=None, force=False):
41
+ """Register a module build function.
42
+ Args:
43
+ module (:obj:`nn.Module`): Module to be registered.
44
+ """
45
+ if not inspect.isfunction(module_build_function):
46
+ raise TypeError('module_build_function must be a function, but got {}'.format(
47
+ type(module_build_function)))
48
+ if module_name is None:
49
+ module_name = module_build_function.__name__
50
+ if not force and module_name in self._module_dict:
51
+ raise KeyError('{} is already registered in {}'.format(
52
+ module_name, self.name))
53
+ self._module_dict[module_name] = module_build_function
54
+
55
+ return module_build_function
56
+
57
+ MODULE_BUILD_FUNCS = Registry('model build functions')
58
+
difpoint/src/models/XPose/predefined_keypoints.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person = {"keypoints":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],"skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}
2
+
3
+ face = {"keypoints": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], "skeleton": []}
4
+
5
+ hand = {"keypoints":['wrist', 'thumb root', "thumb's third knuckle", "thumb's second knuckle", 'thumb’s first knuckle', "forefinger's root", "forefinger's third knuckle", "forefinger's second knuckle", "forefinger's first knuckle", "middle finger's root", "middle finger's third knuckle", "middle finger's second knuckle", "middle finger's first knuckle", "ring finger's root", "ring finger's third knuckle", "ring finger's second knuckle", "ring finger's first knuckle", "pinky finger's root", "pinky finger's third knuckle", "pinky finger's second knuckle", "pinky finger's first knuckle"],"skeleton": []}
6
+
7
+ animal_in_AnimalKindom = {"keypoints":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],"skeleton": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}
8
+
9
+ animal_in_AP10K = {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
10
+
11
+ animal= {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
12
+
13
+ animal_face = {"keypoints": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], "skeleton": []}
14
+
15
+ fly = {"keypoints": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], "skeleton": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}
16
+
17
+ locust = {"keypoints": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],"skeleton": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}
18
+
19
+ car ={"keypoints": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],"skeleton": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}
20
+
21
+ short_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
22
+
23
+ long_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
24
+
25
+ short_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
26
+
27
+ sling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
28
+
29
+ vest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
30
+
31
+ long_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
32
+
33
+ long_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
34
+
35
+ trousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}
36
+
37
+ sling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
38
+
39
+ vest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
40
+
41
+ skirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}
42
+
43
+ short_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
44
+
45
+ shorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}
46
+
47
+ table = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}
48
+
49
+ chair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}
50
+
51
+ bed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}
52
+
53
+ sofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}
54
+
55
+ swivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}
56
+
difpoint/src/models/XPose/transforms.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Transforms and data augmentation for both image + bbox.
4
+ """
5
+ import os
6
+ import sys
7
+ import random
8
+
9
+ import PIL
10
+ import torch
11
+ import torchvision.transforms as T
12
+ import torchvision.transforms.functional as F
13
+
14
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
15
+ from util.box_ops import box_xyxy_to_cxcywh
16
+ from util.misc import interpolate
17
+
18
+
19
+ def crop(image, target, region):
20
+ cropped_image = F.crop(image, *region)
21
+
22
+ if target is not None:
23
+ target = target.copy()
24
+ i, j, h, w = region
25
+ id2catname = target["id2catname"]
26
+ caption_list = target["caption_list"]
27
+ target["size"] = torch.tensor([h, w])
28
+
29
+ fields = ["labels", "area", "iscrowd", "positive_map","keypoints"]
30
+
31
+ if "boxes" in target:
32
+ boxes = target["boxes"]
33
+ max_size = torch.as_tensor([w, h], dtype=torch.float32)
34
+ cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
35
+ cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
36
+ cropped_boxes = cropped_boxes.clamp(min=0)
37
+ area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
38
+ target["boxes"] = cropped_boxes.reshape(-1, 4)
39
+ target["area"] = area
40
+ fields.append("boxes")
41
+
42
+ if "masks" in target:
43
+ # FIXME should we update the area here if there are no boxes?
44
+ target['masks'] = target['masks'][:, i:i + h, j:j + w]
45
+ fields.append("masks")
46
+
47
+
48
+ # remove elements for which the boxes or masks that have zero area
49
+ if "boxes" in target or "masks" in target:
50
+ # favor boxes selection when defining which elements to keep
51
+ # this is compatible with previous implementation
52
+ if "boxes" in target:
53
+ cropped_boxes = target['boxes'].reshape(-1, 2, 2)
54
+ keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
55
+ else:
56
+ keep = target['masks'].flatten(1).any(1)
57
+
58
+ for field in fields:
59
+ if field in target:
60
+ target[field] = target[field][keep]
61
+
62
+ if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
63
+ # for debug and visualization only.
64
+ if 'strings_positive' in target:
65
+ target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]
66
+
67
+
68
+ if "keypoints" in target:
69
+ max_size = torch.as_tensor([w, h], dtype=torch.float32)
70
+ keypoints = target["keypoints"]
71
+ cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])
72
+ cropped_keypoints = torch.min(cropped_keypoints, max_size)
73
+ cropped_keypoints = cropped_keypoints.clamp(min=0)
74
+ cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)
75
+ target["keypoints"] = cropped_keypoints.view(target["keypoints"].shape[0], target["keypoints"].shape[1], 3)
76
+
77
+ target["id2catname"] = id2catname
78
+ target["caption_list"] = caption_list
79
+
80
+ return cropped_image, target
81
+
82
+
83
+ def hflip(image, target):
84
+ flipped_image = F.hflip(image)
85
+
86
+ w, h = image.size
87
+
88
+ if target is not None:
89
+ target = target.copy()
90
+ if "boxes" in target:
91
+ boxes = target["boxes"]
92
+ boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
93
+ target["boxes"] = boxes
94
+
95
+ if "masks" in target:
96
+ target['masks'] = target['masks'].flip(-1)
97
+
98
+
99
+ if "keypoints" in target:
100
+ dataset_name=target["dataset_name"]
101
+ if dataset_name == "coco_person" or dataset_name == "macaque":
102
+ flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
103
+ [9, 10], [11, 12], [13, 14], [15, 16]]
104
+
105
+ elif dataset_name=="animalkindom_ak_P1_animal":
106
+ flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]
107
+
108
+ elif dataset_name=="animalweb_animal":
109
+ flip_pairs = [[0, 3], [1, 2], [5, 6]]
110
+
111
+ elif dataset_name=="face":
112
+ flip_pairs = [
113
+ [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
114
+ [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
115
+ [31, 35], [32, 34],
116
+ [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
117
+ [48, 54], [49, 53], [50, 52],
118
+ [55, 59], [56, 58],
119
+ [60, 64], [61, 63],
120
+ [65, 67]
121
+ ]
122
+
123
+ elif dataset_name=="hand":
124
+ flip_pairs = []
125
+
126
+ elif dataset_name=="foot":
127
+ flip_pairs = []
128
+
129
+ elif dataset_name=="locust":
130
+ flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
131
+
132
+ elif dataset_name=="fly":
133
+ flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]
134
+
135
+ elif dataset_name == "ap_36k_animal" or dataset_name == "ap_10k_animal":
136
+ flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]
137
+
138
+
139
+
140
+ keypoints = target["keypoints"]
141
+ keypoints[:,:,0] = w - keypoints[:,:, 0]-1
142
+ for pair in flip_pairs:
143
+ keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()
144
+ target["keypoints"] = keypoints
145
+ return flipped_image, target
146
+
147
+
148
+ def resize(image, target, size, max_size=None):
149
+ # size can be min_size (scalar) or (w, h) tuple
150
+
151
+ def get_size_with_aspect_ratio(image_size, size, max_size=None):
152
+ w, h = image_size
153
+ if max_size is not None:
154
+ min_original_size = float(min((w, h)))
155
+ max_original_size = float(max((w, h)))
156
+ if max_original_size / min_original_size * size > max_size:
157
+ size = int(round(max_size * min_original_size / max_original_size))
158
+
159
+ if (w <= h and w == size) or (h <= w and h == size):
160
+ return (h, w)
161
+
162
+ if w < h:
163
+ ow = size
164
+ oh = int(size * h / w)
165
+ else:
166
+ oh = size
167
+ ow = int(size * w / h)
168
+
169
+ return (oh, ow)
170
+
171
+ def get_size(image_size, size, max_size=None):
172
+ if isinstance(size, (list, tuple)):
173
+ return size[::-1]
174
+ else:
175
+ return get_size_with_aspect_ratio(image_size, size, max_size)
176
+
177
+ size = get_size(image.size, size, max_size)
178
+ rescaled_image = F.resize(image, size)
179
+
180
+ if target is None:
181
+ return rescaled_image, None
182
+
183
+ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
184
+ ratio_width, ratio_height = ratios
185
+
186
+ target = target.copy()
187
+ if "boxes" in target:
188
+ boxes = target["boxes"]
189
+ scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
190
+ target["boxes"] = scaled_boxes
191
+
192
+ if "area" in target:
193
+ area = target["area"]
194
+ scaled_area = area * (ratio_width * ratio_height)
195
+ target["area"] = scaled_area
196
+
197
+
198
+ if "keypoints" in target:
199
+ keypoints = target["keypoints"]
200
+ scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])
201
+ target["keypoints"] = scaled_keypoints
202
+
203
+ h, w = size
204
+ target["size"] = torch.tensor([h, w])
205
+
206
+ if "masks" in target:
207
+ target['masks'] = interpolate(
208
+ target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
209
+
210
+ return rescaled_image, target
211
+
212
+
213
+ def pad(image, target, padding):
214
+ # assumes that we only pad on the bottom right corners
215
+ padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
216
+ if target is None:
217
+ return padded_image, None
218
+ target = target.copy()
219
+ # should we do something wrt the original size?
220
+ target["size"] = torch.tensor(padded_image.size[::-1])
221
+ if "masks" in target:
222
+ target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
223
+ return padded_image, target
224
+
225
+
226
+ class ResizeDebug(object):
227
+ def __init__(self, size):
228
+ self.size = size
229
+
230
+ def __call__(self, img, target):
231
+ return resize(img, target, self.size)
232
+
233
+
234
+ class RandomCrop(object):
235
+ def __init__(self, size):
236
+ self.size = size
237
+
238
+ def __call__(self, img, target):
239
+ region = T.RandomCrop.get_params(img, self.size)
240
+ return crop(img, target, region)
241
+
242
+
243
+ class RandomSizeCrop(object):
244
+ def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
245
+ # respect_boxes: True to keep all boxes
246
+ # False to tolerence box filter
247
+ self.min_size = min_size
248
+ self.max_size = max_size
249
+ self.respect_boxes = respect_boxes
250
+
251
+ def __call__(self, img: PIL.Image.Image, target: dict):
252
+ init_boxes = len(target["boxes"]) if (target is not None and "boxes" in target) else 0
253
+ max_patience = 10
254
+ for i in range(max_patience):
255
+ w = random.randint(self.min_size, min(img.width, self.max_size))
256
+ h = random.randint(self.min_size, min(img.height, self.max_size))
257
+ region = T.RandomCrop.get_params(img, [h, w])
258
+ result_img, result_target = crop(img, target, region)
259
+ if target is not None:
260
+ if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
261
+ return result_img, result_target
262
+ return result_img, result_target
263
+
264
+
265
+ class CenterCrop(object):
266
+ def __init__(self, size):
267
+ self.size = size
268
+
269
+ def __call__(self, img, target):
270
+ image_width, image_height = img.size
271
+ crop_height, crop_width = self.size
272
+ crop_top = int(round((image_height - crop_height) / 2.))
273
+ crop_left = int(round((image_width - crop_width) / 2.))
274
+ return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
275
+
276
+
277
+ class RandomHorizontalFlip(object):
278
+ def __init__(self, p=0.5):
279
+ self.p = p
280
+
281
+ def __call__(self, img, target):
282
+ if random.random() < self.p:
283
+ return hflip(img, target)
284
+ return img, target
285
+
286
+
287
+ class RandomResize(object):
288
+ def __init__(self, sizes, max_size=None):
289
+ assert isinstance(sizes, (list, tuple))
290
+ self.sizes = sizes
291
+ self.max_size = max_size
292
+
293
+ def __call__(self, img, target=None):
294
+ size = random.choice(self.sizes)
295
+ return resize(img, target, size, self.max_size)
296
+
297
+
298
+ class RandomPad(object):
299
+ def __init__(self, max_pad):
300
+ self.max_pad = max_pad
301
+
302
+ def __call__(self, img, target):
303
+ pad_x = random.randint(0, self.max_pad)
304
+ pad_y = random.randint(0, self.max_pad)
305
+ return pad(img, target, (pad_x, pad_y))
306
+
307
+
308
+ class RandomSelect(object):
309
+ """
310
+ Randomly selects between transforms1 and transforms2,
311
+ with probability p for transforms1 and (1 - p) for transforms2
312
+ """
313
+ def __init__(self, transforms1, transforms2, p=0.5):
314
+ self.transforms1 = transforms1
315
+ self.transforms2 = transforms2
316
+ self.p = p
317
+
318
+ def __call__(self, img, target):
319
+ if random.random() < self.p:
320
+ return self.transforms1(img, target)
321
+ return self.transforms2(img, target)
322
+
323
+
324
+ class ToTensor(object):
325
+ def __call__(self, img, target):
326
+ return F.to_tensor(img), target
327
+
328
+
329
+ class RandomErasing(object):
330
+
331
+ def __init__(self, *args, **kwargs):
332
+ self.eraser = T.RandomErasing(*args, **kwargs)
333
+
334
+ def __call__(self, img, target):
335
+ return self.eraser(img), target
336
+
337
+
338
+ class Normalize(object):
339
+ def __init__(self, mean, std):
340
+ self.mean = mean
341
+ self.std = std
342
+
343
+ def __call__(self, image, target=None):
344
+ image = F.normalize(image, mean=self.mean, std=self.std)
345
+ if target is None:
346
+ return image, None
347
+ target = target.copy()
348
+ h, w = image.shape[-2:]
349
+ if "boxes" in target:
350
+ boxes = target["boxes"]
351
+ boxes = box_xyxy_to_cxcywh(boxes)
352
+ boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
353
+ target["boxes"] = boxes
354
+
355
+ if "area" in target:
356
+ area = target["area"]
357
+ area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))
358
+ target["area"] = area
359
+
360
+ if "keypoints" in target:
361
+ keypoints = target["keypoints"]
362
+ V = keypoints[:, :, 2]
363
+ V[V == 2] = 1
364
+ Z=keypoints[:, :, :2]
365
+ Z = Z.contiguous().view(-1, 2 * V.shape[-1])
366
+ Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)
367
+ target["valid_kpt_num"] = V.shape[1]
368
+ Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])
369
+ V_pad = torch.zeros(V.shape[0],68 - V.shape[1])
370
+ V=torch.cat([V, V_pad], dim=1)
371
+ Z=torch.cat([Z, Z_pad], dim=1)
372
+ all_keypoints = torch.cat([Z, V], dim=1)
373
+ target["keypoints"] = all_keypoints
374
+
375
+
376
+ return image, target
377
+
378
+
379
+ class Compose(object):
380
+ def __init__(self, transforms):
381
+ self.transforms = transforms
382
+
383
+ def __call__(self, image, target):
384
+ for t in self.transforms:
385
+ image, target = t(image, target)
386
+ return image, target
387
+
388
+ def __repr__(self):
389
+ format_string = self.__class__.__name__ + "("
390
+ for t in self.transforms:
391
+ format_string += "\n"
392
+ format_string += " {0}".format(t)
393
+ format_string += "\n)"
394
+ return format_string
difpoint/src/models/XPose/util/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2024/8/5 21:58
3
+ # @Author : shaoguowen
4
+ # @Email : [email protected]
5
+ # @Project : FasterLivePortrait
6
+ # @FileName: __init__.py.py
difpoint/src/models/XPose/util/addict.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+
4
+ class Dict(dict):
5
+
6
+ def __init__(__self, *args, **kwargs):
7
+ object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
8
+ object.__setattr__(__self, '__key', kwargs.pop('__key', None))
9
+ object.__setattr__(__self, '__frozen', False)
10
+ for arg in args:
11
+ if not arg:
12
+ continue
13
+ elif isinstance(arg, dict):
14
+ for key, val in arg.items():
15
+ __self[key] = __self._hook(val)
16
+ elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
17
+ __self[arg[0]] = __self._hook(arg[1])
18
+ else:
19
+ for key, val in iter(arg):
20
+ __self[key] = __self._hook(val)
21
+
22
+ for key, val in kwargs.items():
23
+ __self[key] = __self._hook(val)
24
+
25
+ def __setattr__(self, name, value):
26
+ if hasattr(self.__class__, name):
27
+ raise AttributeError("'Dict' object attribute "
28
+ "'{0}' is read-only".format(name))
29
+ else:
30
+ self[name] = value
31
+
32
+ def __setitem__(self, name, value):
33
+ isFrozen = (hasattr(self, '__frozen') and
34
+ object.__getattribute__(self, '__frozen'))
35
+ if isFrozen and name not in super(Dict, self).keys():
36
+ raise KeyError(name)
37
+ super(Dict, self).__setitem__(name, value)
38
+ try:
39
+ p = object.__getattribute__(self, '__parent')
40
+ key = object.__getattribute__(self, '__key')
41
+ except AttributeError:
42
+ p = None
43
+ key = None
44
+ if p is not None:
45
+ p[key] = self
46
+ object.__delattr__(self, '__parent')
47
+ object.__delattr__(self, '__key')
48
+
49
+ def __add__(self, other):
50
+ if not self.keys():
51
+ return other
52
+ else:
53
+ self_type = type(self).__name__
54
+ other_type = type(other).__name__
55
+ msg = "unsupported operand type(s) for +: '{}' and '{}'"
56
+ raise TypeError(msg.format(self_type, other_type))
57
+
58
+ @classmethod
59
+ def _hook(cls, item):
60
+ if isinstance(item, dict):
61
+ return cls(item)
62
+ elif isinstance(item, (list, tuple)):
63
+ return type(item)(cls._hook(elem) for elem in item)
64
+ return item
65
+
66
+ def __getattr__(self, item):
67
+ return self.__getitem__(item)
68
+
69
+ def __missing__(self, name):
70
+ if object.__getattribute__(self, '__frozen'):
71
+ raise KeyError(name)
72
+ return self.__class__(__parent=self, __key=name)
73
+
74
+ def __delattr__(self, name):
75
+ del self[name]
76
+
77
+ def to_dict(self):
78
+ base = {}
79
+ for key, value in self.items():
80
+ if isinstance(value, type(self)):
81
+ base[key] = value.to_dict()
82
+ elif isinstance(value, (list, tuple)):
83
+ base[key] = type(value)(
84
+ item.to_dict() if isinstance(item, type(self)) else
85
+ item for item in value)
86
+ else:
87
+ base[key] = value
88
+ return base
89
+
90
+ def copy(self):
91
+ return copy.copy(self)
92
+
93
+ def deepcopy(self):
94
+ return copy.deepcopy(self)
95
+
96
+ def __deepcopy__(self, memo):
97
+ other = self.__class__()
98
+ memo[id(self)] = other
99
+ for key, value in self.items():
100
+ other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
101
+ return other
102
+
103
+ def update(self, *args, **kwargs):
104
+ other = {}
105
+ if args:
106
+ if len(args) > 1:
107
+ raise TypeError()
108
+ other.update(args[0])
109
+ other.update(kwargs)
110
+ for k, v in other.items():
111
+ if ((k not in self) or
112
+ (not isinstance(self[k], dict)) or
113
+ (not isinstance(v, dict))):
114
+ self[k] = v
115
+ else:
116
+ self[k].update(v)
117
+
118
+ def __getnewargs__(self):
119
+ return tuple(self.items())
120
+
121
+ def __getstate__(self):
122
+ return self
123
+
124
+ def __setstate__(self, state):
125
+ self.update(state)
126
+
127
+ def __or__(self, other):
128
+ if not isinstance(other, (Dict, dict)):
129
+ return NotImplemented
130
+ new = Dict(self)
131
+ new.update(other)
132
+ return new
133
+
134
+ def __ror__(self, other):
135
+ if not isinstance(other, (Dict, dict)):
136
+ return NotImplemented
137
+ new = Dict(other)
138
+ new.update(self)
139
+ return new
140
+
141
+ def __ior__(self, other):
142
+ self.update(other)
143
+ return self
144
+
145
+ def setdefault(self, key, default=None):
146
+ if key in self:
147
+ return self[key]
148
+ else:
149
+ self[key] = default
150
+ return default
151
+
152
+ def freeze(self, shouldFreeze=True):
153
+ object.__setattr__(self, '__frozen', shouldFreeze)
154
+ for key, val in self.items():
155
+ if isinstance(val, Dict):
156
+ val.freeze(shouldFreeze)
157
+
158
+ def unfreeze(self):
159
+ self.freeze(False)
difpoint/src/models/XPose/util/box_ops.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Utilities for bounding box manipulation and GIoU.
4
+ """
5
+ import torch, os
6
+ from torchvision.ops.boxes import box_area
7
+
8
+
9
+ def box_cxcywh_to_xyxy(x):
10
+ x_c, y_c, w, h = x.unbind(-1)
11
+ b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
12
+ (x_c + 0.5 * w), (y_c + 0.5 * h)]
13
+ return torch.stack(b, dim=-1)
14
+
15
+
16
+ def box_xyxy_to_cxcywh(x):
17
+ x0, y0, x1, y1 = x.unbind(-1)
18
+ b = [(x0 + x1) / 2, (y0 + y1) / 2,
19
+ (x1 - x0), (y1 - y0)]
20
+ return torch.stack(b, dim=-1)
21
+
22
+
23
+ # modified from torchvision to also return the union
24
+ def box_iou(boxes1, boxes2):
25
+ area1 = box_area(boxes1)
26
+ area2 = box_area(boxes2)
27
+
28
+ # import ipdb; ipdb.set_trace()
29
+ lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
30
+ rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
31
+
32
+ wh = (rb - lt).clamp(min=0) # [N,M,2]
33
+ inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
34
+
35
+ union = area1[:, None] + area2 - inter
36
+
37
+ iou = inter / (union + 1e-6)
38
+ return iou, union
39
+
40
+
41
+ def generalized_box_iou(boxes1, boxes2):
42
+ """
43
+ Generalized IoU from https://giou.stanford.edu/
44
+
45
+ The boxes should be in [x0, y0, x1, y1] format
46
+
47
+ Returns a [N, M] pairwise matrix, where N = len(boxes1)
48
+ and M = len(boxes2)
49
+ """
50
+ # degenerate boxes gives inf / nan results
51
+ # so do an early check
52
+ assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
53
+ assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
54
+ # except:
55
+ # import ipdb; ipdb.set_trace()
56
+ iou, union = box_iou(boxes1, boxes2)
57
+
58
+ lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
59
+ rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
60
+
61
+ wh = (rb - lt).clamp(min=0) # [N,M,2]
62
+ area = wh[:, :, 0] * wh[:, :, 1]
63
+
64
+ return iou - (area - union) / (area + 1e-6)
65
+
66
+
67
+
68
+ # modified from torchvision to also return the union
69
+ def box_iou_pairwise(boxes1, boxes2):
70
+ area1 = box_area(boxes1)
71
+ area2 = box_area(boxes2)
72
+
73
+ lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2]
74
+ rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2]
75
+
76
+ wh = (rb - lt).clamp(min=0) # [N,2]
77
+ inter = wh[:, 0] * wh[:, 1] # [N]
78
+
79
+ union = area1 + area2 - inter
80
+
81
+ iou = inter / union
82
+ return iou, union
83
+
84
+
85
+ def generalized_box_iou_pairwise(boxes1, boxes2):
86
+ """
87
+ Generalized IoU from https://giou.stanford.edu/
88
+
89
+ Input:
90
+ - boxes1, boxes2: N,4
91
+ Output:
92
+ - giou: N, 4
93
+ """
94
+ # degenerate boxes gives inf / nan results
95
+ # so do an early check
96
+ assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
97
+ assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
98
+ assert boxes1.shape == boxes2.shape
99
+ iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
100
+
101
+ lt = torch.min(boxes1[:, :2], boxes2[:, :2])
102
+ rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
103
+
104
+ wh = (rb - lt).clamp(min=0) # [N,2]
105
+ area = wh[:, 0] * wh[:, 1]
106
+
107
+ return iou - (area - union) / area
108
+
109
+ def masks_to_boxes(masks):
110
+ """Compute the bounding boxes around the provided masks
111
+
112
+ The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
113
+
114
+ Returns a [N, 4] tensors, with the boxes in xyxy format
115
+ """
116
+ if masks.numel() == 0:
117
+ return torch.zeros((0, 4), device=masks.device)
118
+
119
+ h, w = masks.shape[-2:]
120
+
121
+ y = torch.arange(0, h, dtype=torch.float)
122
+ x = torch.arange(0, w, dtype=torch.float)
123
+ y, x = torch.meshgrid(y, x)
124
+
125
+ x_mask = (masks * x.unsqueeze(0))
126
+ x_max = x_mask.flatten(1).max(-1)[0]
127
+ x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
128
+
129
+ y_mask = (masks * y.unsqueeze(0))
130
+ y_max = y_mask.flatten(1).max(-1)[0]
131
+ y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
132
+
133
+ return torch.stack([x_min, y_min, x_max, y_max], 1)
134
+
135
+ if __name__ == '__main__':
136
+ x = torch.rand(5, 4)
137
+ y = torch.rand(3, 4)
138
+ iou, union = box_iou(x, y)
139
+ import ipdb; ipdb.set_trace()
difpoint/src/models/XPose/util/config.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================================
2
+ # Modified from mmcv
3
+ # ==========================================================
4
+ import sys
5
+ import os.path as osp
6
+ import ast
7
+ import tempfile
8
+ import shutil
9
+ from importlib import import_module
10
+ from argparse import Action
11
+
12
+ from .addict import Dict
13
+ import os
14
+
15
+ BASE_KEY = '_base_'
16
+ DELETE_KEY = '_delete_'
17
+ RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict']
18
+
19
+
20
+ def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
21
+ if not osp.isfile(filename):
22
+ raise FileNotFoundError(msg_tmpl.format(filename))
23
+
24
+ class ConfigDict(Dict):
25
+
26
+ def __missing__(self, name):
27
+ raise KeyError(name)
28
+
29
+ def __getattr__(self, name):
30
+ try:
31
+ value = super(ConfigDict, self).__getattr__(name)
32
+ except KeyError:
33
+ ex = AttributeError(f"'{self.__class__.__name__}' object has no "
34
+ f"attribute '{name}'")
35
+ except Exception as e:
36
+ ex = e
37
+ else:
38
+ return value
39
+ raise ex
40
+
41
+
42
+ class Config(object):
43
+ """
44
+ config files.
45
+ only support .py file as config now.
46
+
47
+ ref: mmcv.utils.config
48
+
49
+ Example:
50
+ >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
51
+ >>> cfg.a
52
+ 1
53
+ >>> cfg.b
54
+ {'b1': [0, 1]}
55
+ >>> cfg.b.b1
56
+ [0, 1]
57
+ >>> cfg = Config.fromfile('tests/data/config/a.py')
58
+ >>> cfg.filename
59
+ "/home/kchen/projects/mmcv/tests/data/config/a.py"
60
+ >>> cfg.item4
61
+ 'test'
62
+ >>> cfg
63
+ "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
64
+ "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
65
+ """
66
+ @staticmethod
67
+ def _validate_py_syntax(filename):
68
+ with open(filename) as f:
69
+ content = f.read()
70
+ try:
71
+ ast.parse(content)
72
+ except SyntaxError:
73
+ raise SyntaxError('There are syntax errors in config '
74
+ f'file {filename}')
75
+
76
+ @staticmethod
77
+ def _file2dict(filename):
78
+ filename = osp.abspath(osp.expanduser(filename))
79
+ check_file_exist(filename)
80
+ if filename.lower().endswith('.py'):
81
+ with tempfile.TemporaryDirectory() as temp_config_dir:
82
+ # 使用 mkstemp 代替 NamedTemporaryFile
83
+ fd, temp_path = tempfile.mkstemp(dir=temp_config_dir, suffix='.py')
84
+ os.close(fd) # 立即关闭文件描述符
85
+ temp_config_name = os.path.basename(temp_path)
86
+ shutil.copyfile(filename, os.path.join(temp_config_dir, temp_config_name))
87
+ temp_module_name = os.path.splitext(temp_config_name)[0]
88
+ sys.path.insert(0, temp_config_dir)
89
+ Config._validate_py_syntax(filename)
90
+ mod = import_module(temp_module_name)
91
+ sys.path.pop(0)
92
+ cfg_dict = {
93
+ name: value
94
+ for name, value in mod.__dict__.items()
95
+ if not name.startswith('__')
96
+ }
97
+ # delete imported module
98
+ del sys.modules[temp_module_name]
99
+ elif filename.lower().endswith(('.yml', '.yaml', '.json')):
100
+ from .slio import slload
101
+ cfg_dict = slload(filename)
102
+ else:
103
+ raise IOError('Only py/yml/yaml/json type are supported now!')
104
+
105
+ cfg_text = filename + '\n'
106
+ with open(filename, 'r') as f:
107
+ cfg_text += f.read()
108
+
109
+ # parse the base file
110
+ if BASE_KEY in cfg_dict:
111
+ cfg_dir = osp.dirname(filename)
112
+ base_filename = cfg_dict.pop(BASE_KEY)
113
+ base_filename = base_filename if isinstance(
114
+ base_filename, list) else [base_filename]
115
+
116
+ cfg_dict_list = list()
117
+ cfg_text_list = list()
118
+ for f in base_filename:
119
+ _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
120
+ cfg_dict_list.append(_cfg_dict)
121
+ cfg_text_list.append(_cfg_text)
122
+
123
+ base_cfg_dict = dict()
124
+ for c in cfg_dict_list:
125
+ if len(base_cfg_dict.keys() & c.keys()) > 0:
126
+ raise KeyError('Duplicate key is not allowed among bases')
127
+ # TODO Allow the duplicate key while warnning user
128
+ base_cfg_dict.update(c)
129
+
130
+ base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
131
+ cfg_dict = base_cfg_dict
132
+
133
+ # merge cfg_text
134
+ cfg_text_list.append(cfg_text)
135
+ cfg_text = '\n'.join(cfg_text_list)
136
+
137
+ return cfg_dict, cfg_text
138
+
139
+ @staticmethod
140
+ def _merge_a_into_b(a, b):
141
+ """merge dict `a` into dict `b` (non-inplace).
142
+ values in `a` will overwrite `b`.
143
+ copy first to avoid inplace modification
144
+
145
+ Args:
146
+ a ([type]): [description]
147
+ b ([type]): [description]
148
+
149
+ Returns:
150
+ [dict]: [description]
151
+ """
152
+ # import ipdb; ipdb.set_trace()
153
+ if not isinstance(a, dict):
154
+ return a
155
+
156
+ b = b.copy()
157
+ for k, v in a.items():
158
+ if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
159
+
160
+ if not isinstance(b[k], dict) and not isinstance(b[k], list):
161
+ # if :
162
+ # import ipdb; ipdb.set_trace()
163
+ raise TypeError(
164
+ f'{k}={v} in child config cannot inherit from base '
165
+ f'because {k} is a dict in the child config but is of '
166
+ f'type {type(b[k])} in base config. You may set '
167
+ f'`{DELETE_KEY}=True` to ignore the base config')
168
+ b[k] = Config._merge_a_into_b(v, b[k])
169
+ elif isinstance(b, list):
170
+ try:
171
+ _ = int(k)
172
+ except:
173
+ raise TypeError(
174
+ f'b is a list, '
175
+ f'index {k} should be an int when input but {type(k)}'
176
+ )
177
+ b[int(k)] = Config._merge_a_into_b(v, b[int(k)])
178
+ else:
179
+ b[k] = v
180
+
181
+ return b
182
+
183
+ @staticmethod
184
+ def fromfile(filename):
185
+ cfg_dict, cfg_text = Config._file2dict(filename)
186
+ return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
187
+
188
+
189
+ def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
190
+ if cfg_dict is None:
191
+ cfg_dict = dict()
192
+ elif not isinstance(cfg_dict, dict):
193
+ raise TypeError('cfg_dict must be a dict, but '
194
+ f'got {type(cfg_dict)}')
195
+ for key in cfg_dict:
196
+ if key in RESERVED_KEYS:
197
+ raise KeyError(f'{key} is reserved for config file')
198
+
199
+ super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
200
+ super(Config, self).__setattr__('_filename', filename)
201
+ if cfg_text:
202
+ text = cfg_text
203
+ elif filename:
204
+ with open(filename, 'r') as f:
205
+ text = f.read()
206
+ else:
207
+ text = ''
208
+ super(Config, self).__setattr__('_text', text)
209
+
210
+
211
+ @property
212
+ def filename(self):
213
+ return self._filename
214
+
215
+ @property
216
+ def text(self):
217
+ return self._text
218
+
219
+ @property
220
+ def pretty_text(self):
221
+
222
+ indent = 4
223
+
224
+ def _indent(s_, num_spaces):
225
+ s = s_.split('\n')
226
+ if len(s) == 1:
227
+ return s_
228
+ first = s.pop(0)
229
+ s = [(num_spaces * ' ') + line for line in s]
230
+ s = '\n'.join(s)
231
+ s = first + '\n' + s
232
+ return s
233
+
234
+ def _format_basic_types(k, v, use_mapping=False):
235
+ if isinstance(v, str):
236
+ v_str = f"'{v}'"
237
+ else:
238
+ v_str = str(v)
239
+
240
+ if use_mapping:
241
+ k_str = f"'{k}'" if isinstance(k, str) else str(k)
242
+ attr_str = f'{k_str}: {v_str}'
243
+ else:
244
+ attr_str = f'{str(k)}={v_str}'
245
+ attr_str = _indent(attr_str, indent)
246
+
247
+ return attr_str
248
+
249
+ def _format_list(k, v, use_mapping=False):
250
+ # check if all items in the list are dict
251
+ if all(isinstance(_, dict) for _ in v):
252
+ v_str = '[\n'
253
+ v_str += '\n'.join(
254
+ f'dict({_indent(_format_dict(v_), indent)}),'
255
+ for v_ in v).rstrip(',')
256
+ if use_mapping:
257
+ k_str = f"'{k}'" if isinstance(k, str) else str(k)
258
+ attr_str = f'{k_str}: {v_str}'
259
+ else:
260
+ attr_str = f'{str(k)}={v_str}'
261
+ attr_str = _indent(attr_str, indent) + ']'
262
+ else:
263
+ attr_str = _format_basic_types(k, v, use_mapping)
264
+ return attr_str
265
+
266
+ def _contain_invalid_identifier(dict_str):
267
+ contain_invalid_identifier = False
268
+ for key_name in dict_str:
269
+ contain_invalid_identifier |= \
270
+ (not str(key_name).isidentifier())
271
+ return contain_invalid_identifier
272
+
273
+ def _format_dict(input_dict, outest_level=False):
274
+ r = ''
275
+ s = []
276
+
277
+ use_mapping = _contain_invalid_identifier(input_dict)
278
+ if use_mapping:
279
+ r += '{'
280
+ for idx, (k, v) in enumerate(input_dict.items()):
281
+ is_last = idx >= len(input_dict) - 1
282
+ end = '' if outest_level or is_last else ','
283
+ if isinstance(v, dict):
284
+ v_str = '\n' + _format_dict(v)
285
+ if use_mapping:
286
+ k_str = f"'{k}'" if isinstance(k, str) else str(k)
287
+ attr_str = f'{k_str}: dict({v_str}'
288
+ else:
289
+ attr_str = f'{str(k)}=dict({v_str}'
290
+ attr_str = _indent(attr_str, indent) + ')' + end
291
+ elif isinstance(v, list):
292
+ attr_str = _format_list(k, v, use_mapping) + end
293
+ else:
294
+ attr_str = _format_basic_types(k, v, use_mapping) + end
295
+
296
+ s.append(attr_str)
297
+ r += '\n'.join(s)
298
+ if use_mapping:
299
+ r += '}'
300
+ return r
301
+
302
+ cfg_dict = self._cfg_dict.to_dict()
303
+ text = _format_dict(cfg_dict, outest_level=True)
304
+ return text
305
+
306
+
307
+ def __repr__(self):
308
+ return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
309
+
310
+ def __len__(self):
311
+ return len(self._cfg_dict)
312
+
313
+ def __getattr__(self, name):
314
+ # # debug
315
+ # print('+'*15)
316
+ # print('name=%s' % name)
317
+ # print("addr:", id(self))
318
+ # # print('type(self):', type(self))
319
+ # print(self.__dict__)
320
+ # print('+'*15)
321
+ # if self.__dict__ == {}:
322
+ # raise ValueError
323
+
324
+ return getattr(self._cfg_dict, name)
325
+
326
+ def __getitem__(self, name):
327
+ return self._cfg_dict.__getitem__(name)
328
+
329
+ def __setattr__(self, name, value):
330
+ if isinstance(value, dict):
331
+ value = ConfigDict(value)
332
+ self._cfg_dict.__setattr__(name, value)
333
+
334
+ def __setitem__(self, name, value):
335
+ if isinstance(value, dict):
336
+ value = ConfigDict(value)
337
+ self._cfg_dict.__setitem__(name, value)
338
+
339
+ def __iter__(self):
340
+ return iter(self._cfg_dict)
341
+
342
+ def dump(self, file=None):
343
+ # import ipdb; ipdb.set_trace()
344
+ if file is None:
345
+ return self.pretty_text
346
+ else:
347
+ with open(file, 'w') as f:
348
+ f.write(self.pretty_text)
349
+
350
+ def merge_from_dict(self, options):
351
+ """Merge list into cfg_dict
352
+
353
+ Merge the dict parsed by MultipleKVAction into this cfg.
354
+
355
+ Examples:
356
+ >>> options = {'model.backbone.depth': 50,
357
+ ... 'model.backbone.with_cp':True}
358
+ >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
359
+ >>> cfg.merge_from_dict(options)
360
+ >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
361
+ >>> assert cfg_dict == dict(
362
+ ... model=dict(backbone=dict(depth=50, with_cp=True)))
363
+
364
+ Args:
365
+ options (dict): dict of configs to merge from.
366
+ """
367
+ option_cfg_dict = {}
368
+ for full_key, v in options.items():
369
+ d = option_cfg_dict
370
+ key_list = full_key.split('.')
371
+ for subkey in key_list[:-1]:
372
+ d.setdefault(subkey, ConfigDict())
373
+ d = d[subkey]
374
+ subkey = key_list[-1]
375
+ d[subkey] = v
376
+
377
+ cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
378
+ super(Config, self).__setattr__(
379
+ '_cfg_dict', Config._merge_a_into_b(option_cfg_dict, cfg_dict))
380
+
381
+ # for multiprocess
382
+ def __setstate__(self, state):
383
+ self.__init__(state)
384
+
385
+
386
+ def copy(self):
387
+ return Config(self._cfg_dict.copy())
388
+
389
+ def deepcopy(self):
390
+ return Config(self._cfg_dict.deepcopy())
391
+
392
+
393
+ class DictAction(Action):
394
+ """
395
+ argparse action to split an argument into KEY=VALUE form
396
+ on the first = and append to a dictionary. List options should
397
+ be passed as comma separated values, i.e KEY=V1,V2,V3
398
+ """
399
+
400
+ @staticmethod
401
+ def _parse_int_float_bool(val):
402
+ try:
403
+ return int(val)
404
+ except ValueError:
405
+ pass
406
+ try:
407
+ return float(val)
408
+ except ValueError:
409
+ pass
410
+ if val.lower() in ['true', 'false']:
411
+ return True if val.lower() == 'true' else False
412
+ if val.lower() in ['none', 'null']:
413
+ return None
414
+ return val
415
+
416
+ def __call__(self, parser, namespace, values, option_string=None):
417
+ options = {}
418
+ for kv in values:
419
+ key, val = kv.split('=', maxsplit=1)
420
+ val = [self._parse_int_float_bool(v) for v in val.split(',')]
421
+ if len(val) == 1:
422
+ val = val[0]
423
+ options[key] = val
424
+ setattr(namespace, self.dest, options)
425
+
difpoint/src/models/XPose/util/keypoint_ops.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os
2
+
3
+ def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
4
+ """_summary_
5
+
6
+ Args:
7
+ keypoints (torch.Tensor): ..., 51
8
+ """
9
+ res = torch.zeros_like(keypoints)
10
+ num_points = keypoints.shape[-1] // 3
11
+ Z = keypoints[..., :2*num_points]
12
+ V = keypoints[..., 2*num_points:]
13
+ res[...,0::3] = Z[..., 0::2]
14
+ res[...,1::3] = Z[..., 1::2]
15
+ res[...,2::3] = V[...]
16
+ return res
17
+
18
+ def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
19
+ """_summary_
20
+
21
+ Args:
22
+ keypoints (torch.Tensor): ..., 51
23
+ """
24
+ res = torch.zeros_like(keypoints)
25
+ num_points = keypoints.shape[-1] // 3
26
+ res[...,0:2*num_points:2] = keypoints[..., 0::3]
27
+ res[...,1:2*num_points:2] = keypoints[..., 1::3]
28
+ res[...,2*num_points:] = keypoints[..., 2::3]
29
+ return res
difpoint/src/models/XPose/util/misc.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Misc functions, including distributed helpers.
4
+
5
+ Mostly copy-paste from torchvision references.
6
+ """
7
+ import functools
8
+ import io
9
+ import os
10
+ import random
11
+ import subprocess
12
+ import time
13
+ from collections import OrderedDict, defaultdict, deque
14
+ import datetime
15
+ import pickle
16
+ from typing import Optional, List
17
+
18
+ import json, time
19
+ import numpy as np
20
+ import torch
21
+ import torch.distributed as dist
22
+ from torch import Tensor
23
+
24
+ import colorsys
25
+
26
+ # needed due to empty tensor bug in pytorch and torchvision 0.5
27
+ import torchvision
28
+ __torchvision_need_compat_flag = float(torchvision.__version__.split('.')[1]) < 7
29
+ if __torchvision_need_compat_flag:
30
+ from torchvision.ops import _new_empty_tensor
31
+ from torchvision.ops.misc import _output_size
32
+
33
+
34
+ class SmoothedValue(object):
35
+ """Track a series of values and provide access to smoothed values over a
36
+ window or the global series average.
37
+ """
38
+
39
+ def __init__(self, window_size=20, fmt=None):
40
+ if fmt is None:
41
+ fmt = "{median:.4f} ({global_avg:.4f})"
42
+ self.deque = deque(maxlen=window_size)
43
+ self.total = 0.0
44
+ self.count = 0
45
+ self.fmt = fmt
46
+
47
+ def update(self, value, n=1):
48
+ self.deque.append(value)
49
+ self.count += n
50
+ self.total += value * n
51
+
52
+ def synchronize_between_processes(self):
53
+ """
54
+ Warning: does not synchronize the deque!
55
+ """
56
+ if not is_dist_avail_and_initialized():
57
+ return
58
+ t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
59
+ dist.barrier()
60
+ dist.all_reduce(t)
61
+ t = t.tolist()
62
+ self.count = int(t[0])
63
+ self.total = t[1]
64
+
65
+ @property
66
+ def median(self):
67
+ d = torch.tensor(list(self.deque))
68
+ if d.shape[0] == 0:
69
+ return 0
70
+ return d.median().item()
71
+
72
+ @property
73
+ def avg(self):
74
+ d = torch.tensor(list(self.deque), dtype=torch.float32)
75
+ return d.mean().item()
76
+
77
+ @property
78
+ def global_avg(self):
79
+ if os.environ.get("SHILONG_AMP", None) == '1':
80
+ eps = 1e-4
81
+ else:
82
+ eps = 1e-6
83
+ return self.total / (self.count + eps)
84
+
85
+ @property
86
+ def max(self):
87
+ return max(self.deque)
88
+
89
+ @property
90
+ def value(self):
91
+ return self.deque[-1]
92
+
93
+ def __str__(self):
94
+ return self.fmt.format(
95
+ median=self.median,
96
+ avg=self.avg,
97
+ global_avg=self.global_avg,
98
+ max=self.max,
99
+ value=self.value)
100
+
101
+ @functools.lru_cache()
102
+ def _get_global_gloo_group():
103
+ """
104
+ Return a process group based on gloo backend, containing all the ranks
105
+ The result is cached.
106
+ """
107
+
108
+ if dist.get_backend() == "nccl":
109
+ return dist.new_group(backend="gloo")
110
+
111
+ return dist.group.WORLD
112
+
113
+ def all_gather_cpu(data):
114
+ """
115
+ Run all_gather on arbitrary picklable data (not necessarily tensors)
116
+ Args:
117
+ data: any picklable object
118
+ Returns:
119
+ list[data]: list of data gathered from each rank
120
+ """
121
+
122
+ world_size = get_world_size()
123
+ if world_size == 1:
124
+ return [data]
125
+
126
+ cpu_group = _get_global_gloo_group()
127
+
128
+ buffer = io.BytesIO()
129
+ torch.save(data, buffer)
130
+ data_view = buffer.getbuffer()
131
+ device = "cuda" if cpu_group is None else "cpu"
132
+ tensor = torch.ByteTensor(data_view).to(device)
133
+
134
+ # obtain Tensor size of each rank
135
+ local_size = torch.tensor([tensor.numel()], device=device, dtype=torch.long)
136
+ size_list = [torch.tensor([0], device=device, dtype=torch.long) for _ in range(world_size)]
137
+ if cpu_group is None:
138
+ dist.all_gather(size_list, local_size)
139
+ else:
140
+ print("gathering on cpu")
141
+ dist.all_gather(size_list, local_size, group=cpu_group)
142
+ size_list = [int(size.item()) for size in size_list]
143
+ max_size = max(size_list)
144
+ assert isinstance(local_size.item(), int)
145
+ local_size = int(local_size.item())
146
+
147
+ # receiving Tensor from all ranks
148
+ # we pad the tensor because torch all_gather does not support
149
+ # gathering tensors of different shapes
150
+ tensor_list = []
151
+ for _ in size_list:
152
+ tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device=device))
153
+ if local_size != max_size:
154
+ padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device=device)
155
+ tensor = torch.cat((tensor, padding), dim=0)
156
+ if cpu_group is None:
157
+ dist.all_gather(tensor_list, tensor)
158
+ else:
159
+ dist.all_gather(tensor_list, tensor, group=cpu_group)
160
+
161
+ data_list = []
162
+ for size, tensor in zip(size_list, tensor_list):
163
+ tensor = torch.split(tensor, [size, max_size - size], dim=0)[0]
164
+ buffer = io.BytesIO(tensor.cpu().numpy())
165
+ obj = torch.load(buffer)
166
+ data_list.append(obj)
167
+
168
+ return data_list
169
+
170
+
171
+ def all_gather(data):
172
+ """
173
+ Run all_gather on arbitrary picklable data (not necessarily tensors)
174
+ Args:
175
+ data: any picklable object
176
+ Returns:
177
+ list[data]: list of data gathered from each rank
178
+ """
179
+
180
+ if os.getenv("CPU_REDUCE") == "1":
181
+ return all_gather_cpu(data)
182
+
183
+
184
+
185
+ world_size = get_world_size()
186
+ if world_size == 1:
187
+ return [data]
188
+
189
+ # serialized to a Tensor
190
+ buffer = pickle.dumps(data)
191
+ storage = torch.ByteStorage.from_buffer(buffer)
192
+ tensor = torch.ByteTensor(storage).to("cuda")
193
+
194
+ # obtain Tensor size of each rank
195
+ local_size = torch.tensor([tensor.numel()], device="cuda")
196
+ size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
197
+ dist.all_gather(size_list, local_size)
198
+ size_list = [int(size.item()) for size in size_list]
199
+ max_size = max(size_list)
200
+
201
+ # receiving Tensor from all ranks
202
+ # we pad the tensor because torch all_gather does not support
203
+ # gathering tensors of different shapes
204
+ tensor_list = []
205
+ for _ in size_list:
206
+ tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
207
+ if local_size != max_size:
208
+ padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
209
+ tensor = torch.cat((tensor, padding), dim=0)
210
+ dist.all_gather(tensor_list, tensor)
211
+
212
+ data_list = []
213
+ for size, tensor in zip(size_list, tensor_list):
214
+ buffer = tensor.cpu().numpy().tobytes()[:size]
215
+ data_list.append(pickle.loads(buffer))
216
+
217
+ return data_list
218
+
219
+
220
+ def reduce_dict(input_dict, average=True):
221
+ """
222
+ Args:
223
+ input_dict (dict): all the values will be reduced
224
+ average (bool): whether to do average or sum
225
+ Reduce the values in the dictionary from all processes so that all processes
226
+ have the averaged results. Returns a dict with the same fields as
227
+ input_dict, after reduction.
228
+ """
229
+ world_size = get_world_size()
230
+ if world_size < 2:
231
+ return input_dict
232
+ with torch.no_grad():
233
+ names = []
234
+ values = []
235
+ # sort the keys so that they are consistent across processes
236
+ for k in sorted(input_dict.keys()):
237
+ names.append(k)
238
+ values.append(input_dict[k])
239
+ values = torch.stack(values, dim=0)
240
+ dist.all_reduce(values)
241
+ if average:
242
+ values /= world_size
243
+ reduced_dict = {k: v for k, v in zip(names, values)}
244
+ return reduced_dict
245
+
246
+
247
+ class MetricLogger(object):
248
+ def __init__(self, delimiter="\t"):
249
+ self.meters = defaultdict(SmoothedValue)
250
+ self.delimiter = delimiter
251
+
252
+ def update(self, **kwargs):
253
+ for k, v in kwargs.items():
254
+ if isinstance(v, torch.Tensor):
255
+ v = v.item()
256
+ assert isinstance(v, (float, int))
257
+ self.meters[k].update(v)
258
+
259
+ def __getattr__(self, attr):
260
+ if attr in self.meters:
261
+ return self.meters[attr]
262
+ if attr in self.__dict__:
263
+ return self.__dict__[attr]
264
+ raise AttributeError("'{}' object has no attribute '{}'".format(
265
+ type(self).__name__, attr))
266
+
267
+ def __str__(self):
268
+ loss_str = []
269
+ for name, meter in self.meters.items():
270
+ # print(name, str(meter))
271
+ # import ipdb;ipdb.set_trace()
272
+ if meter.count > 0:
273
+ loss_str.append(
274
+ "{}: {}".format(name, str(meter))
275
+ )
276
+ return self.delimiter.join(loss_str)
277
+
278
+ def synchronize_between_processes(self):
279
+ for meter in self.meters.values():
280
+ meter.synchronize_between_processes()
281
+
282
+ def add_meter(self, name, meter):
283
+ self.meters[name] = meter
284
+
285
+ def log_every(self, iterable, print_freq, header=None, logger=None):
286
+ if logger is None:
287
+ print_func = print
288
+ else:
289
+ print_func = logger.info
290
+
291
+ i = 0
292
+ if not header:
293
+ header = ''
294
+ start_time = time.time()
295
+ end = time.time()
296
+ iter_time = SmoothedValue(fmt='{avg:.4f}')
297
+ data_time = SmoothedValue(fmt='{avg:.4f}')
298
+ space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
299
+ if torch.cuda.is_available():
300
+ log_msg = self.delimiter.join([
301
+ header,
302
+ '[{0' + space_fmt + '}/{1}]',
303
+ 'eta: {eta}',
304
+ '{meters}',
305
+ 'time: {time}',
306
+ 'data: {data}',
307
+ 'max mem: {memory:.0f}'
308
+ ])
309
+ else:
310
+ log_msg = self.delimiter.join([
311
+ header,
312
+ '[{0' + space_fmt + '}/{1}]',
313
+ 'eta: {eta}',
314
+ '{meters}',
315
+ 'time: {time}',
316
+ 'data: {data}'
317
+ ])
318
+ MB = 1024.0 * 1024.0
319
+ for obj in iterable:
320
+ data_time.update(time.time() - end)
321
+ yield obj
322
+ # import ipdb; ipdb.set_trace()
323
+ iter_time.update(time.time() - end)
324
+ if i % print_freq == 0 or i == len(iterable) - 1:
325
+ eta_seconds = iter_time.global_avg * (len(iterable) - i)
326
+ eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
327
+ if torch.cuda.is_available():
328
+ print_func(log_msg.format(
329
+ i, len(iterable), eta=eta_string,
330
+ meters=str(self),
331
+ time=str(iter_time), data=str(data_time),
332
+ memory=torch.cuda.max_memory_allocated() / MB))
333
+ else:
334
+ print_func(log_msg.format(
335
+ i, len(iterable), eta=eta_string,
336
+ meters=str(self),
337
+ time=str(iter_time), data=str(data_time)))
338
+ i += 1
339
+ end = time.time()
340
+ total_time = time.time() - start_time
341
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
342
+ print_func('{} Total time: {} ({:.4f} s / it)'.format(
343
+ header, total_time_str, total_time / len(iterable)))
344
+
345
+
346
+ def get_sha():
347
+ cwd = os.path.dirname(os.path.abspath(__file__))
348
+
349
+ def _run(command):
350
+ return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
351
+ sha = 'N/A'
352
+ diff = "clean"
353
+ branch = 'N/A'
354
+ try:
355
+ sha = _run(['git', 'rev-parse', 'HEAD'])
356
+ subprocess.check_output(['git', 'diff'], cwd=cwd)
357
+ diff = _run(['git', 'diff-index', 'HEAD'])
358
+ diff = "has uncommited changes" if diff else "clean"
359
+ branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
360
+ except Exception:
361
+ pass
362
+ message = f"sha: {sha}, status: {diff}, branch: {branch}"
363
+ return message
364
+
365
+
366
+ def collate_fn(batch):
367
+ # import ipdb; ipdb.set_trace()
368
+ batch = list(zip(*batch))
369
+ batch[0] = nested_tensor_from_tensor_list(batch[0])
370
+ return tuple(batch)
371
+
372
+
373
+ def _max_by_axis(the_list):
374
+ # type: (List[List[int]]) -> List[int]
375
+ maxes = the_list[0]
376
+ for sublist in the_list[1:]:
377
+ for index, item in enumerate(sublist):
378
+ maxes[index] = max(maxes[index], item)
379
+ return maxes
380
+
381
+
382
+ class NestedTensor(object):
383
+ def __init__(self, tensors, mask: Optional[Tensor]):
384
+ self.tensors = tensors
385
+ self.mask = mask
386
+ if mask == 'auto':
387
+ self.mask = torch.zeros_like(tensors).to(tensors.device)
388
+ if self.mask.dim() == 3:
389
+ self.mask = self.mask.sum(0).to(bool)
390
+ elif self.mask.dim() == 4:
391
+ self.mask = self.mask.sum(1).to(bool)
392
+ else:
393
+ raise ValueError("tensors dim must be 3 or 4 but {}({})".format(self.tensors.dim(), self.tensors.shape))
394
+
395
+ def imgsize(self):
396
+ res = []
397
+ for i in range(self.tensors.shape[0]):
398
+ mask = self.mask[i]
399
+ maxH = (~mask).sum(0).max()
400
+ maxW = (~mask).sum(1).max()
401
+ res.append(torch.Tensor([maxH, maxW]))
402
+ return res
403
+
404
+ def to(self, device):
405
+ # type: (Device) -> NestedTensor # noqa
406
+ cast_tensor = self.tensors.to(device)
407
+ mask = self.mask
408
+ if mask is not None:
409
+ assert mask is not None
410
+ cast_mask = mask.to(device)
411
+ else:
412
+ cast_mask = None
413
+ return NestedTensor(cast_tensor, cast_mask)
414
+
415
+ def to_img_list_single(self, tensor, mask):
416
+ assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim())
417
+ maxH = (~mask).sum(0).max()
418
+ maxW = (~mask).sum(1).max()
419
+ img = tensor[:, :maxH, :maxW]
420
+ return img
421
+
422
+ def to_img_list(self):
423
+ """remove the padding and convert to img list
424
+
425
+ Returns:
426
+ [type]: [description]
427
+ """
428
+ if self.tensors.dim() == 3:
429
+ return self.to_img_list_single(self.tensors, self.mask)
430
+ else:
431
+ res = []
432
+ for i in range(self.tensors.shape[0]):
433
+ tensor_i = self.tensors[i]
434
+ mask_i = self.mask[i]
435
+ res.append(self.to_img_list_single(tensor_i, mask_i))
436
+ return res
437
+
438
+ @property
439
+ def device(self):
440
+ return self.tensors.device
441
+
442
+ def decompose(self):
443
+ return self.tensors, self.mask
444
+
445
+ def __repr__(self):
446
+ return str(self.tensors)
447
+
448
+ @property
449
+ def shape(self):
450
+ return {
451
+ 'tensors.shape': self.tensors.shape,
452
+ 'mask.shape': self.mask.shape
453
+ }
454
+
455
+
456
+ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
457
+ # TODO make this more general
458
+ if tensor_list[0].ndim == 3:
459
+ if torchvision._is_tracing():
460
+ # nested_tensor_from_tensor_list() does not export well to ONNX
461
+ # call _onnx_nested_tensor_from_tensor_list() instead
462
+ return _onnx_nested_tensor_from_tensor_list(tensor_list)
463
+
464
+ # TODO make it support different-sized images
465
+ max_size = _max_by_axis([list(img.shape) for img in tensor_list])
466
+ # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
467
+ batch_shape = [len(tensor_list)] + max_size
468
+ b, c, h, w = batch_shape
469
+ dtype = tensor_list[0].dtype
470
+ device = tensor_list[0].device
471
+ tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
472
+ mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
473
+ for img, pad_img, m in zip(tensor_list, tensor, mask):
474
+ pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
475
+ m[: img.shape[1], :img.shape[2]] = False
476
+ else:
477
+ raise ValueError('not supported')
478
+ return NestedTensor(tensor, mask)
479
+
480
+
481
+ # _onnx_nested_tensor_from_tensor_list() is an implementation of
482
+ # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
483
+ @torch.jit.unused
484
+ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
485
+ max_size = []
486
+ for i in range(tensor_list[0].dim()):
487
+ max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
488
+ max_size.append(max_size_i)
489
+ max_size = tuple(max_size)
490
+
491
+ # work around for
492
+ # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
493
+ # m[: img.shape[1], :img.shape[2]] = False
494
+ # which is not yet supported in onnx
495
+ padded_imgs = []
496
+ padded_masks = []
497
+ for img in tensor_list:
498
+ padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
499
+ padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
500
+ padded_imgs.append(padded_img)
501
+
502
+ m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
503
+ padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
504
+ padded_masks.append(padded_mask.to(torch.bool))
505
+
506
+ tensor = torch.stack(padded_imgs)
507
+ mask = torch.stack(padded_masks)
508
+
509
+ return NestedTensor(tensor, mask=mask)
510
+
511
+
512
+ def setup_for_distributed(is_master):
513
+ """
514
+ This function disables printing when not in master process
515
+ """
516
+ import builtins as __builtin__
517
+ builtin_print = __builtin__.print
518
+
519
+ def print(*args, **kwargs):
520
+ force = kwargs.pop('force', False)
521
+ if is_master or force:
522
+ builtin_print(*args, **kwargs)
523
+
524
+ __builtin__.print = print
525
+
526
+
527
+ def is_dist_avail_and_initialized():
528
+ if not dist.is_available():
529
+ return False
530
+ if not dist.is_initialized():
531
+ return False
532
+ return True
533
+
534
+
535
+ def get_world_size():
536
+ if not is_dist_avail_and_initialized():
537
+ return 1
538
+ return dist.get_world_size()
539
+
540
+
541
+ def get_rank():
542
+ if not is_dist_avail_and_initialized():
543
+ return 0
544
+ return dist.get_rank()
545
+
546
+
547
+ def is_main_process():
548
+ return get_rank() == 0
549
+
550
+
551
+ def save_on_master(*args, **kwargs):
552
+ if is_main_process():
553
+ torch.save(*args, **kwargs)
554
+
555
+ def init_distributed_mode(args):
556
+ if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '': # 'RANK' in os.environ and
557
+ args.rank = int(os.environ["RANK"])
558
+ args.world_size = int(os.environ['WORLD_SIZE'])
559
+ args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
560
+
561
+ # launch by torch.distributed.launch
562
+ # Single node
563
+ # python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 1 --rank 0 ...
564
+ # Multi nodes
565
+ # python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 0 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
566
+ # python -m torch.distributed.launch --nproc_per_node=8 main.py --world-size 2 --rank 1 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' ...
567
+ # args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK'))
568
+ # local_world_size = int(os.environ['GPU_PER_NODE_COUNT'])
569
+ # args.world_size = args.world_size * local_world_size
570
+ # args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
571
+ # args.rank = args.rank * local_world_size + args.local_rank
572
+ print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
573
+ print(json.dumps(dict(os.environ), indent=2))
574
+ elif 'SLURM_PROCID' in os.environ:
575
+ args.rank = int(os.environ['SLURM_PROCID'])
576
+ args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
577
+ args.world_size = int(os.environ['SLURM_NPROCS'])
578
+
579
+ if os.environ.get('HAND_DEFINE_DIST_URL', 0) == '1':
580
+ pass
581
+ else:
582
+ import util.hostlist as uh
583
+ nodenames = uh.parse_nodelist(os.environ['SLURM_JOB_NODELIST'])
584
+ gpu_ids = [int(node[3:]) for node in nodenames]
585
+ fixid = int(os.environ.get('FIX_DISTRIBUTED_PORT_NUMBER', 0))
586
+ # fixid += random.randint(0, 300)
587
+ port = str(3137 + int(min(gpu_ids)) + fixid)
588
+ args.dist_url = "tcp://{ip}:{port}".format(ip=uh.nodename_to_ip(nodenames[0]), port=port)
589
+
590
+ print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
591
+
592
+
593
+ else:
594
+ print('Not using distributed mode')
595
+ args.distributed = False
596
+ args.world_size = 1
597
+ args.rank = 0
598
+ args.local_rank = 0
599
+ return
600
+
601
+ print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
602
+ args.distributed = True
603
+ torch.cuda.set_device(args.local_rank)
604
+ args.dist_backend = 'nccl'
605
+ print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
606
+
607
+ torch.distributed.init_process_group(
608
+ backend=args.dist_backend,
609
+ world_size=args.world_size,
610
+ rank=args.rank,
611
+ init_method=args.dist_url,
612
+ )
613
+
614
+ print("Before torch.distributed.barrier()")
615
+ torch.distributed.barrier()
616
+ print("End torch.distributed.barrier()")
617
+ setup_for_distributed(args.rank == 0)
618
+
619
+
620
+ @torch.no_grad()
621
+ def accuracy(output, target, topk=(1,)):
622
+ """Computes the precision@k for the specified values of k"""
623
+ if target.numel() == 0:
624
+ return [torch.zeros([], device=output.device)]
625
+ maxk = max(topk)
626
+ batch_size = target.size(0)
627
+
628
+ _, pred = output.topk(maxk, 1, True, True)
629
+ pred = pred.t()
630
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
631
+
632
+ res = []
633
+ for k in topk:
634
+ correct_k = correct[:k].view(-1).float().sum(0)
635
+ res.append(correct_k.mul_(100.0 / batch_size))
636
+ return res
637
+
638
+ @torch.no_grad()
639
+ def accuracy_onehot(pred, gt):
640
+ """_summary_
641
+
642
+ Args:
643
+ pred (_type_): n, c
644
+ gt (_type_): n, c
645
+ """
646
+ tp = ((pred - gt).abs().sum(-1) < 1e-4).float().sum()
647
+ acc = tp / gt.shape[0] * 100
648
+ return acc
649
+
650
+
651
+
652
+
653
+
654
+ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
655
+ # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
656
+ """
657
+ Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
658
+ This will eventually be supported natively by PyTorch, and this
659
+ class can go away.
660
+ """
661
+ if __torchvision_need_compat_flag < 0.7:
662
+ if input.numel() > 0:
663
+ return torch.nn.functional.interpolate(
664
+ input, size, scale_factor, mode, align_corners
665
+ )
666
+
667
+ output_shape = _output_size(2, input, size, scale_factor)
668
+ output_shape = list(input.shape[:-2]) + list(output_shape)
669
+ return _new_empty_tensor(input, output_shape)
670
+ else:
671
+ return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
672
+
673
+
674
+
675
+ class color_sys():
676
+ def __init__(self, num_colors) -> None:
677
+ self.num_colors = num_colors
678
+ colors=[]
679
+ for i in np.arange(0., 360., 360. / num_colors):
680
+ hue = i/360.
681
+ lightness = (50 + np.random.rand() * 10)/100.
682
+ saturation = (90 + np.random.rand() * 10)/100.
683
+ colors.append(tuple([int(j*255) for j in colorsys.hls_to_rgb(hue, lightness, saturation)]))
684
+ self.colors = colors
685
+
686
+ def __call__(self, idx):
687
+ return self.colors[idx]
688
+
689
+ def inverse_sigmoid(x, eps=1e-3):
690
+ x = x.clamp(min=0, max=1)
691
+ x1 = x.clamp(min=eps)
692
+ x2 = (1 - x).clamp(min=eps)
693
+ return torch.log(x1/x2)
694
+
695
+ def clean_state_dict(state_dict):
696
+ new_state_dict = OrderedDict()
697
+ for k, v in state_dict.items():
698
+ if k[:7] == 'module.':
699
+ k = k[7:] # remove `module.`
700
+ new_state_dict[k] = v
701
+ return new_state_dict
difpoint/src/models/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Author : wenshao
3
+ # @Email : [email protected]
4
+ # @Project : FasterLivePortrait
5
+ # @FileName: __init__.py.py
6
+
7
+ from .warping_spade_model import WarpingSpadeModel
8
+ from .motion_extractor_model import MotionExtractorModel
9
+ from .appearance_feature_extractor_model import AppearanceFeatureExtractorModel
10
+ from .landmark_model import LandmarkModel
11
+ from .face_analysis_model import FaceAnalysisModel
12
+ from .stitching_model import StitchingModel
13
+ from .mediapipe_face_model import MediaPipeFaceModel
difpoint/src/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (599 Bytes). View file
 
difpoint/src/models/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (612 Bytes). View file
 
difpoint/src/models/__pycache__/appearance_feature_extractor_model.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
difpoint/src/models/__pycache__/appearance_feature_extractor_model.cpython-38.pyc ADDED
Binary file (2.17 kB). View file
 
difpoint/src/models/__pycache__/base_model.cpython-310.pyc ADDED
Binary file (1.61 kB). View file