diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..3bcbe277a8218bc25b31b3d27b42cc659c27b23e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d1d9a561b76aaa8c22a6f446fb505150d5a305ab --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**pycache** diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 0000000000000000000000000000000000000000..47351a4dd132b031ef7e8f4693d1bf130de74390 --- /dev/null +++ b/NOTES.md @@ -0,0 +1,11 @@ +## Create wheel for mmcv +``` +cd ./external/engine +python setup.py bdist_wheel + +cd ./external/cv +MMCV_WITH_OPS=1 python setup.py bdist_wheel + +cd ./external/det +python setup.py bdist_wheel +``` \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0211f40f12d6e730786bd34036869b44bd61454c --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +--- +title: Sapiens Pose +emoji: 📊 +colorFrom: pink +colorTo: yellow +sdk: gradio +sdk_version: 4.42.0 +app_file: app.py +pinned: false +license: cc-by-nc-4.0 +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..30cbf3cd8306056525ce204c459df24c8bceb9ba --- /dev/null +++ b/app.py @@ -0,0 +1,453 @@ +import os +from typing import List +import spaces +import gradio as gr +import numpy as np +import torch +import json +import tempfile +import torch.nn.functional as F +from torchvision import transforms +from PIL import Image +import cv2 +from gradio.themes.utils import sizes +from classes_and_palettes import ( + COCO_KPTS_COLORS, + COCO_WHOLEBODY_KPTS_COLORS, + GOLIATH_KPTS_COLORS, + GOLIATH_SKELETON_INFO, + GOLIATH_KEYPOINTS +) + +import os +import sys +import subprocess +import importlib.util + +def is_package_installed(package_name): + return importlib.util.find_spec(package_name) is not None + +def find_wheel(package_path): + dist_dir = os.path.join(package_path, "dist") + if os.path.exists(dist_dir): + wheel_files = [f for f in os.listdir(dist_dir) if f.endswith('.whl')] + if wheel_files: + return os.path.join(dist_dir, wheel_files[0]) + return None + +def install_from_wheel(package_name, package_path): + wheel_file = find_wheel(package_path) + if wheel_file: + print(f"Installing {package_name} from wheel: {wheel_file}") + subprocess.check_call([sys.executable, "-m", "pip", "install", wheel_file]) + else: + print(f"{package_name} wheel not found in {package_path}. Please build it first.") + sys.exit(1) + +def install_local_packages(): + packages = [ + ("mmengine", "./external/engine"), + ("mmcv", "./external/cv"), + ("mmdet", "./external/det") + ] + + for package_name, package_path in packages: + if not is_package_installed(package_name): + print(f"Installing {package_name}...") + install_from_wheel(package_name, package_path) + else: + print(f"{package_name} is already installed.") + +# Run the installation at the start of your app +install_local_packages() + +from detector_utils import ( + adapt_mmdet_pipeline, + init_detector, + process_images_detector, + ) + +class Config: + ASSETS_DIR = os.path.join(os.path.dirname(__file__), 'assets') + CHECKPOINTS_DIR = os.path.join(ASSETS_DIR, "checkpoints") + CHECKPOINTS = { + "0.3b": "sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2", + "1b": "sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2", + } + DETECTION_CHECKPOINT = os.path.join(CHECKPOINTS_DIR, 'rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth') + DETECTION_CONFIG = os.path.join(ASSETS_DIR, 'rtmdet_m_640-8xb32_coco-person_no_nms.py') + +class ModelManager: + @staticmethod + def load_model(checkpoint_name: str): + if checkpoint_name is None: + return None + checkpoint_path = os.path.join(Config.CHECKPOINTS_DIR, checkpoint_name) + model = torch.jit.load(checkpoint_path) + model.eval() + model.to("cuda") + return model + + @staticmethod + @torch.inference_mode() + def run_model(model, input_tensor): + return model(input_tensor) + +class ImageProcessor: + def __init__(self): + self.transform = transforms.Compose([ + transforms.Resize((1024, 768)), + transforms.ToTensor(), + transforms.Normalize(mean=[123.5/255, 116.5/255, 103.5/255], + std=[58.5/255, 57.0/255, 57.5/255]) + ]) + self.detector = init_detector( + Config.DETECTION_CONFIG, Config.DETECTION_CHECKPOINT, device='cpu' + ) + self.detector.cfg = adapt_mmdet_pipeline(self.detector.cfg) + + def detect_persons(self, image: Image.Image): + # Convert PIL Image to tensor + image = np.array(image) + image = np.expand_dims(image, axis=0) + + # Perform person detection + bboxes_batch = process_images_detector( + image, + self.detector + ) + bboxes = self.get_person_bboxes(bboxes_batch[0]) # Get bboxes for the first (and only) image + + return bboxes + + def get_person_bboxes(self, bboxes_batch, score_thr=0.3): + person_bboxes = [] + for bbox in bboxes_batch: + if len(bbox) == 5: # [x1, y1, x2, y2, score] + if bbox[4] > score_thr: + person_bboxes.append(bbox) + elif len(bbox) == 4: # [x1, y1, x2, y2] + person_bboxes.append(bbox + [1.0]) # Add a default score of 1.0 + return person_bboxes + + @spaces.GPU + @torch.inference_mode() + def estimate_pose(self, image: Image.Image, bboxes: List[List[float]], model_name: str, kpt_threshold: float): + pose_model = ModelManager.load_model(Config.CHECKPOINTS[model_name]) + + result_image = image.copy() + all_keypoints = [] # List to store keypoints for all persons + + for bbox in bboxes: + cropped_img = self.crop_image(result_image, bbox) + input_tensor = self.transform(cropped_img).unsqueeze(0).to("cuda") + heatmaps = ModelManager.run_model(pose_model, input_tensor) + keypoints = self.heatmaps_to_keypoints(heatmaps[0].cpu().numpy()) + all_keypoints.append(keypoints) # Collect keypoints + result_image = self.draw_keypoints(result_image, keypoints, bbox, kpt_threshold) + + return result_image, all_keypoints + + def process_image(self, image: Image.Image, model_name: str, kpt_threshold: str): + bboxes = self.detect_persons(image) + result_image, keypoints = self.estimate_pose(image, bboxes, model_name, float(kpt_threshold)) + return result_image, keypoints + + def crop_image(self, image, bbox): + if len(bbox) == 4: + x1, y1, x2, y2 = map(int, bbox) + elif len(bbox) >= 5: + x1, y1, x2, y2, _ = map(int, bbox[:5]) + else: + raise ValueError(f"Unexpected bbox format: {bbox}") + + crop = image.crop((x1, y1, x2, y2)) + return crop + + @staticmethod + def heatmaps_to_keypoints(heatmaps): + num_joints = heatmaps.shape[0] # Should be 308 + keypoints = {} + for i, name in enumerate(GOLIATH_KEYPOINTS): + if i < num_joints: + heatmap = heatmaps[i] + y, x = np.unravel_index(np.argmax(heatmap), heatmap.shape) + conf = heatmap[y, x] + keypoints[name] = (float(x), float(y), float(conf)) + return keypoints + + @staticmethod + def draw_keypoints(image, keypoints, bbox, kpt_threshold): + image = np.array(image) + + # Handle both 4 and 5-element bounding boxes + if len(bbox) == 4: + x1, y1, x2, y2 = map(int, bbox) + elif len(bbox) >= 5: + x1, y1, x2, y2, _ = map(int, bbox[:5]) + else: + raise ValueError(f"Unexpected bbox format: {bbox}") + + # Calculate adaptive radius and thickness based on bounding box size + bbox_width = x2 - x1 + bbox_height = y2 - y1 + bbox_size = np.sqrt(bbox_width * bbox_height) + + radius = max(1, int(bbox_size * 0.006)) # minimum 1 pixel + thickness = max(1, int(bbox_size * 0.006)) # minimum 1 pixel + bbox_thickness = max(1, thickness//4) + + cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), bbox_thickness) + + # Draw keypoints + for i, (name, (x, y, conf)) in enumerate(keypoints.items()): + if conf > kpt_threshold and i < len(GOLIATH_KPTS_COLORS): + x_coord = int(x * bbox_width / 192) + x1 + y_coord = int(y * bbox_height / 256) + y1 + color = GOLIATH_KPTS_COLORS[i] + cv2.circle(image, (x_coord, y_coord), radius, color, -1) + + # Draw skeleton + for _, link_info in GOLIATH_SKELETON_INFO.items(): + pt1_name, pt2_name = link_info['link'] + color = link_info['color'] + + if pt1_name in keypoints and pt2_name in keypoints: + pt1 = keypoints[pt1_name] + pt2 = keypoints[pt2_name] + if pt1[2] > kpt_threshold and pt2[2] > kpt_threshold: + x1_coord = int(pt1[0] * bbox_width / 192) + x1 + y1_coord = int(pt1[1] * bbox_height / 256) + y1 + x2_coord = int(pt2[0] * bbox_width / 192) + x1 + y2_coord = int(pt2[1] * bbox_height / 256) + y1 + cv2.line(image, (x1_coord, y1_coord), (x2_coord, y2_coord), color, thickness=thickness) + + return Image.fromarray(image) + +class GradioInterface: + def __init__(self): + self.image_processor = ImageProcessor() + + def create_interface(self): + app_styles = """ + + """ + + header_html = f""" + + + {app_styles} +
+

Sapiens: Pose Estimation

+

ECCV 2024 (Oral)

+

+ Meta presents Sapiens, foundation models for human tasks pretrained on 300 million human images. + This demo showcases the finetuned pose estimation model.
+

+ + +
+ """ + + js_func = """ + function refresh() { + const url = new URL(window.location); + if (url.searchParams.get('__theme') !== 'dark') { + url.searchParams.set('__theme', 'dark'); + window.location.href = url.href; + } + } + """ + + def process_image(image, model_name, kpt_threshold): + result_image, keypoints = self.image_processor.process_image(image, model_name, kpt_threshold) + with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w') as json_file: + json.dump(keypoints, json_file) + json_file_path = json_file.name + return result_image, json_file_path + + with gr.Blocks(js=js_func, theme=gr.themes.Default()) as demo: + gr.HTML(header_html) + with gr.Row(elem_classes="content-container"): + with gr.Column(): + input_image = gr.Image(label="Input Image", type="pil", format="png", elem_classes="image-preview") + with gr.Row(): + model_name = gr.Dropdown( + label="Model Size", + choices=list(Config.CHECKPOINTS.keys()), + value="1b", + ) + kpt_threshold = gr.Dropdown( + label="Min Keypoint Confidence", + choices=["0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9"], + value="0.3", + ) + example_model = gr.Examples( + inputs=input_image, + examples_per_page=14, + examples=[ + os.path.join(Config.ASSETS_DIR, "images", img) + for img in os.listdir(os.path.join(Config.ASSETS_DIR, "images")) + ], + ) + with gr.Column(): + result_image = gr.Image(label="Pose-308 Result", type="pil", elem_classes="image-preview") + json_output = gr.File(label="Pose-308 Output (.json)") + run_button = gr.Button("Run") + + run_button.click( + fn=process_image, + inputs=[input_image, model_name, kpt_threshold], + outputs=[result_image, json_output], + ) + + return demo + +def main(): + if torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + interface = GradioInterface() + demo = interface.create_interface() + demo.launch(share=False) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth b/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth new file mode 100644 index 0000000000000000000000000000000000000000..35c573c1f7ea4c44ee9af0917710a21c3c38434a --- /dev/null +++ b/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b66b27072c6a3cd4f093882df440921987076131fb78a7df7b1cf92d67f41509 +size 99149914 diff --git a/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2 b/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2 new file mode 100644 index 0000000000000000000000000000000000000000..7e320f245c82df4f06f26b4e8908f5e9090af4b5 --- /dev/null +++ b/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21cf7e3e723720d847bee6d3b321bfcdb33268c9f1418d7552552264ae0a5a9b +size 1319579523 diff --git a/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2 b/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2 new file mode 100644 index 0000000000000000000000000000000000000000..3ea09bc91c4b6c449451fce529e91260af5ef198 --- /dev/null +++ b/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6218c6be17697157f9e65ee34054a94ab8ca0f637380fa5748c18e04814976e +size 4677162331 diff --git a/assets/images/68204.png b/assets/images/68204.png new file mode 100644 index 0000000000000000000000000000000000000000..6584b288fafd94166c2877b7e43a3f387016a434 --- /dev/null +++ b/assets/images/68204.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0268cb801ed164864a4b5f6d131e0ac5cc2fbd149a6467d5d0c97da47122c2 +size 4285020 diff --git a/assets/images/68210.png b/assets/images/68210.png new file mode 100644 index 0000000000000000000000000000000000000000..a0c34954cd7483f373026408b083c8195d165489 --- /dev/null +++ b/assets/images/68210.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbe5f80498af4ebd1ff09ae4184f37c20ba981e53bd554c3cc78d39ae0ee7fd7 +size 3933143 diff --git a/assets/images/68658.png b/assets/images/68658.png new file mode 100644 index 0000000000000000000000000000000000000000..24dd9477f8cdb5d92d96db34a8932a0d24da334e --- /dev/null +++ b/assets/images/68658.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a68b619bd17235e683324f2826ce0693322e45ab8c86f1c057851ecb333ac7 +size 5096267 diff --git a/assets/images/68666.png b/assets/images/68666.png new file mode 100644 index 0000000000000000000000000000000000000000..95e7ae11dc90d22afc15fa3b41cbfc60ac4cda91 --- /dev/null +++ b/assets/images/68666.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea3047e6c2ccb485fdb3966aa2325e803cbf49c27c0bff00287b44bc16f18914 +size 4562681 diff --git a/assets/images/68691.png b/assets/images/68691.png new file mode 100644 index 0000000000000000000000000000000000000000..9c688716c962b891073e1feea115a7838f72fcba --- /dev/null +++ b/assets/images/68691.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae39e4055c1b297af7068cdddfeeba8d685363281b839d8c5afac1980204b57 +size 3736765 diff --git a/assets/images/68956.png b/assets/images/68956.png new file mode 100644 index 0000000000000000000000000000000000000000..d8a83b85cdb8d999f65677278a28deaa08352a57 --- /dev/null +++ b/assets/images/68956.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee1f27082b10999d0fa848121ecb06cda3386b1a864b9aa0f59ae78261f8908 +size 4147008 diff --git a/assets/images/pexels-amresh444-17315601.png b/assets/images/pexels-amresh444-17315601.png new file mode 100644 index 0000000000000000000000000000000000000000..8453dc7bb9885c43733d798c46c877779cc8ba15 --- /dev/null +++ b/assets/images/pexels-amresh444-17315601.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e17ee1b229147e4b52e8348a6ef426bc9e9a2f90738e776e15b26b325abb9b3 +size 3503065 diff --git a/assets/images/pexels-gabby-k-6311686.png b/assets/images/pexels-gabby-k-6311686.png new file mode 100644 index 0000000000000000000000000000000000000000..9add365bb9485f5085155dbbbab2232a0b533449 --- /dev/null +++ b/assets/images/pexels-gabby-k-6311686.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f10eded3fb05ab04b963f7b9fd2e183d8d4e81b20569b1c6b0653549639421f +size 3651731 diff --git a/assets/images/pexels-julia-m-cameron-4145040.png b/assets/images/pexels-julia-m-cameron-4145040.png new file mode 100644 index 0000000000000000000000000000000000000000..ff67ab842aabe6ebb6c0d5c8630c2c081c7a40ba --- /dev/null +++ b/assets/images/pexels-julia-m-cameron-4145040.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459cf0280667b028ffbca16aa11188780d7a0205c0defec02916ff3cbaeecb72 +size 2924608 diff --git a/assets/images/pexels-marcus-aurelius-6787357.png b/assets/images/pexels-marcus-aurelius-6787357.png new file mode 100644 index 0000000000000000000000000000000000000000..c48247aeacbdb3e0e81b2a0dd1376bf26b687817 --- /dev/null +++ b/assets/images/pexels-marcus-aurelius-6787357.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d35452f76492125eaf7d5783aa9fd6b0d5990ebe0579fe9dfd58a9d634f4955 +size 3297473 diff --git a/assets/images/pexels-mo-saeed-3616599-5409085.png b/assets/images/pexels-mo-saeed-3616599-5409085.png new file mode 100644 index 0000000000000000000000000000000000000000..ac7017af6d97524a95a6e43940f95ce10193cc9f --- /dev/null +++ b/assets/images/pexels-mo-saeed-3616599-5409085.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1ca7afd6c2a654e94ef59d5fb56fca4f3cde5fb5216f6b218c34a7b8c143dc +size 3125143 diff --git a/assets/images/pexels-riedelmax-27355495.png b/assets/images/pexels-riedelmax-27355495.png new file mode 100644 index 0000000000000000000000000000000000000000..20a059e38001957319449e3c1892b3a9bae0ab94 --- /dev/null +++ b/assets/images/pexels-riedelmax-27355495.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4141d2f5f718f162ea1f6710c06b28b5cb51fd69598fde35948f8f3491228164 +size 3732680 diff --git a/assets/images/pexels-sergeymakashin-5368660.png b/assets/images/pexels-sergeymakashin-5368660.png new file mode 100644 index 0000000000000000000000000000000000000000..5b0d1554db9cdcaec20efba4c0628a0ab55867f4 --- /dev/null +++ b/assets/images/pexels-sergeymakashin-5368660.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8f5a8f26dd102d87d94c1be36ec903791fe8e6d951c68ebb9ebcfc6d7397bb +size 4075879 diff --git a/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png b/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png new file mode 100644 index 0000000000000000000000000000000000000000..95aa28be407ccc9b63706bdfb961f99a67319dad --- /dev/null +++ b/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6eef5eee15b81fe65ea95627e9a46040b9889466689b3c1ca6ed273e02fe84f +size 3627053 diff --git a/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py b/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..59b2b465444f5162e3bd859f63608146ba9437a4 --- /dev/null +++ b/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py @@ -0,0 +1,20 @@ +_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + bbox_head=dict(num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=None, + max_per_img=100)) + +train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) + +val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader \ No newline at end of file diff --git a/build_wheel.py b/build_wheel.py new file mode 100644 index 0000000000000000000000000000000000000000..7d4520ce8c84e4d0923db97c9743ad2f3e187483 --- /dev/null +++ b/build_wheel.py @@ -0,0 +1,26 @@ +import os +import subprocess +import sys + +def build_wheel(package_path): + current_dir = os.getcwd() + os.chdir(package_path) + try: + subprocess.check_call([sys.executable, "setup.py", "bdist_wheel"]) + finally: + os.chdir(current_dir) + +def main(): + packages = [ + "./external/engine", + "./external/cv", + "./external/det" + ] + + for package in packages: + print(f"Building wheel for {package}...") + build_wheel(package) + print(f"Wheel built for {package}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/classes_and_palettes.py b/classes_and_palettes.py new file mode 100644 index 0000000000000000000000000000000000000000..ff650bb7e0f738f4b453590554c9f995feb8235c --- /dev/null +++ b/classes_and_palettes.py @@ -0,0 +1,1024 @@ +COCO_KPTS_COLORS = [ + [51, 153, 255], # 0: nose + [51, 153, 255], # 1: left_eye + [51, 153, 255], # 2: right_eye + [51, 153, 255], # 3: left_ear + [51, 153, 255], # 4: right_ear + [0, 255, 0], # 5: left_shoulder + [255, 128, 0], # 6: right_shoulder + [0, 255, 0], # 7: left_elbow + [255, 128, 0], # 8: right_elbow + [0, 255, 0], # 9: left_wrist + [255, 128, 0], # 10: right_wrist + [0, 255, 0], # 11: left_hip + [255, 128, 0], # 12: right_hip + [0, 255, 0], # 13: left_knee + [255, 128, 0], # 14: right_knee + [0, 255, 0], # 15: left_ankle + [255, 128, 0], # 16: right_ankle +] + +COCO_WHOLEBODY_KPTS_COLORS = [ + [51, 153, 255], # 0: nose + [51, 153, 255], # 1: left_eye + [51, 153, 255], # 2: right_eye + [51, 153, 255], # 3: left_ear + [51, 153, 255], # 4: right_ear + [0, 255, 0], # 5: left_shoulder + [255, 128, 0], # 6: right_shoulder + [0, 255, 0], # 7: left_elbow + [255, 128, 0], # 8: right_elbow + [0, 255, 0], # 9: left_wrist + [255, 128, 0], # 10: right_wrist + [0, 255, 0], # 11: left_hip + [255, 128, 0], # 12: right_hip + [0, 255, 0], # 13: left_knee + [255, 128, 0], # 14: right_knee + [0, 255, 0], # 15: left_ankle + [255, 128, 0], # 16: right_ankle + [255, 128, 0], # 17: left_big_toe + [255, 128, 0], # 18: left_small_toe + [255, 128, 0], # 19: left_heel + [255, 128, 0], # 20: right_big_toe + [255, 128, 0], # 21: right_small_toe + [255, 128, 0], # 22: right_heel + [255, 255, 255], # 23: face-0 + [255, 255, 255], # 24: face-1 + [255, 255, 255], # 25: face-2 + [255, 255, 255], # 26: face-3 + [255, 255, 255], # 27: face-4 + [255, 255, 255], # 28: face-5 + [255, 255, 255], # 29: face-6 + [255, 255, 255], # 30: face-7 + [255, 255, 255], # 31: face-8 + [255, 255, 255], # 32: face-9 + [255, 255, 255], # 33: face-10 + [255, 255, 255], # 34: face-11 + [255, 255, 255], # 35: face-12 + [255, 255, 255], # 36: face-13 + [255, 255, 255], # 37: face-14 + [255, 255, 255], # 38: face-15 + [255, 255, 255], # 39: face-16 + [255, 255, 255], # 40: face-17 + [255, 255, 255], # 41: face-18 + [255, 255, 255], # 42: face-19 + [255, 255, 255], # 43: face-20 + [255, 255, 255], # 44: face-21 + [255, 255, 255], # 45: face-22 + [255, 255, 255], # 46: face-23 + [255, 255, 255], # 47: face-24 + [255, 255, 255], # 48: face-25 + [255, 255, 255], # 49: face-26 + [255, 255, 255], # 50: face-27 + [255, 255, 255], # 51: face-28 + [255, 255, 255], # 52: face-29 + [255, 255, 255], # 53: face-30 + [255, 255, 255], # 54: face-31 + [255, 255, 255], # 55: face-32 + [255, 255, 255], # 56: face-33 + [255, 255, 255], # 57: face-34 + [255, 255, 255], # 58: face-35 + [255, 255, 255], # 59: face-36 + [255, 255, 255], # 60: face-37 + [255, 255, 255], # 61: face-38 + [255, 255, 255], # 62: face-39 + [255, 255, 255], # 63: face-40 + [255, 255, 255], # 64: face-41 + [255, 255, 255], # 65: face-42 + [255, 255, 255], # 66: face-43 + [255, 255, 255], # 67: face-44 + [255, 255, 255], # 68: face-45 + [255, 255, 255], # 69: face-46 + [255, 255, 255], # 70: face-47 + [255, 255, 255], # 71: face-48 + [255, 255, 255], # 72: face-49 + [255, 255, 255], # 73: face-50 + [255, 255, 255], # 74: face-51 + [255, 255, 255], # 75: face-52 + [255, 255, 255], # 76: face-53 + [255, 255, 255], # 77: face-54 + [255, 255, 255], # 78: face-55 + [255, 255, 255], # 79: face-56 + [255, 255, 255], # 80: face-57 + [255, 255, 255], # 81: face-58 + [255, 255, 255], # 82: face-59 + [255, 255, 255], # 83: face-60 + [255, 255, 255], # 84: face-61 + [255, 255, 255], # 85: face-62 + [255, 255, 255], # 86: face-63 + [255, 255, 255], # 87: face-64 + [255, 255, 255], # 88: face-65 + [255, 255, 255], # 89: face-66 + [255, 255, 255], # 90: face-67 + [255, 255, 255], # 91: left_hand_root + [255, 128, 0], # 92: left_thumb1 + [255, 128, 0], # 93: left_thumb2 + [255, 128, 0], # 94: left_thumb3 + [255, 128, 0], # 95: left_thumb4 + [255, 153, 255], # 96: left_forefinger1 + [255, 153, 255], # 97: left_forefinger2 + [255, 153, 255], # 98: left_forefinger3 + [255, 153, 255], # 99: left_forefinger4 + [102, 178, 255], # 100: left_middle_finger1 + [102, 178, 255], # 101: left_middle_finger2 + [102, 178, 255], # 102: left_middle_finger3 + [102, 178, 255], # 103: left_middle_finger4 + [255, 51, 51], # 104: left_ring_finger1 + [255, 51, 51], # 105: left_ring_finger2 + [255, 51, 51], # 106: left_ring_finger3 + [255, 51, 51], # 107: left_ring_finger4 + [0, 255, 0], # 108: left_pinky_finger1 + [0, 255, 0], # 109: left_pinky_finger2 + [0, 255, 0], # 110: left_pinky_finger3 + [0, 255, 0], # 111: left_pinky_finger4 + [255, 255, 255], # 112: right_hand_root + [255, 128, 0], # 113: right_thumb1 + [255, 128, 0], # 114: right_thumb2 + [255, 128, 0], # 115: right_thumb3 + [255, 128, 0], # 116: right_thumb4 + [255, 153, 255], # 117: right_forefinger1 + [255, 153, 255], # 118: right_forefinger2 + [255, 153, 255], # 119: right_forefinger3 + [255, 153, 255], # 120: right_forefinger4 + [102, 178, 255], # 121: right_middle_finger1 + [102, 178, 255], # 122: right_middle_finger2 + [102, 178, 255], # 123: right_middle_finger3 + [102, 178, 255], # 124: right_middle_finger4 + [255, 51, 51], # 125: right_ring_finger1 + [255, 51, 51], # 126: right_ring_finger2 + [255, 51, 51], # 127: right_ring_finger3 + [255, 51, 51], # 128: right_ring_finger4 + [0, 255, 0], # 129: right_pinky_finger1 + [0, 255, 0], # 130: right_pinky_finger2 + [0, 255, 0], # 131: right_pinky_finger3 + [0, 255, 0], # 132: right_pinky_finger4 +] + + +GOLIATH_KPTS_COLORS = [ + [51, 153, 255], # 0: nose + [51, 153, 255], # 1: left_eye + [51, 153, 255], # 2: right_eye + [51, 153, 255], # 3: left_ear + [51, 153, 255], # 4: right_ear + [51, 153, 255], # 5: left_shoulder + [51, 153, 255], # 6: right_shoulder + [51, 153, 255], # 7: left_elbow + [51, 153, 255], # 8: right_elbow + [51, 153, 255], # 9: left_hip + [51, 153, 255], # 10: right_hip + [51, 153, 255], # 11: left_knee + [51, 153, 255], # 12: right_knee + [51, 153, 255], # 13: left_ankle + [51, 153, 255], # 14: right_ankle + [51, 153, 255], # 15: left_big_toe + [51, 153, 255], # 16: left_small_toe + [51, 153, 255], # 17: left_heel + [51, 153, 255], # 18: right_big_toe + [51, 153, 255], # 19: right_small_toe + [51, 153, 255], # 20: right_heel + [51, 153, 255], # 21: right_thumb4 + [51, 153, 255], # 22: right_thumb3 + [51, 153, 255], # 23: right_thumb2 + [51, 153, 255], # 24: right_thumb_third_joint + [51, 153, 255], # 25: right_forefinger4 + [51, 153, 255], # 26: right_forefinger3 + [51, 153, 255], # 27: right_forefinger2 + [51, 153, 255], # 28: right_forefinger_third_joint + [51, 153, 255], # 29: right_middle_finger4 + [51, 153, 255], # 30: right_middle_finger3 + [51, 153, 255], # 31: right_middle_finger2 + [51, 153, 255], # 32: right_middle_finger_third_joint + [51, 153, 255], # 33: right_ring_finger4 + [51, 153, 255], # 34: right_ring_finger3 + [51, 153, 255], # 35: right_ring_finger2 + [51, 153, 255], # 36: right_ring_finger_third_joint + [51, 153, 255], # 37: right_pinky_finger4 + [51, 153, 255], # 38: right_pinky_finger3 + [51, 153, 255], # 39: right_pinky_finger2 + [51, 153, 255], # 40: right_pinky_finger_third_joint + [51, 153, 255], # 41: right_wrist + [51, 153, 255], # 42: left_thumb4 + [51, 153, 255], # 43: left_thumb3 + [51, 153, 255], # 44: left_thumb2 + [51, 153, 255], # 45: left_thumb_third_joint + [51, 153, 255], # 46: left_forefinger4 + [51, 153, 255], # 47: left_forefinger3 + [51, 153, 255], # 48: left_forefinger2 + [51, 153, 255], # 49: left_forefinger_third_joint + [51, 153, 255], # 50: left_middle_finger4 + [51, 153, 255], # 51: left_middle_finger3 + [51, 153, 255], # 52: left_middle_finger2 + [51, 153, 255], # 53: left_middle_finger_third_joint + [51, 153, 255], # 54: left_ring_finger4 + [51, 153, 255], # 55: left_ring_finger3 + [51, 153, 255], # 56: left_ring_finger2 + [51, 153, 255], # 57: left_ring_finger_third_joint + [51, 153, 255], # 58: left_pinky_finger4 + [51, 153, 255], # 59: left_pinky_finger3 + [51, 153, 255], # 60: left_pinky_finger2 + [51, 153, 255], # 61: left_pinky_finger_third_joint + [51, 153, 255], # 62: left_wrist + [51, 153, 255], # 63: left_olecranon + [51, 153, 255], # 64: right_olecranon + [51, 153, 255], # 65: left_cubital_fossa + [51, 153, 255], # 66: right_cubital_fossa + [51, 153, 255], # 67: left_acromion + [51, 153, 255], # 68: right_acromion + [51, 153, 255], # 69: neck + [255, 255, 255], # 70: center_of_glabella + [255, 255, 255], # 71: center_of_nose_root + [255, 255, 255], # 72: tip_of_nose_bridge + [255, 255, 255], # 73: midpoint_1_of_nose_bridge + [255, 255, 255], # 74: midpoint_2_of_nose_bridge + [255, 255, 255], # 75: midpoint_3_of_nose_bridge + [255, 255, 255], # 76: center_of_labiomental_groove + [255, 255, 255], # 77: tip_of_chin + [255, 255, 255], # 78: upper_startpoint_of_r_eyebrow + [255, 255, 255], # 79: lower_startpoint_of_r_eyebrow + [255, 255, 255], # 80: end_of_r_eyebrow + [255, 255, 255], # 81: upper_midpoint_1_of_r_eyebrow + [255, 255, 255], # 82: lower_midpoint_1_of_r_eyebrow + [255, 255, 255], # 83: upper_midpoint_2_of_r_eyebrow + [255, 255, 255], # 84: upper_midpoint_3_of_r_eyebrow + [255, 255, 255], # 85: lower_midpoint_2_of_r_eyebrow + [255, 255, 255], # 86: lower_midpoint_3_of_r_eyebrow + [255, 255, 255], # 87: upper_startpoint_of_l_eyebrow + [255, 255, 255], # 88: lower_startpoint_of_l_eyebrow + [255, 255, 255], # 89: end_of_l_eyebrow + [255, 255, 255], # 90: upper_midpoint_1_of_l_eyebrow + [255, 255, 255], # 91: lower_midpoint_1_of_l_eyebrow + [255, 255, 255], # 92: upper_midpoint_2_of_l_eyebrow + [255, 255, 255], # 93: upper_midpoint_3_of_l_eyebrow + [255, 255, 255], # 94: lower_midpoint_2_of_l_eyebrow + [255, 255, 255], # 95: lower_midpoint_3_of_l_eyebrow + [192, 64, 128], # 96: l_inner_end_of_upper_lash_line + [192, 64, 128], # 97: l_outer_end_of_upper_lash_line + [192, 64, 128], # 98: l_centerpoint_of_upper_lash_line + [192, 64, 128], # 99: l_midpoint_2_of_upper_lash_line + [192, 64, 128], # 100: l_midpoint_1_of_upper_lash_line + [192, 64, 128], # 101: l_midpoint_6_of_upper_lash_line + [192, 64, 128], # 102: l_midpoint_5_of_upper_lash_line + [192, 64, 128], # 103: l_midpoint_4_of_upper_lash_line + [192, 64, 128], # 104: l_midpoint_3_of_upper_lash_line + [192, 64, 128], # 105: l_outer_end_of_upper_eyelid_line + [192, 64, 128], # 106: l_midpoint_6_of_upper_eyelid_line + [192, 64, 128], # 107: l_midpoint_2_of_upper_eyelid_line + [192, 64, 128], # 108: l_midpoint_5_of_upper_eyelid_line + [192, 64, 128], # 109: l_centerpoint_of_upper_eyelid_line + [192, 64, 128], # 110: l_midpoint_4_of_upper_eyelid_line + [192, 64, 128], # 111: l_midpoint_1_of_upper_eyelid_line + [192, 64, 128], # 112: l_midpoint_3_of_upper_eyelid_line + [192, 64, 128], # 113: l_midpoint_6_of_upper_crease_line + [192, 64, 128], # 114: l_midpoint_2_of_upper_crease_line + [192, 64, 128], # 115: l_midpoint_5_of_upper_crease_line + [192, 64, 128], # 116: l_centerpoint_of_upper_crease_line + [192, 64, 128], # 117: l_midpoint_4_of_upper_crease_line + [192, 64, 128], # 118: l_midpoint_1_of_upper_crease_line + [192, 64, 128], # 119: l_midpoint_3_of_upper_crease_line + [64, 32, 192], # 120: r_inner_end_of_upper_lash_line + [64, 32, 192], # 121: r_outer_end_of_upper_lash_line + [64, 32, 192], # 122: r_centerpoint_of_upper_lash_line + [64, 32, 192], # 123: r_midpoint_1_of_upper_lash_line + [64, 32, 192], # 124: r_midpoint_2_of_upper_lash_line + [64, 32, 192], # 125: r_midpoint_3_of_upper_lash_line + [64, 32, 192], # 126: r_midpoint_4_of_upper_lash_line + [64, 32, 192], # 127: r_midpoint_5_of_upper_lash_line + [64, 32, 192], # 128: r_midpoint_6_of_upper_lash_line + [64, 32, 192], # 129: r_outer_end_of_upper_eyelid_line + [64, 32, 192], # 130: r_midpoint_3_of_upper_eyelid_line + [64, 32, 192], # 131: r_midpoint_1_of_upper_eyelid_line + [64, 32, 192], # 132: r_midpoint_4_of_upper_eyelid_line + [64, 32, 192], # 133: r_centerpoint_of_upper_eyelid_line + [64, 32, 192], # 134: r_midpoint_5_of_upper_eyelid_line + [64, 32, 192], # 135: r_midpoint_2_of_upper_eyelid_line + [64, 32, 192], # 136: r_midpoint_6_of_upper_eyelid_line + [64, 32, 192], # 137: r_midpoint_3_of_upper_crease_line + [64, 32, 192], # 138: r_midpoint_1_of_upper_crease_line + [64, 32, 192], # 139: r_midpoint_4_of_upper_crease_line + [64, 32, 192], # 140: r_centerpoint_of_upper_crease_line + [64, 32, 192], # 141: r_midpoint_5_of_upper_crease_line + [64, 32, 192], # 142: r_midpoint_2_of_upper_crease_line + [64, 32, 192], # 143: r_midpoint_6_of_upper_crease_line + [64, 192, 128], # 144: l_inner_end_of_lower_lash_line + [64, 192, 128], # 145: l_outer_end_of_lower_lash_line + [64, 192, 128], # 146: l_centerpoint_of_lower_lash_line + [64, 192, 128], # 147: l_midpoint_2_of_lower_lash_line + [64, 192, 128], # 148: l_midpoint_1_of_lower_lash_line + [64, 192, 128], # 149: l_midpoint_6_of_lower_lash_line + [64, 192, 128], # 150: l_midpoint_5_of_lower_lash_line + [64, 192, 128], # 151: l_midpoint_4_of_lower_lash_line + [64, 192, 128], # 152: l_midpoint_3_of_lower_lash_line + [64, 192, 128], # 153: l_outer_end_of_lower_eyelid_line + [64, 192, 128], # 154: l_midpoint_6_of_lower_eyelid_line + [64, 192, 128], # 155: l_midpoint_2_of_lower_eyelid_line + [64, 192, 128], # 156: l_midpoint_5_of_lower_eyelid_line + [64, 192, 128], # 157: l_centerpoint_of_lower_eyelid_line + [64, 192, 128], # 158: l_midpoint_4_of_lower_eyelid_line + [64, 192, 128], # 159: l_midpoint_1_of_lower_eyelid_line + [64, 192, 128], # 160: l_midpoint_3_of_lower_eyelid_line + [64, 192, 32], # 161: r_inner_end_of_lower_lash_line + [64, 192, 32], # 162: r_outer_end_of_lower_lash_line + [64, 192, 32], # 163: r_centerpoint_of_lower_lash_line + [64, 192, 32], # 164: r_midpoint_1_of_lower_lash_line + [64, 192, 32], # 165: r_midpoint_2_of_lower_lash_line + [64, 192, 32], # 166: r_midpoint_3_of_lower_lash_line + [64, 192, 32], # 167: r_midpoint_4_of_lower_lash_line + [64, 192, 32], # 168: r_midpoint_5_of_lower_lash_line + [64, 192, 32], # 169: r_midpoint_6_of_lower_lash_line + [64, 192, 32], # 170: r_outer_end_of_lower_eyelid_line + [64, 192, 32], # 171: r_midpoint_3_of_lower_eyelid_line + [64, 192, 32], # 172: r_midpoint_1_of_lower_eyelid_line + [64, 192, 32], # 173: r_midpoint_4_of_lower_eyelid_line + [64, 192, 32], # 174: r_centerpoint_of_lower_eyelid_line + [64, 192, 32], # 175: r_midpoint_5_of_lower_eyelid_line + [64, 192, 32], # 176: r_midpoint_2_of_lower_eyelid_line + [64, 192, 32], # 177: r_midpoint_6_of_lower_eyelid_line + [0, 192, 0], # 178: tip_of_nose + [0, 192, 0], # 179: bottom_center_of_nose + [0, 192, 0], # 180: r_outer_corner_of_nose + [0, 192, 0], # 181: l_outer_corner_of_nose + [0, 192, 0], # 182: inner_corner_of_r_nostril + [0, 192, 0], # 183: outer_corner_of_r_nostril + [0, 192, 0], # 184: upper_corner_of_r_nostril + [0, 192, 0], # 185: inner_corner_of_l_nostril + [0, 192, 0], # 186: outer_corner_of_l_nostril + [0, 192, 0], # 187: upper_corner_of_l_nostril + [192, 0, 0], # 188: r_outer_corner_of_mouth + [192, 0, 0], # 189: l_outer_corner_of_mouth + [192, 0, 0], # 190: center_of_cupid_bow + [192, 0, 0], # 191: center_of_lower_outer_lip + [192, 0, 0], # 192: midpoint_1_of_upper_outer_lip + [192, 0, 0], # 193: midpoint_2_of_upper_outer_lip + [192, 0, 0], # 194: midpoint_1_of_lower_outer_lip + [192, 0, 0], # 195: midpoint_2_of_lower_outer_lip + [192, 0, 0], # 196: midpoint_3_of_upper_outer_lip + [192, 0, 0], # 197: midpoint_4_of_upper_outer_lip + [192, 0, 0], # 198: midpoint_5_of_upper_outer_lip + [192, 0, 0], # 199: midpoint_6_of_upper_outer_lip + [192, 0, 0], # 200: midpoint_3_of_lower_outer_lip + [192, 0, 0], # 201: midpoint_4_of_lower_outer_lip + [192, 0, 0], # 202: midpoint_5_of_lower_outer_lip + [192, 0, 0], # 203: midpoint_6_of_lower_outer_lip + [0, 192, 192], # 204: r_inner_corner_of_mouth + [0, 192, 192], # 205: l_inner_corner_of_mouth + [0, 192, 192], # 206: center_of_upper_inner_lip + [0, 192, 192], # 207: center_of_lower_inner_lip + [0, 192, 192], # 208: midpoint_1_of_upper_inner_lip + [0, 192, 192], # 209: midpoint_2_of_upper_inner_lip + [0, 192, 192], # 210: midpoint_1_of_lower_inner_lip + [0, 192, 192], # 211: midpoint_2_of_lower_inner_lip + [0, 192, 192], # 212: midpoint_3_of_upper_inner_lip + [0, 192, 192], # 213: midpoint_4_of_upper_inner_lip + [0, 192, 192], # 214: midpoint_5_of_upper_inner_lip + [0, 192, 192], # 215: midpoint_6_of_upper_inner_lip + [0, 192, 192], # 216: midpoint_3_of_lower_inner_lip + [0, 192, 192], # 217: midpoint_4_of_lower_inner_lip + [0, 192, 192], # 218: midpoint_5_of_lower_inner_lip + [0, 192, 192], # 219: midpoint_6_of_lower_inner_lip. teeths removed + [200, 200, 0], # 256: l_top_end_of_inferior_crus + [200, 200, 0], # 257: l_top_end_of_superior_crus + [200, 200, 0], # 258: l_start_of_antihelix + [200, 200, 0], # 259: l_end_of_antihelix + [200, 200, 0], # 260: l_midpoint_1_of_antihelix + [200, 200, 0], # 261: l_midpoint_1_of_inferior_crus + [200, 200, 0], # 262: l_midpoint_2_of_antihelix + [200, 200, 0], # 263: l_midpoint_3_of_antihelix + [200, 200, 0], # 264: l_point_1_of_inner_helix + [200, 200, 0], # 265: l_point_2_of_inner_helix + [200, 200, 0], # 266: l_point_3_of_inner_helix + [200, 200, 0], # 267: l_point_4_of_inner_helix + [200, 200, 0], # 268: l_point_5_of_inner_helix + [200, 200, 0], # 269: l_point_6_of_inner_helix + [200, 200, 0], # 270: l_point_7_of_inner_helix + [200, 200, 0], # 271: l_highest_point_of_antitragus + [200, 200, 0], # 272: l_bottom_point_of_tragus + [200, 200, 0], # 273: l_protruding_point_of_tragus + [200, 200, 0], # 274: l_top_point_of_tragus + [200, 200, 0], # 275: l_start_point_of_crus_of_helix + [200, 200, 0], # 276: l_deepest_point_of_concha + [200, 200, 0], # 277: l_tip_of_ear_lobe + [200, 200, 0], # 278: l_midpoint_between_22_15 + [200, 200, 0], # 279: l_bottom_connecting_point_of_ear_lobe + [200, 200, 0], # 280: l_top_connecting_point_of_helix + [200, 200, 0], # 281: l_point_8_of_inner_helix + [0, 200, 200], # 282: r_top_end_of_inferior_crus + [0, 200, 200], # 283: r_top_end_of_superior_crus + [0, 200, 200], # 284: r_start_of_antihelix + [0, 200, 200], # 285: r_end_of_antihelix + [0, 200, 200], # 286: r_midpoint_1_of_antihelix + [0, 200, 200], # 287: r_midpoint_1_of_inferior_crus + [0, 200, 200], # 288: r_midpoint_2_of_antihelix + [0, 200, 200], # 289: r_midpoint_3_of_antihelix + [0, 200, 200], # 290: r_point_1_of_inner_helix + [0, 200, 200], # 291: r_point_8_of_inner_helix + [0, 200, 200], # 292: r_point_3_of_inner_helix + [0, 200, 200], # 293: r_point_4_of_inner_helix + [0, 200, 200], # 294: r_point_5_of_inner_helix + [0, 200, 200], # 295: r_point_6_of_inner_helix + [0, 200, 200], # 296: r_point_7_of_inner_helix + [0, 200, 200], # 297: r_highest_point_of_antitragus + [0, 200, 200], # 298: r_bottom_point_of_tragus + [0, 200, 200], # 299: r_protruding_point_of_tragus + [0, 200, 200], # 300: r_top_point_of_tragus + [0, 200, 200], # 301: r_start_point_of_crus_of_helix + [0, 200, 200], # 302: r_deepest_point_of_concha + [0, 200, 200], # 303: r_tip_of_ear_lobe + [0, 200, 200], # 304: r_midpoint_between_22_15 + [0, 200, 200], # 305: r_bottom_connecting_point_of_ear_lobe + [0, 200, 200], # 306: r_top_connecting_point_of_helix + [0, 200, 200], # 307: r_point_2_of_inner_helix + [128, 192, 64], # 308: l_center_of_iris + [128, 192, 64], # 309: l_border_of_iris_3 + [128, 192, 64], # 310: l_border_of_iris_midpoint_1 + [128, 192, 64], # 311: l_border_of_iris_12 + [128, 192, 64], # 312: l_border_of_iris_midpoint_4 + [128, 192, 64], # 313: l_border_of_iris_9 + [128, 192, 64], # 314: l_border_of_iris_midpoint_3 + [128, 192, 64], # 315: l_border_of_iris_6 + [128, 192, 64], # 316: l_border_of_iris_midpoint_2 + [192, 32, 64], # 317: r_center_of_iris + [192, 32, 64], # 318: r_border_of_iris_3 + [192, 32, 64], # 319: r_border_of_iris_midpoint_1 + [192, 32, 64], # 320: r_border_of_iris_12 + [192, 32, 64], # 321: r_border_of_iris_midpoint_4 + [192, 32, 64], # 322: r_border_of_iris_9 + [192, 32, 64], # 323: r_border_of_iris_midpoint_3 + [192, 32, 64], # 324: r_border_of_iris_6 + [192, 32, 64], # 325: r_border_of_iris_midpoint_2 + [192, 128, 64], # 326: l_center_of_pupil + [192, 128, 64], # 327: l_border_of_pupil_3 + [192, 128, 64], # 328: l_border_of_pupil_midpoint_1 + [192, 128, 64], # 329: l_border_of_pupil_12 + [192, 128, 64], # 330: l_border_of_pupil_midpoint_4 + [192, 128, 64], # 331: l_border_of_pupil_9 + [192, 128, 64], # 332: l_border_of_pupil_midpoint_3 + [192, 128, 64], # 333: l_border_of_pupil_6 + [192, 128, 64], # 334: l_border_of_pupil_midpoint_2 + [32, 192, 192], # 335: r_center_of_pupil + [32, 192, 192], # 336: r_border_of_pupil_3 + [32, 192, 192], # 337: r_border_of_pupil_midpoint_1 + [32, 192, 192], # 338: r_border_of_pupil_12 + [32, 192, 192], # 339: r_border_of_pupil_midpoint_4 + [32, 192, 192], # 340: r_border_of_pupil_9 + [32, 192, 192], # 341: r_border_of_pupil_midpoint_3 + [32, 192, 192], # 342: r_border_of_pupil_6 + [32, 192, 192], # 343: r_border_of_pupil_midpoint_2 +] + +GOLIATH_KEYPOINTS = [ + "nose", + "left_eye", + "right_eye", + "left_ear", + "right_ear", + "left_shoulder", + "right_shoulder", + "left_elbow", + "right_elbow", + "left_hip", + "right_hip", + "left_knee", + "right_knee", + "left_ankle", + "right_ankle", + "left_big_toe", + "left_small_toe", + "left_heel", + "right_big_toe", + "right_small_toe", + "right_heel", + "right_thumb4", + "right_thumb3", + "right_thumb2", + "right_thumb_third_joint", + "right_forefinger4", + "right_forefinger3", + "right_forefinger2", + "right_forefinger_third_joint", + "right_middle_finger4", + "right_middle_finger3", + "right_middle_finger2", + "right_middle_finger_third_joint", + "right_ring_finger4", + "right_ring_finger3", + "right_ring_finger2", + "right_ring_finger_third_joint", + "right_pinky_finger4", + "right_pinky_finger3", + "right_pinky_finger2", + "right_pinky_finger_third_joint", + "right_wrist", + "left_thumb4", + "left_thumb3", + "left_thumb2", + "left_thumb_third_joint", + "left_forefinger4", + "left_forefinger3", + "left_forefinger2", + "left_forefinger_third_joint", + "left_middle_finger4", + "left_middle_finger3", + "left_middle_finger2", + "left_middle_finger_third_joint", + "left_ring_finger4", + "left_ring_finger3", + "left_ring_finger2", + "left_ring_finger_third_joint", + "left_pinky_finger4", + "left_pinky_finger3", + "left_pinky_finger2", + "left_pinky_finger_third_joint", + "left_wrist", + "left_olecranon", + "right_olecranon", + "left_cubital_fossa", + "right_cubital_fossa", + "left_acromion", + "right_acromion", + "neck", + "center_of_glabella", + "center_of_nose_root", + "tip_of_nose_bridge", + "midpoint_1_of_nose_bridge", + "midpoint_2_of_nose_bridge", + "midpoint_3_of_nose_bridge", + "center_of_labiomental_groove", + "tip_of_chin", + "upper_startpoint_of_r_eyebrow", + "lower_startpoint_of_r_eyebrow", + "end_of_r_eyebrow", + "upper_midpoint_1_of_r_eyebrow", + "lower_midpoint_1_of_r_eyebrow", + "upper_midpoint_2_of_r_eyebrow", + "upper_midpoint_3_of_r_eyebrow", + "lower_midpoint_2_of_r_eyebrow", + "lower_midpoint_3_of_r_eyebrow", + "upper_startpoint_of_l_eyebrow", + "lower_startpoint_of_l_eyebrow", + "end_of_l_eyebrow", + "upper_midpoint_1_of_l_eyebrow", + "lower_midpoint_1_of_l_eyebrow", + "upper_midpoint_2_of_l_eyebrow", + "upper_midpoint_3_of_l_eyebrow", + "lower_midpoint_2_of_l_eyebrow", + "lower_midpoint_3_of_l_eyebrow", + "l_inner_end_of_upper_lash_line", + "l_outer_end_of_upper_lash_line", + "l_centerpoint_of_upper_lash_line", + "l_midpoint_2_of_upper_lash_line", + "l_midpoint_1_of_upper_lash_line", + "l_midpoint_6_of_upper_lash_line", + "l_midpoint_5_of_upper_lash_line", + "l_midpoint_4_of_upper_lash_line", + "l_midpoint_3_of_upper_lash_line", + "l_outer_end_of_upper_eyelid_line", + "l_midpoint_6_of_upper_eyelid_line", + "l_midpoint_2_of_upper_eyelid_line", + "l_midpoint_5_of_upper_eyelid_line", + "l_centerpoint_of_upper_eyelid_line", + "l_midpoint_4_of_upper_eyelid_line", + "l_midpoint_1_of_upper_eyelid_line", + "l_midpoint_3_of_upper_eyelid_line", + "l_midpoint_6_of_upper_crease_line", + "l_midpoint_2_of_upper_crease_line", + "l_midpoint_5_of_upper_crease_line", + "l_centerpoint_of_upper_crease_line", + "l_midpoint_4_of_upper_crease_line", + "l_midpoint_1_of_upper_crease_line", + "l_midpoint_3_of_upper_crease_line", + "r_inner_end_of_upper_lash_line", + "r_outer_end_of_upper_lash_line", + "r_centerpoint_of_upper_lash_line", + "r_midpoint_1_of_upper_lash_line", + "r_midpoint_2_of_upper_lash_line", + "r_midpoint_3_of_upper_lash_line", + "r_midpoint_4_of_upper_lash_line", + "r_midpoint_5_of_upper_lash_line", + "r_midpoint_6_of_upper_lash_line", + "r_outer_end_of_upper_eyelid_line", + "r_midpoint_3_of_upper_eyelid_line", + "r_midpoint_1_of_upper_eyelid_line", + "r_midpoint_4_of_upper_eyelid_line", + "r_centerpoint_of_upper_eyelid_line", + "r_midpoint_5_of_upper_eyelid_line", + "r_midpoint_2_of_upper_eyelid_line", + "r_midpoint_6_of_upper_eyelid_line", + "r_midpoint_3_of_upper_crease_line", + "r_midpoint_1_of_upper_crease_line", + "r_midpoint_4_of_upper_crease_line", + "r_centerpoint_of_upper_crease_line", + "r_midpoint_5_of_upper_crease_line", + "r_midpoint_2_of_upper_crease_line", + "r_midpoint_6_of_upper_crease_line", + "l_inner_end_of_lower_lash_line", + "l_outer_end_of_lower_lash_line", + "l_centerpoint_of_lower_lash_line", + "l_midpoint_2_of_lower_lash_line", + "l_midpoint_1_of_lower_lash_line", + "l_midpoint_6_of_lower_lash_line", + "l_midpoint_5_of_lower_lash_line", + "l_midpoint_4_of_lower_lash_line", + "l_midpoint_3_of_lower_lash_line", + "l_outer_end_of_lower_eyelid_line", + "l_midpoint_6_of_lower_eyelid_line", + "l_midpoint_2_of_lower_eyelid_line", + "l_midpoint_5_of_lower_eyelid_line", + "l_centerpoint_of_lower_eyelid_line", + "l_midpoint_4_of_lower_eyelid_line", + "l_midpoint_1_of_lower_eyelid_line", + "l_midpoint_3_of_lower_eyelid_line", + "r_inner_end_of_lower_lash_line", + "r_outer_end_of_lower_lash_line", + "r_centerpoint_of_lower_lash_line", + "r_midpoint_1_of_lower_lash_line", + "r_midpoint_2_of_lower_lash_line", + "r_midpoint_3_of_lower_lash_line", + "r_midpoint_4_of_lower_lash_line", + "r_midpoint_5_of_lower_lash_line", + "r_midpoint_6_of_lower_lash_line", + "r_outer_end_of_lower_eyelid_line", + "r_midpoint_3_of_lower_eyelid_line", + "r_midpoint_1_of_lower_eyelid_line", + "r_midpoint_4_of_lower_eyelid_line", + "r_centerpoint_of_lower_eyelid_line", + "r_midpoint_5_of_lower_eyelid_line", + "r_midpoint_2_of_lower_eyelid_line", + "r_midpoint_6_of_lower_eyelid_line", + "tip_of_nose", + "bottom_center_of_nose", + "r_outer_corner_of_nose", + "l_outer_corner_of_nose", + "inner_corner_of_r_nostril", + "outer_corner_of_r_nostril", + "upper_corner_of_r_nostril", + "inner_corner_of_l_nostril", + "outer_corner_of_l_nostril", + "upper_corner_of_l_nostril", + "r_outer_corner_of_mouth", + "l_outer_corner_of_mouth", + "center_of_cupid_bow", + "center_of_lower_outer_lip", + "midpoint_1_of_upper_outer_lip", + "midpoint_2_of_upper_outer_lip", + "midpoint_1_of_lower_outer_lip", + "midpoint_2_of_lower_outer_lip", + "midpoint_3_of_upper_outer_lip", + "midpoint_4_of_upper_outer_lip", + "midpoint_5_of_upper_outer_lip", + "midpoint_6_of_upper_outer_lip", + "midpoint_3_of_lower_outer_lip", + "midpoint_4_of_lower_outer_lip", + "midpoint_5_of_lower_outer_lip", + "midpoint_6_of_lower_outer_lip", + "r_inner_corner_of_mouth", + "l_inner_corner_of_mouth", + "center_of_upper_inner_lip", + "center_of_lower_inner_lip", + "midpoint_1_of_upper_inner_lip", + "midpoint_2_of_upper_inner_lip", + "midpoint_1_of_lower_inner_lip", + "midpoint_2_of_lower_inner_lip", + "midpoint_3_of_upper_inner_lip", + "midpoint_4_of_upper_inner_lip", + "midpoint_5_of_upper_inner_lip", + "midpoint_6_of_upper_inner_lip", + "midpoint_3_of_lower_inner_lip", + "midpoint_4_of_lower_inner_lip", + "midpoint_5_of_lower_inner_lip", + "midpoint_6_of_lower_inner_lip", + "l_top_end_of_inferior_crus", + "l_top_end_of_superior_crus", + "l_start_of_antihelix", + "l_end_of_antihelix", + "l_midpoint_1_of_antihelix", + "l_midpoint_1_of_inferior_crus", + "l_midpoint_2_of_antihelix", + "l_midpoint_3_of_antihelix", + "l_point_1_of_inner_helix", + "l_point_2_of_inner_helix", + "l_point_3_of_inner_helix", + "l_point_4_of_inner_helix", + "l_point_5_of_inner_helix", + "l_point_6_of_inner_helix", + "l_point_7_of_inner_helix", + "l_highest_point_of_antitragus", + "l_bottom_point_of_tragus", + "l_protruding_point_of_tragus", + "l_top_point_of_tragus", + "l_start_point_of_crus_of_helix", + "l_deepest_point_of_concha", + "l_tip_of_ear_lobe", + "l_midpoint_between_22_15", + "l_bottom_connecting_point_of_ear_lobe", + "l_top_connecting_point_of_helix", + "l_point_8_of_inner_helix", + "r_top_end_of_inferior_crus", + "r_top_end_of_superior_crus", + "r_start_of_antihelix", + "r_end_of_antihelix", + "r_midpoint_1_of_antihelix", + "r_midpoint_1_of_inferior_crus", + "r_midpoint_2_of_antihelix", + "r_midpoint_3_of_antihelix", + "r_point_1_of_inner_helix", + "r_point_8_of_inner_helix", + "r_point_3_of_inner_helix", + "r_point_4_of_inner_helix", + "r_point_5_of_inner_helix", + "r_point_6_of_inner_helix", + "r_point_7_of_inner_helix", + "r_highest_point_of_antitragus", + "r_bottom_point_of_tragus", + "r_protruding_point_of_tragus", + "r_top_point_of_tragus", + "r_start_point_of_crus_of_helix", + "r_deepest_point_of_concha", + "r_tip_of_ear_lobe", + "r_midpoint_between_22_15", + "r_bottom_connecting_point_of_ear_lobe", + "r_top_connecting_point_of_helix", + "r_point_2_of_inner_helix", + "l_center_of_iris", + "l_border_of_iris_3", + "l_border_of_iris_midpoint_1", + "l_border_of_iris_12", + "l_border_of_iris_midpoint_4", + "l_border_of_iris_9", + "l_border_of_iris_midpoint_3", + "l_border_of_iris_6", + "l_border_of_iris_midpoint_2", + "r_center_of_iris", + "r_border_of_iris_3", + "r_border_of_iris_midpoint_1", + "r_border_of_iris_12", + "r_border_of_iris_midpoint_4", + "r_border_of_iris_9", + "r_border_of_iris_midpoint_3", + "r_border_of_iris_6", + "r_border_of_iris_midpoint_2", + "l_center_of_pupil", + "l_border_of_pupil_3", + "l_border_of_pupil_midpoint_1", + "l_border_of_pupil_12", + "l_border_of_pupil_midpoint_4", + "l_border_of_pupil_9", + "l_border_of_pupil_midpoint_3", + "l_border_of_pupil_6", + "l_border_of_pupil_midpoint_2", + "r_center_of_pupil", + "r_border_of_pupil_3", + "r_border_of_pupil_midpoint_1", + "r_border_of_pupil_12", + "r_border_of_pupil_midpoint_4", + "r_border_of_pupil_9", + "r_border_of_pupil_midpoint_3", + "r_border_of_pupil_6", + "r_border_of_pupil_midpoint_2" +] + +GOLIATH_SKELETON_INFO = { + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_wrist', 'left_thumb_third_joint'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb_third_joint', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_wrist', 'left_forefinger_third_joint'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger_third_joint', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_wrist', 'left_middle_finger_third_joint'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger_third_joint', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_wrist', 'left_ring_finger_third_joint'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger_third_joint', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_wrist', 'left_pinky_finger_third_joint'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger_third_joint', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_wrist', 'right_thumb_third_joint'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb_third_joint', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_wrist', 'right_forefinger_third_joint'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger_third_joint', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_wrist', 'right_middle_finger_third_joint'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger_third_joint', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_wrist', 'right_ring_finger_third_joint'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger_third_joint', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_wrist', 'right_pinky_finger_third_joint'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger_third_joint', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + } \ No newline at end of file diff --git a/detector_utils.py b/detector_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..adb8f2a42ed53cc516e506fb3a852d528118d4de --- /dev/null +++ b/detector_utils.py @@ -0,0 +1,196 @@ +from typing import List, Optional, Sequence, Union + +import torch +import cv2 +import numpy as np +from mmcv.ops import RoIPool +from mmengine.dataset import Compose, pseudo_collate +from mmengine.device import get_device +from mmengine.registry import init_default_scope +from mmdet.apis import inference_detector, init_detector +from mmdet.structures import DetDataSample, SampleList +from mmdet.utils import get_test_pipeline_cfg + + +ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]] + +def nms(dets: np.ndarray, thr: float): + """Greedily select boxes with high confidence and overlap <= thr. + Args: + dets (np.ndarray): [[x1, y1, x2, y2, score]]. + thr (float): Retain overlap < thr. + Returns: + list: Indexes to keep. + """ + if len(dets) == 0: + return [] + + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thr)[0] + order = order[inds + 1] + + return keep + +def adapt_mmdet_pipeline(cfg): + """Converts pipeline types in MMDetection's test dataloader to use the + 'mmdet' namespace. + + Args: + cfg (ConfigDict): Configuration dictionary for MMDetection. + + Returns: + ConfigDict: Configuration dictionary with updated pipeline types. + """ + # use lazy import to avoid hard dependence on mmdet + from mmdet.datasets import transforms + + if 'test_dataloader' not in cfg: + return cfg + + pipeline = cfg.test_dataloader.dataset.pipeline + for trans in pipeline: + if trans['type'] in dir(transforms): + trans['type'] = 'mmdet.' + trans['type'] + + return cfg + + +def inference_detector( + model: torch.nn.Module, + imgs: ImagesType, + test_pipeline: Optional[Compose] = None, + text_prompt: Optional[str] = None, + custom_entities: bool = False, +) -> Union[DetDataSample, SampleList]: + """Inference image(s) with the detector. + + Args: + model (nn.Module): The loaded detector. + imgs (str, ndarray, Sequence[str/ndarray]): + Either image files or loaded images. + test_pipeline (:obj:`Compose`): Test pipeline. + + Returns: + :obj:`DetDataSample` or list[:obj:`DetDataSample`]: + If imgs is a list or tuple, the same length list type results + will be returned, otherwise return the detection results directly. + """ + if isinstance(imgs, torch.Tensor): + if imgs.is_cuda: + imgs = imgs.cpu() + + # Remove batch dimension and transpose + imgs = imgs.squeeze(0).permute(1, 2, 0).numpy() + + # Ensure the data type is appropriate (uint8 for most image processing functions) + imgs = (imgs * 255).astype(np.uint8) + + if isinstance(imgs, (list, tuple)) or (isinstance(imgs, np.ndarray) and len(imgs.shape) == 4): + is_batch = True + else: + imgs = [imgs] + is_batch = False + + cfg = model.cfg + + if test_pipeline is None: + cfg = cfg.copy() + test_pipeline = get_test_pipeline_cfg(cfg) + if isinstance(imgs[0], np.ndarray): + # Calling this method across libraries will result + # in module unregistered error if not prefixed with mmdet. + test_pipeline[0].type = "mmdet.LoadImageFromNDArray" + + test_pipeline = Compose(test_pipeline) + + if model.data_preprocessor.device.type == "cpu": + for m in model.modules(): + assert not isinstance( + m, RoIPool + ), "CPU inference with RoIPool is not supported currently." + + result_list = [] + for i, img in enumerate(imgs): + # prepare data + if isinstance(img, np.ndarray): + # TODO: remove img_id. + data_ = dict(img=img, img_id=0) + else: + # TODO: remove img_id. + data_ = dict(img_path=img, img_id=0) + + if text_prompt: + data_["text"] = text_prompt + data_["custom_entities"] = custom_entities + + # build the data pipeline + data_ = test_pipeline(data_) + + data_["inputs"] = [data_["inputs"]] + data_["data_samples"] = [data_["data_samples"]] + + # forward the model + with torch.no_grad(), torch.autocast(device_type=get_device(), dtype=torch.bfloat16): + results = model.test_step(data_)[0] + + result_list.append(results) + + if not is_batch: + return result_list[0] + else: + return result_list + + +def process_one_image_bbox(pred_instance, det_cat_id, bbox_thr, nms_thr): + bboxes = np.concatenate( + (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1 + ) + bboxes = bboxes[ + np.logical_and( + pred_instance.labels == det_cat_id, + pred_instance.scores > bbox_thr, + ) + ] + bboxes = bboxes[nms(bboxes, nms_thr), :4] + return bboxes + + +def process_images_detector(imgs, detector): + """Visualize predicted keypoints (and heatmaps) of one image.""" + # predict bbox + det_results = inference_detector(detector, imgs) + pred_instances = list( + map(lambda det_result: det_result.pred_instances.numpy(), det_results) + ) + bboxes_batch = list( + map( + lambda pred_instance: process_one_image_bbox( + pred_instance, 0, 0.3, 0.3 ## argparse.Namespace(det_cat_id=0, bbox_thr=0.3, nms_thr=0.3), + ), + pred_instances, + ) + ) + + return bboxes_batch diff --git a/external/cv/.gitignore b/external/cv/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a787c67d6eec893ae8f4e07b6123a5170f2593fe --- /dev/null +++ b/external/cv/.gitignore @@ -0,0 +1,125 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# PyTorch checkpoint +*.pth + +# Distribution / packaging +.Python +build/ +develop-eggs/ +#dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +mlu-ops/ +mlu-ops.* + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/en/_build/ +docs/en/api/generated/ +docs/zh_cn/_build/ +docs/zh_cn/api/generated/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# editors and IDEs +.idea/ +.vscode/ + +# custom +.DS_Store + +# datasets and logs and checkpoints +data/ +work_dir/ + +src/ diff --git a/external/cv/MANIFEST.in b/external/cv/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..622635caa1ec01f78d95c684b87658df87c63b38 --- /dev/null +++ b/external/cv/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements/runtime.txt +include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp +include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp +include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp +include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm +recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm diff --git a/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl b/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl new file mode 100644 index 0000000000000000000000000000000000000000..eecbde2ca8eee264c31762fc1ee936277a76ca12 --- /dev/null +++ b/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746f2be13eefdfe43a59d9c415e03a4b0b922e6ce487b76a572a376ae76c9300 +size 30006791 diff --git a/external/cv/mmcv/__init__.py b/external/cv/mmcv/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e7bb7ac46d9bc1fe6c1dd5b6f74044776df805a --- /dev/null +++ b/external/cv/mmcv/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# flake8: noqa +from .arraymisc import * +from .image import * +from .transforms import * +from .version import * +from .video import * +from .visualization import * + +# The following modules are not imported to this level, so mmcv may be used +# without PyTorch. +# - op +# - utils diff --git a/external/cv/mmcv/arraymisc/__init__.py b/external/cv/mmcv/arraymisc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad965788fdc37610d7c56a7d0c3c9bbeed3bc98 --- /dev/null +++ b/external/cv/mmcv/arraymisc/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .quantization import dequantize, quantize + +__all__ = ['quantize', 'dequantize'] diff --git a/external/cv/mmcv/arraymisc/quantization.py b/external/cv/mmcv/arraymisc/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..06fc55c930754eb43f7e26fb5401afc6106cbb09 --- /dev/null +++ b/external/cv/mmcv/arraymisc/quantization.py @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Union + +import numpy as np + + +def quantize(arr: np.ndarray, + min_val: Union[int, float], + max_val: Union[int, float], + levels: int, + dtype=np.int64) -> tuple: + """Quantize an array of (-inf, inf) to [0, levels-1]. + + Args: + arr (ndarray): Input array. + min_val (int or float): Minimum value to be clipped. + max_val (int or float): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the quantized array. + + Returns: + tuple: Quantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + arr = np.clip(arr, min_val, max_val) - min_val + quantized_arr = np.minimum( + np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) + + return quantized_arr + + +def dequantize(arr: np.ndarray, + min_val: Union[int, float], + max_val: Union[int, float], + levels: int, + dtype=np.float64) -> tuple: + """Dequantize an array. + + Args: + arr (ndarray): Input array. + min_val (int or float): Minimum value to be clipped. + max_val (int or float): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the dequantized array. + + Returns: + tuple: Dequantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - + min_val) / levels + min_val + + return dequantized_arr diff --git a/external/cv/mmcv/cnn/__init__.py b/external/cv/mmcv/cnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b24b7783a5f4c3523dd9465eadbd2646dea994a --- /dev/null +++ b/external/cv/mmcv/cnn/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .alexnet import AlexNet +# yapf: disable +from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule, + ConvTranspose2d, ConvTranspose3d, ConvWS2d, + DepthwiseSeparableConvModule, GeneralizedAttention, + HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d, + NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish, + build_activation_layer, build_conv_layer, + build_norm_layer, build_padding_layer, build_plugin_layer, + build_upsample_layer, conv_ws_2d, is_norm) +# yapf: enable +from .resnet import ResNet, make_res_layer +from .rfsearch import Conv2dRFSearchOp, RFSearchHook +from .utils import fuse_conv_bn, get_model_complexity_info +from .vgg import VGG, make_vgg_layer + +__all__ = [ + 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer', + 'ConvModule', 'build_activation_layer', 'build_conv_layer', + 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', + 'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d', + 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention', + 'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d', + 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d', + 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn', + 'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook' +] diff --git a/external/cv/mmcv/cnn/alexnet.py b/external/cv/mmcv/cnn/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..caf385e2c1af131bf7cf4cc481eea285d803fd6b --- /dev/null +++ b/external/cv/mmcv/cnn/alexnet.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Optional + +import torch +import torch.nn as nn +from mmengine.runner import load_checkpoint + + +class AlexNet(nn.Module): + """AlexNet backbone. + + Args: + num_classes (int): number of classes for classification. + """ + + def __init__(self, num_classes: int = -1): + super().__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained: Optional[str] = None) -> None: + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + # use default initializer + pass + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/external/cv/mmcv/cnn/bricks/__init__.py b/external/cv/mmcv/cnn/bricks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e607bb4e65199d9f537709a4f59ff50c0f44ebfe --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .activation import build_activation_layer +from .context_block import ContextBlock +from .conv import build_conv_layer +from .conv2d_adaptive_padding import Conv2dAdaptivePadding +from .conv_module import ConvModule +from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d +from .depthwise_separable_conv_module import DepthwiseSeparableConvModule +from .drop import Dropout, DropPath +from .generalized_attention import GeneralizedAttention +from .hsigmoid import HSigmoid +from .hswish import HSwish +from .non_local import NonLocal1d, NonLocal2d, NonLocal3d +from .norm import build_norm_layer, is_norm +from .padding import build_padding_layer +from .plugin import build_plugin_layer +from .scale import LayerScale, Scale +from .swish import Swish +from .upsample import build_upsample_layer +from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d, + Linear, MaxPool2d, MaxPool3d) + +__all__ = [ + 'ConvModule', 'build_activation_layer', 'build_conv_layer', + 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', + 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d', + 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention', + 'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d', + 'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding', + 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', + 'Conv3d', 'Dropout', 'DropPath', 'LayerScale' +] diff --git a/external/cv/mmcv/cnn/bricks/activation.py b/external/cv/mmcv/cnn/bricks/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..397c541eb0036b4f9068ee590a9aa426bcdddad3 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/activation.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.registry import MODELS +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION + +for module in [ + nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU, + nn.Sigmoid, nn.Tanh +]: + MODELS.register_module(module=module) + +if digit_version(torch.__version__) >= digit_version('1.7.0'): + MODELS.register_module(module=nn.SiLU, name='SiLU') +else: + + class SiLU(nn.Module): + """Sigmoid Weighted Liner Unit.""" + + def __init__(self, inplace=False): + super().__init__() + self.inplace = inplace + + def forward(self, inputs) -> torch.Tensor: + if self.inplace: + return inputs.mul_(torch.sigmoid(inputs)) + else: + return inputs * torch.sigmoid(inputs) + + MODELS.register_module(module=SiLU, name='SiLU') + + +@MODELS.register_module(name='Clip') +@MODELS.register_module() +class Clamp(nn.Module): + """Clamp activation layer. + + This activation function is to clamp the feature map value within + :math:`[min, max]`. More details can be found in ``torch.clamp()``. + + Args: + min (Number | optional): Lower-bound of the range to be clamped to. + Default to -1. + max (Number | optional): Upper-bound of the range to be clamped to. + Default to 1. + """ + + def __init__(self, min: float = -1., max: float = 1.): + super().__init__() + self.min = min + self.max = max + + def forward(self, x) -> torch.Tensor: + """Forward function. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: Clamped tensor. + """ + return torch.clamp(x, min=self.min, max=self.max) + + +class GELU(nn.Module): + r"""Applies the Gaussian Error Linear Units function: + + .. math:: + \text{GELU}(x) = x * \Phi(x) + where :math:`\Phi(x)` is the Cumulative Distribution Function for + Gaussian Distribution. + + Shape: + - Input: :math:`(N, *)` where `*` means, any number of additional + dimensions + - Output: :math:`(N, *)`, same shape as the input + + .. image:: scripts/activation_images/GELU.png + + Examples:: + + >>> m = nn.GELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.gelu(input) + + +if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.4')): + MODELS.register_module(module=GELU) +else: + MODELS.register_module(module=nn.GELU) + + +def build_activation_layer(cfg: Dict) -> nn.Module: + """Build activation layer. + + Args: + cfg (dict): The activation layer config, which should contain: + + - type (str): Layer type. + - layer args: Args needed to instantiate an activation layer. + + Returns: + nn.Module: Created activation layer. + """ + return MODELS.build(cfg) diff --git a/external/cv/mmcv/cnn/bricks/context_block.py b/external/cv/mmcv/cnn/bricks/context_block.py new file mode 100644 index 0000000000000000000000000000000000000000..364b8c3dfbb3f80689e538a586eb656a9f7e77ba --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/context_block.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Union + +import torch +from mmengine.model import constant_init, kaiming_init +from mmengine.registry import MODELS +from torch import nn + + +def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None: + if isinstance(m, nn.Sequential): + constant_init(m[-1], val=0) + else: + constant_init(m, val=0) + + +@MODELS.register_module() +class ContextBlock(nn.Module): + """ContextBlock module in GCNet. + + See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' + (https://arxiv.org/abs/1904.11492) for details. + + Args: + in_channels (int): Channels of the input feature map. + ratio (float): Ratio of channels of transform bottleneck + pooling_type (str): Pooling method for context modeling. + Options are 'att' and 'avg', stand for attention pooling and + average pooling respectively. Default: 'att'. + fusion_types (Sequence[str]): Fusion method for feature fusion, + Options are 'channels_add', 'channel_mul', stand for channelwise + addition and multiplication respectively. Default: ('channel_add',) + """ + + _abbr_ = 'context_block' + + def __init__(self, + in_channels: int, + ratio: float, + pooling_type: str = 'att', + fusion_types: tuple = ('channel_add', )): + super().__init__() + assert pooling_type in ['avg', 'att'] + assert isinstance(fusion_types, (list, tuple)) + valid_fusion_types = ['channel_add', 'channel_mul'] + assert all([f in valid_fusion_types for f in fusion_types]) + assert len(fusion_types) > 0, 'at least one fusion should be used' + self.in_channels = in_channels + self.ratio = ratio + self.planes = int(in_channels * ratio) + self.pooling_type = pooling_type + self.fusion_types = fusion_types + if pooling_type == 'att': + self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1) + self.softmax = nn.Softmax(dim=2) + else: + self.avg_pool = nn.AdaptiveAvgPool2d(1) + if 'channel_add' in fusion_types: + self.channel_add_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_add_conv = None + if 'channel_mul' in fusion_types: + self.channel_mul_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_mul_conv = None + self.reset_parameters() + + def reset_parameters(self): + if self.pooling_type == 'att': + kaiming_init(self.conv_mask, mode='fan_in') + self.conv_mask.inited = True + + if self.channel_add_conv is not None: + last_zero_init(self.channel_add_conv) + if self.channel_mul_conv is not None: + last_zero_init(self.channel_mul_conv) + + def spatial_pool(self, x: torch.Tensor) -> torch.Tensor: + batch, channel, height, width = x.size() + if self.pooling_type == 'att': + input_x = x + # [N, C, H * W] + input_x = input_x.view(batch, channel, height * width) + # [N, 1, C, H * W] + input_x = input_x.unsqueeze(1) + # [N, 1, H, W] + context_mask = self.conv_mask(x) + # [N, 1, H * W] + context_mask = context_mask.view(batch, 1, height * width) + # [N, 1, H * W] + context_mask = self.softmax(context_mask) + # [N, 1, H * W, 1] + context_mask = context_mask.unsqueeze(-1) + # [N, 1, C, 1] + context = torch.matmul(input_x, context_mask) + # [N, C, 1, 1] + context = context.view(batch, channel, 1, 1) + else: + # [N, C, 1, 1] + context = self.avg_pool(x) + + return context + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # [N, C, 1, 1] + context = self.spatial_pool(x) + + out = x + if self.channel_mul_conv is not None: + # [N, C, 1, 1] + channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) + out = out * channel_mul_term + if self.channel_add_conv is not None: + # [N, C, 1, 1] + channel_add_term = self.channel_add_conv(context) + out = out + channel_add_term + + return out diff --git a/external/cv/mmcv/cnn/bricks/conv.py b/external/cv/mmcv/cnn/bricks/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..7f6da2ece9d77137e02f825633b50c992ff6df5b --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/conv.py @@ -0,0 +1,56 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +from typing import Dict, Optional + +from mmengine.registry import MODELS +from torch import nn + +MODELS.register_module('Conv1d', module=nn.Conv1d) +MODELS.register_module('Conv2d', module=nn.Conv2d) +MODELS.register_module('Conv3d', module=nn.Conv3d) +MODELS.register_module('Conv', module=nn.Conv2d) + + +def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module: + """Build convolution layer. + + Args: + cfg (None or dict): The conv layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an conv layer. + args (argument list): Arguments passed to the `__init__` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the `__init__` + method of the corresponding conv layer. + + Returns: + nn.Module: Created conv layer. + """ + if cfg is None: + cfg_ = dict(type='Conv2d') + else: + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if inspect.isclass(layer_type): + return layer_type(*args, **kwargs, **cfg_) # type: ignore + # Switch registry to the target scope. If `conv_layer` cannot be found + # in the registry, fallback to search `conv_layer` in the + # mmengine.MODELS. + with MODELS.switch_scope_and_registry(None) as registry: + conv_layer = registry.get(layer_type) + if conv_layer is None: + raise KeyError(f'Cannot find {conv_layer} in registry under scope ' + f'name {registry.scope}') + layer = conv_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..e6376f4ca558ed721633ec6063ef1349cafdcba4 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Tuple, Union + +import torch +from mmengine.registry import MODELS +from torch import nn +from torch.nn import functional as F + + +@MODELS.register_module() +class Conv2dAdaptivePadding(nn.Conv2d): + """Implementation of 2D convolution in tensorflow with `padding` as "same", + which applies padding to input (if needed) so that input image gets fully + covered by filter and stride you specified. For stride 1, this will ensure + that output image size is same as input. For stride of 2, output dimensions + will be half, for example. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True): + super().__init__(in_channels, out_channels, kernel_size, stride, 0, + dilation, groups, bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + img_h, img_w = x.size()[-2:] + kernel_h, kernel_w = self.weight.size()[-2:] + stride_h, stride_w = self.stride + output_h = math.ceil(img_h / stride_h) + output_w = math.ceil(img_w / stride_w) + pad_h = ( + max((output_h - 1) * self.stride[0] + + (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0)) + pad_w = ( + max((output_w - 1) * self.stride[1] + + (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0)) + if pad_h > 0 or pad_w > 0: + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 + ]) + return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) diff --git a/external/cv/mmcv/cnn/bricks/conv_module.py b/external/cv/mmcv/cnn/bricks/conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..eb2f05361cf2578f4742d1379dc675474816e134 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/conv_module.py @@ -0,0 +1,343 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import warnings +from functools import partial +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmengine.model import constant_init, kaiming_init +from mmengine.registry import MODELS +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm + +from .activation import build_activation_layer +from .conv import build_conv_layer +from .norm import build_norm_layer +from .padding import build_padding_layer + + +def efficient_conv_bn_eval_forward(bn: _BatchNorm, + conv: nn.modules.conv._ConvNd, + x: torch.Tensor): + """ + Implementation based on https://arxiv.org/abs/2305.11624 + "Tune-Mode ConvBN Blocks For Efficient Transfer Learning" + It leverages the associative law between convolution and affine transform, + i.e., normalize (weight conv feature) = (normalize weight) conv feature. + It works for Eval mode of ConvBN blocks during validation, and can be used + for training as well. It reduces memory and computation cost. + + Args: + bn (_BatchNorm): a BatchNorm module. + conv (nn._ConvNd): a conv module + x (torch.Tensor): Input feature map. + """ + # These lines of code are designed to deal with various cases + # like bn without affine transform, and conv without bias + weight_on_the_fly = conv.weight + if conv.bias is not None: + bias_on_the_fly = conv.bias + else: + bias_on_the_fly = torch.zeros_like(bn.running_var) + + if bn.weight is not None: + bn_weight = bn.weight + else: + bn_weight = torch.ones_like(bn.running_var) + + if bn.bias is not None: + bn_bias = bn.bias + else: + bn_bias = torch.zeros_like(bn.running_var) + + # shape of [C_out, 1, 1, 1] in Conv2d + weight_coeff = torch.rsqrt(bn.running_var + + bn.eps).reshape([-1] + [1] * + (len(conv.weight.shape) - 1)) + # shape of [C_out, 1, 1, 1] in Conv2d + coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff + + # shape of [C_out, C_in, k, k] in Conv2d + weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly + # shape of [C_out] in Conv2d + bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\ + (bias_on_the_fly - bn.running_mean) + + return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly) + + +@MODELS.register_module() +class ConvModule(nn.Module): + """A conv block that bundles conv/norm/activation layers. + + This block simplifies the usage of convolution layers, which are commonly + used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). + It is based upon three build methods: `build_conv_layer()`, + `build_norm_layer()` and `build_activation_layer()`. + + Besides, we add some additional features in this module. + 1. Automatically set `bias` of the conv layer. + 2. Spectral norm is supported. + 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only + supports zero and circular padding, and we add "reflect" padding mode. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. + groups (int): Number of blocked connections from input channels to + output channels. Same as that in ``nn._ConvNd``. + bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + inplace (bool): Whether to use inplace mode for activation. + Default: True. + with_spectral_norm (bool): Whether use spectral norm in conv module. + Default: False. + padding_mode (str): If the `padding_mode` has not been supported by + current `Conv2d` in PyTorch, we will use our own padding layer + instead. Currently, we support ['zeros', 'circular'] with official + implementation and ['reflect'] with our own implementation. + Default: 'zeros'. + order (tuple[str]): The order of conv/norm/activation layers. It is a + sequence of "conv", "norm" and "act". Common examples are + ("conv", "norm", "act") and ("act", "conv", "norm"). + Default: ('conv', 'norm', 'act'). + efficient_conv_bn_eval (bool): Whether use efficient conv when the + consecutive bn is in eval mode (either training or testing), as + proposed in https://arxiv.org/abs/2305.11624 . Default: `False`. + """ + + _abbr_ = 'conv_block' + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: Union[bool, str] = 'auto', + conv_cfg: Optional[Dict] = None, + norm_cfg: Optional[Dict] = None, + act_cfg: Optional[Dict] = dict(type='ReLU'), + inplace: bool = True, + with_spectral_norm: bool = False, + padding_mode: str = 'zeros', + order: tuple = ('conv', 'norm', 'act'), + efficient_conv_bn_eval: bool = False): + super().__init__() + assert conv_cfg is None or isinstance(conv_cfg, dict) + assert norm_cfg is None or isinstance(norm_cfg, dict) + assert act_cfg is None or isinstance(act_cfg, dict) + official_padding_mode = ['zeros', 'circular'] + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.inplace = inplace + self.with_spectral_norm = with_spectral_norm + self.with_explicit_padding = padding_mode not in official_padding_mode + self.order = order + assert isinstance(self.order, tuple) and len(self.order) == 3 + assert set(order) == {'conv', 'norm', 'act'} + + self.with_norm = norm_cfg is not None + self.with_activation = act_cfg is not None + # if the conv layer is before a norm layer, bias is unnecessary. + if bias == 'auto': + bias = not self.with_norm + self.with_bias = bias + + if self.with_explicit_padding: + pad_cfg = dict(type=padding_mode) + self.padding_layer = build_padding_layer(pad_cfg, padding) + + # reset padding to 0 for conv module + conv_padding = 0 if self.with_explicit_padding else padding + # build convolution layer + self.conv = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=conv_padding, + dilation=dilation, + groups=groups, + bias=bias) + # export the attributes of self.conv to a higher level for convenience + self.in_channels = self.conv.in_channels + self.out_channels = self.conv.out_channels + self.kernel_size = self.conv.kernel_size + self.stride = self.conv.stride + self.padding = padding + self.dilation = self.conv.dilation + self.transposed = self.conv.transposed + self.output_padding = self.conv.output_padding + self.groups = self.conv.groups + + if self.with_spectral_norm: + self.conv = nn.utils.spectral_norm(self.conv) + + # build normalization layers + if self.with_norm: + # norm layer is after conv layer + if order.index('norm') > order.index('conv'): + norm_channels = out_channels + else: + norm_channels = in_channels + self.norm_name, norm = build_norm_layer( + norm_cfg, norm_channels) # type: ignore + self.add_module(self.norm_name, norm) + if self.with_bias: + if isinstance(norm, (_BatchNorm, _InstanceNorm)): + warnings.warn( + 'Unnecessary conv bias before batch/instance norm') + else: + self.norm_name = None # type: ignore + + self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) + + # build activation layer + if self.with_activation: + act_cfg_ = act_cfg.copy() # type: ignore + # nn.Tanh has no 'inplace' argument + if act_cfg_['type'] not in [ + 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU' + ]: + act_cfg_.setdefault('inplace', inplace) + self.activate = build_activation_layer(act_cfg_) + + # Use msra init by default + self.init_weights() + + @property + def norm(self): + if self.norm_name: + return getattr(self, self.norm_name) + else: + return None + + def init_weights(self): + # 1. It is mainly for customized conv layers with their own + # initialization manners by calling their own ``init_weights()``, + # and we do not want ConvModule to override the initialization. + # 2. For customized conv layers without their own initialization + # manners (that is, they don't have their own ``init_weights()``) + # and PyTorch's conv layers, they will be initialized by + # this method with default ``kaiming_init``. + # Note: For PyTorch's conv layers, they will be overwritten by our + # initialization implementation using default ``kaiming_init``. + if not hasattr(self.conv, 'init_weights'): + if self.with_activation and self.act_cfg['type'] == 'LeakyReLU': + nonlinearity = 'leaky_relu' + a = self.act_cfg.get('negative_slope', 0.01) + else: + nonlinearity = 'relu' + a = 0 + kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) + if self.with_norm: + constant_init(self.norm, 1, bias=0) + + def forward(self, + x: torch.Tensor, + activate: bool = True, + norm: bool = True) -> torch.Tensor: + layer_index = 0 + while layer_index < len(self.order): + layer = self.order[layer_index] + if layer == 'conv': + if self.with_explicit_padding: + x = self.padding_layer(x) + # if the next operation is norm and we have a norm layer in + # eval mode and we have enabled `efficient_conv_bn_eval` for + # the conv operator, then activate the optimized forward and + # skip the next norm operator since it has been fused + if layer_index + 1 < len(self.order) and \ + self.order[layer_index + 1] == 'norm' and norm and \ + self.with_norm and not self.norm.training and \ + self.efficient_conv_bn_eval_forward is not None: + self.conv.forward = partial( + self.efficient_conv_bn_eval_forward, self.norm, + self.conv) + layer_index += 1 + x = self.conv(x) + del self.conv.forward + else: + x = self.conv(x) + elif layer == 'norm' and norm and self.with_norm: + x = self.norm(x) + elif layer == 'act' and activate and self.with_activation: + x = self.activate(x) + layer_index += 1 + return x + + def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True): + # efficient_conv_bn_eval works for conv + bn + # with `track_running_stats` option + if efficient_conv_bn_eval and self.norm \ + and isinstance(self.norm, _BatchNorm) \ + and self.norm.track_running_stats: + self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501 + else: + self.efficient_conv_bn_eval_forward = None # type: ignore + + @staticmethod + def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd, + bn: torch.nn.modules.batchnorm._BatchNorm, + efficient_conv_bn_eval=True) -> 'ConvModule': + """Create a ConvModule from a conv and a bn module.""" + self = ConvModule.__new__(ConvModule) + super(ConvModule, self).__init__() + + self.conv_cfg = None + self.norm_cfg = None + self.act_cfg = None + self.inplace = False + self.with_spectral_norm = False + self.with_explicit_padding = False + self.order = ('conv', 'norm', 'act') + + self.with_norm = True + self.with_activation = False + self.with_bias = conv.bias is not None + + # build convolution layer + self.conv = conv + # export the attributes of self.conv to a higher level for convenience + self.in_channels = self.conv.in_channels + self.out_channels = self.conv.out_channels + self.kernel_size = self.conv.kernel_size + self.stride = self.conv.stride + self.padding = self.conv.padding + self.dilation = self.conv.dilation + self.transposed = self.conv.transposed + self.output_padding = self.conv.output_padding + self.groups = self.conv.groups + + # build normalization layers + self.norm_name, norm = 'bn', bn + self.add_module(self.norm_name, norm) + + self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) + + return self diff --git a/external/cv/mmcv/cnn/bricks/conv_ws.py b/external/cv/mmcv/cnn/bricks/conv_ws.py new file mode 100644 index 0000000000000000000000000000000000000000..901cd8002a95bcbb12f9d2723fd34a341017ae91 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/conv_ws.py @@ -0,0 +1,158 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.registry import MODELS + + +def conv_ws_2d(input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + eps: float = 1e-5) -> torch.Tensor: + c_in = weight.size(0) + weight_flat = weight.view(c_in, -1) + mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) + std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) + weight = (weight - mean) / (std + eps) + return F.conv2d(input, weight, bias, stride, padding, dilation, groups) + + +@MODELS.register_module('ConvWS') +class ConvWS2d(nn.Conv2d): + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True, + eps: float = 1e-5): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups, self.eps) + + +@MODELS.register_module(name='ConvAWS') +class ConvAWS2d(nn.Conv2d): + """AWS (Adaptive Weight Standardization) + + This is a variant of Weight Standardization + (https://arxiv.org/pdf/1903.10520.pdf) + It is used in DetectoRS to avoid NaN + (https://arxiv.org/pdf/2006.02334.pdf) + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the conv kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If set True, adds a learnable bias to the + output. Default: True + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.register_buffer('weight_gamma', + torch.ones(self.out_channels, 1, 1, 1)) + self.register_buffer('weight_beta', + torch.zeros(self.out_channels, 1, 1, 1)) + + def _get_weight(self, weight: torch.Tensor) -> torch.Tensor: + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + weight = (weight - mean) / std + weight = self.weight_gamma * weight + self.weight_beta + return weight + + def forward(self, x: torch.Tensor) -> torch.Tensor: + weight = self._get_weight(self.weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str, + local_metadata: Dict, strict: bool, + missing_keys: List[str], + unexpected_keys: List[str], + error_msgs: List[str]) -> None: + """Override default load function. + + AWS overrides the function _load_from_state_dict to recover + weight_gamma and weight_beta if they are missing. If weight_gamma and + weight_beta are found in the checkpoint, this function will return + after super()._load_from_state_dict. Otherwise, it will compute the + mean and std of the pretrained weights and store them in weight_beta + and weight_gamma. + """ + + self.weight_gamma.data.fill_(-1) + local_missing_keys: List = [] + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, local_missing_keys, + unexpected_keys, error_msgs) + if self.weight_gamma.data.mean() > 0: + for k in local_missing_keys: + missing_keys.append(k) + return + weight = self.weight.data + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + self.weight_beta.data.copy_(mean) + self.weight_gamma.data.copy_(std) + missing_gamma_beta = [ + k for k in local_missing_keys + if k.endswith('weight_gamma') or k.endswith('weight_beta') + ] + for k in missing_gamma_beta: + local_missing_keys.remove(k) + for k in local_missing_keys: + missing_keys.append(k) diff --git a/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..d66c8951c32f9c40f50ceb3d394aaf0fd4252150 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py @@ -0,0 +1,104 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn + +from .conv_module import ConvModule + + +class DepthwiseSeparableConvModule(nn.Module): + """Depthwise separable convolution module. + + See https://arxiv.org/pdf/1704.04861.pdf for details. + + This module can replace a ConvModule with the conv block replaced by two + conv block: depthwise conv block and pointwise conv block. The depthwise + conv block contains depthwise-conv/norm/activation layers. The pointwise + conv block contains pointwise-conv/norm/activation layers. It should be + noted that there will be norm/activation layer in the depthwise conv block + if `norm_cfg` and `act_cfg` are specified. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. Default: 1. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. Default: 0. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. Default: 1. + norm_cfg (dict): Default norm config for both depthwise ConvModule and + pointwise ConvModule. Default: None. + act_cfg (dict): Default activation config for both depthwise ConvModule + and pointwise ConvModule. Default: dict(type='ReLU'). + dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + kwargs (optional): Other shared arguments for depthwise and pointwise + ConvModule. See ConvModule for ref. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + norm_cfg: Optional[Dict] = None, + act_cfg: Dict = dict(type='ReLU'), + dw_norm_cfg: Union[Dict, str] = 'default', + dw_act_cfg: Union[Dict, str] = 'default', + pw_norm_cfg: Union[Dict, str] = 'default', + pw_act_cfg: Union[Dict, str] = 'default', + **kwargs): + super().__init__() + assert 'groups' not in kwargs, 'groups should not be specified' + + # if norm/activation config of depthwise/pointwise ConvModule is not + # specified, use default config. + dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 + dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg + pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 + pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg + + # depthwise convolution + self.depthwise_conv = ConvModule( + in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + norm_cfg=dw_norm_cfg, # type: ignore + act_cfg=dw_act_cfg, # type: ignore + **kwargs) + + self.pointwise_conv = ConvModule( + in_channels, + out_channels, + 1, + norm_cfg=pw_norm_cfg, # type: ignore + act_cfg=pw_act_cfg, # type: ignore + **kwargs) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.depthwise_conv(x) + x = self.pointwise_conv(x) + return x diff --git a/external/cv/mmcv/cnn/bricks/drop.py b/external/cv/mmcv/cnn/bricks/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..4e2853a17994f62c9e4db1e2704673c24b3cf670 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/drop.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, Optional + +import torch +import torch.nn as nn +from mmengine.registry import MODELS + + +def drop_path(x: torch.Tensor, + drop_prob: float = 0., + training: bool = False) -> torch.Tensor: + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + """ + if not training: + return x + keep_prob = 1 - drop_prob + # handle tensors with different dimensions, not just 4D tensors. + shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + output = x.div(keep_prob) * random_tensor.floor() + return output + + +@MODELS.register_module() +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + + Args: + drop_prob (float): Probability of the path to be zeroed. Default: 0.1 + """ + + def __init__(self, drop_prob: float = 0.1): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return drop_path(x, self.drop_prob, self.training) + + +@MODELS.register_module() +class Dropout(nn.Dropout): + """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of + ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with + ``DropPath`` + + Args: + drop_prob (float): Probability of the elements to be + zeroed. Default: 0.5. + inplace (bool): Do the operation inplace or not. Default: False. + """ + + def __init__(self, drop_prob: float = 0.5, inplace: bool = False): + super().__init__(p=drop_prob, inplace=inplace) + + +def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any: + """Builder for drop out layers.""" + return MODELS.build(cfg, default_args=default_args) diff --git a/external/cv/mmcv/cnn/bricks/generalized_attention.py b/external/cv/mmcv/cnn/bricks/generalized_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4e1b466afe31f577723316214b39df6114882195 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/generalized_attention.py @@ -0,0 +1,416 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import kaiming_init +from mmengine.registry import MODELS + + +@MODELS.register_module() +class GeneralizedAttention(nn.Module): + """GeneralizedAttention module. + + See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' + (https://arxiv.org/abs/1904.05873) for details. + + Args: + in_channels (int): Channels of the input feature map. + spatial_range (int): The spatial range. -1 indicates no spatial range + constraint. Default: -1. + num_heads (int): The head number of empirical_attention module. + Default: 9. + position_embedding_dim (int): The position embedding dimension. + Default: -1. + position_magnitude (int): A multiplier acting on coord difference. + Default: 1. + kv_stride (int): The feature stride acting on key/value feature map. + Default: 2. + q_stride (int): The feature stride acting on query feature map. + Default: 1. + attention_type (str): A binary indicator string for indicating which + items in generalized empirical_attention module are used. + Default: '1111'. + + - '1000' indicates 'query and key content' (appr - appr) item, + - '0100' indicates 'query content and relative position' + (appr - position) item, + - '0010' indicates 'key content only' (bias - appr) item, + - '0001' indicates 'relative position only' (bias - position) item. + """ + + _abbr_ = 'gen_attention_block' + + def __init__(self, + in_channels: int, + spatial_range: int = -1, + num_heads: int = 9, + position_embedding_dim: int = -1, + position_magnitude: int = 1, + kv_stride: int = 2, + q_stride: int = 1, + attention_type: str = '1111'): + + super().__init__() + + # hard range means local range for non-local operation + self.position_embedding_dim = ( + position_embedding_dim + if position_embedding_dim > 0 else in_channels) + + self.position_magnitude = position_magnitude + self.num_heads = num_heads + self.in_channels = in_channels + self.spatial_range = spatial_range + self.kv_stride = kv_stride + self.q_stride = q_stride + self.attention_type = [bool(int(_)) for _ in attention_type] + self.qk_embed_dim = in_channels // num_heads + out_c = self.qk_embed_dim * num_heads + + if self.attention_type[0] or self.attention_type[1]: + self.query_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.query_conv.kaiming_init = True + + if self.attention_type[0] or self.attention_type[2]: + self.key_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.key_conv.kaiming_init = True + + self.v_dim = in_channels // num_heads + self.value_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=self.v_dim * num_heads, + kernel_size=1, + bias=False) + self.value_conv.kaiming_init = True + + if self.attention_type[1] or self.attention_type[3]: + self.appr_geom_fc_x = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_x.kaiming_init = True + + self.appr_geom_fc_y = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_y.kaiming_init = True + + if self.attention_type[2]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.appr_bias = nn.Parameter(appr_bias_value) + + if self.attention_type[3]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.geom_bias = nn.Parameter(geom_bias_value) + + self.proj_conv = nn.Conv2d( + in_channels=self.v_dim * num_heads, + out_channels=in_channels, + kernel_size=1, + bias=True) + self.proj_conv.kaiming_init = True + self.gamma = nn.Parameter(torch.zeros(1)) + + if self.spatial_range >= 0: + # only works when non local is after 3*3 conv + if in_channels == 256: + max_len = 84 + elif in_channels == 512: + max_len = 42 + + max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) + local_constraint_map = np.ones( + (max_len, max_len, max_len_kv, max_len_kv), dtype=int) + for iy in range(max_len): + for ix in range(max_len): + local_constraint_map[ + iy, ix, + max((iy - self.spatial_range) // + self.kv_stride, 0):min((iy + self.spatial_range + + 1) // self.kv_stride + + 1, max_len), + max((ix - self.spatial_range) // + self.kv_stride, 0):min((ix + self.spatial_range + + 1) // self.kv_stride + + 1, max_len)] = 0 + + self.local_constraint_map = nn.Parameter( + torch.from_numpy(local_constraint_map).byte(), + requires_grad=False) + + if self.q_stride > 1: + self.q_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.q_stride) + else: + self.q_downsample = None + + if self.kv_stride > 1: + self.kv_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.kv_stride) + else: + self.kv_downsample = None + + self.init_weights() + + def get_position_embedding(self, + h, + w, + h_kv, + w_kv, + q_stride, + kv_stride, + device, + dtype, + feat_dim, + wave_length=1000): + # the default type of Tensor is float32, leading to type mismatch + # in fp16 mode. Cast it to support fp16 mode. + h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype) + h_idxs = h_idxs.view((h, 1)) * q_stride + + w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype) + w_idxs = w_idxs.view((w, 1)) * q_stride + + h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to( + device=device, dtype=dtype) + h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride + + w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to( + device=device, dtype=dtype) + w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride + + # (h, h_kv, 1) + h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0) + h_diff *= self.position_magnitude + + # (w, w_kv, 1) + w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0) + w_diff *= self.position_magnitude + + feat_range = torch.arange(0, feat_dim / 4).to( + device=device, dtype=dtype) + + dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype) + dim_mat = dim_mat**((4. / feat_dim) * feat_range) + dim_mat = dim_mat.view((1, 1, -1)) + + embedding_x = torch.cat( + ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2) + + embedding_y = torch.cat( + ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2) + + return embedding_x, embedding_y + + def forward(self, x_input: torch.Tensor) -> torch.Tensor: + num_heads = self.num_heads + + # use empirical_attention + if self.q_downsample is not None: + x_q = self.q_downsample(x_input) + else: + x_q = x_input + n, _, h, w = x_q.shape + + if self.kv_downsample is not None: + x_kv = self.kv_downsample(x_input) + else: + x_kv = x_input + _, _, h_kv, w_kv = x_kv.shape + + if self.attention_type[0] or self.attention_type[1]: + proj_query = self.query_conv(x_q).view( + (n, num_heads, self.qk_embed_dim, h * w)) + proj_query = proj_query.permute(0, 1, 3, 2) + + if self.attention_type[0] or self.attention_type[2]: + proj_key = self.key_conv(x_kv).view( + (n, num_heads, self.qk_embed_dim, h_kv * w_kv)) + + if self.attention_type[1] or self.attention_type[3]: + position_embed_x, position_embed_y = self.get_position_embedding( + h, w, h_kv, w_kv, self.q_stride, self.kv_stride, + x_input.device, x_input.dtype, self.position_embedding_dim) + # (n, num_heads, w, w_kv, dim) + position_feat_x = self.appr_geom_fc_x(position_embed_x).\ + view(1, w, w_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + # (n, num_heads, h, h_kv, dim) + position_feat_y = self.appr_geom_fc_y(position_embed_y).\ + view(1, h, h_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + position_feat_x /= math.sqrt(2) + position_feat_y /= math.sqrt(2) + + # accelerate for saliency only + if (np.sum(self.attention_type) == 1) and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy = torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, h_kv * w_kv) + + h = 1 + w = 1 + else: + # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for + if not self.attention_type[0]: + energy = torch.zeros( + n, + num_heads, + h, + w, + h_kv, + w_kv, + dtype=x_input.dtype, + device=x_input.device) + + # attention_type[0]: appr - appr + # attention_type[1]: appr - position + # attention_type[2]: bias - appr + # attention_type[3]: bias - position + if self.attention_type[0] or self.attention_type[2]: + if self.attention_type[0] and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + energy = torch.matmul(proj_query + appr_bias, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[0]: + energy = torch.matmul(proj_query, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy += torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, 1, h_kv, w_kv) + + if self.attention_type[1] or self.attention_type[3]: + if self.attention_type[1] and self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + + proj_query_reshape = (proj_query + geom_bias).\ + view(n, num_heads, h, w, self.qk_embed_dim) + + energy_x = torch.matmul( + proj_query_reshape.permute(0, 1, 3, 2, 4), + position_feat_x.permute(0, 1, 2, 4, 3)) + energy_x = energy_x.\ + permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul( + proj_query_reshape, + position_feat_y.permute(0, 1, 2, 4, 3)) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[1]: + proj_query_reshape = proj_query.\ + view(n, num_heads, h, w, self.qk_embed_dim) + proj_query_reshape = proj_query_reshape.\ + permute(0, 1, 3, 2, 4) + position_feat_x_reshape = position_feat_x.\ + permute(0, 1, 2, 4, 3) + position_feat_y_reshape = position_feat_y.\ + permute(0, 1, 2, 4, 3) + + energy_x = torch.matmul(proj_query_reshape, + position_feat_x_reshape) + energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul(proj_query_reshape, + position_feat_y_reshape) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, self.qk_embed_dim, 1).\ + repeat(n, 1, 1, 1) + + position_feat_x_reshape = position_feat_x.\ + view(n, num_heads, w * w_kv, self.qk_embed_dim) + + position_feat_y_reshape = position_feat_y.\ + view(n, num_heads, h * h_kv, self.qk_embed_dim) + + energy_x = torch.matmul(position_feat_x_reshape, geom_bias) + energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv) + + energy_y = torch.matmul(position_feat_y_reshape, geom_bias) + energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1) + + energy += energy_x + energy_y + + energy = energy.view(n, num_heads, h * w, h_kv * w_kv) + + if self.spatial_range >= 0: + cur_local_constraint_map = \ + self.local_constraint_map[:h, :w, :h_kv, :w_kv].\ + contiguous().\ + view(1, 1, h*w, h_kv*w_kv) + + energy = energy.masked_fill_(cur_local_constraint_map, + float('-inf')) + + attention = F.softmax(energy, 3) + + proj_value = self.value_conv(x_kv) + proj_value_reshape = proj_value.\ + view((n, num_heads, self.v_dim, h_kv * w_kv)).\ + permute(0, 1, 3, 2) + + out = torch.matmul(attention, proj_value_reshape).\ + permute(0, 1, 3, 2).\ + contiguous().\ + view(n, self.v_dim * self.num_heads, h, w) + + out = self.proj_conv(out) + + # output is downsampled, upsample back to input size + if self.q_downsample is not None: + out = F.interpolate( + out, + size=x_input.shape[2:], + mode='bilinear', + align_corners=False) + + out = self.gamma * out + x_input + return out + + def init_weights(self): + for m in self.modules(): + if hasattr(m, 'kaiming_init') and m.kaiming_init: + kaiming_init( + m, + mode='fan_in', + nonlinearity='leaky_relu', + bias=0, + distribution='uniform', + a=1) diff --git a/external/cv/mmcv/cnn/bricks/hsigmoid.py b/external/cv/mmcv/cnn/bricks/hsigmoid.py new file mode 100644 index 0000000000000000000000000000000000000000..2c371cd460a7ca9801dd0b3bfb7968c9e92978f3 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/hsigmoid.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import warnings + +import torch +import torch.nn as nn +from mmengine.registry import MODELS + + +@MODELS.register_module() +class HSigmoid(nn.Module): + """Hard Sigmoid Module. Apply the hard sigmoid function: + Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) + Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1) + + Note: + In MMCV v1.4.4, we modified the default value of args to align with + PyTorch official. + + Args: + bias (float): Bias of the input feature map. Default: 3.0. + divisor (float): Divisor of the input feature map. Default: 6.0. + min_value (float): Lower bound value. Default: 0.0. + max_value (float): Upper bound value. Default: 1.0. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + bias: float = 3.0, + divisor: float = 6.0, + min_value: float = 0.0, + max_value: float = 1.0): + super().__init__() + warnings.warn( + 'In MMCV v1.4.4, we modified the default value of args to align ' + 'with PyTorch official. Previous Implementation: ' + 'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). ' + 'Current Implementation: ' + 'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).') + self.bias = bias + self.divisor = divisor + assert self.divisor != 0 + self.min_value = min_value + self.max_value = max_value + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = (x + self.bias) / self.divisor + + return x.clamp_(self.min_value, self.max_value) diff --git a/external/cv/mmcv/cnn/bricks/hswish.py b/external/cv/mmcv/cnn/bricks/hswish.py new file mode 100644 index 0000000000000000000000000000000000000000..d06d7bcb2aee2f1b8a275e83175bf5edd8a24acb --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/hswish.py @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from mmengine.registry import MODELS +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION + + +class HSwish(nn.Module): + """Hard Swish Module. + + This module applies the hard swish function: + + .. math:: + Hswish(x) = x * ReLU6(x + 3) / 6 + + Args: + inplace (bool): can optionally do the operation in-place. + Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, inplace: bool = False): + super().__init__() + self.act = nn.ReLU6(inplace) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * self.act(x + 3) / 6 + + +if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.7')): + # Hardswish is not supported when PyTorch version < 1.6. + # And Hardswish in PyTorch 1.6 does not support inplace. + MODELS.register_module(module=HSwish) +else: + MODELS.register_module(module=nn.Hardswish, name='HSwish') diff --git a/external/cv/mmcv/cnn/bricks/non_local.py b/external/cv/mmcv/cnn/bricks/non_local.py new file mode 100644 index 0000000000000000000000000000000000000000..4828f3097bfa1f7ff230a8a60f26c97ff08935d3 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/non_local.py @@ -0,0 +1,313 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from abc import ABCMeta +from typing import Dict, Optional + +import torch +import torch.nn as nn +from mmengine.model import constant_init, normal_init +from mmengine.registry import MODELS + +from .conv_module import ConvModule + + +class _NonLocalNd(nn.Module, metaclass=ABCMeta): + """Basic Non-local module. + + This module is proposed in + "Non-local Neural Networks" + Paper reference: https://arxiv.org/abs/1711.07971 + Code reference: https://github.com/AlexHex7/Non-local_pytorch + + Args: + in_channels (int): Channels of the input feature map. + reduction (int): Channel reduction ratio. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. + Default: True. + conv_cfg (None | dict): The config dict for convolution layers. + If not specified, it will use `nn.Conv2d` for convolution layers. + Default: None. + norm_cfg (None | dict): The config dict for normalization layers. + Default: None. (This parameter is only applicable to conv_out.) + mode (str): Options are `gaussian`, `concatenation`, + `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. + """ + + def __init__(self, + in_channels: int, + reduction: int = 2, + use_scale: bool = True, + conv_cfg: Optional[Dict] = None, + norm_cfg: Optional[Dict] = None, + mode: str = 'embedded_gaussian', + **kwargs): + super().__init__() + self.in_channels = in_channels + self.reduction = reduction + self.use_scale = use_scale + self.inter_channels = max(in_channels // reduction, 1) + self.mode = mode + + if mode not in [ + 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' + ]: + raise ValueError("Mode should be in 'gaussian', 'concatenation', " + f"'embedded_gaussian' or 'dot_product', but got " + f'{mode} instead.') + + # g, theta, phi are defaulted as `nn.ConvNd`. + # Here we use ConvModule for potential usage. + self.g = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) # type: ignore + self.conv_out = ConvModule( + self.inter_channels, + self.in_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + if self.mode != 'gaussian': + self.theta = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + self.phi = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + + if self.mode == 'concatenation': + self.concat_project = ConvModule( + self.inter_channels * 2, + 1, + kernel_size=1, + stride=1, + padding=0, + bias=False, + act_cfg=dict(type='ReLU')) + + self.init_weights(**kwargs) + + def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None: + if self.mode != 'gaussian': + for m in [self.g, self.theta, self.phi]: + normal_init(m.conv, std=std) + else: + normal_init(self.g.conv, std=std) + if zeros_init: + if self.conv_out.norm_cfg is None: + constant_init(self.conv_out.conv, 0) + else: + constant_init(self.conv_out.norm, 0) + else: + if self.conv_out.norm_cfg is None: + normal_init(self.conv_out.conv, std=std) + else: + normal_init(self.conv_out.norm, std=std) + + def gaussian(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def embedded_gaussian(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + if self.use_scale: + # theta_x.shape[-1] is `self.inter_channels` + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def dot_product(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight /= pairwise_weight.shape[-1] + return pairwise_weight + + def concatenation(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + h = theta_x.size(2) + w = phi_x.size(3) + theta_x = theta_x.repeat(1, 1, 1, w) + phi_x = phi_x.repeat(1, 1, h, 1) + + concat_feature = torch.cat([theta_x, phi_x], dim=1) + pairwise_weight = self.concat_project(concat_feature) + n, _, h, w = pairwise_weight.size() + pairwise_weight = pairwise_weight.view(n, h, w) + pairwise_weight /= pairwise_weight.shape[-1] + + return pairwise_weight + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Assume `reduction = 1`, then `inter_channels = C` + # or `inter_channels = C` when `mode="gaussian"` + + # NonLocal1d x: [N, C, H] + # NonLocal2d x: [N, C, H, W] + # NonLocal3d x: [N, C, T, H, W] + n = x.size(0) + + # NonLocal1d g_x: [N, H, C] + # NonLocal2d g_x: [N, HxW, C] + # NonLocal3d g_x: [N, TxHxW, C] + g_x = self.g(x).view(n, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H] + # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW] + # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW] + if self.mode == 'gaussian': + theta_x = x.view(n, self.in_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + if self.sub_sample: + phi_x = self.phi(x).view(n, self.in_channels, -1) + else: + phi_x = x.view(n, self.in_channels, -1) + elif self.mode == 'concatenation': + theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) + phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) + else: + theta_x = self.theta(x).view(n, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(n, self.inter_channels, -1) + + pairwise_func = getattr(self, self.mode) + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = pairwise_func(theta_x, phi_x) + + # NonLocal1d y: [N, H, C] + # NonLocal2d y: [N, HxW, C] + # NonLocal3d y: [N, TxHxW, C] + y = torch.matmul(pairwise_weight, g_x) + # NonLocal1d y: [N, C, H] + # NonLocal2d y: [N, C, H, W] + # NonLocal3d y: [N, C, T, H, W] + y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, + *x.size()[2:]) + + output = x + self.conv_out(y) + + return output + + +class NonLocal1d(_NonLocalNd): + """1D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv1d'). + """ + + def __init__(self, + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv1d'), + **kwargs): + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool1d(kernel_size=2) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +@MODELS.register_module() +class NonLocal2d(_NonLocalNd): + """2D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv2d'). + """ + + _abbr_ = 'nonlocal_block' + + def __init__(self, + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv2d'), + **kwargs): + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +class NonLocal3d(_NonLocalNd): + """3D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv3d'). + """ + + def __init__(self, + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv3d'), + **kwargs): + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer diff --git a/external/cv/mmcv/cnn/bricks/norm.py b/external/cv/mmcv/cnn/bricks/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..a67f3ffa7cebe9cd485a57da9a6fca43ef6d3540 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/norm.py @@ -0,0 +1,161 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +from typing import Dict, Tuple, Union + +import torch.nn as nn +from mmengine.registry import MODELS +from mmengine.utils import is_tuple_of +from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm, + _InstanceNorm) + +MODELS.register_module('BN', module=nn.BatchNorm2d) +MODELS.register_module('BN1d', module=nn.BatchNorm1d) +MODELS.register_module('BN2d', module=nn.BatchNorm2d) +MODELS.register_module('BN3d', module=nn.BatchNorm3d) +MODELS.register_module('SyncBN', module=SyncBatchNorm) +MODELS.register_module('GN', module=nn.GroupNorm) +MODELS.register_module('LN', module=nn.LayerNorm) +MODELS.register_module('IN', module=nn.InstanceNorm2d) +MODELS.register_module('IN1d', module=nn.InstanceNorm1d) +MODELS.register_module('IN2d', module=nn.InstanceNorm2d) +MODELS.register_module('IN3d', module=nn.InstanceNorm3d) + + +def infer_abbr(class_type): + """Infer abbreviation from the class name. + + When we build a norm layer with `build_norm_layer()`, we want to preserve + the norm type in variable names, e.g, self.bn1, self.gn. This method will + infer the abbreviation to map class types to abbreviations. + + Rule 1: If the class has the property "_abbr_", return the property. + Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or + InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and + "in" respectively. + Rule 3: If the class name contains "batch", "group", "layer" or "instance", + the abbreviation of this layer will be "bn", "gn", "ln" and "in" + respectively. + Rule 4: Otherwise, the abbreviation falls back to "norm". + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ + if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN + return 'in' + elif issubclass(class_type, _BatchNorm): + return 'bn' + elif issubclass(class_type, nn.GroupNorm): + return 'gn' + elif issubclass(class_type, nn.LayerNorm): + return 'ln' + else: + class_name = class_type.__name__.lower() + if 'batch' in class_name: + return 'bn' + elif 'group' in class_name: + return 'gn' + elif 'layer' in class_name: + return 'ln' + elif 'instance' in class_name: + return 'in' + else: + return 'norm_layer' + + +def build_norm_layer(cfg: Dict, + num_features: int, + postfix: Union[int, str] = '') -> Tuple[str, nn.Module]: + """Build normalization layer. + + Args: + cfg (dict): The norm layer config, which should contain: + + - type (str): Layer type. + - layer args: Args needed to instantiate a norm layer. + - requires_grad (bool, optional): Whether stop gradient updates. + num_features (int): Number of input channels. + postfix (int | str): The postfix to be appended into norm abbreviation + to create named layer. + + Returns: + tuple[str, nn.Module]: The first element is the layer name consisting + of abbreviation and postfix, e.g., bn1, gn. The second element is the + created norm layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + + if inspect.isclass(layer_type): + norm_layer = layer_type + else: + # Switch registry to the target scope. If `norm_layer` cannot be found + # in the registry, fallback to search `norm_layer` in the + # mmengine.MODELS. + with MODELS.switch_scope_and_registry(None) as registry: + norm_layer = registry.get(layer_type) + if norm_layer is None: + raise KeyError(f'Cannot find {norm_layer} in registry under ' + f'scope name {registry.scope}') + abbr = infer_abbr(norm_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + requires_grad = cfg_.pop('requires_grad', True) + cfg_.setdefault('eps', 1e-5) + if norm_layer is not nn.GroupNorm: + layer = norm_layer(num_features, **cfg_) + if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): + layer._specify_ddp_gpu_num(1) + else: + assert 'num_groups' in cfg_ + layer = norm_layer(num_channels=num_features, **cfg_) + + for param in layer.parameters(): + param.requires_grad = requires_grad + + return name, layer + + +def is_norm(layer: nn.Module, + exclude: Union[type, tuple, None] = None) -> bool: + """Check if a layer is a normalization layer. + + Args: + layer (nn.Module): The layer to be checked. + exclude (type | tuple[type]): Types to be excluded. + + Returns: + bool: Whether the layer is a norm layer. + """ + if exclude is not None: + if not isinstance(exclude, tuple): + exclude = (exclude, ) + if not is_tuple_of(exclude, type): + raise TypeError( + f'"exclude" must be either None or type or a tuple of types, ' + f'but got {type(exclude)}: {exclude}') + + if exclude and isinstance(layer, exclude): + return False + + all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm) + return isinstance(layer, all_norm_bases) diff --git a/external/cv/mmcv/cnn/bricks/padding.py b/external/cv/mmcv/cnn/bricks/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..77218481cca68272252582265788830cebe36b40 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/padding.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +from typing import Dict + +import torch.nn as nn +from mmengine.registry import MODELS + +MODELS.register_module('zero', module=nn.ZeroPad2d) +MODELS.register_module('reflect', module=nn.ReflectionPad2d) +MODELS.register_module('replicate', module=nn.ReplicationPad2d) + + +def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module: + """Build padding layer. + + Args: + cfg (dict): The padding layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate a padding layer. + + Returns: + nn.Module: Created padding layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + + cfg_ = cfg.copy() + padding_type = cfg_.pop('type') + if inspect.isclass(padding_type): + return padding_type(*args, **kwargs, **cfg_) + # Switch registry to the target scope. If `padding_layer` cannot be found + # in the registry, fallback to search `padding_layer` in the + # mmengine.MODELS. + with MODELS.switch_scope_and_registry(None) as registry: + padding_layer = registry.get(padding_type) + if padding_layer is None: + raise KeyError(f'Cannot find {padding_layer} in registry under scope ' + f'name {registry.scope}') + layer = padding_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/external/cv/mmcv/cnn/bricks/plugin.py b/external/cv/mmcv/cnn/bricks/plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..682b4877dc628b727c9d1090aa4a9066daa8eedf --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/plugin.py @@ -0,0 +1,106 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +import platform +from typing import Dict, Tuple, Union + +import torch.nn as nn +from mmengine.registry import MODELS + +if platform.system() == 'Windows': + import regex as re # type: ignore +else: + import re # type: ignore + + +def infer_abbr(class_type: type) -> str: + """Infer abbreviation from the class name. + + This method will infer the abbreviation to map class types to + abbreviations. + + Rule 1: If the class has the property "abbr", return the property. + Rule 2: Otherwise, the abbreviation falls back to snake case of class + name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``. + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + + def camel2snack(word): + """Convert camel case word into snack case. + + Modified from `inflection lib + `_. + + Example:: + + >>> camel2snack("FancyBlock") + 'fancy_block' + """ + + word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word) + word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word) + word = word.replace('-', '_') + return word.lower() + + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ # type: ignore + else: + return camel2snack(class_type.__name__) + + +def build_plugin_layer(cfg: Dict, + postfix: Union[int, str] = '', + **kwargs) -> Tuple[str, nn.Module]: + """Build plugin layer. + + Args: + cfg (dict): cfg should contain: + + - type (str): identify plugin layer type. + - layer args: args needed to instantiate a plugin layer. + postfix (int, str): appended into norm abbreviation to + create named layer. Default: ''. + + Returns: + tuple[str, nn.Module]: The first one is the concatenation of + abbreviation and postfix. The second is the created plugin layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if inspect.isclass(layer_type): + plugin_layer = layer_type + else: + # Switch registry to the target scope. If `plugin_layer` cannot be + # found in the registry, fallback to search `plugin_layer` in the + # mmengine.MODELS. + with MODELS.switch_scope_and_registry(None) as registry: + plugin_layer = registry.get(layer_type) + if plugin_layer is None: + raise KeyError( + f'Cannot find {plugin_layer} in registry under scope ' + f'name {registry.scope}') + abbr = infer_abbr(plugin_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + layer = plugin_layer(**kwargs, **cfg_) + + return name, layer diff --git a/external/cv/mmcv/cnn/bricks/scale.py b/external/cv/mmcv/cnn/bricks/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..5facada835b892969e2c8bbe072b0fa6985b8872 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/scale.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class Scale(nn.Module): + """A learnable scale parameter. + + This layer scales the input by a learnable factor. It multiplies a + learnable scale parameter of shape (1,) with input of any shape. + + Args: + scale (float): Initial value of scale factor. Default: 1.0 + """ + + def __init__(self, scale: float = 1.0): + super().__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * self.scale + + +class LayerScale(nn.Module): + """LayerScale layer. + + Args: + dim (int): Dimension of input features. + inplace (bool): Whether performs operation in-place. + Default: `False`. + data_format (str): The input data format, could be 'channels_last' + or 'channels_first', representing (B, C, H, W) and + (B, N, C) format data respectively. Default: 'channels_last'. + scale (float): Initial value of scale factor. Default: 1.0 + """ + + def __init__(self, + dim: int, + inplace: bool = False, + data_format: str = 'channels_last', + scale: float = 1e-5): + super().__init__() + assert data_format in ('channels_last', 'channels_first'), \ + "'data_format' could only be channels_last or channels_first." + self.inplace = inplace + self.data_format = data_format + self.weight = nn.Parameter(torch.ones(dim) * scale) + + def forward(self, x) -> torch.Tensor: + if self.data_format == 'channels_first': + shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2)))) + else: + shape = tuple((*(1 for _ in range(x.dim() - 1)), -1)) + if self.inplace: + return x.mul_(self.weight.view(*shape)) + else: + return x * self.weight.view(*shape) diff --git a/external/cv/mmcv/cnn/bricks/swish.py b/external/cv/mmcv/cnn/bricks/swish.py new file mode 100644 index 0000000000000000000000000000000000000000..6e5d33ad7612b53a548b582f97c7e4f6f4b4acc6 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/swish.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from mmengine.registry import MODELS + + +@MODELS.register_module() +class Swish(nn.Module): + """Swish Module. + + This module applies the swish function: + + .. math:: + Swish(x) = x * Sigmoid(x) + + Returns: + Tensor: The output tensor. + """ + + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x * torch.sigmoid(x) diff --git a/external/cv/mmcv/cnn/bricks/transformer.py b/external/cv/mmcv/cnn/bricks/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..997fb6a5b74e139f7129891431256386cc43b09a --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/transformer.py @@ -0,0 +1,956 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import math +import warnings +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, ModuleList, Sequential +from mmengine.registry import MODELS +from mmengine.utils import deprecated_api_warning, to_2tuple + +from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, + build_norm_layer) +from .drop import build_dropout +from .scale import LayerScale + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from mmcv.ops.multi_scale_deform_attn import \ + MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) + +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv`` rather than ``mmcv-lite`` ' + 'if you need this module. ') + + +def build_positional_encoding(cfg, default_args=None): + """Builder for Position Encoding.""" + return MODELS.build(cfg, default_args=default_args) + + +def build_attention(cfg, default_args=None): + """Builder for attention.""" + return MODELS.build(cfg, default_args=default_args) + + +def build_feedforward_network(cfg, default_args=None): + """Builder for feed-forward network (FFN).""" + return MODELS.build(cfg, default_args=default_args) + + +def build_transformer_layer(cfg, default_args=None): + """Builder for transformer layer.""" + return MODELS.build(cfg, default_args=default_args) + + +def build_transformer_layer_sequence(cfg, default_args=None): + """Builder for transformer encoder and transformer decoder.""" + return MODELS.build(cfg, default_args=default_args) + + +class AdaptivePadding(nn.Module): + """Applies padding adaptively to the input. + + This module can make input get fully covered by filter + you specified. It support two modes "same" and "corner". The + "same" mode is same with "SAME" padding mode in TensorFlow, pad + zero around input. The "corner" mode would pad zero + to bottom right. + + Args: + kernel_size (int | tuple): Size of the kernel. Default: 1. + stride (int | tuple): Stride of the filter. Default: 1. + dilation (int | tuple): Spacing between kernel elements. + Default: 1. + padding (str): Support "same" and "corner", "corner" mode + would pad zero to bottom right, and "same" mode would + pad zero around input. Default: "corner". + + Example: + >>> kernel_size = 16 + >>> stride = 16 + >>> dilation = 1 + >>> input = torch.rand(1, 1, 15, 17) + >>> adap_pad = AdaptivePadding( + >>> kernel_size=kernel_size, + >>> stride=stride, + >>> dilation=dilation, + >>> padding="corner") + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + >>> input = torch.rand(1, 1, 16, 17) + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + """ + + def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): + super().__init__() + assert padding in ('same', 'corner') + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + self.padding = padding + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + + def get_pad_shape(self, input_shape): + """Calculate the padding size of input. + + Args: + input_shape (:obj:`torch.Size`): arrange as (H, W). + + Returns: + Tuple[int]: The padding size along the + original H and W directions + """ + input_h, input_w = input_shape + kernel_h, kernel_w = self.kernel_size + stride_h, stride_w = self.stride + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + pad_h = max((output_h - 1) * stride_h + + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) + pad_w = max((output_w - 1) * stride_w + + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) + return pad_h, pad_w + + def forward(self, x): + """Add padding to `x` + + Args: + x (Tensor): Input tensor has shape (B, C, H, W). + + Returns: + Tensor: The tensor with adaptive padding + """ + pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) + if pad_h > 0 or pad_w > 0: + if self.padding == 'corner': + x = F.pad(x, [0, pad_w, 0, pad_h]) + elif self.padding == 'same': + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2 + ]) + return x + + +class PatchEmbed(BaseModule): + """Image to Patch Embedding. + + We use a conv layer to implement PatchEmbed. + + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (str): The type of convolution + to generate patch embedding. Default: "Conv2d". + kernel_size (int): The kernel_size of embedding conv. Default: 16. + stride (int): The slide stride of embedding conv. + Default: 16. + padding (int | tuple | string): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int): The dilation rate of embedding conv. Default: 1. + bias (bool): Bias of embed conv. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + input_size (int | tuple | None): The size of input, which will be + used to calculate the out size. Only works when `dynamic_size` + is False. Default: None. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type='Conv2d', + kernel_size=16, + stride=16, + padding='corner', + dilation=1, + bias=True, + norm_cfg=None, + input_size=None, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + if stride is None: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of conv + padding = 0 + else: + self.adaptive_padding = None + padding = to_2tuple(padding) + + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + if input_size: + input_size = to_2tuple(input_size) + # `init_out_size` would be used outside to + # calculate the num_patches + # e.g. when `use_abs_pos_embed` outside + self.init_input_size = input_size + if self.adaptive_padding: + pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size) + input_h, input_w = input_size + input_h = input_h + pad_h + input_w = input_w + pad_w + input_size = (input_h, input_w) + + # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html + h_out = (input_size[0] + 2 * padding[0] - dilation[0] * + (kernel_size[0] - 1) - 1) // stride[0] + 1 + w_out = (input_size[1] + 2 * padding[1] - dilation[1] * + (kernel_size[1] - 1) - 1) // stride[1] + 1 + self.init_out_size = (h_out, w_out) + else: + self.init_input_size = None + self.init_out_size = None + + def forward(self, x): + """ + Args: + x (Tensor): Has shape (B, C, H, W). In most case, C is 3. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, out_h * out_w, embed_dims) + - out_size (tuple[int]): Spatial shape of x, arrange as + (out_h, out_w). + """ + + if self.adaptive_padding: + x = self.adaptive_padding(x) + + x = self.projection(x) + out_size = (x.shape[2], x.shape[3]) + x = x.flatten(2).transpose(1, 2) + if self.norm is not None: + x = self.norm(x) + return x, out_size + + +class PatchMerging(BaseModule): + """Merge patch feature map. + + This layer groups feature map by kernel_size, and applies norm and linear + layers to the grouped feature map ((used in Swin Transformer)). + Our implementation uses `nn.Unfold` to + merge patches, which is about 25% faster than the original + implementation. However, we need to modify pretrained + models for compatibility. + + Args: + in_channels (int): The num of input channels. + to gets fully covered by filter and stride you specified. + out_channels (int): The num of output channels. + kernel_size (int | tuple, optional): the kernel size in the unfold + layer. Defaults to 2. + stride (int | tuple, optional): the stride of the sliding blocks in the + unfold layer. Default: None. (Would be set as `kernel_size`) + padding (int | tuple | string ): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int | tuple, optional): dilation parameter in the unfold + layer. Default: 1. + bias (bool, optional): Whether to add bias in linear layer or not. + Defaults: False. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (dict, optional): The extra config for initialization. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=2, + stride=None, + padding='corner', + dilation=1, + bias=False, + norm_cfg=dict(type='LN'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + if stride: + stride = stride + else: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of unfold + padding = 0 + else: + self.adaptive_padding = None + + padding = to_2tuple(padding) + self.sampler = nn.Unfold( + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + stride=stride) + + sample_dim = kernel_size[0] * kernel_size[1] * in_channels + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, sample_dim)[1] + else: + self.norm = None + + self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) + + def forward(self, x, input_size): + """ + Args: + x (Tensor): Has shape (B, H*W, C_in). + input_size (tuple[int]): The spatial shape of x, arrange as (H, W). + Default: None. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) + - out_size (tuple[int]): Spatial shape of x, arrange as + (Merged_H, Merged_W). + """ + B, L, C = x.shape + assert isinstance(input_size, Sequence), f'Expect ' \ + f'input_size is ' \ + f'`Sequence` ' \ + f'but get {input_size}' + + H, W = input_size + assert L == H * W, 'input feature has wrong size' + + x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W + + if self.adaptive_padding: + x = self.adaptive_padding(x) + H, W = x.shape[-2:] + + # Use nn.Unfold to merge patch. About 25% faster than original method, + # but need to modify pretrained model for compatibility + # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) + x = self.sampler(x) + + out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * + (self.sampler.kernel_size[0] - 1) - + 1) // self.sampler.stride[0] + 1 + out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * + (self.sampler.kernel_size[1] - 1) - + 1) // self.sampler.stride[1] + 1 + + output_size = (out_h, out_w) + x = x.transpose(1, 2) # B, H/2*W/2, 4*C + x = self.norm(x) if self.norm else x + x = self.reduction(x) + return x, output_size + + +@MODELS.register_module() +class MultiheadAttention(BaseModule): + """A wrapper for ``torch.nn.MultiheadAttention``. + + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to False. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='Dropout', drop_prob=0.), + init_cfg=None, + batch_first=False, + **kwargs): + super().__init__(init_cfg) + if 'dropout' in kwargs: + warnings.warn( + 'The arguments `dropout` in MultiheadAttention ' + 'has been deprecated, now you can separately ' + 'set `attn_drop`(float), proj_drop(float), ' + 'and `dropout_layer`(dict) ', DeprecationWarning) + attn_drop = kwargs['dropout'] + dropout_layer['drop_prob'] = kwargs.pop('dropout') + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + # @deprecated_api_warning({'residual': 'identity'}, + # cls_name='MultiheadAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `MultiheadAttention`. + + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f'position encoding of key is' + f'missing in {self.__class__.__name__}.') + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + out = self.attn( + query=query, + key=key, + value=value, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) + + +@MODELS.register_module() +class FFN(BaseModule): + """Implements feed-forward networks (FFNs) with identity connection. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + layer_scale_init_value (float): Initial value of scale factor in + LayerScale. Default: 1.0 + """ + + # @deprecated_api_warning( + # { + # 'dropout': 'ffn_drop', + # 'add_residual': 'add_identity' + # }, + # cls_name='FFN') + def __init__(self, + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0., + dropout_layer=None, + add_identity=True, + init_cfg=None, + layer_scale_init_value=0.): + super().__init__(init_cfg) + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + + layers = [] + in_channels = embed_dims + for _ in range(num_fcs - 1): + layers.append( + Sequential( + Linear(in_channels, feedforward_channels), + build_activation_layer(act_cfg), nn.Dropout(ffn_drop))) + in_channels = feedforward_channels + layers.append(Linear(feedforward_channels, embed_dims)) + layers.append(nn.Dropout(ffn_drop)) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + self.add_identity = add_identity + + if layer_scale_init_value > 0: + self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value) + else: + self.gamma2 = nn.Identity() + + # @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') + def forward(self, x, identity=None): + """Forward function for `FFN`. + + The function would add x to the output tensor if residue is None. + """ + out = self.layers(x) + out = self.gamma2(out) + if not self.add_identity: + return self.dropout_layer(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) + + +@MODELS.register_module() +class BaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=False, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ', DeprecationWarning) + ffn_cfgs[new_name] = kwargs[ori_name] + + super().__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & { + 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index], + dict(type='FFN'))) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query + + +@MODELS.register_module() +class TransformerLayerSequence(BaseModule): + """Base class for TransformerEncoder and TransformerDecoder in vision + transformer. + + As base-class of Encoder and Decoder in vision transformer. + Support customization such as specifying different kind + of `transformer_layer` in `transformer_coder`. + + Args: + transformerlayer (list[obj:`mmcv.ConfigDict`] | + obj:`mmcv.ConfigDict`): Config of transformerlayer + in TransformerCoder. If it is obj:`mmcv.ConfigDict`, + it would be repeated `num_layer` times to a + list[`mmcv.ConfigDict`]. Default: None. + num_layers (int): The number of `TransformerLayer`. Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): + super().__init__(init_cfg) + if isinstance(transformerlayers, dict): + transformerlayers = [ + copy.deepcopy(transformerlayers) for _ in range(num_layers) + ] + else: + assert isinstance(transformerlayers, list) and \ + len(transformerlayers) == num_layers + self.num_layers = num_layers + self.layers = ModuleList() + for i in range(num_layers): + self.layers.append(build_transformer_layer(transformerlayers[i])) + self.embed_dims = self.layers[0].embed_dims + self.pre_norm = self.layers[0].pre_norm + + def forward(self, + query, + key, + value, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerCoder`. + + Args: + query (Tensor): Input query with shape + `(num_queries, bs, embed_dims)`. + key (Tensor): The key tensor with shape + `(num_keys, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_keys, bs, embed_dims)`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor], optional): Each element is 2D Tensor + which is used in calculation of corresponding attention in + operation_order. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in self-attention + Default: None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: results with shape [num_queries, bs, embed_dims]. + """ + for layer in self.layers: + query = layer( + query, + key, + value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + **kwargs) + return query diff --git a/external/cv/mmcv/cnn/bricks/upsample.py b/external/cv/mmcv/cnn/bricks/upsample.py new file mode 100644 index 0000000000000000000000000000000000000000..e73fc73c1832f97da9ff5b917436e619aa954150 --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/upsample.py @@ -0,0 +1,99 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +from typing import Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import xavier_init +from mmengine.registry import MODELS + +MODELS.register_module('nearest', module=nn.Upsample) +MODELS.register_module('bilinear', module=nn.Upsample) + + +@MODELS.register_module(name='pixel_shuffle') +class PixelShufflePack(nn.Module): + """Pixel Shuffle upsample layer. + + This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to + achieve a simple upsampling with pixel shuffle. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scale_factor (int): Upsample ratio. + upsample_kernel (int): Kernel size of the conv layer to expand the + channels. + """ + + def __init__(self, in_channels: int, out_channels: int, scale_factor: int, + upsample_kernel: int): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.scale_factor = scale_factor + self.upsample_kernel = upsample_kernel + self.upsample_conv = nn.Conv2d( + self.in_channels, + self.out_channels * scale_factor * scale_factor, + self.upsample_kernel, + padding=(self.upsample_kernel - 1) // 2) + self.init_weights() + + def init_weights(self): + xavier_init(self.upsample_conv, distribution='uniform') + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.upsample_conv(x) + x = F.pixel_shuffle(x, self.scale_factor) + return x + + +def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module: + """Build upsample layer. + + Args: + cfg (dict): The upsample layer config, which should contain: + + - type (str): Layer type. + - scale_factor (int): Upsample ratio, which is not applicable to + deconv. + - layer args: Args needed to instantiate a upsample layer. + args (argument list): Arguments passed to the ``__init__`` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the + ``__init__`` method of the corresponding conv layer. + + Returns: + nn.Module: Created upsample layer. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + raise KeyError( + f'the cfg dict must contain the key "type", but got {cfg}') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + + if inspect.isclass(layer_type): + upsample = layer_type + # Switch registry to the target scope. If `upsample` cannot be found + # in the registry, fallback to search `upsample` in the + # mmengine.MODELS. + else: + with MODELS.switch_scope_and_registry(None) as registry: + upsample = registry.get(layer_type) + if upsample is None: + raise KeyError(f'Cannot find {upsample} in registry under scope ' + f'name {registry.scope}') + if upsample is nn.Upsample: + cfg_['mode'] = layer_type + layer = upsample(*args, **kwargs, **cfg_) + return layer diff --git a/external/cv/mmcv/cnn/bricks/wrappers.py b/external/cv/mmcv/cnn/bricks/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c4502b1b232d637edc7ff9f2f7b3ed51a082db --- /dev/null +++ b/external/cv/mmcv/cnn/bricks/wrappers.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501 + +Wrap some nn modules to support empty tensor input. Currently, these wrappers +are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask +heads are trained on only positive RoIs. +""" +import math + +import torch +import torch.nn as nn +from mmengine.registry import MODELS +from torch.nn.modules.utils import _pair, _triple + +if torch.__version__ == 'parrots': + TORCH_VERSION = torch.__version__ +else: + # torch.__version__ could be 1.3.1+cu92, we only need the first two + # for comparison + TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) + + +def obsolete_torch_version(torch_version, version_threshold) -> bool: + return torch_version == 'parrots' or torch_version <= version_threshold + + +class NewEmptyTensorOp(torch.autograd.Function): + + @staticmethod + def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor: + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad: torch.Tensor) -> tuple: + shape = ctx.shape + return NewEmptyTensorOp.apply(grad, shape), None + + +@MODELS.register_module('Conv', force=True) +class Conv2d(nn.Conv2d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@MODELS.register_module('Conv3d', force=True) +class Conv3d(nn.Conv3d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@MODELS.register_module() +@MODELS.register_module('deconv') +class ConvTranspose2d(nn.ConvTranspose2d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@MODELS.register_module() +@MODELS.register_module('deconv3d') +class ConvTranspose3d(nn.ConvTranspose3d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +class MaxPool2d(nn.MaxPool2d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # PyTorch 1.9 does not support empty tensor inference yet + if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), + _pair(self.padding), _pair(self.stride), + _pair(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class MaxPool3d(nn.MaxPool3d): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # PyTorch 1.9 does not support empty tensor inference yet + if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), + _triple(self.padding), + _triple(self.stride), + _triple(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class Linear(torch.nn.Linear): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # empty tensor forward of Linear layer is supported in Pytorch 1.6 + if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0: + out_shape = [x.shape[0], self.out_features] + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) diff --git a/external/cv/mmcv/cnn/resnet.py b/external/cv/mmcv/cnn/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d2d2990b44381c8fb3c4b416356f452f1fb278a6 --- /dev/null +++ b/external/cv/mmcv/cnn/resnet.py @@ -0,0 +1,326 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Optional, Sequence, Tuple, Union + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmengine.model import constant_init, kaiming_init +from mmengine.runner import load_checkpoint +from torch import Tensor + + +def conv3x3(in_planes: int, + out_planes: int, + stride: int = 1, + dilation: int = 1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + with_cp: bool = False): + super().__init__() + assert style in ['pytorch', 'caffe'] + self.conv1 = conv3x3(inplanes, planes, stride, dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + assert not with_cp + + def forward(self, x: Tensor) -> Tensor: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + with_cp: bool = False): + """Bottleneck block. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super().__init__() + assert style in ['pytorch', 'caffe'] + if style == 'pytorch': + conv1_stride = 1 + conv2_stride = stride + else: + conv1_stride = stride + conv2_stride = 1 + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + def forward(self, x: Tensor) -> Tensor: + + def _inner_forward(x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def make_res_layer(block: nn.Module, + inplanes: int, + planes: int, + blocks: int, + stride: int = 1, + dilation: int = 1, + style: str = 'pytorch', + with_cp: bool = False) -> nn.Module: + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + dilation, + downsample, + style=style, + with_cp=with_cp)) + inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp)) + + return nn.Sequential(*layers) + + +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth: int, + num_stages: int = 4, + strides: Sequence[int] = (1, 2, 2, 2), + dilations: Sequence[int] = (1, 1, 1, 1), + out_indices: Sequence[int] = (0, 1, 2, 3), + style: str = 'pytorch', + frozen_stages: int = -1, + bn_eval: bool = True, + bn_frozen: bool = False, + with_cp: bool = False): + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + assert num_stages >= 1 and num_stages <= 4 + block, stage_blocks = self.arch_settings[depth] + stage_blocks = stage_blocks[:num_stages] # type: ignore + assert len(strides) == len(dilations) == num_stages + assert max(out_indices) < num_stages + + self.out_indices = out_indices + self.style = style + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + self.with_cp = with_cp + + self.inplanes: int = 64 + self.conv1 = nn.Conv2d( + 3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.res_layers = [] + for i, num_blocks in enumerate(stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = 64 * 2**i + res_layer = make_res_layer( + block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + with_cp=with_cp) + self.inplanes = planes * block.expansion # type: ignore + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = block.expansion * 64 * 2**( # type: ignore + len(stage_blocks) - 1) + + def init_weights(self, pretrained: Optional[str] = None) -> None: + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]: + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode: bool = True) -> None: + super().train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + if mode and self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for param in self.bn1.parameters(): + param.requires_grad = False + self.bn1.eval() + self.bn1.weight.requires_grad = False + self.bn1.bias.requires_grad = False + for i in range(1, self.frozen_stages + 1): + mod = getattr(self, f'layer{i}') + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/external/cv/mmcv/cnn/rfsearch/__init__.py b/external/cv/mmcv/cnn/rfsearch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4526227ca748bb50edc8c3a254ce54b0cc06a2dc --- /dev/null +++ b/external/cv/mmcv/cnn/rfsearch/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp +from .search import RFSearchHook + +__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook'] diff --git a/external/cv/mmcv/cnn/rfsearch/operator.py b/external/cv/mmcv/cnn/rfsearch/operator.py new file mode 100644 index 0000000000000000000000000000000000000000..50222f5742c312e98d5470c4d0d6877f2f8fc310 --- /dev/null +++ b/external/cv/mmcv/cnn/rfsearch/operator.py @@ -0,0 +1,174 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import copy + +import numpy as np +import torch +import torch.nn as nn +from mmengine.logging import print_log +from mmengine.model import BaseModule +from torch import Tensor + +from .utils import expand_rates, get_single_padding + + +class BaseConvRFSearchOp(BaseModule): + """Based class of ConvRFSearchOp. + + Args: + op_layer (nn.Module): pytorch module, e,g, Conv2d + global_config (dict): config dict. + """ + + def __init__(self, op_layer: nn.Module, global_config: dict): + super().__init__() + self.op_layer = op_layer + self.global_config = global_config + + def normlize(self, weights: nn.Parameter) -> nn.Parameter: + """Normalize weights. + + Args: + weights (nn.Parameter): Weights to be normalized. + + Returns: + nn.Parameters: Normalized weights. + """ + abs_weights = torch.abs(weights) + normalized_weights = abs_weights / torch.sum(abs_weights) + return normalized_weights + + +class Conv2dRFSearchOp(BaseConvRFSearchOp): + """Enable Conv2d with receptive field searching ability. + + Args: + op_layer (nn.Module): pytorch module, e,g, Conv2d + global_config (dict): config dict. Defaults to None. + By default this must include: + + - "init_alphas": The value for initializing weights of each branch. + - "num_branches": The controller of the size of + search space (the number of branches). + - "exp_rate": The controller of the sparsity of search space. + - "mmin": The minimum dilation rate. + - "mmax": The maximum dilation rate. + + Extra keys may exist, but are used by RFSearchHook, e.g., "step", + "max_step", "search_interval", and "skip_layer". + verbose (bool): Determines whether to print rf-next + related logging messages. + Defaults to True. + """ + + def __init__(self, + op_layer: nn.Module, + global_config: dict, + verbose: bool = True): + super().__init__(op_layer, global_config) + assert global_config is not None, 'global_config is None' + self.num_branches = global_config['num_branches'] + assert self.num_branches in [2, 3] + self.verbose = verbose + init_dilation = op_layer.dilation + self.dilation_rates = expand_rates(init_dilation, global_config) + if self.op_layer.kernel_size[ + 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0: + self.dilation_rates = [(op_layer.dilation[0], r[1]) + for r in self.dilation_rates] + if self.op_layer.kernel_size[ + 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0: + self.dilation_rates = [(r[0], op_layer.dilation[1]) + for r in self.dilation_rates] + + self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches)) + if self.verbose: + print_log(f'Expand as {self.dilation_rates}', 'current') + nn.init.constant_(self.branch_weights, global_config['init_alphas']) + + def forward(self, input: Tensor) -> Tensor: + norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)]) + if len(self.dilation_rates) == 1: + outputs = [ + nn.functional.conv2d( + input, + weight=self.op_layer.weight, + bias=self.op_layer.bias, + stride=self.op_layer.stride, + padding=self.get_padding(self.dilation_rates[0]), + dilation=self.dilation_rates[0], + groups=self.op_layer.groups, + ) + ] + else: + outputs = [ + nn.functional.conv2d( + input, + weight=self.op_layer.weight, + bias=self.op_layer.bias, + stride=self.op_layer.stride, + padding=self.get_padding(r), + dilation=r, + groups=self.op_layer.groups, + ) * norm_w[i] for i, r in enumerate(self.dilation_rates) + ] + output = outputs[0] + for i in range(1, len(self.dilation_rates)): + output += outputs[i] + return output + + def estimate_rates(self) -> None: + """Estimate new dilation rate based on trained branch_weights.""" + norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)]) + if self.verbose: + print_log( + 'Estimate dilation {} with weight {}.'.format( + self.dilation_rates, + norm_w.detach().cpu().numpy().tolist()), 'current') + + sum0, sum1, w_sum = 0, 0, 0 + for i in range(len(self.dilation_rates)): + sum0 += norm_w[i].item() * self.dilation_rates[i][0] + sum1 += norm_w[i].item() * self.dilation_rates[i][1] + w_sum += norm_w[i].item() + estimated = [ + np.clip( + int(round(sum0 / w_sum)), self.global_config['mmin'], + self.global_config['mmax']).item(), + np.clip( + int(round(sum1 / w_sum)), self.global_config['mmin'], + self.global_config['mmax']).item() + ] + self.op_layer.dilation = tuple(estimated) + self.op_layer.padding = self.get_padding(self.op_layer.dilation) + self.dilation_rates = [tuple(estimated)] + if self.verbose: + print_log(f'Estimate as {tuple(estimated)}', 'current') + + def expand_rates(self) -> None: + """Expand dilation rate.""" + dilation = self.op_layer.dilation + dilation_rates = expand_rates(dilation, self.global_config) + if self.op_layer.kernel_size[ + 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0: + dilation_rates = [(dilation[0], r[1]) for r in dilation_rates] + if self.op_layer.kernel_size[ + 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0: + dilation_rates = [(r[0], dilation[1]) for r in dilation_rates] + + self.dilation_rates = copy.deepcopy(dilation_rates) + if self.verbose: + print_log(f'Expand as {self.dilation_rates}', 'current') + nn.init.constant_(self.branch_weights, + self.global_config['init_alphas']) + + def get_padding(self, dilation) -> tuple: + padding = (get_single_padding(self.op_layer.kernel_size[0], + self.op_layer.stride[0], dilation[0]), + get_single_padding(self.op_layer.kernel_size[1], + self.op_layer.stride[1], dilation[1])) + return padding diff --git a/external/cv/mmcv/cnn/rfsearch/search.py b/external/cv/mmcv/cnn/rfsearch/search.py new file mode 100644 index 0000000000000000000000000000000000000000..f4f77ca9b4d8cb44e26b0b76cb5d74f6a172fbc5 --- /dev/null +++ b/external/cv/mmcv/cnn/rfsearch/search.py @@ -0,0 +1,244 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Dict, Optional + +import mmengine +import torch # noqa +import torch.nn as nn +from mmengine.hooks import Hook +from mmengine.logging import print_log +from mmengine.registry import HOOKS + +from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp # noqa +from .utils import get_single_padding, write_to_json + + +@HOOKS.register_module() +class RFSearchHook(Hook): + """Rcecptive field search via dilation rates. + + Please refer to `RF-Next: Efficient Receptive Field + Search for Convolutional Neural Networks + `_ for more details. + + + Args: + mode (str, optional): It can be set to the following types: + 'search', 'fixed_single_branch', or 'fixed_multi_branch'. + Defaults to 'search'. + config (Dict, optional): config dict of search. + By default this config contains "search", + and config["search"] must include: + + - "step": recording the current searching step. + - "max_step": The maximum number of searching steps + to update the structures. + - "search_interval": The interval (epoch/iteration) + between two updates. + - "exp_rate": The controller of the sparsity of search space. + - "init_alphas": The value for initializing weights of each branch. + - "mmin": The minimum dilation rate. + - "mmax": The maximum dilation rate. + - "num_branches": The controller of the size of + search space (the number of branches). + - "skip_layer": The modules in skip_layer will be ignored + during the receptive field search. + rfstructure_file (str, optional): Path to load searched receptive + fields of the model. Defaults to None. + by_epoch (bool, optional): Determine to perform step by epoch or + by iteration. If set to True, it will step by epoch. Otherwise, by + iteration. Defaults to True. + verbose (bool): Determines whether to print rf-next related logging + messages. Defaults to True. + """ + + def __init__(self, + mode: str = 'search', + config: Dict = {}, + rfstructure_file: Optional[str] = None, + by_epoch: bool = True, + verbose: bool = True): + assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch'] + assert config is not None + self.config = config + self.config['structure'] = {} + self.verbose = verbose + if rfstructure_file is not None: + rfstructure = mmengine.load(rfstructure_file)['structure'] + self.config['structure'] = rfstructure + self.mode = mode + self.num_branches = self.config['search']['num_branches'] + self.by_epoch = by_epoch + + def init_model(self, model: nn.Module): + """init model with search ability. + + Args: + model (nn.Module): pytorch model + + Raises: + NotImplementedError: only support three modes: + search/fixed_single_branch/fixed_multi_branch + """ + if self.verbose: + print_log('RFSearch init begin.', 'current') + if self.mode == 'search': + if self.config['structure']: + self.set_model(model, search_op='Conv2d') + self.wrap_model(model, search_op='Conv2d') + elif self.mode == 'fixed_single_branch': + self.set_model(model, search_op='Conv2d') + elif self.mode == 'fixed_multi_branch': + self.set_model(model, search_op='Conv2d') + self.wrap_model(model, search_op='Conv2d') + else: + raise NotImplementedError + if self.verbose: + print_log('RFSearch init end.', 'current') + + def after_train_epoch(self, runner): + """Performs a dilation searching step after one training epoch.""" + if self.by_epoch and self.mode == 'search': + self.step(runner.model, runner.work_dir) + + def after_train_iter(self, runner, batch_idx, data_batch, outputs): + """Performs a dilation searching step after one training iteration.""" + if not self.by_epoch and self.mode == 'search': + self.step(runner.model, runner.work_dir) + + def step(self, model: nn.Module, work_dir: str) -> None: + """Performs a dilation searching step. + + Args: + model (nn.Module): pytorch model + work_dir (str): Directory to save the searching results. + """ + self.config['search']['step'] += 1 + if (self.config['search']['step'] + ) % self.config['search']['search_interval'] == 0 and (self.config[ + 'search']['step']) < self.config['search']['max_step']: + self.estimate_and_expand(model) + for name, module in model.named_modules(): + if isinstance(module, BaseConvRFSearchOp): + self.config['structure'][name] = module.op_layer.dilation + + write_to_json( + self.config, + os.path.join( + work_dir, + 'local_search_config_step%d.json' % + self.config['search']['step'], + ), + ) + + def estimate_and_expand(self, model: nn.Module) -> None: + """estimate and search for RFConvOp. + + Args: + model (nn.Module): pytorch model + """ + for module in model.modules(): + if isinstance(module, BaseConvRFSearchOp): + module.estimate_rates() + module.expand_rates() + + def wrap_model(self, + model: nn.Module, + search_op: str = 'Conv2d', + prefix: str = '') -> None: + """wrap model to support searchable conv op. + + Args: + model (nn.Module): pytorch model + search_op (str): The module that uses RF search. + Defaults to 'Conv2d'. + init_rates (int, optional): Set to other initial dilation rates. + Defaults to None. + prefix (str): Prefix for function recursion. Defaults to ''. + """ + op = 'torch.nn.' + search_op + for name, module in model.named_children(): + if prefix == '': + fullname = 'module.' + name + else: + fullname = prefix + '.' + name + if self.config['search']['skip_layer'] is not None: + if any(layer in fullname + for layer in self.config['search']['skip_layer']): + continue + if isinstance(module, eval(op)): + if 1 < module.kernel_size[0] and \ + 0 != module.kernel_size[0] % 2 or \ + 1 < module.kernel_size[1] and \ + 0 != module.kernel_size[1] % 2: + moduleWrap = eval(search_op + 'RFSearchOp')( + module, self.config['search'], self.verbose) + moduleWrap = moduleWrap.to(module.weight.device) + if self.verbose: + print_log( + 'Wrap model %s to %s.' % + (str(module), str(moduleWrap)), 'current') + setattr(model, name, moduleWrap) + elif not isinstance(module, BaseConvRFSearchOp): + self.wrap_model(module, search_op, fullname) + + def set_model(self, + model: nn.Module, + search_op: str = 'Conv2d', + init_rates: Optional[int] = None, + prefix: str = '') -> None: + """set model based on config. + + Args: + model (nn.Module): pytorch model + config (Dict): config file + search_op (str): The module that uses RF search. + Defaults to 'Conv2d'. + init_rates (int, optional): Set to other initial dilation rates. + Defaults to None. + prefix (str): Prefix for function recursion. Defaults to ''. + """ + op = 'torch.nn.' + search_op + for name, module in model.named_children(): + if prefix == '': + fullname = 'module.' + name + else: + fullname = prefix + '.' + name + if self.config['search']['skip_layer'] is not None: + if any(layer in fullname + for layer in self.config['search']['skip_layer']): + continue + if isinstance(module, eval(op)): + if 1 < module.kernel_size[0] and \ + 0 != module.kernel_size[0] % 2 or \ + 1 < module.kernel_size[1] and \ + 0 != module.kernel_size[1] % 2: + if isinstance(self.config['structure'][fullname], int): + self.config['structure'][fullname] = [ + self.config['structure'][fullname], + self.config['structure'][fullname] + ] + module.dilation = ( + self.config['structure'][fullname][0], + self.config['structure'][fullname][1], + ) + module.padding = ( + get_single_padding( + module.kernel_size[0], module.stride[0], + self.config['structure'][fullname][0]), + get_single_padding( + module.kernel_size[1], module.stride[1], + self.config['structure'][fullname][1])) + setattr(model, name, module) + if self.verbose: + print_log( + 'Set module %s dilation as: [%d %d]' % + (fullname, module.dilation[0], module.dilation[1]), + 'current') + elif not isinstance(module, BaseConvRFSearchOp): + self.set_model(module, search_op, init_rates, fullname) diff --git a/external/cv/mmcv/cnn/rfsearch/utils.py b/external/cv/mmcv/cnn/rfsearch/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..67d301a4784a2a11110676011acc02f2937317de --- /dev/null +++ b/external/cv/mmcv/cnn/rfsearch/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import mmengine +import numpy as np + + +def write_to_json(config: dict, filename: str): + """save config to json file. + + Args: + config (dict): Config to be saved. + filename (str): Path to save config. + """ + + with open(filename, 'w', encoding='utf-8') as f: + mmengine.dump(config, f, file_format='json') + + +def expand_rates(dilation: tuple, config: dict) -> list: + """expand dilation rate according to config. + + Args: + dilation (int): _description_ + config (dict): config dict + + Returns: + list: list of expanded dilation rates + """ + exp_rate = config['exp_rate'] + + large_rates = [] + small_rates = [] + for _ in range(config['num_branches'] // 2): + large_rates.append( + tuple([ + np.clip( + int(round((1 + exp_rate) * dilation[0])), config['mmin'], + config['mmax']).item(), + np.clip( + int(round((1 + exp_rate) * dilation[1])), config['mmin'], + config['mmax']).item() + ])) + small_rates.append( + tuple([ + np.clip( + int(round((1 - exp_rate) * dilation[0])), config['mmin'], + config['mmax']).item(), + np.clip( + int(round((1 - exp_rate) * dilation[1])), config['mmin'], + config['mmax']).item() + ])) + + small_rates.reverse() + + if config['num_branches'] % 2 == 0: + rate_list = small_rates + large_rates + else: + rate_list = small_rates + [dilation] + large_rates + + unique_rate_list = list(set(rate_list)) + unique_rate_list.sort(key=rate_list.index) + return unique_rate_list + + +def get_single_padding(kernel_size: int, + stride: int = 1, + dilation: int = 1) -> int: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding diff --git a/external/cv/mmcv/cnn/utils/__init__.py b/external/cv/mmcv/cnn/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0822f2a0c5f12efdbcf78aa52c14e334a07c56fb --- /dev/null +++ b/external/cv/mmcv/cnn/utils/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .flops_counter import get_model_complexity_info +from .fuse_conv_bn import fuse_conv_bn + +__all__ = ['get_model_complexity_info', 'fuse_conv_bn'] diff --git a/external/cv/mmcv/cnn/utils/flops_counter.py b/external/cv/mmcv/cnn/utils/flops_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..f18607485bdb7a26065f765debdeb143865da974 --- /dev/null +++ b/external/cv/mmcv/cnn/utils/flops_counter.py @@ -0,0 +1,610 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from flops-counter.pytorch by Vladislav Sovrasov +# original repo: https://github.com/sovrasov/flops-counter.pytorch + +# MIT License + +# Copyright (c) 2018 Vladislav Sovrasov + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys +import warnings +from functools import partial +from typing import Any, Callable, Dict, Optional, TextIO, Tuple + +import numpy as np +import torch +import torch.nn as nn + +from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear, + MaxPool2d, MaxPool3d) + + +def get_model_complexity_info(model: nn.Module, + input_shape: tuple, + print_per_layer_stat: bool = True, + as_strings: bool = True, + input_constructor: Optional[Callable] = None, + flush: bool = False, + ost: TextIO = sys.stdout) -> tuple: + """Get complexity information of a model. + + This method can calculate FLOPs and parameter counts of a model with + corresponding input shape. It can also print complexity information for + each layer in a model. + + Supported layers are listed as below: + - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. + - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, + ``nn.LeakyReLU``, ``nn.ReLU6``. + - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, + ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, + ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, + ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, + ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. + - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, + ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, + ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. + - Linear: ``nn.Linear``. + - Deconvolution: ``nn.ConvTranspose2d``. + - Upsample: ``nn.Upsample``. + + Args: + model (nn.Module): The model for complexity calculation. + input_shape (tuple): Input shape used for calculation. + print_per_layer_stat (bool): Whether to print complexity information + for each layer in a model. Default: True. + as_strings (bool): Output FLOPs and params counts in a string form. + Default: True. + input_constructor (None | callable): If specified, it takes a callable + method that generates input. otherwise, it will generate a random + tensor with input shape to calculate FLOPs. Default: None. + flush (bool): same as that in :func:`print`. Default: False. + ost (stream): same as ``file`` param in :func:`print`. + Default: sys.stdout. + + Returns: + tuple[float | str]: If ``as_strings`` is set to True, it will return + FLOPs and parameter counts in a string format. otherwise, it will + return those in a float number format. + """ + assert type(input_shape) is tuple + assert len(input_shape) >= 1 + assert isinstance(model, nn.Module) + flops_model = add_flops_counting_methods(model) + flops_model.eval() + flops_model.start_flops_count() + if input_constructor: + input = input_constructor(input_shape) + _ = flops_model(**input) + else: + try: + batch = torch.ones(()).new_empty( + (1, *input_shape), + dtype=next(flops_model.parameters()).dtype, + device=next(flops_model.parameters()).device) + except StopIteration: + # Avoid StopIteration for models which have no parameters, + # like `nn.Relu()`, `nn.AvgPool2d`, etc. + batch = torch.ones(()).new_empty((1, *input_shape)) + + _ = flops_model(batch) + + flops_count, params_count = flops_model.compute_average_flops_cost() + if print_per_layer_stat: + print_model_with_flops( + flops_model, flops_count, params_count, ost=ost, flush=flush) + flops_model.stop_flops_count() + + if as_strings: + return flops_to_string(flops_count), params_to_string(params_count) + + return flops_count, params_count + + +def flops_to_string(flops: float, + units: Optional[str] = 'GFLOPs', + precision: int = 2) -> str: + """Convert FLOPs number into a string. + + Note that Here we take a multiply-add counts as one FLOP. + + Args: + flops (float): FLOPs number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', + 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically + choose the most suitable unit for FLOPs. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted FLOPs number with units. + + Examples: + >>> flops_to_string(1e9) + '1.0 GFLOPs' + >>> flops_to_string(2e5, 'MFLOPs') + '0.2 MFLOPs' + >>> flops_to_string(3e-9, None) + '3e-09 FLOPs' + """ + if units is None: + if flops // 10**9 > 0: + return str(round(flops / 10.**9, precision)) + ' GFLOPs' + elif flops // 10**6 > 0: + return str(round(flops / 10.**6, precision)) + ' MFLOPs' + elif flops // 10**3 > 0: + return str(round(flops / 10.**3, precision)) + ' KFLOPs' + else: + return str(flops) + ' FLOPs' + else: + if units == 'GFLOPs': + return str(round(flops / 10.**9, precision)) + ' ' + units + elif units == 'MFLOPs': + return str(round(flops / 10.**6, precision)) + ' ' + units + elif units == 'KFLOPs': + return str(round(flops / 10.**3, precision)) + ' ' + units + else: + return str(flops) + ' FLOPs' + + +def params_to_string(num_params: float, + units: Optional[str] = None, + precision: int = 2) -> str: + """Convert parameter number into a string. + + Args: + num_params (float): Parameter number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'M', + 'K' and ''. If set to None, it will automatically choose the most + suitable unit for Parameter number. Default: None. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted parameter number with units. + + Examples: + >>> params_to_string(1e9) + '1000.0 M' + >>> params_to_string(2e5) + '200.0 k' + >>> params_to_string(3e-9) + '3e-09' + """ + if units is None: + if num_params // 10**6 > 0: + return str(round(num_params / 10**6, precision)) + ' M' + elif num_params // 10**3: + return str(round(num_params / 10**3, precision)) + ' k' + else: + return str(num_params) + else: + if units == 'M': + return str(round(num_params / 10.**6, precision)) + ' ' + units + elif units == 'K': + return str(round(num_params / 10.**3, precision)) + ' ' + units + else: + return str(num_params) + + +def print_model_with_flops(model: nn.Module, + total_flops: float, + total_params: float, + units: Optional[str] = 'GFLOPs', + precision: int = 3, + ost: TextIO = sys.stdout, + flush: bool = False) -> None: + """Print a model with FLOPs for each layer. + + Args: + model (nn.Module): The model to be printed. + total_flops (float): Total FLOPs of the model. + total_params (float): Total parameter counts of the model. + units (str | None): Converted FLOPs units. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 3. + ost (stream): same as `file` param in :func:`print`. + Default: sys.stdout. + flush (bool): same as that in :func:`print`. Default: False. + + Example: + >>> class ExampleModel(nn.Module): + + >>> def __init__(self): + >>> super().__init__() + >>> self.conv1 = nn.Conv2d(3, 8, 3) + >>> self.conv2 = nn.Conv2d(8, 256, 3) + >>> self.conv3 = nn.Conv2d(256, 8, 3) + >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + >>> self.flatten = nn.Flatten() + >>> self.fc = nn.Linear(8, 1) + + >>> def forward(self, x): + >>> x = self.conv1(x) + >>> x = self.conv2(x) + >>> x = self.conv3(x) + >>> x = self.avg_pool(x) + >>> x = self.flatten(x) + >>> x = self.fc(x) + >>> return x + + >>> model = ExampleModel() + >>> x = (3, 16, 16) + to print the complexity information state for each layer, you can use + >>> get_model_complexity_info(model, x) + or directly use + >>> print_model_with_flops(model, 4579784.0, 37361) + ExampleModel( + 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, + (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 + (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) + (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) + (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) + (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) + (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) + ) + """ + + def accumulate_params(self): + if is_supported_instance(self): + return self.__params__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_params() + return sum + + def accumulate_flops(self): + if is_supported_instance(self): + return self.__flops__ / model.__batch_counter__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_flops() + return sum + + def flops_repr(self): + accumulated_num_params = self.accumulate_params() + accumulated_flops_cost = self.accumulate_flops() + return ', '.join([ + params_to_string( + accumulated_num_params, units='M', precision=precision), + f'{accumulated_num_params / total_params:.3%} Params', + flops_to_string( + accumulated_flops_cost, units=units, precision=precision), + f'{accumulated_flops_cost / total_flops:.3%} FLOPs', + self.original_extra_repr() + ]) + + def add_extra_repr(m): + m.accumulate_flops = accumulate_flops.__get__(m) + m.accumulate_params = accumulate_params.__get__(m) + flops_extra_repr = flops_repr.__get__(m) + if m.extra_repr != flops_extra_repr: + m.original_extra_repr = m.extra_repr + m.extra_repr = flops_extra_repr + assert m.extra_repr != m.original_extra_repr + + def del_extra_repr(m): + if hasattr(m, 'original_extra_repr'): + m.extra_repr = m.original_extra_repr + del m.original_extra_repr + if hasattr(m, 'accumulate_flops'): + del m.accumulate_flops + + model.apply(add_extra_repr) + print(model, file=ost, flush=flush) + model.apply(del_extra_repr) + + +def get_model_parameters_number(model: nn.Module) -> float: + """Calculate parameter number of a model. + + Args: + model (nn.module): The model for parameter number calculation. + + Returns: + float: Parameter number of the model. + """ + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return num_params + + +def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module: + # adding additional methods to the existing module object, + # this is done this way so that each function has access to self object + net_main_module.start_flops_count = start_flops_count.__get__( # type: ignore # noqa E501 + net_main_module) + net_main_module.stop_flops_count = stop_flops_count.__get__( # type: ignore # noqa E501 + net_main_module) + net_main_module.reset_flops_count = reset_flops_count.__get__( # type: ignore # noqa E501 + net_main_module) + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # type: ignore # noqa E501 + net_main_module) + + net_main_module.reset_flops_count() + + return net_main_module + + +def compute_average_flops_cost(self) -> Tuple[float, float]: + """Compute average FLOPs cost. + + A method to compute average FLOPs cost, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + + Returns: + float: Current mean flops consumption per image. + """ + batches_count = self.__batch_counter__ + flops_sum = 0 + for module in self.modules(): + if is_supported_instance(module): + flops_sum += module.__flops__ + params_sum = get_model_parameters_number(self) + return flops_sum / batches_count, params_sum + + +def start_flops_count(self) -> None: + """Activate the computation of mean flops consumption per image. + + A method to activate the computation of mean flops consumption per image. + which will be available after ``add_flops_counting_methods()`` is called on + a desired net object. It should be called before running the network. + """ + add_batch_counter_hook_function(self) + + def add_flops_counter_hook_function(module: nn.Module) -> None: + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + return + + else: + handle = module.register_forward_hook( + get_modules_mapping()[type(module)]) + + module.__flops_handle__ = handle + + self.apply(partial(add_flops_counter_hook_function)) + + +def stop_flops_count(self) -> None: + """Stop computing the mean flops consumption per image. + + A method to stop computing the mean flops consumption per image, which will + be available after ``add_flops_counting_methods()`` is called on a desired + net object. It can be called to pause the computation whenever. + """ + remove_batch_counter_hook_function(self) + self.apply(remove_flops_counter_hook_function) + + +def reset_flops_count(self) -> None: + """Reset statistics computed so far. + + A method to Reset computed statistics, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + """ + add_batch_counter_variables_or_reset(self) + self.apply(add_flops_counter_variable_or_reset) + + +# ---- Internal functions +def empty_flops_counter_hook(module: nn.Module, input: tuple, + output: Any) -> None: + module.__flops__ += 0 + + +def upsample_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + output_size = output[0] + batch_size = output_size.shape[0] + output_elements_count = batch_size + for val in output_size.shape[1:]: + output_elements_count *= val + module.__flops__ += int(output_elements_count) + + +def relu_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + active_elements_count = output.numel() + module.__flops__ += int(active_elements_count) + + +def linear_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + output_last_dim = output.shape[ + -1] # pytorch checks dimensions, so here we don't care much + module.__flops__ += int(np.prod(input[0].shape) * output_last_dim) + + +def pool_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + module.__flops__ += int(np.prod(input[0].shape)) + + +def norm_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + batch_flops = np.prod(input[0].shape) + if (getattr(module, 'affine', False) + or getattr(module, 'elementwise_affine', False)): + batch_flops *= 2 + module.__flops__ += int(batch_flops) + + +def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + # Can have multiple inputs, getting the first one + batch_size = input[0].shape[0] + input_height, input_width = input[0].shape[2:] + + kernel_height, kernel_width = conv_module.kernel_size + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = ( + kernel_height * kernel_width * in_channels * filters_per_channel) + + active_elements_count = batch_size * input_height * input_width + overall_conv_flops = conv_per_position_flops * active_elements_count + bias_flops = 0 + if conv_module.bias is not None: + output_height, output_width = output.shape[2:] + bias_flops = out_channels * batch_size * output_height * output_width + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def conv_flops_counter_hook(conv_module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + # Can have multiple inputs, getting the first one + batch_size = input[0].shape[0] + output_dims = list(output.shape[2:]) + + kernel_dims = list(conv_module.kernel_size) + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = int( + np.prod(kernel_dims)) * in_channels * filters_per_channel + + active_elements_count = batch_size * int(np.prod(output_dims)) + + overall_conv_flops = conv_per_position_flops * active_elements_count + + bias_flops = 0 + + if conv_module.bias is not None: + + bias_flops = out_channels * active_elements_count + + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None: + batch_size = 1 + if len(input) > 0: + # Can have multiple inputs, getting the first one + batch_size = len(input[0]) + else: + warnings.warn('No positional inputs found for a module, ' + 'assuming batch size is 1.') + module.__batch_counter__ += batch_size + + +def add_batch_counter_variables_or_reset(module: nn.Module) -> None: + + module.__batch_counter__ = 0 + + +def add_batch_counter_hook_function(module: nn.Module) -> None: + if hasattr(module, '__batch_counter_handle__'): + return + + handle = module.register_forward_hook(batch_counter_hook) + module.__batch_counter_handle__ = handle + + +def remove_batch_counter_hook_function(module: nn.Module) -> None: + if hasattr(module, '__batch_counter_handle__'): + module.__batch_counter_handle__.remove() + del module.__batch_counter_handle__ + + +def add_flops_counter_variable_or_reset(module: nn.Module) -> None: + if is_supported_instance(module): + if hasattr(module, '__flops__') or hasattr(module, '__params__'): + warnings.warn('variables __flops__ or __params__ are already ' + 'defined for the module' + type(module).__name__ + + ' ptflops can affect your code!') + module.__flops__ = 0 + module.__params__ = get_model_parameters_number(module) + + +def is_supported_instance(module: nn.Module) -> bool: + if type(module) in get_modules_mapping(): + return True + return False + + +def remove_flops_counter_hook_function(module: nn.Module) -> None: + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + module.__flops_handle__.remove() + del module.__flops_handle__ + + +def get_modules_mapping() -> Dict: + return { + # convolutions + nn.Conv1d: conv_flops_counter_hook, + nn.Conv2d: conv_flops_counter_hook, + Conv2d: conv_flops_counter_hook, + nn.Conv3d: conv_flops_counter_hook, + Conv3d: conv_flops_counter_hook, + # activations + nn.ReLU: relu_flops_counter_hook, + nn.PReLU: relu_flops_counter_hook, + nn.ELU: relu_flops_counter_hook, + nn.LeakyReLU: relu_flops_counter_hook, + nn.ReLU6: relu_flops_counter_hook, + # poolings + nn.MaxPool1d: pool_flops_counter_hook, + nn.AvgPool1d: pool_flops_counter_hook, + nn.AvgPool2d: pool_flops_counter_hook, + nn.MaxPool2d: pool_flops_counter_hook, + MaxPool2d: pool_flops_counter_hook, + nn.MaxPool3d: pool_flops_counter_hook, + MaxPool3d: pool_flops_counter_hook, + nn.AvgPool3d: pool_flops_counter_hook, + nn.AdaptiveMaxPool1d: pool_flops_counter_hook, + nn.AdaptiveAvgPool1d: pool_flops_counter_hook, + nn.AdaptiveMaxPool2d: pool_flops_counter_hook, + nn.AdaptiveAvgPool2d: pool_flops_counter_hook, + nn.AdaptiveMaxPool3d: pool_flops_counter_hook, + nn.AdaptiveAvgPool3d: pool_flops_counter_hook, + # normalizations + nn.BatchNorm1d: norm_flops_counter_hook, + nn.BatchNorm2d: norm_flops_counter_hook, + nn.BatchNorm3d: norm_flops_counter_hook, + nn.GroupNorm: norm_flops_counter_hook, + nn.InstanceNorm1d: norm_flops_counter_hook, + nn.InstanceNorm2d: norm_flops_counter_hook, + nn.InstanceNorm3d: norm_flops_counter_hook, + nn.LayerNorm: norm_flops_counter_hook, + # FC + nn.Linear: linear_flops_counter_hook, + Linear: linear_flops_counter_hook, + # Upscale + nn.Upsample: upsample_flops_counter_hook, + # Deconvolution + nn.ConvTranspose2d: deconv_flops_counter_hook, + ConvTranspose2d: deconv_flops_counter_hook, + } diff --git a/external/cv/mmcv/cnn/utils/fuse_conv_bn.py b/external/cv/mmcv/cnn/utils/fuse_conv_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..07aab3bd21bf0d695272bc9faa55ddf91861e206 --- /dev/null +++ b/external/cv/mmcv/cnn/utils/fuse_conv_bn.py @@ -0,0 +1,64 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module: + """Fuse conv and bn into one module. + + Args: + conv (nn.Module): Conv to be fused. + bn (nn.Module): BN to be fused. + + Returns: + nn.Module: Fused module. + """ + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv + + +def fuse_conv_bn(module: nn.Module) -> nn.Module: + """Recursively fuse conv and bn in a module. + + During inference, the functionary of batch norm layers is turned off + but only the mean and var alone channels are used, which exposes the + chance to fuse it with the preceding conv layers to save computations and + simplify network structures. + + Args: + module (nn.Module): Module to be fused. + + Returns: + nn.Module: Fused module. + """ + last_conv = None + last_conv_name = None + + for name, child in module.named_children(): + if isinstance(child, + (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)): + if last_conv is None: # only fuse BN that is after Conv + continue + fused_conv = _fuse_conv_bn(last_conv, child) + module._modules[last_conv_name] = fused_conv + # To reduce changes, set BN as Identity instead of deleting it. + module._modules[name] = nn.Identity() + last_conv = None + elif isinstance(child, nn.Conv2d): + last_conv = child + last_conv_name = name + else: + fuse_conv_bn(child) + return module diff --git a/external/cv/mmcv/cnn/vgg.py b/external/cv/mmcv/cnn/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..013ff815de3c6f8a29207e752fd8a1e7ac14abc2 --- /dev/null +++ b/external/cv/mmcv/cnn/vgg.py @@ -0,0 +1,181 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Optional, Sequence, Tuple, Union + +import torch.nn as nn +from mmengine.model import constant_init, kaiming_init, normal_init +from mmengine.runner import load_checkpoint +from torch import Tensor + + +def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module: + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + padding=dilation, + dilation=dilation) + + +def make_vgg_layer(inplanes: int, + planes: int, + num_blocks: int, + dilation: int = 1, + with_bn: bool = False, + ceil_mode: bool = False) -> List[nn.Module]: + layers = [] + for _ in range(num_blocks): + layers.append(conv3x3(inplanes, planes, dilation)) + if with_bn: + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + inplanes = planes + layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) + + return layers + + +class VGG(nn.Module): + """VGG backbone. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_bn (bool): Use BatchNorm or not. + num_classes (int): number of classes for classification. + num_stages (int): VGG stages, normally 5. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + """ + + arch_settings = { + 11: (1, 1, 2, 2, 2), + 13: (2, 2, 2, 2, 2), + 16: (2, 2, 3, 3, 3), + 19: (2, 2, 4, 4, 4) + } + + def __init__(self, + depth: int, + with_bn: bool = False, + num_classes: int = -1, + num_stages: int = 5, + dilations: Sequence[int] = (1, 1, 1, 1, 1), + out_indices: Sequence[int] = (0, 1, 2, 3, 4), + frozen_stages: int = -1, + bn_eval: bool = True, + bn_frozen: bool = False, + ceil_mode: bool = False, + with_last_pool: bool = True): + super().__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for vgg') + assert num_stages >= 1 and num_stages <= 5 + stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + assert len(dilations) == num_stages + assert max(out_indices) <= num_stages + + self.num_classes = num_classes + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + + self.inplanes = 3 + start_idx = 0 + vgg_layers = [] + self.range_sub_modules = [] + for i, num_blocks in enumerate(self.stage_blocks): + num_modules = num_blocks * (2 + with_bn) + 1 + end_idx = start_idx + num_modules + dilation = dilations[i] + planes = 64 * 2**i if i < 4 else 512 + vgg_layer = make_vgg_layer( + self.inplanes, + planes, + num_blocks, + dilation=dilation, + with_bn=with_bn, + ceil_mode=ceil_mode) + vgg_layers.extend(vgg_layer) + self.inplanes = planes + self.range_sub_modules.append([start_idx, end_idx]) + start_idx = end_idx + if not with_last_pool: + vgg_layers.pop(-1) + self.range_sub_modules[-1][1] -= 1 + self.module_name = 'features' + self.add_module(self.module_name, nn.Sequential(*vgg_layers)) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained: Optional[str] = None) -> None: + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]: + outs = [] + vgg_layers = getattr(self, self.module_name) + for i in range(len(self.stage_blocks)): + for j in range(*self.range_sub_modules[i]): + vgg_layer = vgg_layers[j] + x = vgg_layer(x) + if i in self.out_indices: + outs.append(x) + if self.num_classes > 0: + x = x.view(x.size(0), -1) + x = self.classifier(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode: bool = True) -> None: + super().train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + vgg_layers = getattr(self, self.module_name) + if mode and self.frozen_stages >= 0: + for i in range(self.frozen_stages): + for j in range(*self.range_sub_modules[i]): + mod = vgg_layers[j] + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/external/cv/mmcv/image/__init__.py b/external/cv/mmcv/image/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f946caa2750b1da93e6b5b8086f954912e89c06e --- /dev/null +++ b/external/cv/mmcv/image/__init__.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr, + gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert, + rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb) +from .geometric import (cutout, imcrop, imflip, imflip_, impad, + impad_to_multiple, imrescale, imresize, imresize_like, + imresize_to_multiple, imrotate, imshear, imtranslate, + rescale_size) +from .io import imfrombytes, imread, imwrite, supported_backends, use_backend +from .misc import tensor2imgs +from .photometric import (adjust_brightness, adjust_color, adjust_contrast, + adjust_hue, adjust_lighting, adjust_sharpness, + auto_contrast, clahe, imdenormalize, imequalize, + iminvert, imnormalize, imnormalize_, lut_transform, + posterize, solarize) + +__all__ = [ + 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', + 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale', + 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size', + 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate', + 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend', + 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize', + 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', + 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', + 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', + 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting', + 'adjust_hue' +] diff --git a/external/cv/mmcv/image/colorspace.py b/external/cv/mmcv/image/colorspace.py new file mode 100644 index 0000000000000000000000000000000000000000..5ee1d2aab48d95782115245580876937e49cc6c0 --- /dev/null +++ b/external/cv/mmcv/image/colorspace.py @@ -0,0 +1,314 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Union + +import cv2 +import numpy as np + + +def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray: + """Convert an image from the src colorspace to dst colorspace. + + Args: + img (ndarray): The input image. + src (str): The source colorspace, e.g., 'rgb', 'hsv'. + dst (str): The destination colorspace, e.g., 'rgb', 'hsv'. + + Returns: + ndarray: The converted image. + """ + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + out_img = cv2.cvtColor(img, code) + return out_img + + +def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: + """Convert a BGR image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: + """Convert a RGB image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def gray2bgr(img: np.ndarray) -> np.ndarray: + """Convert a grayscale image to BGR image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted BGR image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + return out_img + + +def gray2rgb(img: np.ndarray) -> np.ndarray: + """Convert a grayscale image to RGB image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted RGB image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) + return out_img + + +def _convert_input_type_range(img: np.ndarray) -> np.ndarray: + """Convert the type and range of the input image. + + It converts the input image to np.float32 type and range of [0, 1]. + It is mainly used for pre-processing the input image in colorspace + conversion functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + (ndarray): The converted image with type of np.float32 and range of + [0, 1]. + """ + img_type = img.dtype + img = img.astype(np.float32) + if img_type == np.float32: + pass + elif img_type == np.uint8: + img /= 255. + else: + raise TypeError('The img type should be np.float32 or np.uint8, ' + f'but got {img_type}') + return img + + +def _convert_output_type_range( + img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray: + """Convert the type and range of the image according to dst_type. + + It converts the image to desired type and range. If `dst_type` is np.uint8, + images will be converted to np.uint8 type with range [0, 255]. If + `dst_type` is np.float32, it converts the image to np.float32 type with + range [0, 1]. + It is mainly used for post-processing images in colorspace conversion + functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The image to be converted with np.float32 type and + range [0, 255]. + dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it + converts the image to np.uint8 type with range [0, 255]. If + dst_type is np.float32, it converts the image to np.float32 type + with range [0, 1]. + + Returns: + (ndarray): The converted image with desired type and range. + """ + if dst_type not in (np.uint8, np.float32): + raise TypeError('The dst_type should be np.float32 or np.uint8, ' + f'but got {dst_type}') + if dst_type == np.uint8: + img = img.round() + else: + img /= 255. + return img.astype(dst_type) + + +def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: + """Convert a RGB image to YCbCr image. + + This function produces the same results as Matlab's `rgb2ycbcr` function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 + else: + out_img = np.matmul( + img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], + [24.966, 112.0, -18.214]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: + """Convert a BGR image to YCbCr image. + + The bgr version of rgb2ycbcr. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 + else: + out_img = np.matmul( + img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], + [65.481, -37.797, 112.0]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2rgb(img: np.ndarray) -> np.ndarray: + """Convert a YCbCr image to RGB image. + + This function produces the same results as Matlab's ycbcr2rgb function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted RGB image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0, -0.00153632, 0.00791071], + [0.00625893, -0.00318811, 0]]) * 255.0 + [ + -222.921, 135.576, -276.836 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2bgr(img: np.ndarray) -> np.ndarray: + """Convert a YCbCr image to BGR image. + + The bgr version of ycbcr2rgb. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted BGR image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0.00791071, -0.00153632, 0], + [0, -0.00318811, 0.00625893]]) * 255.0 + [ + -276.836, 135.576, -222.921 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def convert_color_factory(src: str, dst: str) -> Callable: + + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + + def convert_color(img: np.ndarray) -> np.ndarray: + out_img = cv2.cvtColor(img, code) + return out_img + + convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()} + image. + + Args: + img (ndarray or str): The input image. + + Returns: + ndarray: The converted {dst.upper()} image. + """ + + return convert_color + + +bgr2rgb = convert_color_factory('bgr', 'rgb') + +rgb2bgr = convert_color_factory('rgb', 'bgr') + +bgr2hsv = convert_color_factory('bgr', 'hsv') + +hsv2bgr = convert_color_factory('hsv', 'bgr') + +bgr2hls = convert_color_factory('bgr', 'hls') + +hls2bgr = convert_color_factory('hls', 'bgr') diff --git a/external/cv/mmcv/image/geometric.py b/external/cv/mmcv/image/geometric.py new file mode 100644 index 0000000000000000000000000000000000000000..1fd1029e42148d4093b553bafa52616f186c55fa --- /dev/null +++ b/external/cv/mmcv/image/geometric.py @@ -0,0 +1,793 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numbers +from typing import List, Optional, Tuple, Union, no_type_check + +import cv2 +import numpy as np +from mmengine.utils import to_2tuple + +from .io import imread_backend + +try: + from PIL import Image +except ImportError: + Image = None + + +def _scale_size( + size: Tuple[int, int], + scale: Union[float, int, Tuple[float, float], Tuple[int, int]], +) -> Tuple[int, int]: + """Rescale a size by a ratio. + + Args: + size (tuple[int]): (w, h). + scale (float | int | tuple(float) | tuple(int)): Scaling factor. + + Returns: + tuple[int]: scaled size. + """ + if isinstance(scale, (float, int)): + scale = (scale, scale) + w, h = size + return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) + + +cv2_interp_codes = { + 'nearest': cv2.INTER_NEAREST, + 'bilinear': cv2.INTER_LINEAR, + 'bicubic': cv2.INTER_CUBIC, + 'area': cv2.INTER_AREA, + 'lanczos': cv2.INTER_LANCZOS4 +} + +cv2_border_modes = { + 'constant': cv2.BORDER_CONSTANT, + 'replicate': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT, + 'wrap': cv2.BORDER_WRAP, + 'reflect_101': cv2.BORDER_REFLECT_101, + 'transparent': cv2.BORDER_TRANSPARENT, + 'isolated': cv2.BORDER_ISOLATED +} + +# Pillow >=v9.1.0 use a slightly different naming scheme for filters. +# Set pillow_interp_codes according to the naming scheme used. +if Image is not None: + if hasattr(Image, 'Resampling'): + pillow_interp_codes = { + 'nearest': Image.Resampling.NEAREST, + 'bilinear': Image.Resampling.BILINEAR, + 'bicubic': Image.Resampling.BICUBIC, + 'box': Image.Resampling.BOX, + 'lanczos': Image.Resampling.LANCZOS, + 'hamming': Image.Resampling.HAMMING + } + else: + pillow_interp_codes = { + 'nearest': Image.NEAREST, + 'bilinear': Image.BILINEAR, + 'bicubic': Image.BICUBIC, + 'box': Image.BOX, + 'lanczos': Image.LANCZOS, + 'hamming': Image.HAMMING + } + + +def imresize( + img: np.ndarray, + size: Tuple[int, int], + return_scale: bool = False, + interpolation: str = 'bilinear', + out: Optional[np.ndarray] = None, + backend: Optional[str] = None +) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: + """Resize image to a given size. + + Args: + img (ndarray): The input image. + size (tuple[int]): Target size (w, h). + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported for resize.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + pil_image = Image.fromarray(img) + pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) + resized_img = np.array(pil_image) + else: + resized_img = cv2.resize( + img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) + if not return_scale: + return resized_img + else: + w_scale = size[0] / w + h_scale = size[1] / h + return resized_img, w_scale, h_scale + + +@no_type_check +def imresize_to_multiple( + img: np.ndarray, + divisor: Union[int, Tuple[int, int]], + size: Union[int, Tuple[int, int], None] = None, + scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int], + None] = None, + keep_ratio: bool = False, + return_scale: bool = False, + interpolation: str = 'bilinear', + out: Optional[np.ndarray] = None, + backend: Optional[str] = None +) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: + """Resize image according to a given size or scale factor and then rounds + up the the resized or rescaled image size to the nearest value that can be + divided by the divisor. + + Args: + img (ndarray): The input image. + divisor (int | tuple): Resized image size will be a multiple of + divisor. If divisor is a tuple, divisor should be + (w_divisor, h_divisor). + size (None | int | tuple[int]): Target size (w, h). Default: None. + scale_factor (None | float | int | tuple[float] | tuple[int]): + Multiplier for spatial size. Should match input size if it is a + tuple and the 2D style is (w_scale_factor, h_scale_factor). + Default: None. + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. Default: False. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if size is not None and scale_factor is not None: + raise ValueError('only one of size or scale_factor should be defined') + elif size is None and scale_factor is None: + raise ValueError('one of size or scale_factor should be defined') + elif size is not None: + size = to_2tuple(size) + if keep_ratio: + size = rescale_size((w, h), size, return_scale=False) + else: + size = _scale_size((w, h), scale_factor) + + divisor = to_2tuple(divisor) + size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor)) + resized_img, w_scale, h_scale = imresize( + img, + size, + return_scale=True, + interpolation=interpolation, + out=out, + backend=backend) + if return_scale: + return resized_img, w_scale, h_scale + else: + return resized_img + + +def imresize_like( + img: np.ndarray, + dst_img: np.ndarray, + return_scale: bool = False, + interpolation: str = 'bilinear', + backend: Optional[str] = None +) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: + """Resize image to the same size of a given image. + + Args: + img (ndarray): The input image. + dst_img (ndarray): The target image. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = dst_img.shape[:2] + return imresize(img, (w, h), return_scale, interpolation, backend=backend) + + +def rescale_size(old_size: tuple, + scale: Union[float, int, Tuple[int, int]], + return_scale: bool = False) -> tuple: + """Calculate the new size to be rescaled to. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | int | tuple[int]): The scaling factor or maximum size. + If it is a float number or an integer, then the image will be + rescaled by this factor, else if it is a tuple of 2 integers, then + the image will be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image size. + + Returns: + tuple[int]: The new rescaled image size. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError( + f'Scale must be a number or tuple of int, but got {type(scale)}') + + new_size = _scale_size((w, h), scale_factor) + + if return_scale: + return new_size, scale_factor + else: + return new_size + + +def imrescale( + img: np.ndarray, + scale: Union[float, int, Tuple[int, int]], + return_scale: bool = False, + interpolation: str = 'bilinear', + backend: Optional[str] = None +) -> Union[np.ndarray, Tuple[np.ndarray, float]]: + """Resize image while keeping the aspect ratio. + + Args: + img (ndarray): The input image. + scale (float | int | tuple[int]): The scaling factor or maximum size. + If it is a float number or an integer, then the image will be + rescaled by this factor, else if it is a tuple of 2 integers, then + the image will be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + ndarray: The rescaled image. + """ + h, w = img.shape[:2] + new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) + rescaled_img = imresize( + img, new_size, interpolation=interpolation, backend=backend) + if return_scale: + return rescaled_img, scale_factor + else: + return rescaled_img + + +def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray: + """Flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image. + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return np.flip(img, axis=1) + elif direction == 'vertical': + return np.flip(img, axis=0) + else: + return np.flip(img, axis=(0, 1)) + + +def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray: + """Inplace flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image (inplace). + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return cv2.flip(img, 1, img) + elif direction == 'vertical': + return cv2.flip(img, 0, img) + else: + return cv2.flip(img, -1, img) + + +def imrotate(img: np.ndarray, + angle: float, + center: Optional[Tuple[float, float]] = None, + scale: float = 1.0, + border_value: int = 0, + interpolation: str = 'bilinear', + auto_bound: bool = False, + border_mode: str = 'constant') -> np.ndarray: + """Rotate an image. + + Args: + img (np.ndarray): Image to be rotated. + angle (float): Rotation angle in degrees, positive values mean + clockwise rotation. + center (tuple[float], optional): Center point (w, h) of the rotation in + the source image. If not specified, the center of the image will be + used. + scale (float): Isotropic scale factor. + border_value (int): Border value used in case of a constant border. + Defaults to 0. + interpolation (str): Same as :func:`resize`. + auto_bound (bool): Whether to adjust the image size to cover the whole + rotated image. + border_mode (str): Pixel extrapolation method. Defaults to 'constant'. + + Returns: + np.ndarray: The rotated image. + """ + if center is not None and auto_bound: + raise ValueError('`auto_bound` conflicts with `center`') + h, w = img.shape[:2] + if center is None: + center = ((w - 1) * 0.5, (h - 1) * 0.5) + assert isinstance(center, tuple) + + matrix = cv2.getRotationMatrix2D(center, -angle, scale) + if auto_bound: + cos = np.abs(matrix[0, 0]) + sin = np.abs(matrix[0, 1]) + new_w = h * sin + w * cos + new_h = h * cos + w * sin + matrix[0, 2] += (new_w - w) * 0.5 + matrix[1, 2] += (new_h - h) * 0.5 + w = int(np.round(new_w)) + h = int(np.round(new_h)) + rotated = cv2.warpAffine( + img, + matrix, (w, h), + flags=cv2_interp_codes[interpolation], + borderMode=cv2_border_modes[border_mode], + borderValue=border_value) + return rotated + + +def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray: + """Clip bboxes to fit the image shape. + + Args: + bboxes (ndarray): Shape (..., 4*k) + img_shape (tuple[int]): (height, width) of the image. + + Returns: + ndarray: Clipped bboxes. + """ + assert bboxes.shape[-1] % 4 == 0 + cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype) + cmin[0::2] = img_shape[1] - 1 + cmin[1::2] = img_shape[0] - 1 + clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0) + return clipped_bboxes + + +def bbox_scaling(bboxes: np.ndarray, + scale: float, + clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray: + """Scaling bboxes w.r.t the box center. + + Args: + bboxes (ndarray): Shape(..., 4). + scale (float): Scaling factor. + clip_shape (tuple[int], optional): If specified, bboxes that exceed the + boundary will be clipped according to the given shape (h, w). + + Returns: + ndarray: Scaled bboxes. + """ + if float(scale) == 1.0: + scaled_bboxes = bboxes.copy() + else: + w = bboxes[..., 2] - bboxes[..., 0] + 1 + h = bboxes[..., 3] - bboxes[..., 1] + 1 + dw = (w * (scale - 1)) * 0.5 + dh = (h * (scale - 1)) * 0.5 + scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1) + if clip_shape is not None: + return bbox_clip(scaled_bboxes, clip_shape) + else: + return scaled_bboxes + + +def imcrop( + img: np.ndarray, + bboxes: np.ndarray, + scale: float = 1.0, + pad_fill: Union[float, list, None] = None +) -> Union[np.ndarray, List[np.ndarray]]: + """Crop image patches. + + 3 steps: scale the bboxes -> clip bboxes -> crop and pad. + + Args: + img (ndarray): Image to be cropped. + bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes. + scale (float, optional): Scale ratio of bboxes, the default value + 1.0 means no scaling. + pad_fill (Number | list[Number]): Value to be filled for padding. + Default: None, which means no padding. + + Returns: + list[ndarray] | ndarray: The cropped image patches. + """ + chn = 1 if img.ndim == 2 else img.shape[2] + if pad_fill is not None: + if isinstance(pad_fill, (int, float)): + pad_fill = [pad_fill for _ in range(chn)] + assert len(pad_fill) == chn + + _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes + scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32) + clipped_bbox = bbox_clip(scaled_bboxes, img.shape) + + patches = [] + for i in range(clipped_bbox.shape[0]): + x1, y1, x2, y2 = tuple(clipped_bbox[i, :]) + if pad_fill is None: + patch = img[y1:y2 + 1, x1:x2 + 1, ...] + else: + _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :]) + patch_h = _y2 - _y1 + 1 + patch_w = _x2 - _x1 + 1 + if chn == 1: + patch_shape = (patch_h, patch_w) + else: + patch_shape = (patch_h, patch_w, chn) # type: ignore + patch = np.array( + pad_fill, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + x_start = 0 if _x1 >= 0 else -_x1 + y_start = 0 if _y1 >= 0 else -_y1 + w = x2 - x1 + 1 + h = y2 - y1 + 1 + patch[y_start:y_start + h, x_start:x_start + w, + ...] = img[y1:y1 + h, x1:x1 + w, ...] + patches.append(patch) + + if bboxes.ndim == 1: + return patches[0] + else: + return patches + + +def impad(img: np.ndarray, + *, + shape: Optional[Tuple[int, int]] = None, + padding: Union[int, tuple, None] = None, + pad_val: Union[float, List] = 0, + padding_mode: str = 'constant') -> np.ndarray: + """Pad the given image to a certain shape or pad on all sides with + specified padding mode and padding value. + + Args: + img (ndarray): Image to be padded. + shape (tuple[int]): Expected padding shape (h, w). Default: None. + padding (int or tuple[int]): Padding on each border. If a single int is + provided this is used to pad all borders. If tuple of length 2 is + provided this is the padding on left/right and top/bottom + respectively. If a tuple of length 4 is provided this is the + padding for the left, top, right and bottom borders respectively. + Default: None. Note that `shape` and `padding` can not be both + set. + pad_val (Number | Sequence[Number]): Values to be filled in padding + areas when padding_mode is 'constant'. Default: 0. + padding_mode (str): Type of padding. Should be: constant, edge, + reflect or symmetric. Default: constant. + + - constant: pads with a constant value, this value is specified + with pad_val. + - edge: pads with the last value at the edge of the image. + - reflect: pads with reflection of image without repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with 2 + elements on both sides in reflect mode will result in + [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last value + on the edge. For example, padding [1, 2, 3, 4] with 2 elements on + both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] + + Returns: + ndarray: The padded image. + """ + + assert (shape is not None) ^ (padding is not None) + if shape is not None: + width = max(shape[1] - img.shape[1], 0) + height = max(shape[0] - img.shape[0], 0) + padding = (0, 0, width, height) + + # check pad_val + if isinstance(pad_val, tuple): + assert len(pad_val) == img.shape[-1] + elif not isinstance(pad_val, numbers.Number): + raise TypeError('pad_val must be a int or a tuple. ' + f'But received {type(pad_val)}') + + # check padding + if isinstance(padding, tuple) and len(padding) in [2, 4]: + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + elif isinstance(padding, numbers.Number): + padding = (padding, padding, padding, padding) + else: + raise ValueError('Padding must be a int or a 2, or 4 element tuple.' + f'But received {padding}') + + # check padding mode + assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] + + border_type = { + 'constant': cv2.BORDER_CONSTANT, + 'edge': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT_101, + 'symmetric': cv2.BORDER_REFLECT + } + img = cv2.copyMakeBorder( + img, + padding[1], + padding[3], + padding[0], + padding[2], + border_type[padding_mode], + value=pad_val) + + return img + + +def impad_to_multiple(img: np.ndarray, + divisor: int, + pad_val: Union[float, List] = 0) -> np.ndarray: + """Pad an image to ensure each edge to be multiple to some number. + + Args: + img (ndarray): Image to be padded. + divisor (int): Padded image edges will be multiple to divisor. + pad_val (Number | Sequence[Number]): Same as :func:`impad`. + + Returns: + ndarray: The padded image. + """ + pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor + pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor + return impad(img, shape=(pad_h, pad_w), pad_val=pad_val) + + +def cutout(img: np.ndarray, + shape: Union[int, Tuple[int, int]], + pad_val: Union[int, float, tuple] = 0) -> np.ndarray: + """Randomly cut out a rectangle from the original img. + + Args: + img (ndarray): Image to be cutout. + shape (int | tuple[int]): Expected cutout shape (h, w). If given as a + int, the value will be used for both h and w. + pad_val (int | float | tuple[int | float]): Values to be filled in the + cut area. Defaults to 0. + + Returns: + ndarray: The cutout image. + """ + + channels = 1 if img.ndim == 2 else img.shape[2] + if isinstance(shape, int): + cut_h, cut_w = shape, shape + else: + assert isinstance(shape, tuple) and len(shape) == 2, \ + f'shape must be a int or a tuple with length 2, but got type ' \ + f'{type(shape)} instead.' + cut_h, cut_w = shape + if isinstance(pad_val, (int, float)): + pad_val = tuple([pad_val] * channels) + elif isinstance(pad_val, tuple): + assert len(pad_val) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(pad_val), channels) + else: + raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`') + + img_h, img_w = img.shape[:2] + y0 = np.random.uniform(img_h) + x0 = np.random.uniform(img_w) + + y1 = int(max(0, y0 - cut_h / 2.)) + x1 = int(max(0, x0 - cut_w / 2.)) + y2 = min(img_h, y1 + cut_h) + x2 = min(img_w, x1 + cut_w) + + if img.ndim == 2: + patch_shape = (y2 - y1, x2 - x1) + else: + patch_shape = (y2 - y1, x2 - x1, channels) # type: ignore + + img_cutout = img.copy() + patch = np.array( + pad_val, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + img_cutout[y1:y2, x1:x2, ...] = patch + + return img_cutout + + +def _get_shear_matrix(magnitude: Union[int, float], + direction: str = 'horizontal') -> np.ndarray: + """Generate the shear matrix for transformation. + + Args: + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + + Returns: + ndarray: The shear matrix with dtype float32. + """ + if direction == 'horizontal': + shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]]) + elif direction == 'vertical': + shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]]) + return shear_matrix + + +def imshear(img: np.ndarray, + magnitude: Union[int, float], + direction: str = 'horizontal', + border_value: Union[int, Tuple[int, int]] = 0, + interpolation: str = 'bilinear') -> np.ndarray: + """Shear an image. + + Args: + img (ndarray): Image to be sheared with format (h, w) + or (h, w, c). + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The sheared image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) # type: ignore + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`') + shear_matrix = _get_shear_matrix(magnitude, direction) + sheared = cv2.warpAffine( + img, + shear_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. shearing masks whose channels large + # than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], # type: ignore + flags=cv2_interp_codes[interpolation]) + return sheared + + +def _get_translate_matrix(offset: Union[int, float], + direction: str = 'horizontal') -> np.ndarray: + """Generate the translate matrix. + + Args: + offset (int | float): The offset used for translate. + direction (str): The translate direction, either + "horizontal" or "vertical". + + Returns: + ndarray: The translate matrix with dtype float32. + """ + if direction == 'horizontal': + translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]]) + elif direction == 'vertical': + translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]]) + return translate_matrix + + +def imtranslate(img: np.ndarray, + offset: Union[int, float], + direction: str = 'horizontal', + border_value: Union[int, tuple] = 0, + interpolation: str = 'bilinear') -> np.ndarray: + """Translate an image. + + Args: + img (ndarray): Image to be translated with format + (h, w) or (h, w, c). + offset (int | float): The offset used for translate. + direction (str): The translate direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The translated image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`.') + translate_matrix = _get_translate_matrix(offset, direction) + translated = cv2.warpAffine( + img, + translate_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. translating masks whose channels + # large than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], + flags=cv2_interp_codes[interpolation]) + return translated diff --git a/external/cv/mmcv/image/io.py b/external/cv/mmcv/image/io.py new file mode 100644 index 0000000000000000000000000000000000000000..a69eb5c6940b96d6a839572100f3caf8f3375587 --- /dev/null +++ b/external/cv/mmcv/image/io.py @@ -0,0 +1,369 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import io +import os.path as osp +import warnings +from pathlib import Path +from typing import Optional, Union + +import cv2 +import mmengine.fileio as fileio +import numpy as np +from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, + IMREAD_UNCHANGED) +from mmengine.utils import is_filepath, is_str + +try: + from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG +except ImportError: + TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None + +try: + from PIL import Image, ImageOps +except ImportError: + Image = None + +try: + import tifffile +except ImportError: + tifffile = None + +jpeg = None +supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile'] + +imread_flags = { + 'color': IMREAD_COLOR, + 'grayscale': IMREAD_GRAYSCALE, + 'unchanged': IMREAD_UNCHANGED, + 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR, + 'grayscale_ignore_orientation': + IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE +} + +imread_backend = 'cv2' + + +def use_backend(backend: str) -> None: + """Select a backend for image decoding. + + Args: + backend (str): The image decoding backend type. Options are `cv2`, + `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG) + and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg` + file format. + """ + assert backend in supported_backends + global imread_backend + imread_backend = backend + if imread_backend == 'turbojpeg': + if TurboJPEG is None: + raise ImportError('`PyTurboJPEG` is not installed') + global jpeg + if jpeg is None: + jpeg = TurboJPEG() + elif imread_backend == 'pillow': + if Image is None: + raise ImportError('`Pillow` is not installed') + elif imread_backend == 'tifffile': + if tifffile is None: + raise ImportError('`tifffile` is not installed') + + +def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'): + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'color': + if channel_order == 'bgr': + return TJPF_BGR + elif channel_order == 'rgb': + return TJCS_RGB + elif flag == 'grayscale': + return TJPF_GRAY + else: + raise ValueError('flag must be "color" or "grayscale"') + + +def _pillow2array(img, + flag: str = 'color', + channel_order: str = 'bgr') -> np.ndarray: + """Convert a pillow image to numpy array. + + Args: + img (:obj:`PIL.Image.Image`): The image loaded using PIL + flag (str): Flags specifying the color type of a loaded image, + candidates are 'color', 'grayscale' and 'unchanged'. + Default to 'color'. + channel_order (str): The channel order of the output image array, + candidates are 'bgr' and 'rgb'. Default to 'bgr'. + + Returns: + np.ndarray: The converted numpy array + """ + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'unchanged': + array = np.array(img) + if array.ndim >= 3 and array.shape[2] >= 3: # color image + array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR + else: + # Handle exif orientation tag + if flag in ['color', 'grayscale']: + img = ImageOps.exif_transpose(img) + # If the image mode is not 'RGB', convert it to 'RGB' first. + if img.mode != 'RGB': + if img.mode != 'LA': + # Most formats except 'LA' can be directly converted to RGB + img = img.convert('RGB') + else: + # When the mode is 'LA', the default conversion will fill in + # the canvas with black, which sometimes shadows black objects + # in the foreground. + # + # Therefore, a random color (124, 117, 104) is used for canvas + img_rgba = img.convert('RGBA') + img = Image.new('RGB', img_rgba.size, (124, 117, 104)) + img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha + if flag in ['color', 'color_ignore_orientation']: + array = np.array(img) + if channel_order != 'rgb': + array = array[:, :, ::-1] # RGB to BGR + elif flag in ['grayscale', 'grayscale_ignore_orientation']: + img = img.convert('L') + array = np.array(img) + else: + raise ValueError( + 'flag must be "color", "grayscale", "unchanged", ' + f'"color_ignore_orientation" or "grayscale_ignore_orientation"' + f' but got {flag}') + return array + + +def imread(img_or_path: Union[np.ndarray, str, Path], + flag: str = 'color', + channel_order: str = 'bgr', + backend: Optional[str] = None, + file_client_args: Optional[dict] = None, + *, + backend_args: Optional[dict] = None) -> np.ndarray: + """Read an image. + + Args: + img_or_path (ndarray or str or Path): Either a numpy array or str or + pathlib.Path. If it is a numpy array (loaded image), then + it will be returned as is. + flag (str): Flags specifying the color type of a loaded image, + candidates are `color`, `grayscale`, `unchanged`, + `color_ignore_orientation` and `grayscale_ignore_orientation`. + By default, `cv2` and `pillow` backend would rotate the image + according to its EXIF info unless called with `unchanged` or + `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend + always ignore image's EXIF info regardless of the flag. + The `turbojpeg` backend only supports `color` and `grayscale`. + channel_order (str): Order of channel, candidates are `bgr` and `rgb`. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. + If backend is None, the global imread_backend specified by + ``mmcv.use_backend()`` will be used. Default: None. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmengine.fileio.FileClient` for details. + Default: None. It will be deprecated in future. Please use + ``backend_args`` instead. + Deprecated in version 2.0.0rc4. + backend_args (dict, optional): Instantiates the corresponding file + backend. It may contain `backend` key to specify the file + backend. If it contains, the file backend corresponding to this + value will be used and initialized with the remaining values, + otherwise the corresponding file backend will be selected + based on the prefix of the file path. Defaults to None. + New in version 2.0.0rc4. + + Returns: + ndarray: Loaded image array. + + Examples: + >>> import mmcv + >>> img_path = '/path/to/img.jpg' + >>> img = mmcv.imread(img_path) + >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb', + ... backend='cv2') + >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr', + ... backend='pillow') + >>> s3_img_path = 's3://bucket/img.jpg' + >>> # infer the file backend by the prefix s3 + >>> img = mmcv.imread(s3_img_path) + >>> # manually set the file backend petrel + >>> img = mmcv.imread(s3_img_path, backend_args={ + ... 'backend': 'petrel'}) + >>> http_img_path = 'http://path/to/img.jpg' + >>> img = mmcv.imread(http_img_path) + >>> img = mmcv.imread(http_img_path, backend_args={ + ... 'backend': 'http'}) + """ + if file_client_args is not None: + warnings.warn( + '"file_client_args" will be deprecated in future. ' + 'Please use "backend_args" instead', DeprecationWarning) + if backend_args is not None: + raise ValueError( + '"file_client_args" and "backend_args" cannot be set at the ' + 'same time.') + + if isinstance(img_or_path, Path): + img_or_path = str(img_or_path) + + if isinstance(img_or_path, np.ndarray): + return img_or_path + elif is_str(img_or_path): + if file_client_args is not None: + file_client = fileio.FileClient.infer_client( + file_client_args, img_or_path) + img_bytes = file_client.get(img_or_path) + else: + img_bytes = fileio.get(img_or_path, backend_args=backend_args) + return imfrombytes(img_bytes, flag, channel_order, backend) + else: + raise TypeError('"img" must be a numpy array or a str or ' + 'a pathlib.Path object') + + +def imfrombytes(content: bytes, + flag: str = 'color', + channel_order: str = 'bgr', + backend: Optional[str] = None) -> np.ndarray: + """Read an image from bytes. + + Args: + content (bytes): Image bytes got from files or other streams. + flag (str): Same as :func:`imread`. + channel_order (str): The channel order of the output, candidates + are 'bgr' and 'rgb'. Default to 'bgr'. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is + None, the global imread_backend specified by ``mmcv.use_backend()`` + will be used. Default: None. + + Returns: + ndarray: Loaded image array. + + Examples: + >>> img_path = '/path/to/img.jpg' + >>> with open(img_path, 'rb') as f: + >>> img_buff = f.read() + >>> img = mmcv.imfrombytes(img_buff) + >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb') + >>> img = mmcv.imfrombytes(img_buff, backend='pillow') + >>> img = mmcv.imfrombytes(img_buff, backend='cv2') + """ + + if backend is None: + backend = imread_backend + if backend not in supported_backends: + raise ValueError( + f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'") + if backend == 'turbojpeg': + img = jpeg.decode( # type: ignore + content, _jpegflag(flag, channel_order)) + if img.shape[-1] == 1: + img = img[:, :, 0] + return img + elif backend == 'pillow': + with io.BytesIO(content) as buff: + img = Image.open(buff) + img = _pillow2array(img, flag, channel_order) + return img + elif backend == 'tifffile': + with io.BytesIO(content) as buff: + img = tifffile.imread(buff) + return img + else: + img_np = np.frombuffer(content, np.uint8) + flag = imread_flags[flag] if is_str(flag) else flag + img = cv2.imdecode(img_np, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + + +def imwrite(img: np.ndarray, + file_path: str, + params: Optional[list] = None, + auto_mkdir: Optional[bool] = None, + file_client_args: Optional[dict] = None, + *, + backend_args: Optional[dict] = None) -> bool: + """Write image to file. + + Warning: + The parameter `auto_mkdir` will be deprecated in the future and every + file clients will make directory automatically. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. It will be deprecated. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmengine.fileio.FileClient` for details. + Default: None. It will be deprecated in future. Please use + ``backend_args`` instead. + Deprecated in version 2.0.0rc4. + backend_args (dict, optional): Instantiates the corresponding file + backend. It may contain `backend` key to specify the file + backend. If it contains, the file backend corresponding to this + value will be used and initialized with the remaining values, + otherwise the corresponding file backend will be selected + based on the prefix of the file path. Defaults to None. + New in version 2.0.0rc4. + + Returns: + bool: Successful or not. + + Examples: + >>> # write to hard disk client + >>> ret = mmcv.imwrite(img, '/path/to/img.jpg') + >>> # infer the file backend by the prefix s3 + >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg') + >>> # manually set the file backend petrel + >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={ + ... 'backend': 'petrel'}) + """ + if file_client_args is not None: + warnings.warn( + '"file_client_args" will be deprecated in future. ' + 'Please use "backend_args" instead', DeprecationWarning) + if backend_args is not None: + raise ValueError( + '"file_client_args" and "backend_args" cannot be set at the ' + 'same time.') + + assert is_filepath(file_path) + file_path = str(file_path) + if auto_mkdir is not None: + warnings.warn( + 'The parameter `auto_mkdir` will be deprecated in the future and ' + 'every file clients will make directory automatically.') + + img_ext = osp.splitext(file_path)[-1] + # Encode image according to image suffix. + # For example, if image path is '/path/your/img.jpg', the encode + # format is '.jpg'. + flag, img_buff = cv2.imencode(img_ext, img, params) + + if file_client_args is not None: + file_client = fileio.FileClient.infer_client(file_client_args, + file_path) + file_client.put(img_buff.tobytes(), file_path) + else: + fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args) + + return flag diff --git a/external/cv/mmcv/image/misc.py b/external/cv/mmcv/image/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..1c8d5bfb16028ed03cc60be344c5202390c29f83 --- /dev/null +++ b/external/cv/mmcv/image/misc.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import numpy as np + +import mmcv + +try: + import torch +except ImportError: + torch = None + + +def tensor2imgs(tensor, + mean: Optional[tuple] = None, + std: Optional[tuple] = None, + to_rgb: bool = True) -> list: + """Convert tensor to 3-channel images or 1-channel gray images. + + Args: + tensor (torch.Tensor): Tensor that contains multiple images, shape ( + N, C, H, W). :math:`C` can be either 3 or 1. + mean (tuple[float], optional): Mean of images. If None, + (0, 0, 0) will be used for tensor with 3-channel, + while (0, ) for tensor with 1-channel. Defaults to None. + std (tuple[float], optional): Standard deviation of images. If None, + (1, 1, 1) will be used for tensor with 3-channel, + while (1, ) for tensor with 1-channel. Defaults to None. + to_rgb (bool, optional): Whether the tensor was converted to RGB + format in the first place. If so, convert it back to BGR. + For the tensor with 1 channel, it must be False. Defaults to True. + + Returns: + list[np.ndarray]: A list that contains multiple images. + """ + + if torch is None: + raise RuntimeError('pytorch is not installed') + assert torch.is_tensor(tensor) and tensor.ndim == 4 + channels = tensor.size(1) + assert channels in [1, 3] + if mean is None: + mean = (0, ) * channels + if std is None: + std = (1, ) * channels + assert (channels == len(mean) == len(std) == 3) or \ + (channels == len(mean) == len(std) == 1 and not to_rgb) + + num_imgs = tensor.size(0) + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + imgs = [] + for img_id in range(num_imgs): + img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) + img = mmcv.imdenormalize( + img, mean, std, to_bgr=to_rgb).astype(np.uint8) + imgs.append(np.ascontiguousarray(img)) + return imgs diff --git a/external/cv/mmcv/image/photometric.py b/external/cv/mmcv/image/photometric.py new file mode 100644 index 0000000000000000000000000000000000000000..3b8c231f1ef403f60918f8bdd0e980e4c9e3dcec --- /dev/null +++ b/external/cv/mmcv/image/photometric.py @@ -0,0 +1,566 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import warnings +from typing import Optional + +import cv2 +import numpy as np +from mmengine.utils import is_tuple_of +from PIL import Image, ImageEnhance + +from .colorspace import bgr2gray, gray2bgr +from .io import imread_backend + + +def imnormalize(img, mean, std, to_rgb=True): + """Normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + img = img.copy().astype(np.float32) + return imnormalize_(img, mean, std, to_rgb) + + +def imnormalize_(img, mean, std, to_rgb=True): + """Inplace normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + # cv2 inplace normalization does not accept uint8 + assert img.dtype != np.uint8 + mean = np.float64(mean.reshape(1, -1)) + stdinv = 1 / np.float64(std.reshape(1, -1)) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + cv2.subtract(img, mean, img) # inplace + cv2.multiply(img, stdinv, img) # inplace + return img + + +def imdenormalize(img, mean, std, to_bgr=True): + assert img.dtype != np.uint8 + mean = mean.reshape(1, -1).astype(np.float64) + std = std.reshape(1, -1).astype(np.float64) + img = cv2.multiply(img, std) # make a copy + cv2.add(img, mean, img) # inplace + if to_bgr: + cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace + return img + + +def iminvert(img): + """Invert (negate) an image. + + Args: + img (ndarray): Image to be inverted. + + Returns: + ndarray: The inverted image. + """ + return np.full_like(img, 255) - img + + +def solarize(img, thr=128): + """Solarize an image (invert all pixel values above a threshold) + + Args: + img (ndarray): Image to be solarized. + thr (int): Threshold for solarizing (0 - 255). + + Returns: + ndarray: The solarized image. + """ + img = np.where(img < thr, img, 255 - img) + return img + + +def posterize(img, bits): + """Posterize an image (reduce the number of bits for each color channel) + + Args: + img (ndarray): Image to be posterized. + bits (int): Number of bits (1 to 8) to use for posterizing. + + Returns: + ndarray: The posterized image. + """ + shift = 8 - bits + img = np.left_shift(np.right_shift(img, shift), shift) + return img + + +def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None): + r"""It blends the source image and its gray image: + + .. math:: + output = img * alpha + gray\_img * beta + gamma + + Args: + img (ndarray): The input source image. + alpha (int | float): Weight for the source image. Default 1. + beta (int | float): Weight for the converted gray image. + If None, it's assigned the value (1 - `alpha`). + gamma (int | float): Scalar added to each sum. + Same as :func:`cv2.addWeighted`. Default 0. + backend (str | None): The image processing backend type. Options are + `cv2`, `pillow`, `None`. If backend is None, the global + ``imread_backend`` specified by ``mmcv.use_backend()`` will be + used. Defaults to None. + + Returns: + ndarray: Colored image which has the same size and dtype as input. + """ + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + warnings.warn("Only use 'alpha' for pillow backend.") + # Image.fromarray defaultly supports RGB, not BGR. + pil_image = Image.fromarray(img[..., ::-1], mode='RGB') + enhancer = ImageEnhance.Color(pil_image) + pil_image = enhancer.enhance(alpha) + return np.array(pil_image, dtype=img.dtype)[..., ::-1] + else: + gray_img = bgr2gray(img) + gray_img = np.tile(gray_img[..., None], [1, 1, 3]) + if beta is None: + beta = 1 - alpha + colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma) + if not colored_img.dtype == np.uint8: + # Note when the dtype of `img` is not the default `np.uint8` + # (e.g. np.float32), the value in `colored_img` got from cv2 + # is not guaranteed to be in range [0, 255], so here clip + # is needed. + colored_img = np.clip(colored_img, 0, 255) + return colored_img.astype(img.dtype) + + +def imequalize(img): + """Equalize the image histogram. + + This function applies a non-linear mapping to the input image, + in order to create a uniform distribution of grayscale values + in the output image. + + Args: + img (ndarray): Image to be equalized. + + Returns: + ndarray: The equalized image. + """ + + def _scale_channel(im, c): + """Scale the data in the corresponding channel.""" + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # For computing the step, filter out the nonzeros. + nonzero_histo = histo[histo > 0] + step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 + if not step: + lut = np.array(range(256)) + else: + # Compute the cumulative sum, shifted by step // 2 + # and then normalized by step. + lut = (np.cumsum(histo) + (step // 2)) // step + # Shift lut, prepending with 0. + lut = np.concatenate([[0], lut[:-1]], 0) + # handle potential integer overflow + lut[lut > 255] = 255 + # If step is zero, return the original image. + # Otherwise, index from lut. + return np.where(np.equal(step, 0), im, lut[im]) + + # Scales each channel independently and then stacks + # the result. + s1 = _scale_channel(img, 0) + s2 = _scale_channel(img, 1) + s3 = _scale_channel(img, 2) + equalized_img = np.stack([s1, s2, s3], axis=-1) + return equalized_img.astype(img.dtype) + + +def adjust_brightness(img, factor=1., backend=None): + """Adjust image brightness. + + This function controls the brightness of an image. An + enhancement factor of 0.0 gives a black image. + A factor of 1.0 gives the original image. This function + blends the source image and the degenerated black image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be brightened. + factor (float): A value controls the enhancement. + Factor 1.0 returns the original image, lower + factors mean less color (brightness, contrast, + etc), and higher values more. Default 1. + backend (str | None): The image processing backend type. Options are + `cv2`, `pillow`, `None`. If backend is None, the global + ``imread_backend`` specified by ``mmcv.use_backend()`` will be + used. Defaults to None. + + Returns: + ndarray: The brightened image. + """ + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + # Image.fromarray defaultly supports RGB, not BGR. + pil_image = Image.fromarray(img[..., ::-1], mode='RGB') + enhancer = ImageEnhance.Brightness(pil_image) + pil_image = enhancer.enhance(factor) + return np.array(pil_image, dtype=img.dtype)[..., ::-1] + else: + degenerated = np.zeros_like(img) + # Note manually convert the dtype to np.float32, to + # achieve as close results as PIL.ImageEnhance.Brightness. + # Set beta=1-factor, and gamma=0 + brightened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + brightened_img = np.clip(brightened_img, 0, 255) + return brightened_img.astype(img.dtype) + + +def adjust_contrast(img, factor=1., backend=None): + """Adjust image contrast. + + This function controls the contrast of an image. An + enhancement factor of 0.0 gives a solid grey + image. A factor of 1.0 gives the original image. It + blends the source image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be contrasted. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + backend (str | None): The image processing backend type. Options are + `cv2`, `pillow`, `None`. If backend is None, the global + ``imread_backend`` specified by ``mmcv.use_backend()`` will be + used. Defaults to None. + + Returns: + ndarray: The contrasted image. + """ + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + # Image.fromarray defaultly supports RGB, not BGR. + pil_image = Image.fromarray(img[..., ::-1], mode='RGB') + enhancer = ImageEnhance.Contrast(pil_image) + pil_image = enhancer.enhance(factor) + return np.array(pil_image, dtype=img.dtype)[..., ::-1] + else: + gray_img = bgr2gray(img) + hist = np.histogram(gray_img, 256, (0, 255))[0] + mean = round(np.sum(gray_img) / np.sum(hist)) + degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype) + degenerated = gray2bgr(degenerated) + contrasted_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + contrasted_img = np.clip(contrasted_img, 0, 255) + return contrasted_img.astype(img.dtype) + + +def auto_contrast(img, cutoff=0): + """Auto adjust image contrast. + + This function maximize (normalize) image contrast by first removing cutoff + percent of the lightest and darkest pixels from the histogram and remapping + the image so that the darkest pixel becomes black (0), and the lightest + becomes white (255). + + Args: + img (ndarray): Image to be contrasted. BGR order. + cutoff (int | float | tuple): The cutoff percent of the lightest and + darkest pixels to be removed. If given as tuple, it shall be + (low, high). Otherwise, the single value will be used for both. + Defaults to 0. + + Returns: + ndarray: The contrasted image. + """ + + def _auto_contrast_channel(im, c, cutoff): + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # Remove cut-off percent pixels from histo + histo_sum = np.cumsum(histo) + cut_low = histo_sum[-1] * cutoff[0] // 100 + cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100 + histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low + histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0) + + # Compute mapping + low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1] + # If all the values have been cut off, return the origin img + if low >= high: + return im + scale = 255.0 / (high - low) + offset = -low * scale + lut = np.array(range(256)) + lut = lut * scale + offset + lut = np.clip(lut, 0, 255) + return lut[im] + + if isinstance(cutoff, (int, float)): + cutoff = (cutoff, cutoff) + else: + assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \ + f'float or tuple, but got {type(cutoff)} instead.' + # Auto adjusts contrast for each channel independently and then stacks + # the result. + s1 = _auto_contrast_channel(img, 0, cutoff) + s2 = _auto_contrast_channel(img, 1, cutoff) + s3 = _auto_contrast_channel(img, 2, cutoff) + contrasted_img = np.stack([s1, s2, s3], axis=-1) + return contrasted_img.astype(img.dtype) + + +def adjust_sharpness(img, factor=1., kernel=None): + """Adjust image sharpness. + + This function controls the sharpness of an image. An + enhancement factor of 0.0 gives a blurred image. A + factor of 1.0 gives the original image. And a factor + of 2.0 gives a sharpened image. It blends the source + image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be sharpened. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + kernel (np.ndarray, optional): Filter kernel to be applied on the img + to obtain the degenerated img. Defaults to None. + + Note: + No value sanity check is enforced on the kernel set by users. So with + an inappropriate kernel, the ``adjust_sharpness`` may fail to perform + the function its name indicates but end up performing whatever + transform determined by the kernel. + + Returns: + ndarray: The sharpened image. + """ + + if kernel is None: + # adopted from PIL.ImageFilter.SMOOTH + kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13 + assert isinstance(kernel, np.ndarray), \ + f'kernel must be of type np.ndarray, but got {type(kernel)} instead.' + assert kernel.ndim == 2, \ + f'kernel must have a dimension of 2, but got {kernel.ndim} instead.' + + degenerated = cv2.filter2D(img, -1, kernel) + sharpened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + sharpened_img = np.clip(sharpened_img, 0, 255) + return sharpened_img.astype(img.dtype) + + +def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True): + """AlexNet-style PCA jitter. + + This data augmentation is proposed in `ImageNet Classification with Deep + Convolutional Neural Networks + `_. + + Args: + img (ndarray): Image to be adjusted lighting. BGR order. + eigval (ndarray): the eigenvalue of the convariance matrix of pixel + values, respectively. + eigvec (ndarray): the eigenvector of the convariance matrix of pixel + values, respectively. + alphastd (float): The standard deviation for distribution of alpha. + Defaults to 0.1 + to_rgb (bool): Whether to convert img to rgb. + + Returns: + ndarray: The adjusted image. + """ + assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \ + f'eigval and eigvec should both be of type np.ndarray, got ' \ + f'{type(eigval)} and {type(eigvec)} instead.' + + assert eigval.ndim == 1 and eigvec.ndim == 2 + assert eigvec.shape == (3, eigval.shape[0]) + n_eigval = eigval.shape[0] + assert isinstance(alphastd, float), 'alphastd should be of type float, ' \ + f'got {type(alphastd)} instead.' + + img = img.copy().astype(np.float32) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + + alpha = np.random.normal(0, alphastd, n_eigval) + alter = eigvec \ + * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \ + * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval)) + alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape) + img_adjusted = img + alter + return img_adjusted + + +def lut_transform(img, lut_table): + """Transform array by look-up table. + + The function lut_transform fills the output array with values from the + look-up table. Indices of the entries are taken from the input array. + + Args: + img (ndarray): Image to be transformed. + lut_table (ndarray): look-up table of 256 elements; in case of + multi-channel input array, the table should either have a single + channel (in this case the same table is used for all channels) or + the same number of channels as in the input array. + + Returns: + ndarray: The transformed image. + """ + assert isinstance(img, np.ndarray) + assert 0 <= np.min(img) and np.max(img) <= 255 + assert isinstance(lut_table, np.ndarray) + assert lut_table.shape == (256, ) + + return cv2.LUT(np.array(img, dtype=np.uint8), lut_table) + + +def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): + """Use CLAHE method to process the image. + + See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. + Graphics Gems, 1994:474-485.` for more information. + + Args: + img (ndarray): Image to be processed. + clip_limit (float): Threshold for contrast limiting. Default: 40.0. + tile_grid_size (tuple[int]): Size of grid for histogram equalization. + Input image will be divided into equally sized rectangular tiles. + It defines the number of tiles in row and column. Default: (8, 8). + + Returns: + ndarray: The processed image. + """ + assert isinstance(img, np.ndarray) + assert img.ndim == 2 + assert isinstance(clip_limit, (float, int)) + assert is_tuple_of(tile_grid_size, int) + assert len(tile_grid_size) == 2 + + clahe = cv2.createCLAHE(clip_limit, tile_grid_size) + return clahe.apply(np.array(img, dtype=np.uint8)) + + +def adjust_hue(img: np.ndarray, + hue_factor: float, + backend: Optional[str] = None) -> np.ndarray: + """Adjust hue of an image. + + The image hue is adjusted by converting the image to HSV and cyclically + shifting the intensities in the hue channel (H). The image is then + converted back to original image mode. + + `hue_factor` is the amount of shift in H channel and must be in the + interval `[-0.5, 0.5]`. + + Modified from + https://github.com/pytorch/vision/blob/main/torchvision/ + transforms/functional.py + + Args: + img (ndarray): Image to be adjusted. + hue_factor (float): How much to shift the hue channel. Should be in + [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in + HSV space in positive and negative direction respectively. + 0 means no shift. Therefore, both -0.5 and 0.5 will give an image + with complementary colors while 0 gives the original image. + backend (str | None): The image processing backend type. Options are + `cv2`, `pillow`, `None`. If backend is None, the global + ``imread_backend`` specified by ``mmcv.use_backend()`` will be + used. Defaults to None. + + Returns: + ndarray: Hue adjusted image. + """ + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported.' + f"Supported backends are 'cv2', 'pillow'") + + if not (-0.5 <= hue_factor <= 0.5): + raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].') + if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})): + raise TypeError('img should be ndarray with dim=[2 or 3].') + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + # Image.fromarray defaultly supports RGB, not BGR. + pil_image = Image.fromarray(img[..., ::-1], mode='RGB') + input_mode = pil_image.mode + if input_mode in {'L', '1', 'I', 'F'}: + return pil_image + + h, s, v = pil_image.convert('HSV').split() + + np_h = np.array(h, dtype=np.uint8) + # uint8 addition take cares of rotation across boundaries + with np.errstate(over='ignore'): + np_h += np.uint8(hue_factor * 255) + h = Image.fromarray(np_h, 'L') + + pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode) + return np.array(pil_image, dtype=img.dtype)[..., ::-1] + else: + dtype = img.dtype + img = img.astype(np.uint8) + hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL) + h, s, v = cv2.split(hsv_img) + h = h.astype(np.uint8) + # uint8 addition take cares of rotation across boundaries + with np.errstate(over='ignore'): + h += np.uint8(hue_factor * 255) + hsv_img = cv2.merge([h, s, v]) + return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype) diff --git a/external/cv/mmcv/ops/__init__.py b/external/cv/mmcv/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0a772151cff3454c03da2659d62f621151d03f88 --- /dev/null +++ b/external/cv/mmcv/ops/__init__.py @@ -0,0 +1,123 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from mmcv.utils import IS_MLU_AVAILABLE +from .active_rotated_filter import active_rotated_filter +from .assign_score_withk import assign_score_withk +from .ball_query import ball_query +from .bbox import bbox_overlaps +from .bezier_align import BezierAlign, bezier_align +from .bias_act import bias_act +from .border_align import BorderAlign, border_align +from .box_iou_quadri import box_iou_quadri +from .box_iou_rotated import box_iou_rotated +from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive +from .cc_attention import CrissCrossAttention +from .chamfer_distance import chamfer_distance +from .contour_expand import contour_expand +from .conv2d_gradfix import conv2d, conv_transpose2d +from .convex_iou import convex_giou, convex_iou +from .corner_pool import CornerPool +from .correlation import Correlation +from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d +from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack, + ModulatedDeformRoIPoolPack, deform_roi_pool) +from .deprecated_wrappers import Conv2d_deprecated as Conv2d +from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d +from .deprecated_wrappers import Linear_deprecated as Linear +from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d +from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d +from .filtered_lrelu import filtered_lrelu +from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, + sigmoid_focal_loss, softmax_focal_loss) +from .furthest_point_sample import (furthest_point_sample, + furthest_point_sample_with_dist) +from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu +from .gather_points import gather_points +from .group_points import GroupAll, QueryAndGroup, grouping_operation +from .info import get_compiler_version, get_compiling_cuda_version +from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d, + nms3d_normal, nms_bev, nms_normal_bev) +from .knn import knn +from .masked_conv import MaskedConv2d, masked_conv2d +from .min_area_polygons import min_area_polygons +from .modulated_deform_conv import (ModulatedDeformConv2d, + ModulatedDeformConv2dPack, + modulated_deform_conv2d) +from .multi_scale_deform_attn import MultiScaleDeformableAttention +from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms +from .pixel_group import pixel_group +from .point_sample import (SimpleRoIAlign, point_sample, + rel_roi_point_to_rel_img_point) +from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, + points_in_boxes_part) +from .points_in_polygons import points_in_polygons +from .points_sampler import PointsSampler +from .prroi_pool import PrRoIPool, prroi_pool +from .psa_mask import PSAMask +from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated +from .roi_align import RoIAlign, roi_align +from .roi_align_rotated import RoIAlignRotated, roi_align_rotated +from .roi_pool import RoIPool, roi_pool +from .roiaware_pool3d import RoIAwarePool3d +from .roipoint_pool3d import RoIPointPool3d +from .rotated_feature_align import rotated_feature_align +from .saconv import SAConv2d +from .scatter_points import DynamicScatter, dynamic_scatter +from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, + SparseConvTranspose3d, SparseInverseConv2d, + SparseInverseConv3d, SubMConv2d, SubMConv3d) +from .sparse_modules import SparseModule, SparseSequential +from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d +from .sparse_structure import SparseConvTensor, scatter_nd +from .sync_bn import SyncBatchNorm +from .three_interpolate import three_interpolate +from .three_nn import three_nn +from .tin_shift import TINShift, tin_shift +from .upfirdn2d import filter2d, upfirdn2d, upsample2d +from .voxelize import Voxelization, voxelization + +__all__ = [ + 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe', + 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack', + 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack', + 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss', + 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss', + 'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d', + 'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack', + 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match', + 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d', + 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask', + 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign', + 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', + 'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated', + 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU', + 'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated', + 'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated', + 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation', + 'contour_expand', 'three_nn', 'three_interpolate', + 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align', + 'gather_points', 'furthest_point_sample', 'nms_quadri', + 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', + 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev', + 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization', + 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d', + 'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d', + 'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d', + 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d', + 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part', + 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons', + 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou', + 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance', + 'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d', + 'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align' +] + +if IS_MLU_AVAILABLE: + from .deform_conv import DeformConv2dPack_MLU # noqa:F401 + from .modulated_deform_conv import \ + ModulatedDeformConv2dPack_MLU # noqa:F401 + __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU']) diff --git a/external/cv/mmcv/ops/active_rotated_filter.py b/external/cv/mmcv/ops/active_rotated_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..43e46ae50457892d2a6a8933649304e4b53a8a72 --- /dev/null +++ b/external/cv/mmcv/ops/active_rotated_filter.py @@ -0,0 +1,69 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', + ['active_rotated_filter_forward', 'active_rotated_filter_backward']) + + +class ActiveRotatedFilterFunction(Function): + """Encoding the orientation information and generating orientation- + sensitive features. + + The details are described in the paper `Align Deep Features for Oriented + Object Detection _`. + """ + + @staticmethod + def forward(ctx, input: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + input (torch.Tensor): Input features with shape + [num_output_planes, num_input_planes, num_orientations, H, W]. + indices (torch.Tensor): Indices with shape + [num_orientations, H, W, num_rotations]. + + Returns: + torch.Tensor: Refined features with shape [num_output_planes * + num_rotations, num_input_planes * num_orientations, H, W]. + """ + ctx.save_for_backward(input, indices) + op, ip, o, h, w = input.size() + o, h, w, r = indices.size() + output = input.new_zeros((op * r, ip * o, h, w)) + ext_module.active_rotated_filter_forward(input, indices, output) + + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]: + """ + Args: + grad_output (torch.Tensor): The gradient of output features + with shape [num_output_planes * num_rotations, + num_input_planes * num_orientations, H, W]. + + Returns: + torch.Tensor: The gradient of input features with shape + [num_output_planes, num_input_planes, num_orientations, H, W]. + """ + input, indices = ctx.saved_tensors + grad_in = torch.zeros_like(input) + ext_module.active_rotated_filter_backward(grad_out, indices, grad_in) + return grad_in, None + + +active_rotated_filter = ActiveRotatedFilterFunction.apply diff --git a/external/cv/mmcv/ops/assign_score_withk.py b/external/cv/mmcv/ops/assign_score_withk.py new file mode 100644 index 0000000000000000000000000000000000000000..5f18f5ec9918fd14e60d3efa6666c01ccb7a868c --- /dev/null +++ b/external/cv/mmcv/ops/assign_score_withk.py @@ -0,0 +1,137 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward']) + + +class AssignScoreWithK(Function): + r"""Perform weighted sum to generate output features according to scores. + Modified from `PAConv `_. + + This is a memory-efficient CUDA implementation of assign_scores operation, + which first transform all point features with weight bank, then assemble + neighbor features with ``knn_idx`` and perform weighted sum of ``scores``. + + See the `paper `_ appendix Sec. D for + more detailed descriptions. + + Note: + This implementation assumes using ``neighbor`` kernel input, which is + (point_features - center_features, point_features). + See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/ + pointnet2/paconv.py#L128 for more details. + """ + + @staticmethod + def forward(ctx, + scores: torch.Tensor, + point_features: torch.Tensor, + center_features: torch.Tensor, + knn_idx: torch.Tensor, + aggregate: str = 'sum') -> torch.Tensor: + """ + Args: + scores (torch.Tensor): (B, npoint, K, M), predicted scores to + aggregate weight matrices in the weight bank. + ``npoint`` is the number of sampled centers. + ``K`` is the number of queried neighbors. + ``M`` is the number of weight matrices in the weight bank. + point_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed point features to be aggregated. + center_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed center features to be aggregated. + knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN. + We assume the first idx in each row is the idx of the center. + aggregate (str, optional): Aggregation method. + Can be 'sum', 'avg' or 'max'. Defaults: 'sum'. + + Returns: + torch.Tensor: (B, out_dim, npoint, K), the aggregated features. + """ + agg = {'sum': 0, 'avg': 1, 'max': 2} + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + output = point_features.new_zeros((B, out_dim, npoint, K)) + ext_module.assign_score_withk_forward( + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + output, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg[aggregate]) + + ctx.save_for_backward(output, point_features, center_features, scores, + knn_idx) + ctx.agg = agg[aggregate] + + return output + + @staticmethod + def backward( + ctx, grad_out: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]: + """ + Args: + grad_out (torch.Tensor): (B, out_dim, npoint, K) + + Returns: + tuple[torch.Tensor]: A tuple contains five elements. The first one + is the gradient of ``scores`` whose shape is (B, npoint, K, M). The + second is the gradient of ``point_features`` whose shape is + (B, N, M, out_dim). The third is the gradient of + ``center_features`` with the shape of (B, N, M, out_dim). The last + two are ``None``. + """ + _, point_features, center_features, scores, knn_idx = ctx.saved_tensors + + agg = ctx.agg + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + grad_point_features = point_features.new_zeros(point_features.shape) + grad_center_features = center_features.new_zeros(center_features.shape) + grad_scores = scores.new_zeros(scores.shape) + + ext_module.assign_score_withk_backward( + grad_out.contiguous(), + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + grad_point_features, + grad_center_features, + grad_scores, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg) + + return grad_scores, grad_point_features, \ + grad_center_features, None, None + + +assign_score_withk = AssignScoreWithK.apply diff --git a/external/cv/mmcv/ops/ball_query.py b/external/cv/mmcv/ops/ball_query.py new file mode 100644 index 0000000000000000000000000000000000000000..f26c36b82f6eed6b6c61306eeba61c4f91c609b4 --- /dev/null +++ b/external/cv/mmcv/ops/ball_query.py @@ -0,0 +1,92 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['ball_query_forward', 'stack_ball_query_forward']) + + +class BallQuery(Function): + """Find nearby points in spherical space.""" + + @staticmethod + def forward( + ctx, + min_radius: float, + max_radius: float, + sample_num: int, + xyz: torch.Tensor, + center_xyz: torch.Tensor, + xyz_batch_cnt: Optional[torch.Tensor] = None, + center_xyz_batch_cnt: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """ + Args: + min_radius (float): minimum radius of the balls. + max_radius (float): maximum radius of the balls. + sample_num (int): maximum number of features in the balls. + xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features, + or staked input (N1 + N2 ..., 3). + center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball + query, or staked input (M1 + M2 ..., 3). + xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in + each batch, just like (N1, N2, ...). Defaults to None. + New in version 1.7.0. + center_xyz_batch_cnt: (batch_size): Stacked centers coordinates + nums in each batch, just line (M1, M2, ...). Defaults to None. + New in version 1.7.0. + + Returns: + torch.Tensor: (B, npoint, nsample) tensor with the indices of the + features that form the query balls. + """ + assert center_xyz.is_contiguous() + assert xyz.is_contiguous() + assert min_radius < max_radius + if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None: + assert xyz_batch_cnt.dtype == torch.int + assert center_xyz_batch_cnt.dtype == torch.int + idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num), + dtype=torch.int32) + ext_module.stack_ball_query_forward( + center_xyz, + center_xyz_batch_cnt, + xyz, + xyz_batch_cnt, + idx, + max_radius=max_radius, + nsample=sample_num, + ) + else: + B, N, _ = xyz.size() + npoint = center_xyz.size(1) + idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32) + ext_module.ball_query_forward( + center_xyz, + xyz, + idx, + b=B, + n=N, + m=npoint, + min_radius=min_radius, + max_radius=max_radius, + nsample=sample_num) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + return idx + + @staticmethod + def backward(ctx, a=None) -> Tuple[None, None, None, None]: + return None, None, None, None + + +ball_query = BallQuery.apply diff --git a/external/cv/mmcv/ops/bbox.py b/external/cv/mmcv/ops/bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ea5b96bebec84f9e6167278f60a9add3178156 --- /dev/null +++ b/external/cv/mmcv/ops/bbox.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) + + +def _bbox_overlaps_cpu(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + offset: int = 0) -> torch.Tensor: + assert mode in ['iou', 'iof'] + + if aligned: + lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] + rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] + + wh = (rb - lt + offset).clamp(min=0) # [rows, 2] + overlap = wh[:, 0] * wh[:, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( + bboxes1[:, 3] - bboxes1[:, 1] + offset) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( + bboxes2[:, 3] - bboxes2[:, 1] + offset) + ious = overlap / (area1 + area2 - overlap) + else: + ious = overlap / area1 + else: + lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] + rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] + + wh = (rb - lt + offset).clamp(min=0) # [rows, cols, 2] + overlap = wh[:, :, 0] * wh[:, :, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( + bboxes1[:, 3] - bboxes1[:, 1] + offset) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( + bboxes2[:, 3] - bboxes2[:, 1] + offset) + ious = overlap / (area1[:, None] + area2 - overlap) + else: + ious = overlap / (area1[:, None]) + + return ious + + +def bbox_overlaps(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + offset: int = 0) -> torch.Tensor: + """Calculate overlap between two set of bboxes. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (torch.Tensor): shape (m, 4) in format or + empty. + bboxes2 (torch.Tensor): shape (n, 4) in format or + empty. If aligned is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + torch.Tensor: Return the ious betweens boxes. If ``aligned`` is + ``False``, the shape of ious is (m, n) else (m, 1). + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> bbox_overlaps(bboxes1, bboxes2) + tensor([[0.5000, 0.0000, 0.0000], + [0.0000, 0.0000, 1.0000], + [0.0000, 0.0000, 0.0000]]) + + Example: + >>> empty = torch.FloatTensor([]) + >>> nonempty = torch.FloatTensor([ + >>> [0, 0, 10, 9], + >>> ]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + mode_dict = {'iou': 0, 'iof': 1} + assert mode in mode_dict.keys() + mode_flag = mode_dict[mode] + # Either the boxes are empty or the length of boxes' last dimension is 4 + assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) + assert offset == 1 or offset == 0 + + rows = bboxes1.size(0) + cols = bboxes2.size(0) + + if aligned: + assert rows == cols + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows, cols)) + + if rows * cols == 0: + return ious + + if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots': + return _bbox_overlaps_cpu( + bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset) + + ext_module.bbox_overlaps( + bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) + + return ious diff --git a/external/cv/mmcv/ops/bezier_align.py b/external/cv/mmcv/ops/bezier_align.py new file mode 100644 index 0000000000000000000000000000000000000000..e53d0fd2bcc53525e1482d167d6dffb831b2941a --- /dev/null +++ b/external/cv/mmcv/ops/bezier_align.py @@ -0,0 +1,142 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple, Union + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['bezier_align_forward', 'bezier_align_backward']) + + +class BezierAlignFunction(Function): + + @staticmethod + def forward(ctx, + input: torch.Tensor, + beziers: torch.Tensor, + output_size: Union[int, Tuple[int, int]], + spatial_scale: Union[int, float] = 1.0, + sampling_ratio: int = 0, + aligned: bool = True) -> torch.Tensor: + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + ctx.sampling_ratio = sampling_ratio + ctx.aligned = aligned + + assert beziers.size(1) == 17 + output_shape = (beziers.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + ext_module.bezier_align_forward( + input, + beziers, + output, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + aligned=ctx.aligned) + + ctx.save_for_backward(beziers) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output: torch.Tensor): + beziers = ctx.saved_tensors[0] + grad_input = grad_output.new_zeros(ctx.input_shape) + grad_output = grad_output.contiguous() + ext_module.bezier_align_backward( + grad_output, + beziers, + grad_input, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + aligned=ctx.aligned) + return grad_input, None, None, None, None, None + + +bezier_align = BezierAlignFunction.apply + + +class BezierAlign(nn.Module): + """Bezier align pooling layer. + + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + + Note: + The implementation of BezierAlign is modified from + https://github.com/aim-uofa/AdelaiDet + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + def __init__( + self, + output_size: Tuple, + spatial_scale: Union[int, float], + sampling_ratio: int, + aligned: bool = True, + ) -> None: + super().__init__() + + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.aligned = aligned + + def forward(self, input: torch.Tensor, + beziers: torch.Tensor) -> torch.Tensor: + """BezierAlign forward. + + Args: + inputs (Tensor): input features. + beziers (Tensor): beziers for align. + """ + return bezier_align(input, beziers, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.aligned) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(output_size={self.output_size}, ' + s += f'spatial_scale={self.spatial_scale})' + s += f'sampling_ratio={self.sampling_ratio})' + s += f'aligned={self.aligned})' + return s diff --git a/external/cv/mmcv/ops/bias_act.py b/external/cv/mmcv/ops/bias_act.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed5f3d4c60a8c11a0662d278f1cf8bab7519159 --- /dev/null +++ b/external/cv/mmcv/ops/bias_act.py @@ -0,0 +1,381 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from +# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py + +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa +"""Custom PyTorch ops for efficient bias and activation.""" + +from typing import Any, Dict, Optional, Union + +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['bias_act']) + + +class EasyDict(dict): + """Convenience class that behaves like a dict but allows access with the + attribute syntax.""" + + def __getattr__(self, name: str) -> Any: + try: + return self[name] + except KeyError: + raise AttributeError(name) + + def __setattr__(self, name: str, value: Any) -> None: + self[name] = value + + def __delattr__(self, name: str) -> None: + del self[name] + + +activation_funcs = { + 'linear': + EasyDict( + func=lambda x, **_: x, + def_alpha=0, + def_gain=1, + cuda_idx=1, + ref='', + has_2nd_grad=False), + 'relu': + EasyDict( + func=lambda x, **_: torch.nn.functional.relu(x), + def_alpha=0, + def_gain=np.sqrt(2), + cuda_idx=2, + ref='y', + has_2nd_grad=False), + 'lrelu': + EasyDict( + func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha), + def_alpha=0.2, + def_gain=np.sqrt(2), + cuda_idx=3, + ref='y', + has_2nd_grad=False), + 'tanh': + EasyDict( + func=lambda x, **_: torch.tanh(x), + def_alpha=0, + def_gain=1, + cuda_idx=4, + ref='y', + has_2nd_grad=True), + 'sigmoid': + EasyDict( + func=lambda x, **_: torch.sigmoid(x), + def_alpha=0, + def_gain=1, + cuda_idx=5, + ref='y', + has_2nd_grad=True), + 'elu': + EasyDict( + func=lambda x, **_: torch.nn.functional.elu(x), + def_alpha=0, + def_gain=1, + cuda_idx=6, + ref='y', + has_2nd_grad=True), + 'selu': + EasyDict( + func=lambda x, **_: torch.nn.functional.selu(x), + def_alpha=0, + def_gain=1, + cuda_idx=7, + ref='y', + has_2nd_grad=True), + 'softplus': + EasyDict( + func=lambda x, **_: torch.nn.functional.softplus(x), + def_alpha=0, + def_gain=1, + cuda_idx=8, + ref='y', + has_2nd_grad=True), + 'swish': + EasyDict( + func=lambda x, **_: torch.sigmoid(x) * x, + def_alpha=0, + def_gain=np.sqrt(2), + cuda_idx=9, + ref='x', + has_2nd_grad=True), +} + +_null_tensor = torch.empty([0]) + + +def bias_act(input: torch.Tensor, + bias: Optional[torch.Tensor] = None, + dim: int = 1, + act: str = 'linear', + alpha: Optional[Union[float, int]] = None, + gain: Optional[float] = None, + clamp: Optional[float] = None, + use_custom_op: bool = True): + r"""Fused bias and activation function. + + Adds `bias` to activation tensor `input`, and evaluates activation + function `act`, and scales the result by `gain`. Each of the steps is + optional. + + In most cases, the fused op is considerably more efficient than performing + the same calculation using standard PyTorch ops. It supports first and + second order gradients, but not third order gradients. + + Args: + input (torch.Tensor): Input activation tensor. Can be of any shape. + bias (torch.Tensor): Bias vector, or `None` to disable. + Must be a 1D tensor of the same type as `input`. The shape must + be known, and it must match the dimension of `input` corresponding + to `dim`. Defaults to None. + dim (int): The dimension in `input` corresponding to the elements of + `bias`. The value of `dim` is ignored if `b` is not specified. + Defaults to 1. + act (str): Name of the activation function to evaluate, or `"linear"` + to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", + "swish", etc. See `activation_funcs` for a full list. `None` is not + allowed. Defaults to `linear`. + alpha (float or int): Shape parameter for the activation + function, or `None` to use the default. Defaults to None. + gain (float): Scaling factor for the output tensor, or `None` + to use default. See `activation_funcs` for the default scaling of + each activation function. If unsure, consider specifying 1. + Defaults to None. + clamp (float): Clamp the output values to `[-clamp, +clamp]`, + or `None` to disable the clamping (default). Defaults to None. + use_custom_op (bool): Whether to use customized op. + Defaults to True. + + Returns: + torch.Tensor: Tensor of the same shape and datatype as `input`. + """ + assert isinstance(input, torch.Tensor) + if use_custom_op and input.is_cuda: + return _bias_act_cuda( + dim=dim, act=act, alpha=alpha, gain=gain, + clamp=clamp).apply(input, bias) + return _bias_act_ref( + input=input, + bias=bias, + dim=dim, + act=act, + alpha=alpha, + gain=gain, + clamp=clamp) + + +def _bias_act_ref(input: torch.Tensor, + bias: Optional[torch.Tensor] = None, + dim: int = 1, + act: str = 'linear', + alpha: Optional[Union[float, int]] = None, + gain: Optional[float] = None, + clamp: Optional[float] = None): + """Slow reference implementation of `bias_act()` using standard PyTorch + ops. + + Adds `bias` to activation tensor `input`, and evaluates activation + function `act`, and scales the result by `gain`. Each of the steps is + optional. + + In most cases, the fused op is considerably more efficient than performing + the same calculation using standard PyTorch ops. It supports first and + second order gradients, but not third order gradients. + + Args: + input (torch.Tensor): Input activation tensor. Can be of any shape. + bias (torch.Tensor): Bias vector, or `None` to disable. + Must be a 1D tensor of the same type as `input`. The shape must + be known, and it must match the dimension of `input` corresponding + to `dim`. Defaults to None. + dim (int): The dimension in `input` corresponding to the elements of + `bias`. The value of `dim` is ignored if `b` is not specified. + Defaults to 1. + act (str): Name of the activation function to evaluate, or `"linear"` + to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", + "swish", etc. See `activation_funcs` for a full list. `None` is not + allowed. Defaults to `linear`. + alpha (float or int): Shape parameter for the activation + function, or `None` to use the default. Defaults to None. + gain (float): Scaling factor for the output tensor, or `None` + to use default. See `activation_funcs` for the default scaling of + each activation function. If unsure, consider specifying 1. + Defaults to None. + clamp (float): Clamp the output values to + `[-clamp, +clamp]`, or `None` to disable the clamping (default). + Defaults to None. + + Returns: + torch.Tensor: Tensor of the same shape and datatype as `input`. + """ + assert isinstance(input, torch.Tensor) + assert clamp is None or clamp >= 0 + spec = activation_funcs[act] + alpha = float(alpha if alpha is not None else spec.def_alpha) + gain = float(gain if gain is not None else spec.def_gain) + clamp = float(clamp if clamp is not None else -1) + + # Add bias. + if bias is not None: + assert isinstance(bias, torch.Tensor) and bias.ndim == 1 + assert 0 <= dim < input.ndim + assert bias.shape[0] == input.shape[dim] + input = input + bias.reshape( + [-1 if i == dim else 1 for i in range(input.ndim)]) + + # Evaluate activation function. + alpha = float(alpha) + output = spec.func(input, alpha=alpha) + + # Scale by gain. + gain = float(gain) + if gain != 1: + output = output * gain + + # Clamp. + if clamp >= 0: + # pylint: disable=invalid-unary-operand-type + output = output.clamp(-clamp, clamp) + return output + + +_bias_act_cuda_cache: Dict = dict() + + +def _bias_act_cuda(dim: int = 1, + act: str = 'linear', + alpha: Optional[Union[float, int]] = None, + gain: Optional[float] = None, + clamp: Optional[float] = None): + """"Fast CUDA implementation of `bias_act()` using custom ops. + + Args: + dim (int): The dimension in `x` corresponding to the elements of `b`. + The value of `dim` is ignored if `b` is not specified. + Defaults to 1. + act (str): Name of the activation function to evaluate, or `"linear"` + to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", + "swish", etc. See `activation_funcs` for a full list. `None` is not + allowed. Defaults to `linear`. + alpha (float | int): Shape parameter for the activation + function, or `None` to use the default. Defaults to None. + gain (float): Scaling factor for the output tensor, or `None` + to use default. See `activation_funcs` for the default scaling of + each activation function. If unsure, consider specifying 1. + Defaults to None. + clamp (float): Clamp the output values to `[-clamp, +clamp]`, + or `None` to disable the clamping (default). Defaults to None. + + Returns: + torch.Tensor: Tensor of the same shape and datatype as `x`. + """ + # Parse arguments. + assert clamp is None or clamp >= 0 + spec = activation_funcs[act] + alpha = float(alpha if alpha is not None else spec.def_alpha) + gain = float(gain if gain is not None else spec.def_gain) + clamp = float(clamp if clamp is not None else -1) + + # Lookup from cache. + key = (dim, act, alpha, gain, clamp) + if key in _bias_act_cuda_cache: + return _bias_act_cuda_cache[key] + + # Forward op. + class BiasActCuda(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, b): # pylint: disable=arguments-differ + ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride( + 1) == 1 else torch.contiguous_format + x = x.contiguous(memory_format=ctx.memory_format) + b = b.contiguous() if b is not None else _null_tensor.to(x.device) + y = x + if act != 'linear' or gain != 1 or clamp >= 0 or ( + b is not _null_tensor.to(x.device)): + y = ext_module.bias_act(x, b, _null_tensor.to(x.device), + _null_tensor.to(x.device), + _null_tensor.to(x.device), 0, dim, + spec.cuda_idx, alpha, gain, clamp) + ctx.save_for_backward( + x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to( + x.device), b if 'x' in spec.ref or spec.has_2nd_grad else + _null_tensor.to(x.device), + y if 'y' in spec.ref else _null_tensor.to(x.device)) + return y + + @staticmethod + def backward(ctx, dy): # pylint: disable=arguments-differ + dy = dy.contiguous(memory_format=ctx.memory_format) + x, b, y = ctx.saved_tensors + dx = None + db = None + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + dx = dy + if act != 'linear' or gain != 1 or clamp >= 0: + dx = BiasActCudaGrad.apply(dy, x, b, y) + + if ctx.needs_input_grad[1]: + db = dx.sum([i for i in range(dx.ndim) if i != dim]) + + return dx, db + + # Backward op. + class BiasActCudaGrad(torch.autograd.Function): + + @staticmethod + def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ + ctx.memory_format = torch.channels_last if dy.ndim > 2 and ( + dy.stride(1) == 1) else torch.contiguous_format + dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1, + dim, spec.cuda_idx, alpha, gain, clamp) + ctx.save_for_backward( + dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b, + y) + return dx + + @staticmethod + def backward(ctx, d_dx): # pylint: disable=arguments-differ + d_dx = d_dx.contiguous(memory_format=ctx.memory_format) + dy, x, b, y = ctx.saved_tensors + d_dy = None + d_x = None + d_b = None + d_y = None + + if ctx.needs_input_grad[0]: + d_dy = BiasActCudaGrad.apply(d_dx, x, b, y) + + if spec.has_2nd_grad and (ctx.needs_input_grad[1] + or ctx.needs_input_grad[2]): + d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim, + spec.cuda_idx, alpha, gain, clamp) + + if spec.has_2nd_grad and ctx.needs_input_grad[2]: + d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim]) + + return d_dy, d_x, d_b, d_y + + # Add to cache. + _bias_act_cuda_cache[key] = BiasActCuda + return BiasActCuda diff --git a/external/cv/mmcv/ops/border_align.py b/external/cv/mmcv/ops/border_align.py new file mode 100644 index 0000000000000000000000000000000000000000..27b2dcf45b69126cabd02df4a95191af33d70333 --- /dev/null +++ b/external/cv/mmcv/ops/border_align.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# modified from +# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py + +from typing import Tuple + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['border_align_forward', 'border_align_backward']) + + +class BorderAlignFunction(Function): + + @staticmethod + def symbolic(g, input, boxes, pool_size): + return g.op( + 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) + + @staticmethod + def forward(ctx, input: torch.Tensor, boxes: torch.Tensor, + pool_size: int) -> torch.Tensor: + ctx.pool_size = pool_size + ctx.input_shape = input.size() + + assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]' + assert boxes.size(2) == 4, \ + 'the last dimension of boxes must be (x1, y1, x2, y2)' + assert input.size(1) % 4 == 0, \ + 'the channel for input feature must be divisible by factor 4' + + # [B, C//4, H*W, 4] + output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4) + output = input.new_zeros(output_shape) + # `argmax_idx` only used for backward + argmax_idx = input.new_zeros(output_shape).to(torch.int) + + ext_module.border_align_forward( + input, boxes, output, argmax_idx, pool_size=ctx.pool_size) + + ctx.save_for_backward(boxes, argmax_idx) + return output + + @staticmethod + @once_differentiable + def backward(ctx, + grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]: + boxes, argmax_idx = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + # complex head architecture may cause grad_output uncontiguous + grad_output = grad_output.contiguous() + ext_module.border_align_backward( + grad_output, + boxes, + argmax_idx, + grad_input, + pool_size=ctx.pool_size) + return grad_input, None, None + + +border_align = BorderAlignFunction.apply + + +class BorderAlign(nn.Module): + r"""Border align pooling layer. + + Applies border_align over the input feature based on predicted bboxes. + The details were described in the paper + `BorderDet: Border Feature for Dense Object Detection + `_. + + For each border line (e.g. top, left, bottom or right) of each box, + border_align does the following: + + 1. uniformly samples ``pool_size`` +1 positions on this line, involving + the start and end points. + 2. the corresponding features on these points are computed by bilinear + interpolation. + 3. max pooling over all the ``pool_size`` +1 positions are used for + computing pooled feature. + + Args: + pool_size (int): number of positions sampled over the boxes' borders + (e.g. top, bottom, left, right). + """ + + def __init__(self, pool_size: int): + super().__init__() + self.pool_size = pool_size + + def forward(self, input: torch.Tensor, + boxes: torch.Tensor) -> torch.Tensor: + """ + Args: + input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), + [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, + right features respectively. + boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). + + Returns: + torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is + (top,left,bottom,right) for the last dimension. + """ + return border_align(input, boxes, self.pool_size) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(pool_size={self.pool_size})' + return s diff --git a/external/cv/mmcv/ops/box_iou_quadri.py b/external/cv/mmcv/ops/box_iou_quadri.py new file mode 100644 index 0000000000000000000000000000000000000000..ac70c5bb9968397b07c208aa4431180561bc457a --- /dev/null +++ b/external/cv/mmcv/ops/box_iou_quadri.py @@ -0,0 +1,54 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri']) + + +def box_iou_quadri(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False) -> torch.Tensor: + """Return intersection-over-union (Jaccard index) of boxes. + + Both sets of boxes are expected to be in + (x1, y1, ..., x4, y4) format. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8), + indicating (x1, y1, ..., x4, y4) for each row. + bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8), + indicating (x1, y1, ..., x4, y4) for each row. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + torch.Tensor: Return the ious betweens boxes. If ``aligned`` is + ``False``, the shape of ious is (N, M) else (N,). + """ + assert mode in ['iou', 'iof'] + mode_dict = {'iou': 0, 'iof': 1} + mode_flag = mode_dict[mode] + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros(rows * cols) + bboxes1 = bboxes1.contiguous() + bboxes2 = bboxes2.contiguous() + ext_module.box_iou_quadri( + bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) + if not aligned: + ious = ious.view(rows, cols) + return ious diff --git a/external/cv/mmcv/ops/box_iou_rotated.py b/external/cv/mmcv/ops/box_iou_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..664551350370b30b29169403cb8ef151b318078b --- /dev/null +++ b/external/cv/mmcv/ops/box_iou_rotated.py @@ -0,0 +1,161 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) + + +def box_iou_rotated(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + clockwise: bool = True) -> torch.Tensor: + """Return intersection-over-union (Jaccard index) of boxes. + + Both sets of boxes are expected to be in + (x_center, y_center, width, height, angle) format. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + .. note:: + The operator assumes: + + 1) The positive direction along x axis is left -> right. + + 2) The positive direction along y axis is top -> down. + + 3) The w border is in parallel with x axis when angle = 0. + + However, there are 2 opposite definitions of the positive angular + direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports + both definitions and uses CW by default. + + Please set ``clockwise=False`` if you are using the CCW definition. + + The coordinate system when ``clockwise`` is ``True`` (default) + + .. code-block:: none + + 0-------------------> x (0 rad) + | A-------------B + | | | + | | box h + | | angle=0 | + | D------w------C + v + y (pi/2 rad) + + In such coordination system the rotation matrix is + + .. math:: + \\begin{pmatrix} + \\cos\\alpha & -\\sin\\alpha \\\\ + \\sin\\alpha & \\cos\\alpha + \\end{pmatrix} + + The coordinates of the corner point A can be calculated as: + + .. math:: + P_A= + \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} + = + \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + + \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\ + \\sin\\alpha & \\cos\\alpha\\end{pmatrix} + \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ + = + \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha + \\\\ + y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} + + + The coordinate system when ``clockwise`` is ``False`` + + .. code-block:: none + + 0-------------------> x (0 rad) + | A-------------B + | | | + | | box h + | | angle=0 | + | D------w------C + v + y (-pi/2 rad) + + In such coordination system the rotation matrix is + + .. math:: + \\begin{pmatrix} + \\cos\\alpha & \\sin\\alpha \\\\ + -\\sin\\alpha & \\cos\\alpha + \\end{pmatrix} + + The coordinates of the corner point A can be calculated as: + + .. math:: + P_A= + \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} + = + \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + + \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\ + -\\sin\\alpha & \\cos\\alpha\\end{pmatrix} + \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ + = + \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha + \\\\ + y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} + + Args: + boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5), + indicating (x, y, w, h, theta) for each row. Note that theta is in + radian. + boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5), + indicating (x, y, w, h, theta) for each row. Note that theta is in + radian. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + clockwise (bool): flag indicating whether the positive angular + orientation is clockwise. default True. + `New in version 1.4.3.` + + Returns: + torch.Tensor: Return the ious betweens boxes. If ``aligned`` is + ``False``, the shape of ious is (N, M) else (N,). + """ + assert mode in ['iou', 'iof'] + mode_dict = {'iou': 0, 'iof': 1} + mode_flag = mode_dict[mode] + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + ious = bboxes1.new_zeros(rows) + else: + if bboxes1.device.type == 'mlu': + ious = bboxes1.new_zeros([rows, cols]) + else: + ious = bboxes1.new_zeros(rows * cols) + if not clockwise: + flip_mat = bboxes1.new_ones(bboxes1.shape[-1]) + flip_mat[-1] = -1 + bboxes1 = bboxes1 * flip_mat + bboxes2 = bboxes2 * flip_mat + if bboxes1.device.type == 'npu': + scale_mat = bboxes1.new_ones(bboxes1.shape[-1]) + scale_mat[-1] = 1.0 / 0.01745329252 + bboxes1 = bboxes1 * scale_mat + bboxes2 = bboxes2 * scale_mat + bboxes1 = bboxes1.contiguous() + bboxes2 = bboxes2.contiguous() + ext_module.box_iou_rotated( + bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) + if not aligned: + ious = ious.view(rows, cols) + return ious diff --git a/external/cv/mmcv/ops/carafe.py b/external/cv/mmcv/ops/carafe.py new file mode 100644 index 0000000000000000000000000000000000000000..5562c8dc2c272f7f1b42f6e99dbd4aa9e1a1516e --- /dev/null +++ b/external/cv/mmcv/ops/carafe.py @@ -0,0 +1,305 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import normal_init, xavier_init +from mmengine.registry import MODELS +from torch import Tensor +from torch.autograd import Function +from torch.nn.modules.module import Module + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward', + 'carafe_backward' +]) + + +class CARAFENaiveFunction(Function): + + @staticmethod + def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: + return g.op( + 'mmcv::MMCVCARAFENaive', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + ext_module.carafe_naive_forward( + features, + masks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad or \ + torch.__version__ == 'parrots': + ctx.save_for_backward(features, masks) + return output + + @staticmethod + def backward( + ctx, + grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: + assert grad_output.is_cuda + + features, masks = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + grad_input = torch.zeros_like(features) + grad_masks = torch.zeros_like(masks) + ext_module.carafe_naive_backward( + grad_output.contiguous(), + features, + masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + return grad_input, grad_masks, None, None, None + + +carafe_naive = CARAFENaiveFunction.apply + + +class CARAFENaive(Module): + + def __init__(self, kernel_size: int, group_size: int, scale_factor: int): + super().__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features: Tensor, masks: Tensor) -> Tensor: + return carafe_naive(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +class CARAFEFunction(Function): + + @staticmethod + def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: + return g.op( + 'mmcv::MMCVCARAFE', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + routput = features.new_zeros(output.size(), requires_grad=False) + rfeatures = features.new_zeros(features.size(), requires_grad=False) + rmasks = masks.new_zeros(masks.size(), requires_grad=False) + ext_module.carafe_forward( + features, + masks, + rfeatures, + routput, + rmasks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad or \ + torch.__version__ == 'parrots': + ctx.save_for_backward(features, masks, rfeatures) + return output + + @staticmethod + def backward( + ctx, + grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: + features, masks, rfeatures = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + rgrad_output = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input = torch.zeros_like(features, requires_grad=False) + rgrad_masks = torch.zeros_like(masks, requires_grad=False) + grad_input = torch.zeros_like(features, requires_grad=False) + grad_masks = torch.zeros_like(masks, requires_grad=False) + ext_module.carafe_backward( + grad_output.contiguous(), + rfeatures, + masks, + rgrad_output, + rgrad_input_hs, + rgrad_input, + rgrad_masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + return grad_input, grad_masks, None, None, None + + +carafe = CARAFEFunction.apply + + +class CARAFE(Module): + """ CARAFE: Content-Aware ReAssembly of FEatures + + Please refer to `CARAFE: Content-Aware ReAssembly of FEatures + `_ for more details. + + Args: + kernel_size (int): reassemble kernel size + group_size (int): reassemble group size + scale_factor (int): upsample ratio + + Returns: + upsampled feature map + """ + + def __init__(self, kernel_size: int, group_size: int, scale_factor: int): + super().__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features: Tensor, masks: Tensor) -> Tensor: + return carafe(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +@MODELS.register_module(name='carafe') +class CARAFEPack(nn.Module): + """A unified package of CARAFE upsampler that contains: 1) channel + compressor 2) content encoder 3) CARAFE op. + + Official implementation of ICCV 2019 paper + `CARAFE: Content-Aware ReAssembly of FEatures + `_. + + Args: + channels (int): input feature channels + scale_factor (int): upsample ratio + up_kernel (int): kernel size of CARAFE op + up_group (int): group size of CARAFE op + encoder_kernel (int): kernel size of content encoder + encoder_dilation (int): dilation of content encoder + compressed_channels (int): output channels of channels compressor + + Returns: + upsampled feature map + """ + + def __init__(self, + channels: int, + scale_factor: int, + up_kernel: int = 5, + up_group: int = 1, + encoder_kernel: int = 3, + encoder_dilation: int = 1, + compressed_channels: int = 64): + super().__init__() + self.channels = channels + self.scale_factor = scale_factor + self.up_kernel = up_kernel + self.up_group = up_group + self.encoder_kernel = encoder_kernel + self.encoder_dilation = encoder_dilation + self.compressed_channels = compressed_channels + self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, + 1) + self.content_encoder = nn.Conv2d( + self.compressed_channels, + self.up_kernel * self.up_kernel * self.up_group * + self.scale_factor * self.scale_factor, + self.encoder_kernel, + padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), + dilation=self.encoder_dilation, + groups=1) + self.init_weights() + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + normal_init(self.content_encoder, std=0.001) + + def kernel_normalizer(self, mask: Tensor) -> Tensor: + mask = F.pixel_shuffle(mask, self.scale_factor) + n, mask_c, h, w = mask.size() + # use float division explicitly, + # to void inconsistency while exporting to onnx + mask_channel = int(mask_c / float(self.up_kernel**2)) + mask = mask.view(n, mask_channel, -1, h, w) + + mask = F.softmax(mask, dim=2, dtype=mask.dtype) + mask = mask.view(n, mask_c, h, w).contiguous() + + return mask + + def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor: + x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) + return x + + def forward(self, x: Tensor) -> Tensor: + compressed_x = self.channel_compressor(x) + mask = self.content_encoder(compressed_x) + mask = self.kernel_normalizer(mask) + + x = self.feature_reassemble(x, mask) + return x diff --git a/external/cv/mmcv/ops/cc_attention.py b/external/cv/mmcv/ops/cc_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..06637969f86b9efb1697d78ea7260a76063e0d53 --- /dev/null +++ b/external/cv/mmcv/ops/cc_attention.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.registry import MODELS + +from mmcv.cnn import Scale + + +def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor: + """Returns a diagonal matrix of size [n, n]. + + The diagonal are all "-inf". This is for avoiding calculating the + overlapped element in the Criss-Cross twice. + """ + return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) + + +@MODELS.register_module() +class CrissCrossAttention(nn.Module): + """Criss-Cross Attention Module. + + .. note:: + Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch + to a pure PyTorch and equivalent implementation. For more + details, please refer to https://github.com/open-mmlab/mmcv/pull/1201. + + Speed comparison for one forward pass + + - Input size: [2,512,97,97] + - Device: 1 NVIDIA GeForce RTX 2080 Ti + + +-----------------------+---------------+------------+---------------+ + | |PyTorch version|CUDA version|Relative speed | + +=======================+===============+============+===============+ + |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x | + +-----------------------+---------------+------------+---------------+ + |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x | + +-----------------------+---------------+------------+---------------+ + + Args: + in_channels (int): Channels of the input feature map. + """ + + def __init__(self, in_channels: int) -> None: + super().__init__() + self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.value_conv = nn.Conv2d(in_channels, in_channels, 1) + self.gamma = Scale(0.) + self.in_channels = in_channels + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """forward function of Criss-Cross Attention. + + Args: + x (torch.Tensor): Input feature with the shape of + (batch_size, in_channels, height, width). + + Returns: + torch.Tensor: Output of the layer, with the shape of + (batch_size, in_channels, height, width) + """ + B, C, H, W = x.size() + query = self.query_conv(x) + key = self.key_conv(x) + value = self.value_conv(x) + energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG( + H, query.device) + energy_H = energy_H.transpose(1, 2) + energy_W = torch.einsum('bchw,bchj->bhwj', query, key) + attn = F.softmax( + torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)] + out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H]) + out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:]) + + out = self.gamma(out) + x + out = out.contiguous() + + return out + + def __repr__(self) -> str: + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels})' + return s diff --git a/external/cv/mmcv/ops/chamfer_distance.py b/external/cv/mmcv/ops/chamfer_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..5d5f179d0a6238fab03f692e97d4325f67aeefc4 --- /dev/null +++ b/external/cv/mmcv/ops/chamfer_distance.py @@ -0,0 +1,98 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Sequence, Tuple + +import torch +from torch import Tensor +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward']) + + +class ChamferDistanceFunction(Function): + """This is an implementation of the 2D Chamfer Distance. + + It has been used in the paper `Oriented RepPoints for Aerial Object + Detection (CVPR 2022) _`. + """ + + @staticmethod + def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]: + """ + Args: + xyz1 (Tensor): Point set with shape (B, N, 2). + xyz2 (Tensor): Point set with shape (B, N, 2). + + Returns: + Sequence[Tensor]: + + - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with + shape (B, N). + - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with + shape (B, N). + - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2) + with shape (B, N), which be used in compute gradient. + - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2) + with shape (B, N), which be used in compute gradient. + """ + batch_size, n, _ = xyz1.size() + _, m, _ = xyz2.size() + device = xyz1.device + xyz1 = xyz1.contiguous() + xyz2 = xyz2.contiguous() + + dist1 = torch.zeros(batch_size, n).to(device) + dist2 = torch.zeros(batch_size, m).to(device) + idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device) + idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device) + + ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, + idx2) + ctx.save_for_backward(xyz1, xyz2, idx1, idx2) + return dist1, dist2, idx1, idx2 + + @staticmethod + @once_differentiable + def backward(ctx, + grad_dist1: Tensor, + grad_dist2: Tensor, + grad_idx1=None, + grad_idx2=None) -> Tuple[Tensor, Tensor]: + """ + + Args: + grad_dist1 (Tensor): Gradient of chamfer distance + (xyz1 to xyz2) with shape (B, N). + grad_dist2 (Tensor): Gradient of chamfer distance + (xyz2 to xyz1) with shape (B, N). + + Returns: + Tuple[Tensor, Tensor]: + + - grad_xyz1 (Tensor): Gradient of the point set with shape \ + (B, N, 2). + - grad_xyz2 (Tensor):Gradient of the point set with shape \ + (B, N, 2). + """ + xyz1, xyz2, idx1, idx2 = ctx.saved_tensors + device = grad_dist1.device + grad_dist1 = grad_dist1.contiguous() + grad_dist2 = grad_dist2.contiguous() + grad_xyz1 = torch.zeros(xyz1.size()).to(device) + grad_xyz2 = torch.zeros(xyz2.size()).to(device) + + ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2, + grad_dist1, grad_dist2, grad_xyz1, + grad_xyz2) + return grad_xyz1, grad_xyz2 + + +chamfer_distance = ChamferDistanceFunction.apply diff --git a/external/cv/mmcv/ops/contour_expand.py b/external/cv/mmcv/ops/contour_expand.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2ff0e2ceebab2c5479683572ffc0be4438d28c --- /dev/null +++ b/external/cv/mmcv/ops/contour_expand.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Union + +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['contour_expand']) + + +def contour_expand(kernel_mask: Union[np.array, torch.Tensor], + internal_kernel_label: Union[np.array, torch.Tensor], + min_kernel_area: int, kernel_num: int) -> list: + """Expand kernel contours so that foreground pixels are assigned into + instances. + + Args: + kernel_mask (np.array or torch.Tensor): The instance kernel mask with + size hxw. + internal_kernel_label (np.array or torch.Tensor): The instance internal + kernel label with size hxw. + min_kernel_area (int): The minimum kernel area. + kernel_num (int): The instance kernel number. + + Returns: + list: The instance index map with size hxw. + """ + assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) + assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) + assert isinstance(min_kernel_area, int) + assert isinstance(kernel_num, int) + + if isinstance(kernel_mask, np.ndarray): + kernel_mask = torch.from_numpy(kernel_mask) + if isinstance(internal_kernel_label, np.ndarray): + internal_kernel_label = torch.from_numpy(internal_kernel_label) + + if torch.__version__ == 'parrots': + if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0: + label = [] + else: + label = ext_module.contour_expand( + kernel_mask, + internal_kernel_label, + min_kernel_area=min_kernel_area, + kernel_num=kernel_num) + label = label.tolist() # type: ignore + else: + label = ext_module.contour_expand(kernel_mask, internal_kernel_label, + min_kernel_area, kernel_num) + return label diff --git a/external/cv/mmcv/ops/conv2d_gradfix.py b/external/cv/mmcv/ops/conv2d_gradfix.py new file mode 100644 index 0000000000000000000000000000000000000000..74536014ca62d05832de715c2e0c1a82c659b6ab --- /dev/null +++ b/external/cv/mmcv/ops/conv2d_gradfix.py @@ -0,0 +1,344 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa +"""Custom replacement for `torch.nn.functional.conv2d` that supports +arbitrarily high order gradients with zero performance penalty.""" + +import contextlib +import warnings +from typing import Dict, Optional, Tuple, Union + +import torch +from mmengine.utils import digit_version +from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch + +enabled = True +weight_gradients_disabled = False + + +@contextlib.contextmanager +def no_weight_gradients(disable=True): + global weight_gradients_disabled + old = weight_gradients_disabled + if disable: + weight_gradients_disabled = True + yield + weight_gradients_disabled = old + + +def conv2d(input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + dilation: Union[int, Tuple[int, ...]] = 1, + groups: int = 1): + flag = True + if digit_version(torch.__version__) >= digit_version('1.10.0'): + warnings.warn('Since ' + 'aten:cudnn_convolution_backward_weight is ' + f'not supported in torch=={torch.__version__},' + ' rolling back to `torch.nn.functional.conv2d`') + flag = False + if _should_use_custom_op(input) and flag: + return _conv2d_gradfix( + transpose=False, + weight_shape=weight.shape, + stride=stride, + padding=padding, + output_padding=0, + dilation=dilation, + groups=groups).apply(input, weight, bias) + return torch.nn.functional.conv2d( + input=input, + weight=weight, + bias=bias, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups) + + +def conv_transpose2d(input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + output_padding: Union[int, Tuple[int, ...]] = 0, + groups: int = 1, + dilation: Union[int, Tuple[int, ...]] = 1): + if _should_use_custom_op(input): + return _conv2d_gradfix( + transpose=True, + weight_shape=weight.shape, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + dilation=dilation).apply(input, weight, bias) + return torch.nn.functional.conv_transpose2d( + input=input, + weight=weight, + bias=bias, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + dilation=dilation) + + +def _should_use_custom_op(input): + assert isinstance(input, torch.Tensor) + if (not enabled) or (not torch.backends.cudnn.enabled): + return False + if input.device.type != 'cuda': + return False + return True + + +def _to_tuple(x, ndim): + xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim + assert len(xs) == ndim + assert all(isinstance(x, int) for x in xs) + return xs + + +_conv2d_gradfix_cache: Dict = dict() +_null_tensor = torch.empty([0]) + + +def _conv2d_gradfix( + transpose: bool, + weight_shape: Tuple[int, ...], + stride: Union[int, Tuple[int, ...]], + padding: Union[int, Tuple[int, ...]], + output_padding: Union[int, Tuple[int, ...]], + dilation: Union[int, Tuple[int, ...]], + groups: int, +): + # Parse arguments. + ndim = 2 + weight_shape = tuple(weight_shape) + stride = _to_tuple(stride, ndim) + padding = _to_tuple(padding, ndim) + output_padding = _to_tuple(output_padding, ndim) + dilation = _to_tuple(dilation, ndim) + + # Lookup from cache. + key = (transpose, weight_shape, stride, padding, output_padding, dilation, + groups) + if key in _conv2d_gradfix_cache: + return _conv2d_gradfix_cache[key] + + # Validate arguments. + + assert groups >= 1 + assert len(weight_shape) == ndim + 2 + assert all(stride[i] >= 1 for i in range(ndim)) # type: ignore + assert all(padding[i] >= 0 for i in range(ndim)) # type: ignore + assert all(dilation[i] >= 0 for i in range(ndim)) # type: ignore + if not transpose: + assert all(output_padding[i] == 0 for i in range(ndim)) # type: ignore + else: # transpose + for i in range(ndim): + assert 0 <= output_padding[i] < max( # type: ignore + stride[i], # type: ignore + dilation[i]) # type: ignore + + # Helpers. + common_kwargs = dict( + stride=stride, padding=padding, dilation=dilation, groups=groups) + + def calc_output_padding(input_shape, output_shape): + if transpose: + return [0, 0] + return [ + input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] - + (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1) + for i in range(ndim) + ] + + # Forward & backward. + class Conv2d(torch.autograd.Function): + + @staticmethod + def forward(ctx, input, weight, bias): + assert weight.shape == weight_shape + ctx.save_for_backward( + input if weight.requires_grad else _null_tensor, + weight if input.requires_grad else _null_tensor, + ) + ctx.input_shape = input.shape + + # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere). + if weight_shape[2:] == stride == dilation == ( + 1, 1) and padding == ( + 0, 0) and torch.cuda.get_device_capability( + input.device) < (8, 0): + a = weight.reshape(groups, weight_shape[0] // groups, + weight_shape[1]) + b = input.reshape(input.shape[0], groups, + input.shape[1] // groups, -1) + c = (a.transpose(1, 2) if transpose else a) @ b.permute( + 1, 2, 0, 3).flatten(2) + c = c.reshape(-1, input.shape[0], + *input.shape[2:]).transpose(0, 1) + c = c if bias is None else c + bias.unsqueeze(0).unsqueeze( + 2).unsqueeze(3) + return c.contiguous( + memory_format=(torch.channels_last if input.stride(1) == + 1 else torch.contiguous_format)) + + # General case => cuDNN. + if transpose: + return torch.nn.functional.conv_transpose2d( + input=input, + weight=weight, + bias=bias, + output_padding=output_padding, + **common_kwargs) + return torch.nn.functional.conv2d( + input=input, weight=weight, bias=bias, **common_kwargs) + + @staticmethod + def backward(ctx, grad_output): + input, weight = ctx.saved_tensors + input_shape = ctx.input_shape + grad_input = None + grad_weight = None + grad_bias = None + + if ctx.needs_input_grad[0]: + p = calc_output_padding( + input_shape=input_shape, output_shape=grad_output.shape) + op = _conv2d_gradfix( + transpose=(not transpose), + weight_shape=weight_shape, + output_padding=p, + **common_kwargs) + grad_input = op.apply(grad_output, weight, None) + assert grad_input.shape == input_shape + + if ctx.needs_input_grad[1] and not weight_gradients_disabled: + grad_weight = Conv2dGradWeight.apply(grad_output, input) + assert grad_weight.shape == weight_shape + + if ctx.needs_input_grad[2]: + grad_bias = grad_output.sum([0, 2, 3]) + + return grad_input, grad_weight, grad_bias + + # Gradient with respect to the weights. + class Conv2dGradWeight(torch.autograd.Function): + + @staticmethod + def forward(ctx, grad_output, input): + ctx.save_for_backward( + grad_output if input.requires_grad else _null_tensor, + input if grad_output.requires_grad else _null_tensor, + ) + ctx.grad_output_shape = grad_output.shape + ctx.input_shape = input.shape + + # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere). + if weight_shape[2:] == stride == dilation == ( + 1, 1) and padding == (0, 0): + a = grad_output.reshape(grad_output.shape[0], groups, + grad_output.shape[1] // groups, + -1).permute(1, 2, 0, 3).flatten(2) + b = input.reshape(input.shape[0], groups, + input.shape[1] // groups, + -1).permute(1, 2, 0, 3).flatten(2) + c = (b @ a.transpose(1, 2) if transpose else + a @ b.transpose(1, 2)).reshape(weight_shape) + return c.contiguous( + memory_format=(torch.channels_last if input.stride(1) == + 1 else torch.contiguous_format)) + + # PyTorch consolidated convolution backward API in PR: + # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122 # noqa: E501 + # Enhance the code referring to the discussion: + # https://github.com/pytorch/pytorch/issues/74437 + if digit_version(torch.__version__) >= digit_version('1.11.0'): + empty_weight = torch.tensor( + 0.0, dtype=input.dtype, + device=input.device).expand(weight_shape) + output_padding = calc_output_padding(input.shape, + grad_output.shape) + return torch.ops.aten.convolution_backward( + grad_output, + input, + empty_weight, + None, + stride=stride, + dilation=dilation, + transposed=transpose, + padding=padding, + groups=groups, + output_padding=output_padding, + output_mask=[0, 1, 0])[1] + else: + if is_rocm_pytorch(): + name = 'aten::miopen_convolution_transpose_backward_weight' + if not transpose: + name = 'aten::miopen_convolution_backward_weight' + flags = [ + torch.backends.cudnn.benchmark, + torch.backends.cudnn.deterministic + ] + else: + # General case => cuDNN. + name = ('aten::cudnn_convolution_transpose_backward_weight' + if transpose else + 'aten::cudnn_convolution_backward_weight') + flags = [ + torch.backends.cudnn.benchmark, + torch.backends.cudnn.deterministic, + torch.backends.cudnn.allow_tf32 + ] + return torch._C._jit_get_operation(name)(weight_shape, + grad_output, input, + padding, stride, + dilation, groups, + *flags) + + @staticmethod + def backward(ctx, grad2_grad_weight): + grad_output, input = ctx.saved_tensors + grad_output_shape = ctx.grad_output_shape + input_shape = ctx.input_shape + grad2_grad_output = None + grad2_input = None + + if ctx.needs_input_grad[0]: + grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, + None) + assert grad2_grad_output.shape == grad_output_shape + + if ctx.needs_input_grad[1]: + p = calc_output_padding( + input_shape=input_shape, output_shape=grad_output_shape) + op = _conv2d_gradfix( + transpose=(not transpose), + weight_shape=weight_shape, + output_padding=p, + **common_kwargs) + grad2_input = op.apply(grad_output, grad2_grad_weight, None) + assert grad2_input.shape == input_shape + + return grad2_grad_output, grad2_input + + _conv2d_gradfix_cache[key] = Conv2d + return Conv2d diff --git a/external/cv/mmcv/ops/convex_iou.py b/external/cv/mmcv/ops/convex_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..395ec62f98d204c3509c49dc69984f313ddadef7 --- /dev/null +++ b/external/cv/mmcv/ops/convex_iou.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou']) + + +def convex_giou(pointsets: torch.Tensor, + polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return generalized intersection-over-union (Jaccard index) between point + sets and polygons. + + Args: + pointsets (torch.Tensor): It has shape (N, 18), + indicating (x1, y1, x2, y2, ..., x9, y9) for each row. + polygons (torch.Tensor): It has shape (N, 8), + indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The first element is the gious + between point sets and polygons with the shape (N,). The second + element is the gradient of point sets with the shape (N, 18). + """ + output = pointsets.new_zeros((pointsets.size(0), 19)) + ext_module.convex_giou(pointsets, polygons, output) + convex_giou = output[:, -1] + points_grad = output[:, 0:-1] + return convex_giou, points_grad + + +def convex_iou(pointsets: torch.Tensor, + polygons: torch.Tensor) -> torch.Tensor: + """Return intersection-over-union (Jaccard index) between point sets and + polygons. + + Args: + pointsets (torch.Tensor): It has shape (N, 18), + indicating (x1, y1, x2, y2, ..., x9, y9) for each row. + polygons (torch.Tensor): It has shape (K, 8), + indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. + + Returns: + torch.Tensor: Return the ious between point sets and polygons with the + shape (N, K). + """ + N, K = pointsets.size(0), polygons.size(0) + ious = pointsets.new_zeros((N, K)) + ext_module.convex_iou(pointsets, polygons, ious) + return ious diff --git a/external/cv/mmcv/ops/corner_pool.py b/external/cv/mmcv/ops/corner_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..669a399393f906429bb83ac9de92f08b93ada5a7 --- /dev/null +++ b/external/cv/mmcv/ops/corner_pool.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from mmengine.utils import digit_version +from torch import Tensor, nn + +_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} + + +def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor: + size = x.size(dim) + output = x.clone() + + ind = 1 + while ind < size: + if flip: + cur_start = 0 + cur_len = size - ind + next_start = ind + next_len = size - ind + else: + cur_start = ind + cur_len = size - ind + next_start = 0 + next_len = size - ind + + # max_temp should be cloned for backward computation + max_temp = output.narrow(dim, cur_start, cur_len).clone() + cur_temp = output.narrow(dim, cur_start, cur_len) + next_temp = output.narrow(dim, next_start, next_len) + + cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp) + + ind = ind << 1 + + return output + + +class CornerPool(nn.Module): + """Corner Pooling. + + Corner Pooling is a new type of pooling layer that helps a + convolutional network better localize corners of bounding boxes. + + Please refer to `CornerNet: Detecting Objects as Paired Keypoints + `_ for more details. + + Code is modified from https://github.com/princeton-vl/CornerNet-Lite. + + Args: + mode (str): Pooling orientation for the pooling layer + + - 'bottom': Bottom Pooling + - 'left': Left Pooling + - 'right': Right Pooling + - 'top': Top Pooling + + Returns: + Feature map after pooling. + """ + + cummax_dim_flip = { + 'bottom': (2, False), + 'left': (3, True), + 'right': (3, False), + 'top': (2, True), + } + + def __init__(self, mode: str): + super().__init__() + assert mode in self.cummax_dim_flip + self.mode = mode + + def forward(self, x: Tensor) -> Tensor: + if (torch.__version__ != 'parrots' and + digit_version(torch.__version__) >= digit_version('1.5.0')): + dim, flip = self.cummax_dim_flip[self.mode] + if flip: + x = x.flip(dim) + pool_tensor, _ = torch.cummax(x, dim=dim) + if flip: + pool_tensor = pool_tensor.flip(dim) + return pool_tensor + else: + dim, flip = self.cummax_dim_flip[self.mode] + return _corner_pool(x, dim, flip) diff --git a/external/cv/mmcv/ops/correlation.py b/external/cv/mmcv/ops/correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..bebba1c5871da6ac12abb1b29305dcdc3f389c36 --- /dev/null +++ b/external/cv/mmcv/ops/correlation.py @@ -0,0 +1,205 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['correlation_forward', 'correlation_backward']) + + +class CorrelationFunction(Function): + + @staticmethod + def forward(ctx, + input1: Tensor, + input2: Tensor, + kernel_size: int = 1, + max_displacement: int = 1, + stride: int = 1, + padding: int = 1, + dilation: int = 1, + dilation_patch: int = 1) -> Tensor: + + ctx.save_for_backward(input1, input2) + + kH, kW = ctx.kernel_size = _pair(kernel_size) + patch_size = max_displacement * 2 + 1 + ctx.patch_size = patch_size + dH, dW = ctx.stride = _pair(stride) + padH, padW = ctx.padding = _pair(padding) + dilationH, dilationW = ctx.dilation = _pair(dilation) + dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair( + dilation_patch) + + output_size = CorrelationFunction._output_size(ctx, input1) + + output = input1.new_zeros(output_size) + + ext_module.correlation_forward( + input1, + input2, + output, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + + return output + + @staticmethod + @once_differentiable + def backward( + ctx, grad_output: Tensor + ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]: + input1, input2 = ctx.saved_tensors + + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilation_patchH, dilation_patchW = ctx.dilation_patch + dH, dW = ctx.stride + grad_input1 = torch.zeros_like(input1) + grad_input2 = torch.zeros_like(input2) + + ext_module.correlation_backward( + grad_output, + input1, + input2, + grad_input1, + grad_input2, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + return grad_input1, grad_input2, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input1): + iH, iW = input1.size(2), input1.size(3) + batch_size = input1.size(0) + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + dH, dW = ctx.stride + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilatedKH = (kH - 1) * dilationH + 1 + dilatedKW = (kW - 1) * dilationW + 1 + + oH = int((iH + 2 * padH - dilatedKH) / dH + 1) + oW = int((iW + 2 * padW - dilatedKW) / dW + 1) + + output_size = (batch_size, patch_size, patch_size, oH, oW) + return output_size + + +class Correlation(nn.Module): + r"""Correlation operator + + This correlation operator works for optical flow correlation computation. + + There are two batched tensors with shape :math:`(N, C, H, W)`, + and the correlation output's shape is :math:`(N, max\_displacement \times + 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})` + + where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding - + dilation \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation + \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding + window convolution between input1 and shifted input2, + + .. math:: + Corr(N_i, dx, dy) = + \sum_{c=0}^{C-1} + input1(N_i, c) \star + \mathcal{S}(input2(N_i, c), dy, dx) + + where :math:`\star` is the valid 2d sliding window convolution operator, + and :math:`\mathcal{S}` means shifting the input features (auto-complete + zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in + [-max\_displacement \times dilation\_patch, max\_displacement \times + dilation\_patch]`. + + Args: + kernel_size (int): The size of sliding window i.e. local neighborhood + representing the center points and involved in correlation + computation. Defaults to 1. + max_displacement (int): The radius for computing correlation volume, + but the actual working space can be dilated by dilation_patch. + Defaults to 1. + stride (int): The stride of the sliding blocks in the input spatial + dimensions. Defaults to 1. + padding (int): Zero padding added to all four sides of the input1. + Defaults to 0. + dilation (int): The spacing of local neighborhood that will involved + in correlation. Defaults to 1. + dilation_patch (int): The spacing between position need to compute + correlation. Defaults to 1. + """ + + def __init__(self, + kernel_size: int = 1, + max_displacement: int = 1, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + dilation_patch: int = 1) -> None: + super().__init__() + self.kernel_size = kernel_size + self.max_displacement = max_displacement + self.stride = stride + self.padding = padding + self.dilation = dilation + self.dilation_patch = dilation_patch + + def forward(self, input1: Tensor, input2: Tensor) -> Tensor: + return CorrelationFunction.apply(input1, input2, self.kernel_size, + self.max_displacement, self.stride, + self.padding, self.dilation, + self.dilation_patch) + + def __repr__(self) -> str: + s = self.__class__.__name__ + s += f'(kernel_size={self.kernel_size}, ' + s += f'max_displacement={self.max_displacement}, ' + s += f'stride={self.stride}, ' + s += f'padding={self.padding}, ' + s += f'dilation={self.dilation}, ' + s += f'dilation_patch={self.dilation_patch})' + return s diff --git a/external/cv/mmcv/ops/csrc/README.md b/external/cv/mmcv/ops/csrc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8fcc6eb1a3260148aa7448470967684f8c9f0365 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/README.md @@ -0,0 +1,162 @@ +# Code Structure of CUDA operators + +This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops. + +## Directories Tree + +```folder +. +├── common +│ ├── box_iou_rotated_utils.hpp +│ ├── parrots_cpp_helper.hpp +│ ├── parrots_cuda_helper.hpp +│ ├── pytorch_cpp_helper.hpp +│ ├── pytorch_cuda_helper.hpp +│ ├── pytorch_device_registry.hpp +│   ├── cuda +│   │ ├── common_cuda_helper.hpp +│   │ ├── parrots_cudawarpfunction.cuh +│   │ ├── ... +│   │ └── ops_cuda_kernel.cuh +|   ├── mps +│   │ ├── MPSLibrary.h +│   │ ├── ... +│   │ └── MPSUtils.h +|   ├── mlu +│   │ └── ... +|   └── utils +│   │ └── ... +├── parrots +│   ├── ... +│   ├── ops.cpp +│   ├── ops_parrots.cpp +│   └── ops_pytorch.h +└── pytorch +    ├── info.cpp +    ├── pybind.cpp +    ├── ... +    ├── ops.cpp +    ├── cuda +    │   ├── ... +    │   └── ops_cuda.cu +    ├── cpu +    │   ├── ... +    │   └── ops.cpp +    ├── mps +    │   ├── ... +    |   └── op_mps.mm +    └── mlu +       ├── ... +       └── op_mlu.cpp +``` + +## Components + +- `common`: This directory contains all tools and shared codes. + - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. + - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**. + - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device. + - `utils`: The kernels and utils of spconv. +- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. +- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. + - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. + - `cpu`: This directory contain cpu implementations of corresponding custom ops. + - `mlu`: This directory contain launchers of each MLU kernels. + - `mps`: MPS ops implementation and launchers. + +## How to add new PyTorch ops? + +1. (Optional) Add shared kernel in `common` to support special hardware platform. + + ```c++ + // src/common/cuda/new_ops_cuda_kernel.cuh + + template + __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { + // forward here + } + + ``` + + Add cuda kernel launcher in `pytorch/cuda`. + + ```c++ + // src/pytorch/cuda + #include + + void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ + // initialize + at::cuda::CUDAGuard device_guard(input.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + ... + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { + new_ops_forward_cuda_kernel + <<>>( + input.data_ptr(), output.data_ptr(),...); + })); + AT_CUDA_CHECK(cudaGetLastError()); + } + ``` + +2. Register implementation for different devices. + + ```c++ + // src/pytorch/cuda/cudabind.cpp + ... + + Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ + // implement cuda forward here + // use `NewOpsForwardCUDAKernelLauncher` here + } + // declare interface here. + Tensor new_ops_forward_impl(Tensor input, Tensor output, ...); + // register the implementation for given device (CUDA here). + REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda); + ``` + +3. Add ops implementation in `pytorch` directory. Select different implementations according to device type. + + ```c++ + // src/pytorch/new_ops.cpp + Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){ + // dispatch the implementation according to the device type of input. + DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...); + } + ... + + Tensor new_ops_forward(Tensor input, Tensor output, ...){ + return new_ops_forward_impl(input, output, ...); + } + ``` + +4. Binding the implementation in `pytorch/pybind.cpp` + + ```c++ + // src/pytorch/pybind.cpp + + ... + + Tensor new_ops_forward(Tensor input, Tensor output, ...); + + ... + + // bind with pybind11 + m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", + py::arg("input"), py::arg("output"), ...); + + ... + + ``` + +5. Build MMCV again. Enjoy new ops in python + + ```python + from ..utils import ext_loader + ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) + + ... + + ext_module.new_ops_forward(input, output, ...) + + ``` diff --git a/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a8453eaa8d3638394df8a0b169d8df01dfc27a11 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp @@ -0,0 +1,426 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +// modified from +// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h +#pragma once +#include +#include + +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define HOST_DEVICE __host__ __device__ +#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ +#else +#include +#define HOST_DEVICE +#define HOST_DEVICE_INLINE HOST_DEVICE inline +#endif + +namespace { + +template +struct RotatedBox { + T x_ctr, y_ctr, w, h, a; +}; + +template +struct Point { + T x, y; + HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {} + HOST_DEVICE_INLINE Point operator+(const Point& p) const { + return Point(x + p.x, y + p.y); + } + HOST_DEVICE_INLINE Point& operator+=(const Point& p) { + x += p.x; + y += p.y; + return *this; + } + HOST_DEVICE_INLINE Point operator-(const Point& p) const { + return Point(x - p.x, y - p.y); + } + HOST_DEVICE_INLINE Point operator*(const T coeff) const { + return Point(x * coeff, y * coeff); + } +}; + +template +HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) { + return A.x * B.x + A.y * B.y; +} + +template +HOST_DEVICE_INLINE T cross_2d(const Point& A, const Point& B) { + return A.x * B.y - B.x * A.y; +} + +template +HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox& box, + Point (&pts)[4]) { + // M_PI / 180. == 0.01745329251 + // double theta = box.a * 0.01745329251; + // MODIFIED + double theta = box.a; + T cosTheta2 = (T)cos(theta) * 0.5f; + T sinTheta2 = (T)sin(theta) * 0.5f; + + // y: top --> down; x: left --> right + pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; + pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; + pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; + pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; + pts[2].x = 2 * box.x_ctr - pts[0].x; + pts[2].y = 2 * box.y_ctr - pts[0].y; + pts[3].x = 2 * box.x_ctr - pts[1].x; + pts[3].y = 2 * box.y_ctr - pts[1].y; +} + +template +HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4], + const Point (&pts2)[4], + Point (&intersections)[24]) { + // Line vector + // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] + Point vec1[4], vec2[4]; + for (int i = 0; i < 4; i++) { + vec1[i] = pts1[(i + 1) % 4] - pts1[i]; + vec2[i] = pts2[(i + 1) % 4] - pts2[i]; + } + + // Line test - test all line combos for intersection + int num = 0; // number of intersections + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + // Solve for 2x2 Ax=b + T det = cross_2d(vec2[j], vec1[i]); + + // This takes care of parallel lines + if (fabs(det) <= 1e-14) { + continue; + } + + auto vec12 = pts2[j] - pts1[i]; + + T t1 = cross_2d(vec2[j], vec12) / det; + T t2 = cross_2d(vec1[i], vec12) / det; + + if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { + intersections[num++] = pts1[i] + vec1[i] * t1; + } + } + } + + // Check for vertices of rect1 inside rect2 + { + const auto& AB = vec2[0]; + const auto& DA = vec2[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + // assume ABCD is the rectangle, and P is the point to be judged + // P is inside ABCD iff. P's projection on AB lies within AB + // and P's projection on AD lies within AD + + auto AP = pts1[i] - pts2[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts1[i]; + } + } + } + + // Reverse the check - check for vertices of rect2 inside rect1 + { + const auto& AB = vec1[0]; + const auto& DA = vec1[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + auto AP = pts2[i] - pts1[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts2[i]; + } + } + } + + return num; +} + +template +HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], + const int& num_in, Point (&q)[24], + bool shift_to_zero = false) { + assert(num_in >= 2); + + // Step 1: + // Find point with minimum y + // if more than 1 points have the same minimum y, + // pick the one with the minimum x. + int t = 0; + for (int i = 1; i < num_in; i++) { + if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { + t = i; + } + } + auto& start = p[t]; // starting point + + // Step 2: + // Subtract starting point from every points (for sorting in the next step) + for (int i = 0; i < num_in; i++) { + q[i] = p[i] - start; + } + + // Swap the starting point to position 0 + auto tmp = q[0]; + q[0] = q[t]; + q[t] = tmp; + + // Step 3: + // Sort point 1 ~ num_in according to their relative cross-product values + // (essentially sorting according to angles) + // If the angles are the same, sort according to their distance to origin + T dist[24]; + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + +#ifdef __CUDACC__ + // CUDA version + // In the future, we can potentially use thrust + // for sorting here to improve speed (though not guaranteed) + for (int i = 1; i < num_in - 1; i++) { + for (int j = i + 1; j < num_in; j++) { + T crossProduct = cross_2d(q[i], q[j]); + if ((crossProduct < -1e-6) || + (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { + auto q_tmp = q[i]; + q[i] = q[j]; + q[j] = q_tmp; + auto dist_tmp = dist[i]; + dist[i] = dist[j]; + dist[j] = dist_tmp; + } + } + } +#else + // CPU version + std::sort(q + 1, q + num_in, + [](const Point& A, const Point& B) -> bool { + T temp = cross_2d(A, B); + if (fabs(temp) < 1e-6) { + return dot_2d(A, A) < dot_2d(B, B); + } else { + return temp > 0; + } + }); + // compute distance to origin after sort, since the points are now different. + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } +#endif + + // Step 4: + // Make sure there are at least 2 points (that don't overlap with each other) + // in the stack + int k; // index of the non-overlapped second point + for (k = 1; k < num_in; k++) { + if (dist[k] > 1e-8) { + break; + } + } + if (k == num_in) { + // We reach the end, which means the convex hull is just one point + q[0] = p[t]; + return 1; + } + q[1] = q[k]; + int m = 2; // 2 points in the stack + // Step 5: + // Finally we can start the scanning process. + // When a non-convex relationship between the 3 points is found + // (either concave shape or duplicated points), + // we pop the previous point from the stack + // until the 3-point relationship is convex again, or + // until the stack only contains two points + for (int i = k + 1; i < num_in; i++) { + while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { + m--; + } + q[m++] = q[i]; + } + + // Step 6 (Optional): + // In general sense we need the original coordinates, so we + // need to shift the points back (reverting Step 2) + // But if we're only interested in getting the area/perimeter of the shape + // We can simply return. + if (!shift_to_zero) { + for (int i = 0; i < m; i++) { + q[i] += start; + } + } + + return m; +} + +template +HOST_DEVICE_INLINE T quadri_box_area(const Point (&q)[4]) { + T area = 0; +#pragma unroll + for (int i = 1; i < 3; i++) { + area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.0; +} + +template +HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) { + if (m <= 2) { + return 0; + } + + T area = 0; + for (int i = 1; i < m - 1; i++) { + area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.0; +} + +template +HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox& box1, + const RotatedBox& box2) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned + // from rotated_rect_intersection_pts + Point intersectPts[24], orderedPts[24]; + + Point pts1[4]; + Point pts2[4]; + get_rotated_vertices(box1, pts1); + get_rotated_vertices(box2, pts2); + + int num = get_intersection_points(pts1, pts2, intersectPts); + + if (num <= 2) { + return 0.0; + } + + // Convex Hull to order the intersection points in clockwise order and find + // the contour area. + int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); + return polygon_area(orderedPts, num_convex); +} + +template +HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point (&pts1)[4], + const Point (&pts2)[4]) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned + // from rotated_rect_intersection_pts + Point intersectPts[24], orderedPts[24]; + + int num = get_intersection_points(pts1, pts2, intersectPts); + + if (num <= 2) { + return 0.0; + } + + // Convex Hull to order the intersection points in clockwise order and find + // the contour area. + int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); + return polygon_area(orderedPts, num_convex); +} + +} // namespace + +template +HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw, + T const* const box2_raw, + const int mode_flag) { + // shift center to the middle point to achieve higher precision in result + RotatedBox box1, box2; + auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; + auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; + box1.x_ctr = box1_raw[0] - center_shift_x; + box1.y_ctr = box1_raw[1] - center_shift_y; + box1.w = box1_raw[2]; + box1.h = box1_raw[3]; + box1.a = box1_raw[4]; + box2.x_ctr = box2_raw[0] - center_shift_x; + box2.y_ctr = box2_raw[1] - center_shift_y; + box2.w = box2_raw[2]; + box2.h = box2_raw[3]; + box2.a = box2_raw[4]; + + const T area1 = box1.w * box1.h; + const T area2 = box2.w * box2.h; + if (area1 < 1e-14 || area2 < 1e-14) { + return 0.f; + } + + const T intersection = rotated_boxes_intersection(box1, box2); + T baseS = 1.0; + if (mode_flag == 0) { + baseS = (area1 + area2 - intersection); + } else if (mode_flag == 1) { + baseS = area1; + } + const T iou = intersection / baseS; + return iou; +} + +template +HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw, + T const* const pts2_raw, + const int mode_flag) { + // shift center to the middle point to achieve higher precision in result + Point pts1[4], pts2[4]; + + auto center_shift_x = + (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] + + pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) / + 8.0; + auto center_shift_y = + (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] + + pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) / + 8.0; + pts1[0].x = pts1_raw[0] - center_shift_x; + pts1[0].y = pts1_raw[1] - center_shift_y; + pts1[1].x = pts1_raw[2] - center_shift_x; + pts1[1].y = pts1_raw[3] - center_shift_y; + pts1[2].x = pts1_raw[4] - center_shift_x; + pts1[2].y = pts1_raw[5] - center_shift_y; + pts1[3].x = pts1_raw[6] - center_shift_x; + pts1[3].y = pts1_raw[7] - center_shift_y; + pts2[0].x = pts2_raw[0] - center_shift_x; + pts2[0].y = pts2_raw[1] - center_shift_y; + pts2[1].x = pts2_raw[2] - center_shift_x; + pts2[1].y = pts2_raw[3] - center_shift_y; + pts2[2].x = pts2_raw[4] - center_shift_x; + pts2[2].y = pts2_raw[5] - center_shift_y; + pts2[3].x = pts2_raw[6] - center_shift_x; + pts2[3].y = pts2_raw[7] - center_shift_y; + + const T area1 = quadri_box_area(pts1); + const T area2 = quadri_box_area(pts2); + if (area1 < 1e-14 || area2 < 1e-14) { + return 0.f; + } + + const T intersection = quadri_boxes_intersection(pts1, pts2); + T baseS = 1.0; + if (mode_flag == 0) { + baseS = (area1 + area2 - intersection); + } else if (mode_flag == 1) { + baseS = area1; + } + const T iou = intersection / baseS; + return iou; +} diff --git a/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh @@ -0,0 +1,59 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu +#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH +#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void active_rotated_filter_forward_cuda_kernel( + const int nthreads, const scalar_t* weight_data, const int* indices_data, + const int num_input_planes, const int num_output_planes, + const int num_orientations, const int num_rotations, const int nEntry, + scalar_t* output_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int l = index % nEntry; + int j = (index / nEntry) % num_input_planes; + int i = index / nEntry / num_input_planes; + int k; + scalar_t val = *(weight_data + index); + for (k = 0; k < num_rotations; k++) { + int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; + scalar_t* target = output_data + + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + idx; + *target = val; + } + } +} + +template +__global__ void active_rotated_filter_backward_cuda_kernel( + const int nthreads, const scalar_t* gradWeight_data, + const int* indices_data, const int num_input_planes, + const int num_output_planes, const int num_orientations, + const int num_rotations, const int nEntry, scalar_t* weight_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int l = index % nEntry; + int j = (index / nEntry) % num_input_planes; + int i = index / nEntry / num_input_planes; + int k; + scalar_t* val = weight_data + index; + *val = 0; + scalar_t tmp = 0; + for (k = 0; k < num_rotations; k++) { + int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; + scalar_t target = + *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + idx); + tmp = tmp + target; + } + *val = tmp; + } +} +#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9f9250844b9ceeca0df0377640c3d28e3f61cecc --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh @@ -0,0 +1,116 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH +#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K) +// output: fout(B,O,N) +// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) = s(b,i,k,m)*p(b,i(k),m,j) +// i(k) = idx(b,i,k) +// sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j) +// avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k +// max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j))) + +template +__global__ void assign_score_withk_forward_cuda_kernel( + const int B, const int N0, const int N1, const int M, const int K, + const int O, const int aggregate, const T* points, const T* centers, + const T* scores, const int64_t* knn_idx, T* output) { + // ----- parallel loop for B, N1, K and O --------- + CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) { + // ------- loop for M ---------- + const int b = (int)(i / (O * N1 * K)); + const int o = (int)(i % (O * N1 * K) / (N1 * K)); + const int n = (int)(i % (N1 * K) / K); + const int k = (int)(i % K); + const int cn = (int)knn_idx[b * K * N1 + n * K + + 0]; // The first neighbor is the center point + const int kn = (int)knn_idx[b * K * N1 + n * K + k]; + if (kn >= N0 || + kn < 0) { // if index overflows, it is out of the neighborhood range + return; + } + assert(b < B); + assert(kn < N0); + assert(cn < N0); + assert(o < O); + assert(n < N1); + const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; + T val = output[out_idx]; + for (int m = 0; m < M; m++) { + val += points[b * N0 * M * O + kn * M * O + m * O + o] * + scores[b * N1 * K * M + n * K * M + k * M + m] - + centers[b * N0 * M * O + cn * M * O + m * O + o] * + scores[b * N1 * K * M + n * K * M + k * M + m]; + } + output[out_idx] = val; + } +} + +template +__global__ void assign_score_withk_points_backward_cuda_kernel( + const int B, const int N0, const int N, const int M, const int K, + const int O, const int aggregate, const T* grad_out, const T* scores, + const int64_t* knn_idx, T* grad_points, T* grad_centers) { + // ----- parallel loop for B, M, O --------- + CUDA_1D_KERNEL_LOOP(i, B * M * O) { + int b = (int)(i / (M * O)); + int m = (int)(i % (M * O) / O); + int o = (int)(i % O); + + // ----- loop for N,K --------- + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + int kn = knn_idx[b * N * K + n * K + k]; + int cn = knn_idx[b * N * K + n * K + 0]; + if (kn >= N0 || kn < 0) { // if index overflows, it is out of the + // neighborhood range + continue; + } + atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o, + scores[b * N * K * M + n * K * M + k * M + m] * + grad_out[b * O * N * K + o * N * K + n * K + k]); + atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o, + -scores[b * N * K * M + n * K * M + k * M + m] * + grad_out[b * O * N * K + o * N * K + n * K + k]); + } + } + } +} + +template +__global__ void assign_score_withk_scores_backward_cuda_kernel( + const int B, const int N0, const int N, const int M, const int K, + const int O, const int aggregate, const T* grad_out, const T* points, + const T* centers, const int64_t* knn_idx, T* grad_scores) { + // ----- parallel loop for B, N, K, M --------- + CUDA_1D_KERNEL_LOOP(i, B * N * K * M) { + const int b = (int)(i / (N * M * K)); + const int n = (int)(i % (N * M * K) / M / K); + const int k = (int)(i % (M * K) / M); + const int m = (int)(i % M); + const int cn = knn_idx[b * N * K + n * K + 0]; + const int kn = knn_idx[b * N * K + n * K + k]; + if (kn >= N0 || + kn < 0) { // if index overflows, it is out of the neighborhood range + return; + } + + // -------------- loop for O ------------------------ + const int out_idx = b * N * K * M + n * K * M + k * M + m; + T val = grad_scores[out_idx]; + for (int o = 0; o < O; o++) { + val += (points[b * N0 * M * O + kn * M * O + m * O + o] - + centers[b * N0 * M * O + cn * M * O + m * O + o]) * + grad_out[b * O * N * K + o * N * K + n * K + k]; + } + grad_scores[out_idx] = val; + } +} + +#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh @@ -0,0 +1,58 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu +#ifndef BALL_QUERY_CUDA_KERNEL_CUH +#define BALL_QUERY_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void ball_query_forward_cuda_kernel(int b, int n, int m, + float min_radius, + float max_radius, int nsample, + const T* new_xyz, const T* xyz, + int* idx) { + // new_xyz: (B, M, 3) + // xyz: (B, N, 3) + // output: + // idx: (B, M, nsample) + int bs_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b) return; + + new_xyz += bs_idx * m * 3 + pt_idx * 3; + xyz += bs_idx * n * 3; + idx += bs_idx * m * nsample + pt_idx * nsample; + + float max_radius2 = max_radius * max_radius; + float min_radius2 = min_radius * min_radius; + T new_x = new_xyz[0]; + T new_y = new_xyz[1]; + T new_z = new_xyz[2]; + + int cnt = 0; + for (int k = 0; k < n; ++k) { + T x = xyz[k * 3 + 0]; + T y = xyz[k * 3 + 1]; + T z = xyz[k * 3 + 2]; + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + idx[l] = k; + } + } + idx[cnt] = k; + ++cnt; + if (cnt >= nsample) break; + } + } + } +} + +#endif // BALL_QUERY_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..15bd91eca629895d3a99dde3fe6614036ca31dc9 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh @@ -0,0 +1,147 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH +#define BBOX_OVERLAPS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1, + T& y1, T& x2, T& y2) { + x1 = bbox[base]; + y1 = bbox[base + 1]; + x2 = bbox[base + 2]; + y2 = bbox[base + 3]; +} + +template <> +__device__ __forceinline__ void load_bbox(const float* bbox, + const int base, float& x1, + float& y1, float& x2, + float& y2) { + const float4 bbox_offset = reinterpret_cast(bbox + base)[0]; + x1 = bbox_offset.x; + y1 = bbox_offset.y; + x2 = bbox_offset.z; + y2 = bbox_offset.w; +} + +template +__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, + T* ious, const int num_bbox1, + const int num_bbox2, const int mode, + const bool aligned, + const int offset) { + if (aligned) { + CUDA_1D_KERNEL_LOOP(index, num_bbox1) { + const int b1 = index; + const int b2 = index; + + const int base1 = b1 << 2; // b1 * 4 + T b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); + + const int base2 = b2 << 2; // b2 * 4 + T b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); + + const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); + const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); + const T width = fmaxf(right - left + offset, 0.f); + const T height = fmaxf(bottom - top + offset, 0.f); + const T interS = width * height; + + const T baseS = + fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); + ious[index] = interS / baseS; + } + } else { + CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) { + const int b1 = index / num_bbox2; + const int b2 = index % num_bbox2; + + const int base1 = b1 << 2; // b1 * 4 + T b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); + + const int base2 = b2 << 2; // b2 * 4 + T b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); + + const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); + const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); + const T width = fmaxf(right - left + offset, 0.f); + const T height = fmaxf(bottom - top + offset, 0.f); + const T interS = width * height; + + const T baseS = + fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); + ious[index] = interS / baseS; + } + } +} + +#if __CUDA_ARCH__ >= 530 +__device__ __forceinline__ __half __half_area(const __half x1, const __half y1, + const __half x2, const __half y2, + const __half offset) { + const __half half_w = __hadd(__hsub(x2, x1), offset); + const __half half_h = __hadd(__hsub(y2, y1), offset); + return __hmul(half_w, half_h); +} + +__device__ __forceinline__ __half __half_max(const __half a, const __half b) { + return __hge(a, b) ? a : b; +} + +__device__ __forceinline__ __half __half_min(const __half a, const __half b) { + return __hle(a, b) ? a : b; +} + +// fp16 won't provide much increase when aligned==true. It is useful when +// aligned==false, which would give you ~40% bonus. +__device__ void bbox_overlaps_cuda_kernel_half( + const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1, + const int num_bbox2, const int mode, const bool aligned, const int offset) { + const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2; + const __half h_offset = __int2half_rn(offset); + CUDA_1D_KERNEL_LOOP(index, num_output) { + const int b1 = aligned ? index : index / num_bbox2; + const int b2 = aligned ? index : index % num_bbox2; + + const int base1 = b1 << 2; + __half b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset); + + const int base2 = b2 << 2; + __half b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset); + + const __half left = __half_max(b1_x1, b2_x1), + right = __half_min(b1_x2, b2_x2); + const __half top = __half_max(b1_y1, b2_y1), + bottom = __half_min(b1_y2, b2_y2); + const __half width = + __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f)); + const __half height = + __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f)); + const __half interS = __hmul(width, height); + + const __half baseS = __half_max( + mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area, + h_offset); + ious[index] = __hdiv(interS, baseS); + } +} +#endif // __CUDA_ARCH__ >= 530 + +#endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..537610416e16aae8979d0843972e090d127b0d43 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh @@ -0,0 +1,230 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu +#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH +#define BEZIER_ALIGN_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +template +__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3, + const T u) { + return ((1. - u) * (1. - u) * (1. - u) * p0 + + 3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 + + u * u * u * p3); +} + +template +__global__ void bezier_align_forward_cuda_kernel( + const int nthreads, + const T *bottom_data, // inputs + const T *bottom_rois, // bottom rois contains the bezier curve + T *top_data, // outputs + const int pooled_height, const int pooled_width, const T spatial_scale, + const int sampling_ratio, bool aligned, const int channels, + const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + // beziers have size Nx(1+8*2) = Nx17 + const T *offset_bottom_rois = bottom_rois + n * 17; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + + // TODO: avoid this by using parallel annotation, for good + T p0_x = offset_bottom_rois[1] * spatial_scale; + T p0_y = offset_bottom_rois[2] * spatial_scale; + T p1_x = offset_bottom_rois[3] * spatial_scale; + T p1_y = offset_bottom_rois[4] * spatial_scale; + T p2_x = offset_bottom_rois[5] * spatial_scale; + T p2_y = offset_bottom_rois[6] * spatial_scale; + T p3_x = offset_bottom_rois[7] * spatial_scale; + T p3_y = offset_bottom_rois[8] * spatial_scale; + T p4_x = offset_bottom_rois[15] * spatial_scale; + T p4_y = offset_bottom_rois[16] * spatial_scale; + T p5_x = offset_bottom_rois[13] * spatial_scale; + T p5_y = offset_bottom_rois[14] * spatial_scale; + T p6_x = offset_bottom_rois[11] * spatial_scale; + T p6_y = offset_bottom_rois[12] * spatial_scale; + T p7_x = offset_bottom_rois[9] * spatial_scale; + T p7_y = offset_bottom_rois[10] * spatial_scale; + + // compute the coords + const T u = pw / static_cast(pooled_width); + const T v = ph / static_cast(pooled_height); + const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); + const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); + const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); + const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); + const T x_center = x1 * v + x0 * (1. - v) - offset; + const T y_center = y1 * v + y0 * (1. - v) - offset; + + T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); + T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T *offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + // When the grid is empty, output zeros == 0/1, instead of NaN. + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = y_center - (T)0.5 * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = x_center - (T)0.5 * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, + index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +template +__global__ void bezier_align_backward_cuda_kernel( + const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int sampling_ratio, bool aligned, const int channels, + const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + // beziers have size Nx(1+8*2) = Nx17 + const T *offset_bottom_rois = bottom_rois + n * 17; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T p0_x = offset_bottom_rois[1] * spatial_scale; + T p0_y = offset_bottom_rois[2] * spatial_scale; + T p1_x = offset_bottom_rois[3] * spatial_scale; + T p1_y = offset_bottom_rois[4] * spatial_scale; + T p2_x = offset_bottom_rois[5] * spatial_scale; + T p2_y = offset_bottom_rois[6] * spatial_scale; + T p3_x = offset_bottom_rois[7] * spatial_scale; + T p3_y = offset_bottom_rois[8] * spatial_scale; + T p4_x = offset_bottom_rois[15] * spatial_scale; + T p4_y = offset_bottom_rois[16] * spatial_scale; + T p5_x = offset_bottom_rois[13] * spatial_scale; + T p5_y = offset_bottom_rois[14] * spatial_scale; + T p6_x = offset_bottom_rois[11] * spatial_scale; + T p6_y = offset_bottom_rois[12] * spatial_scale; + T p7_x = offset_bottom_rois[9] * spatial_scale; + T p7_y = offset_bottom_rois[10] * spatial_scale; + + // compute the coords + const T u = pw / static_cast(pooled_width); + const T v = ph / static_cast(pooled_height); + const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); + const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); + const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); + const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); + const T x_center = x1 * v + x0 * (1. - v) - offset; + const T y_center = y1 * v + y0 * (1. - v) - offset; + + T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); + T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T *offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = y_center - (T)0.5 * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = x_center - (T)0.5 * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high, index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, + static_cast(g1)); + atomicAdd(offset_bottom_diff + y_low * width + x_high, + static_cast(g2)); + atomicAdd(offset_bottom_diff + y_high * width + x_low, + static_cast(g3)); + atomicAdd(offset_bottom_diff + y_high * width + x_high, + static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // BezierAlignBackward + +#endif // BEZIER_ALIGN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1d2a2197b45ef5c82412c4b75d7819a7e27674f6 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh @@ -0,0 +1,200 @@ +// Copyright (c) OpenMMLab. All rights reserved +// modified from +// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu. +// the main difference: (1) use `argmax_idx` for fast computing of gradient +// during the backward. (2) `wh` is directly computed by `boxes`, rather than +// passing it as argument to forward or backward functions. + +#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH +#define BORDER_ALIGN_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 }; + +/*** Forward ***/ +template +__global__ void border_align_forward_cuda_kernel( + const int nthreads, const T* input, const T* boxes, T* output, + int* argmax_idx, const int channels, const int box_size, const int height, + const int width, const int pool_size) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (batch_idx, c_idx, box_idx) is an element paralleled for computing + // output, and `extreme_idx` is in range [0,3] + int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx; + const T *offset_box, *offset_input, *offset_box_x; + T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y, + val, maxval; + + extreme_idx = threadIdx.y; + // shape (N, C, box_size, 4) for output + batch_idx = index / channels / box_size; + // shape (N, box_size, 4) for boxes + box_idx = index % box_size + batch_idx * box_size; + c_idx = (index / box_size) % channels; + + offset_box = boxes + box_idx * 4; + box_width = *(offset_box + 2) - *offset_box; + box_height = *(offset_box + 3) - *(offset_box + 1); + offset_output = output + index * 4 + extreme_idx; + offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; + // shape (N, 4C, h, w) for input. + // [0,C) for top feature, [C,2C) for left feature, + // [2C,3C) for bottom feature, [3C,4C) for right feature + offset_input = + input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) * + height * width; + + // extreme_idx in [0,1] -> offset_box_x indexed at x1 + // extreme_idx in [2,3] -> offset_box_x indexed at x2 + offset_box_x = offset_box + extreme_idx / 2 * 2; + + // (x1,y1) or (x2,y2) for (x,y) + x = *offset_box_x; + y = *(offset_box_x + 1); + + switch (extreme_idx) { + // top + case BorderMode::Top: + stride = box_width / pool_size; + x_stride = stride; + y_stride = 0; + break; + // left + case BorderMode::Left: + stride = box_height / pool_size; + x_stride = 0; + y_stride = stride; + break; + // bottom + case BorderMode::Bottom: + stride = box_width / pool_size; + x_stride = -stride; + y_stride = 0; + break; + // right + case BorderMode::Right: + stride = box_height / pool_size; + x_stride = 0; + y_stride = -stride; + break; + } + + // initialize maxval and maxidx with the start position (e.g. (x1,y1) or + // (x2,y2)) + maxval = bilinear_interpolate(offset_input, height, width, y, x, index); + maxidx = 0; + + // do max_pool along the border + for (int i = 1; i <= pool_size; i++) { + x += x_stride; + y += y_stride; + val = bilinear_interpolate(offset_input, height, width, y, x, index); + if (val > maxval) { + maxval = val; + maxidx = i; + } + } + + // update output and argmax_idx + *offset_output = maxval; + *offset_argmax_idx = maxidx; + } +} + +/*** Backward ***/ +template +__global__ void border_align_backward_cuda_kernel( + const int nthreads, const T* grad_output, const T* boxes, + const int* argmax_idx, T* grad_input, const int channels, + const int box_size, const int height, const int width, + const int pool_size) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (batch_idx, c_idx, box_idx) is an element paralleled for computing + // output, and `extreme_idx` is in range [0,3] + int batch_idx, c_idx, box_idx, extreme_idx; + const int* offset_argmax_idx; + const T *offset_grad_output, *offset_box, *offset_box_x; + T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x, + y; + + extreme_idx = threadIdx.y; + batch_idx = index / channels / box_size; + box_idx = index % box_size + batch_idx * box_size; + c_idx = (index / box_size) % channels; + + offset_box = boxes + box_idx * 4; + box_width = *(offset_box + 2) - *offset_box; + box_height = *(offset_box + 3) - *(offset_box + 1); + offset_grad_output = grad_output + index * 4 + extreme_idx; + offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; + // [0,C) for top feature grad, [C,2C) for left feature grad, + // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad + offset_grad_input = grad_input + (batch_idx * channels * 4 + + extreme_idx * channels + c_idx) * + height * width; + + // extreme_idx in [0,1] -> offset_box_x indexed at x1 + // extreme_idx in [2,3] -> offset_box_x indexed at x2 + offset_box_x = offset_box + extreme_idx / 2 * 2; + + switch (extreme_idx) { + // top + case BorderMode::Top: + stride = box_width / pool_size; + x_stride = stride; + y_stride = 0; + break; + // left + case BorderMode::Left: + stride = box_height / pool_size; + x_stride = 0; + y_stride = stride; + break; + // bottom + case BorderMode::Bottom: + stride = box_width / pool_size; + x_stride = -stride; + y_stride = 0; + break; + // right + case BorderMode::Right: + stride = box_height / pool_size; + x_stride = 0; + y_stride = -stride; + break; + } + + // get position (x,y) which has maximum value during forward + x = *offset_box_x; + y = *(offset_box_x + 1); + x += x_stride * (T)(*offset_argmax_idx); + y += y_stride * (T)(*offset_argmax_idx); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, + x_high, y_low, y_high, index); + + // update grad_output + atomicAdd(offset_grad_input + y_low * width + x_low, + *offset_grad_output * w1); + atomicAdd(offset_grad_input + y_low * width + x_high, + *offset_grad_output * w2); + atomicAdd(offset_grad_input + y_high * width + x_low, + *offset_grad_output * w3); + atomicAdd(offset_grad_input + y_high * width + x_high, + *offset_grad_output * w4); + } +} + +#endif // BORDER_ALIGN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..cf8ad5e1a324de3a11c8fc8af28a8d559a661ed6 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh @@ -0,0 +1,91 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#ifndef BOX_IOU_QUADRI_CUDA_CUH +#define BOX_IOU_QUADRI_CUDA_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif +#include "box_iou_rotated_utils.hpp" + +// 2D block with 32 * 16 = 512 threads per block +const int BLOCK_DIM_X = 32; +const int BLOCK_DIM_Y = 16; + +inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } + +template +__global__ void box_iou_quadri_cuda_kernel( + const int n_boxes1, const int n_boxes2, const T* dev_boxes1, + const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { + if (aligned) { + CUDA_1D_KERNEL_LOOP(index, n_boxes1) { + int b1 = index; + int b2 = index; + + int base1 = b1 * 8; + + float block_boxes1[8]; + float block_boxes2[8]; + + block_boxes1[0] = dev_boxes1[base1 + 0]; + block_boxes1[1] = dev_boxes1[base1 + 1]; + block_boxes1[2] = dev_boxes1[base1 + 2]; + block_boxes1[3] = dev_boxes1[base1 + 3]; + block_boxes1[4] = dev_boxes1[base1 + 4]; + block_boxes1[5] = dev_boxes1[base1 + 5]; + block_boxes1[6] = dev_boxes1[base1 + 6]; + block_boxes1[7] = dev_boxes1[base1 + 7]; + + int base2 = b2 * 8; + + block_boxes2[0] = dev_boxes2[base2 + 0]; + block_boxes2[1] = dev_boxes2[base2 + 1]; + block_boxes2[2] = dev_boxes2[base2 + 2]; + block_boxes2[3] = dev_boxes2[base2 + 3]; + block_boxes2[4] = dev_boxes2[base2 + 4]; + block_boxes2[5] = dev_boxes2[base2 + 5]; + block_boxes2[6] = dev_boxes2[base2 + 6]; + block_boxes2[7] = dev_boxes2[base2 + 7]; + + dev_ious[index] = + single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); + } + } else { + CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { + int b1 = index / n_boxes2; + int b2 = index % n_boxes2; + + int base1 = b1 * 8; + + float block_boxes1[8]; + float block_boxes2[8]; + + block_boxes1[0] = dev_boxes1[base1 + 0]; + block_boxes1[1] = dev_boxes1[base1 + 1]; + block_boxes1[2] = dev_boxes1[base1 + 2]; + block_boxes1[3] = dev_boxes1[base1 + 3]; + block_boxes1[4] = dev_boxes1[base1 + 4]; + block_boxes1[5] = dev_boxes1[base1 + 5]; + block_boxes1[6] = dev_boxes1[base1 + 6]; + block_boxes1[7] = dev_boxes1[base1 + 7]; + + int base2 = b2 * 8; + + block_boxes2[0] = dev_boxes2[base2 + 0]; + block_boxes2[1] = dev_boxes2[base2 + 1]; + block_boxes2[2] = dev_boxes2[base2 + 2]; + block_boxes2[3] = dev_boxes2[base2 + 3]; + block_boxes2[4] = dev_boxes2[base2 + 4]; + block_boxes2[5] = dev_boxes2[base2 + 5]; + block_boxes2[6] = dev_boxes2[base2 + 6]; + block_boxes2[7] = dev_boxes2[base2 + 7]; + + dev_ious[index] = + single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); + } + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh @@ -0,0 +1,81 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +// modified from +// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu +#ifndef BOX_IOU_ROTATED_CUDA_CUH +#define BOX_IOU_ROTATED_CUDA_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif +#include "box_iou_rotated_utils.hpp" + +// 2D block with 32 * 16 = 512 threads per block +const int BLOCK_DIM_X = 32; +const int BLOCK_DIM_Y = 16; + +inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } + +template +__global__ void box_iou_rotated_cuda_kernel( + const int n_boxes1, const int n_boxes2, const T* dev_boxes1, + const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { + if (aligned) { + CUDA_1D_KERNEL_LOOP(index, n_boxes1) { + int b1 = index; + int b2 = index; + + int base1 = b1 * 5; + + float block_boxes1[5]; + float block_boxes2[5]; + + block_boxes1[0] = dev_boxes1[base1 + 0]; + block_boxes1[1] = dev_boxes1[base1 + 1]; + block_boxes1[2] = dev_boxes1[base1 + 2]; + block_boxes1[3] = dev_boxes1[base1 + 3]; + block_boxes1[4] = dev_boxes1[base1 + 4]; + + int base2 = b2 * 5; + + block_boxes2[0] = dev_boxes2[base2 + 0]; + block_boxes2[1] = dev_boxes2[base2 + 1]; + block_boxes2[2] = dev_boxes2[base2 + 2]; + block_boxes2[3] = dev_boxes2[base2 + 3]; + block_boxes2[4] = dev_boxes2[base2 + 4]; + + dev_ious[index] = + single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); + } + } else { + CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { + int b1 = index / n_boxes2; + int b2 = index % n_boxes2; + + int base1 = b1 * 5; + + float block_boxes1[5]; + float block_boxes2[5]; + + block_boxes1[0] = dev_boxes1[base1 + 0]; + block_boxes1[1] = dev_boxes1[base1 + 1]; + block_boxes1[2] = dev_boxes1[base1 + 2]; + block_boxes1[3] = dev_boxes1[base1 + 3]; + block_boxes1[4] = dev_boxes1[base1 + 4]; + + int base2 = b2 * 5; + + block_boxes2[0] = dev_boxes2[base2 + 0]; + block_boxes2[1] = dev_boxes2[base2 + 1]; + block_boxes2[2] = dev_boxes2[base2 + 2]; + block_boxes2[3] = dev_boxes2[base2 + 3]; + block_boxes2[4] = dev_boxes2[base2 + 4]; + + dev_ious[index] = + single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); + } + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..311900fcd303483dea815a1eb996a7eb33fdc55b --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh @@ -0,0 +1,335 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef CARAFE_CUDA_KERNEL_CUH +#define CARAFE_CUDA_KERNEL_CUH + +#include + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#ifdef MMCV_WITH_HIP +#define WARP_SIZE 64 +#else +#define WARP_SIZE 32 +#endif +#define THREADS_PER_PIXEL 32 +#define MAX_SHARED_MEMORY 49152 +#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 +#define MAXIMIZE_KERNEL_SIZE true +#define kTileDim 32 +#define kBlockRows 8 +#define FULL_MASK 0xffffffff + +inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } + +__device__ inline int Loc2Index(const int n, const int c, const int h, + const int w, const int channel_num, + const int height, const int width) { + int index = w + (h + (c + n * channel_num) * height) * width; + return index; +} +#ifndef MMCV_WITH_HIP +/* TODO: move this to a common place */ +template +__device__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__device__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} +#endif +template +__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) +#ifdef MMCV_WITH_HIP + val += __shfl_down(val, offset); +#else + val += __shfl_down_sync(FULL_MASK, val, offset); +#endif + return val; +} + +template <> +__device__ __forceinline__ phalf warpReduceSum(phalf val) { + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) +#ifdef MMCV_WITH_HIP + // Using PyTorch's macro for half support + __PHALF(val) += WARP_SHFL_DOWN(val, offset); +#else + __PHALF(val) += + __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset); +#endif + return val; +} + +// Splits the original matrix into submatrices with size 32 * 32. +// Each block transposes one submatrix by loading it into shared memory. +// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/ +template +__global__ void BatchTranspose2DCUDAKernel(const int N, const int H, + const int W, const int dh, + const int dw, + const scalar_t *__restrict__ X, + scalar_t *__restrict__ Y) { + __shared__ scalar_t tile[kTileDim][kTileDim + 1]; + const int n = blockIdx.x / (dh * dw); + const int k = blockIdx.x % (dh * dw); + const int r = k / dw; + const int c = k % dw; + const int offset = n * H * W; + int x = c * kTileDim + threadIdx.x; + int y = r * kTileDim + threadIdx.y; + if (x < W) { + for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) { + tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x]; + } + } + __syncthreads(); + x = r * kTileDim + threadIdx.x; + y = c * kTileDim + threadIdx.y; + if (x < H) { + for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) { + Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i]; + } + } +} +template +__global__ void CARAFEForward( + const int num_kernels, const scalar_t *__restrict__ bottom_data, + const scalar_t *__restrict__ bottom_masks, const int kernel_size, + const int group_size, const int scale_factor, const int channels, + const int down_height, const int down_width, const int height, + const int width, const int mask_channels, scalar_t *__restrict__ top_data) { +#if MAXIMIZE_KERNEL_SIZE + __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; +#else + __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; +#endif + + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index > num_kernels - 1) { + return; + } + const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; + const int split_id = threadIdx.x % THREADS_PER_PIXEL; + index = index / THREADS_PER_PIXEL; + const int pw = index % width; + const int ph = (index / width) % height; + const int n = index / width / height; + + const int down_pw = pw / scale_factor; + const int down_ph = ph / scale_factor; + + const int start_w = down_pw - (kernel_size - 1) / 2; + const int end_w = down_pw + (kernel_size - 1) / 2 + 1; + const int start_h = down_ph - (kernel_size - 1) / 2; + const int end_h = down_ph + (kernel_size - 1) / 2 + 1; + for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { + int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels); + shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; + } + __syncthreads(); + + const int channels_per_group = ceilf(channels / (float)group_size); +#pragma unroll + for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { + int mask_group = c / channels_per_group; + scalar_t output_val = 0; +#pragma unroll + for (int iy = start_h; iy < end_h; iy++) { +#pragma unroll + for (int ix = start_w; ix < end_w; ix++) { + if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { + continue; + } + int mask_iy = iy - down_ph + (kernel_size - 1) / 2; + int mask_ix = ix - down_pw + (kernel_size - 1) / 2; + int mask_c = + (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; + int feat_index = + Loc2Index(n, iy, ix, c, down_height, down_width, channels); + + output_val += bottom_data[feat_index] * + shared_mask[mask_c * WARP_SIZE + pixel_id]; + } + } + + int top_index = Loc2Index(n, ph, pw, c, height, width, channels); + top_data[top_index] = output_val; + } +} + +template +__global__ void CARAFEBackward_Feature( + const int num_kernels, const scalar_t *__restrict__ top_diff, + const scalar_t *__restrict__ bottom_masks, const int kernel_size, + const int group_size, const int scale_factor, const int channels, + const int down_height, const int down_width, const int height, + const int width, const int mask_channels, + scalar_t *__restrict__ bottom_diff) { +#if MAXIMIZE_KERNEL_SIZE + __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; +#else + __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; +#endif + + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index > num_kernels - 1) { + return; + } + + const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; + const int split_id = threadIdx.x % THREADS_PER_PIXEL; + // (n, c, ph, pw) is an element in the bottom_data + index = index / THREADS_PER_PIXEL; + const int pw = index % width; + const int ph = (index / width) % height; + const int n = index / width / height; + + const int start_w = pw - (kernel_size - 1) * scale_factor / 2; + const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1; + const int start_h = ph - (kernel_size - 1) * scale_factor / 2; + const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1; + for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { + const int mask_w = (c % kernel_size) * scale_factor; + const int mask_h = (c / kernel_size % kernel_size) * scale_factor; + const int mask_x = start_w + mask_w; + const int mask_y = start_h + mask_h; + if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) { + shared_mask[c * WARP_SIZE + pixel_id] = 0; + continue; + } + const int mask_group = c / (kernel_size * kernel_size); + const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1; + int mask_index = + Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width); + shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; + } + __syncthreads(); + const int channels_per_group = ceilf(channels / (float)group_size); +#pragma unroll + for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { + int mask_group = c / channels_per_group; + int top_index = Loc2Index(n, ph, pw, c, height, width, channels); + scalar_t output_val = 0; +#pragma unroll + for (int iy = start_h; iy < end_h; iy += scale_factor) { +#pragma unroll + for (int ix = start_w; ix < end_w; ix += scale_factor) { + if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) { + continue; + } + int mask_iy = + (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor; + int mask_ix = + (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor; + int mask_c = + (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; + int feat_index = Loc2Index(n, iy, ix, c, height, width, channels); + output_val += + shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index]; + } + } + bottom_diff[top_index] = output_val; + } +} + +template +__global__ void FeatureSum(const int num_kernels, + const scalar_t *__restrict__ input_data, + const int scale_factor, const int channels, + const int height, const int width, + scalar_t *__restrict__ output_data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index > num_kernels - 1) { + return; + } + const int split_id = threadIdx.x % THREADS_PER_PIXEL; + index = index / THREADS_PER_PIXEL; + const int pw = index % width; + const int ph = (index / width) % height; + const int n = index / width / height; + for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { + scalar_t output_val = 0; + for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) { + for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) { + int input_id = Loc2Index(n, iy, ix, c, height * scale_factor, + width * scale_factor, channels); + output_val += input_data[input_id]; + } + } + const int output_id = Loc2Index(n, ph, pw, c, height, width, channels); + output_data[output_id] = output_val; + } +} + +template +__global__ void CARAFEBackward_Mask(const int num_kernels, + const scalar_t *__restrict__ top_diff, + const scalar_t *__restrict__ bottom_data, + const int kernel_size, const int group_size, + const int scale_factor, const int channels, + const int down_height, const int down_width, + const int height, const int width, + const int mask_channels, + scalar_t *__restrict__ mask_diff) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index > num_kernels - 1) { + return; + } + + const int lane_id = index % WARP_SIZE; + index = index / WARP_SIZE; + const int mask_c = index % mask_channels; + // (n, c, ph, pw) is an element in the bottom_data + index = index / mask_channels; + const int pw = index % width; + const int ph = (index / width) % height; + const int n = index / width / height; + + const int down_pw = pw / scale_factor; + const int down_ph = ph / scale_factor; + + const int mask_group = mask_c / (kernel_size * kernel_size); + const int mask_loc = mask_c % (kernel_size * kernel_size); + + const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2; + const int offset_y = + mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2; + + const int down_x = down_pw + offset_x; + const int down_y = down_ph + offset_y; + + scalar_t output_val = 0; + + if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 && + down_x <= down_width - 1) { + const int channels_per_mask = ceilf(channels / (float)group_size); + const int start = channels_per_mask * mask_group; + const int end = min(channels_per_mask * (mask_group + 1), channels); + for (int c = start + lane_id; c < end; c += WARP_SIZE) { + int bottom_id = + Loc2Index(n, down_y, down_x, c, down_height, down_width, channels); + int top_id = Loc2Index(n, ph, pw, c, height, width, channels); + output_val += top_diff[top_id] * bottom_data[bottom_id]; + } + } +#ifdef MMCV_WITH_HIP + __syncthreads(); +#else + __syncwarp(); +#endif + output_val = warpReduceSum(output_val); + if (lane_id == 0) { + const int mask_id = + Loc2Index(n, ph, pw, mask_c, height, width, mask_channels); + mask_diff[mask_id] = output_val; + } +} + +#endif // CARAFE_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..48230c632f223b736aa72a9d5fd682c97b3aa93a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh @@ -0,0 +1,111 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH +#define CARAFE_NAIVE_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +__device__ inline int Loc2Index(const int n, const int c, const int h, + const int w, const int channel_num, + const int height, const int width) { + int index = w + (h + (c + n * channel_num) * height) * width; + return index; +} + +template +__global__ void carafe_naive_forward_cuda_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size, + const int group_size, const int scale_factor, const int channels, + const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the bottom_data + int pw = index % width; + int ph = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + int mask_channels = kernel_size * kernel_size * group_size; + int mask_group = c / (channels / group_size); + + int down_pw = pw / scale_factor; + int down_ph = ph / scale_factor; + int down_width = width / scale_factor; + int down_height = height / scale_factor; + int start_w = down_pw - (kernel_size - 1) / 2; + int end_w = down_pw + (kernel_size - 1) / 2 + 1; + int start_h = down_ph - (kernel_size - 1) / 2; + int end_h = down_ph + (kernel_size - 1) / 2 + 1; + + scalar_t output_val = 0; + for (int iy = start_h; iy < end_h; iy++) { + for (int ix = start_w; ix < end_w; ix++) { + if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { + continue; + } + int mask_iy = iy - down_ph + (kernel_size - 1) / 2; + int mask_ix = ix - down_pw + (kernel_size - 1) / 2; + int mask_c = + (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; + int feat_index = + Loc2Index(n, c, iy, ix, channels, down_height, down_width); + int mask_index = + Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); + output_val += bottom_data[feat_index] * bottom_masks[mask_index]; + } + } + top_data[index] = output_val; + } +} + +template +__global__ void carafe_naive_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data, + const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff, + const int kernel_size, const int group_size, const int scale_factor, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the bottom_data + int pw = index % width; + int ph = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + int mask_channels = kernel_size * kernel_size * group_size; + int mask_group = c / (channels / group_size); + + int down_pw = pw / scale_factor; + int down_ph = ph / scale_factor; + int down_width = width / scale_factor; + int down_height = height / scale_factor; + int start_w = down_pw - (kernel_size - 1) / 2; + int end_w = down_pw + (kernel_size - 1) / 2 + 1; + int start_h = down_ph - (kernel_size - 1) / 2; + int end_h = down_ph + (kernel_size - 1) / 2 + 1; + + for (int iy = start_h; iy < end_h; iy++) { + for (int ix = start_w; ix < end_w; ix++) { + if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { + continue; + } + int mask_iy = iy - down_ph + (kernel_size - 1) / 2; + int mask_ix = ix - down_pw + (kernel_size - 1) / 2; + int mask_c = + (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; + int feat_index = + Loc2Index(n, c, iy, ix, channels, down_height, down_width); + int mask_index = + Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); + atomicAdd(bottom_diff + feat_index, + bottom_masks[mask_index] * top_diff[index]); + atomicAdd(mask_diff + mask_index, + bottom_data[feat_index] * top_diff[index]); + } + } + } +} + +#endif // CARAFE_NAIVE_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh @@ -0,0 +1,101 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu +#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH +#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 + +template +__global__ void chamfer_distance_forward_cuda_kernel(int b, int n, + const scalar_t* xyz, int m, + const scalar_t* xyz2, + scalar_t* result, + int* result_i) { + __shared__ scalar_t buf[MAX_SHARED_SCALAR_T]; + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) { + int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2; + for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) { + buf[j] = xyz2[(i * m + k2) * 2 + j]; + } + __syncthreads(); + for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { + scalar_t x1 = xyz[(i * n + j) * 2 + 0]; + scalar_t y1 = xyz[(i * n + j) * 2 + 1]; + int best_i = 0; + scalar_t best = 1e10; + int end_ka = end_k & (~2); + if (end_ka == THREADS_PER_BLOCK) { + for (int k = 0; k < THREADS_PER_BLOCK; k += 4) { +#pragma unroll + for (int j = 0; j < 4; ++j) { + scalar_t x2 = buf[(k + j) * 2] - x1; + scalar_t y2 = buf[(k + j) * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (d < best) { + best = d; + best_i = k + k2 + j; + } + } + } + } else { + for (int k = 0; k < end_ka; k += 4) { +#pragma unroll + for (int j = 0; j < 4; ++j) { + scalar_t x2 = buf[(k + j) * 2] - x1; + scalar_t y2 = buf[(k + j) * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (d < best) { + best = d; + best_i = k + k2 + j; + } + } + } + } + for (int k = end_ka; k < end_k; k++) { + scalar_t x2 = buf[k * 2 + 0] - x1; + scalar_t y2 = buf[k * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (k == 0 || d < best) { + best = d; + best_i = k + k2; + } + } + if (k2 == 0 || result[(i * n + j)] > best) { + result[(i * n + j)] = best; + result_i[(i * n + j)] = best_i; + } + } + __syncthreads(); + } + } +} + +template +__global__ void chamfer_distance_backward_cuda_kernel( + int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2, + const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1, + scalar_t* grad_xyz2) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { + scalar_t x1 = xyz1[(i * n + j) * 2 + 0]; + scalar_t y1 = xyz1[(i * n + j) * 2 + 1]; + int j2 = idx1[i * n + j]; + scalar_t x2 = xyz2[(i * m + j2) * 2 + 0]; + scalar_t y2 = xyz2[(i * m + j2) * 2 + 1]; + scalar_t g = grad_dist1[i * n + j] * 2; + atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2)); + atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2)); + atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2))); + atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2))); + } + } +} +#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp @@ -0,0 +1,120 @@ +#ifndef COMMON_CUDA_HELPER +#define COMMON_CUDA_HELPER + +#include + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) \ + for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ + j += blockDim.y * gridDim.y) + +#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ + for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ + for (size_t j = blockIdx.y; j < (m); j += gridDim.y) + +#define THREADS_PER_BLOCK 512 + +inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) { + int optimal_block_num = (N + num_threads - 1) / num_threads; + int max_block_num = 4096; + return min(optimal_block_num, max_block_num); +} + +template +__device__ T bilinear_interpolate(const T* input, const int height, + const int width, T y, T x, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) return 0; + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = input[y_low * width + x_low]; + T v2 = input[y_low * width + x_high]; + T v3 = input[y_high * width + x_low]; + T v4 = input[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__device__ void bilinear_interpolate_gradient( + const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, + int& x_low, int& x_high, int& y_low, int& y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} +#endif // COMMON_CUDA_HELPER diff --git a/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh @@ -0,0 +1,831 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef CONVEX_IOU_CUDA_KERNEL_CUH +#define CONVEX_IOU_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAXN 100 +#define NMAX 512 +__device__ const double EPS = 1E-8; + +__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); } + +struct Point { + double x, y; + __device__ Point() {} + __device__ Point(double x, double y) : x(x), y(y) {} +}; + +__device__ inline bool point_same(Point& a, Point& b) { + return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0; +} + +__device__ inline void swap1(Point* a, Point* b) { + Point temp; + temp.x = a->x; + temp.y = a->y; + + a->x = b->x; + a->y = b->y; + + b->x = temp.x; + b->y = temp.y; +} + +__device__ inline void reverse1(Point* a, const int n) { + for (int i = 0; i < (n - 1) / 2.0; i++) { + Point* j = &(a[i]); + Point* k = &(a[n - 1 - i]); + swap1(j, k); + } +} + +__device__ inline double cross(Point o, Point a, Point b) { + return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); +} + +__device__ inline double dis(Point a, Point b) { + return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); +} +__device__ inline double area(Point* ps, int n) { + ps[n] = ps[0]; + double res = 0; + for (int i = 0; i < n; i++) { + res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; + } + return res / 2.0; +} +__device__ inline double polygon_area_grad(Point* ps, int n, + int* polygon_to_pred_index, + int n_pred, double* grad_C) { + ps[n] = ps[0]; + double partion_grad[4 * 30 + 2]; + double res = 0; + for (int i = 0; i < n; i++) { + res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; + partion_grad[i * 4 + 2] = ps[i + 1].y; + partion_grad[i * 4 + 3] = -ps[i + 1].x; + if (i != n - 1) { + partion_grad[i * 4 + 4] = -ps[i].y; + partion_grad[i * 4 + 5] = ps[i].x; + } else { + partion_grad[0] = -ps[i].y; + partion_grad[1] = ps[i].x; + } + } + for (int i = 0; i < n; i++) { + for (int j = 0; j < n_pred; j++) { + if (i == polygon_to_pred_index[j]) { + grad_C[2 * polygon_to_pred_index[j + n_pred]] = + (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2; + break; + } + } + for (int j = 0; j < n_pred; j++) { + if (i == polygon_to_pred_index[j]) { + grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] = + (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2; + break; + } + } + } + + return res / 2.0; +} + +__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p, + double* cut_grad, int m, int n, int i) { + double s1, s2; + double s2_s1_2; + double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd; + double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd; + s1 = cross(a, b, c); + s2 = cross(a, b, d); + + ds1_dxc = -(b.y - a.y); + ds1_dyc = b.x - a.x; + ds2_dxd = ds1_dxc; + ds2_dyd = ds1_dyc; + s2_s1_2 = (s2 - s1) * (s2 - s1); + + if (sig(s1) == 0 && sig(s2) == 0) return 2; + if (sig(s2 - s1) == 0) return 0; + + dxp_dxc = + ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) / + (s2_s1_2); + dxp_dyc = + ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) / + (s2_s1_2); + dxp_dxd = + ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) / + (s2_s1_2); + dxp_dyd = + ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) / + (s2_s1_2); + + dyp_dxc = + ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) / + (s2_s1_2); + dyp_dyc = + ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) / + (s2_s1_2); + dyp_dxd = + ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) / + (s2_s1_2); + dyp_dyd = + ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) / + (s2_s1_2); + + p.x = (c.x * s2 - d.x * s1) / (s2 - s1); + p.y = (c.y * s2 - d.y * s1) / (s2 - s1); + if (i == n - 1) { + cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; + cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; + cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; + cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; + cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd; + cut_grad[4 * n * m + 1] = dyp_dxd; + cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd; + cut_grad[4 * n * m + 3] = dyp_dyd; + } else { + cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; + cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; + cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; + cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; + cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd; + cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd; + cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd; + cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd; + } + + return 1; +} +__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b, + double* cut_grad) { + Point pp[MAXN]; + double ccur_grad[MAXN] = {}; + int m = 0; + p[n] = p[0]; + int k = n; + for (int i = 0; i < n; i++) { + if (sig(cross(a, b, p[i])) > 0) { + pp[m] = p[i]; + ccur_grad[4 * n * m + 4 * i] = 1.0; + ccur_grad[4 * n * m + 4 * i + 3] = 1.0; + m++; + } + if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { + lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i); + m++; + } + } + + n = 0; + for (int i = 0; i < m; i++) { + if (!i || !(point_same(pp[i], pp[i - 1]))) { + p[n] = pp[i]; + for (int j = 0; j < 4 * k; j++) { + cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j]; + } + n++; + } + } + + while (n > 1 && point_same(p[n - 1], p[0])) n--; +} + +__device__ inline double intersectArea(Point a, Point b, Point c, Point d, + double* grad_AB, int order, + int convex_n) { + Point o(0, 0); + int res_flag = 0; + int s1 = sig(cross(o, a, b)); + int s2 = sig(cross(o, c, d)); + if (s1 == 0 || s2 == 0) return 0.0; + if (s1 == -1) { + Point* i = &a; + Point* j = &b; + swap1(i, j); + res_flag = 1; + } + if (s2 == -1) { + Point* i = &c; + Point* j = &d; + swap1(i, j); + } + Point p[10] = {o, a, b}; + int n = 3, n0 = 3, n1, n2, n3; + double cut_grad1[MAXN] = {}; + double cut_grad2[MAXN] = {}; + double cut_grad3[MAXN] = {}; + double p1_p_grad[10][10] = {}; + double p2_p1_grad[10][10] = {}; + double p3_p2_grad[10][10] = {}; + + double p3_p1_grad[10][10] = {}; + double p3_p_grad[10][10] = {}; + + // 1 + polygon_cut(p, n, o, c, cut_grad1); + n1 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n0; j++) { + if (!(j % 2)) { + p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j]; + } else { + p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j]; + } + } + } + + // 2 + polygon_cut(p, n, c, d, cut_grad2); + n2 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n1; j++) { + if (!(j % 2)) { + p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j]; + } else { + p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j]; + } + } + } + // 3 + polygon_cut(p, n, d, o, cut_grad3); + n3 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n2; j++) { + if (!(j % 2)) { + p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j]; + } else { + p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j]; + } + } + } + + // mul + // p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1) + for (int i = 0; i < 2 * n3; i++) { + for (int j = 0; j < 2 * n1; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n2; m++) { + sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j]; + } + p3_p1_grad[i][j] = sum; + } + } + + // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0) + for (int i = 0; i < 2 * n3; i++) { + for (int j = 0; j < 2 * n0; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n1; m++) { + sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j]; + } + p3_p_grad[i][j] = sum; + } + } + + // calculate S_grad + int polygon_index_box_index[20]; + double grad_polygon[20]; + double S_grad[6]; + + for (int i = 0; i < n3; i++) { + polygon_index_box_index[i] = i; + polygon_index_box_index[i + n3] = i; + } + + double res = + polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon); + + if (s1 * s2 == -1) { + for (int j = 0; j < 2 * 3; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n3; m++) { + sum = sum - grad_polygon[m] * p3_p_grad[m][j]; + } + S_grad[j] = sum; + } + + if (order != convex_n - 1) { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[2 * order + 2] += S_grad[2]; + grad_AB[2 * order + 3] += S_grad[3]; + + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[2 * order + 2] += S_grad[4]; + grad_AB[2 * order + 3] += S_grad[5]; + } + } else { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[0] += S_grad[2]; + grad_AB[1] += S_grad[3]; + + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[0] += S_grad[4]; + grad_AB[1] += S_grad[5]; + } + } + res = -res; + } else { + for (int j = 0; j < 2 * 3; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n3; m++) { + sum = sum + grad_polygon[m] * p3_p_grad[m][j]; + } + S_grad[j] = sum; + } + + if (order != convex_n - 1) { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[2 * order + 2] += S_grad[2]; + grad_AB[2 * order + 3] += S_grad[3]; + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[2 * order + 2] += S_grad[4]; + grad_AB[2 * order + 3] += S_grad[5]; + } + } else { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[0] += S_grad[2]; + grad_AB[1] += S_grad[3]; + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[0] += S_grad[4]; + grad_AB[1] += S_grad[5]; + } + } + } + return res; +} + +__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2, + double* grad_AB) { + if (area(ps1, n1) < 0) reverse1(ps1, n1); + if (area(ps2, n2) < 0) reverse1(ps2, n2); + ps1[n1] = ps1[0]; + ps2[n2] = ps2[0]; + double res = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + res += + intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1); + } + } + return res; +} + +__device__ inline void Jarvis(Point* in_poly, int& n_poly) { + Point p_max, p_k; + int max_index, k_index; + int Stack[NMAX] = {}, top1, top2; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point* j = &(in_poly[0]); + Point* k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]]; + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]]; + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; +} + +__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2, + int n2, double* grad_C) { + Point polygon[MAXN]; + int n = n1 + n2, n_poly = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n - n1; j++) { + if (point_same(ps1[i], ps2[j])) { + for (int k = j; k < n - n1 - 1; k++) { + ps2[k] = ps2[k + 1]; + } + n2--; + break; + } + } + } + n_poly = n1 + n2; + for (int i = 0; i < n_poly; i++) { + if (i < n1) { + polygon[i] = ps1[i]; + } else { + polygon[i] = ps2[i - n1]; + } + } + + Jarvis(polygon, n_poly); + + int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1}; + int n_pred = 0; + for (int i = 0; i < n_poly; i++) { + for (int j = 0; j < n1; j++) { + if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) { + polygon_to_pred_index[n_pred] = i; + polygon_to_pred_index[n_pred + n1] = j; + n_pred += 1; + break; + } + } + } + if (n_pred == 0) { + double polygon_area = fabs(area(polygon, n_poly)); + for (int i = 0; i < 18; i++) { + grad_C[i] = 0.0; + } + return polygon_area; + } else { + double polygon_area = + polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C); + if (polygon_area < 0) { + for (int i = 0; i < 18; i++) { + grad_C[i] = -grad_C[i]; + } + } + return fabs(polygon_area); + } +} + +// convex_find and get the polygon_index_box_index +__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly, + int* points_to_convex_ind) { + int n_input = n_poly; + Point input_poly[20]; + for (int i = 0; i < n_input; i++) { + input_poly[i].x = in_poly[i].x; + input_poly[i].y = in_poly[i].y; + } + Point p_max, p_k; + int max_index, k_index; + int Stack[20], top1, top2; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point* j = &(in_poly[0]); + Point* k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + for (int i = 0; i <= top1; i++) { + right_point[i] = in_poly[Stack[i]]; + } + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + + for (int i = top2 - 1; i >= 0; i--) { + left_point[i] = in_poly[Stack[i]]; + } + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; + for (int i = 0; i < n_poly; i++) { + for (int j = 0; j < n_input; j++) { + if (point_same(in_poly[i], input_poly[j])) { + points_to_convex_ind[i] = j; + break; + } + } + } +} + +template +__device__ inline float devrIoU(T const* const p, T const* const q, + T* point_grad, const int idx) { + Point ps1[MAXN], ps2[MAXN]; + + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = (double)p[i * 2]; + convex[i].y = (double)p[i * 2 + 1]; + } + int n_convex = 9; + int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; + Jarvis_and_index(convex, n_convex, points_to_convex_ind); + + int n1 = n_convex; + int n2 = 4; + + for (int i = 0; i < n1; i++) { + ps1[i].x = (double)convex[i].x; + ps1[i].y = (double)convex[i].y; + } + + for (int i = 0; i < n2; i++) { + ps2[i].x = (double)q[i * 2]; + ps2[i].y = (double)q[i * 2 + 1]; + } + + int polygon_index_box_index[18]; + for (int i = 0; i < n1; i++) { + polygon_index_box_index[i] = i; + polygon_index_box_index[i + n1] = i; + } + + double grad_A[18] = {}; + double grad_AB[18] = {}; + double grad_C[18] = {}; + + double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB); + double S_pred = + polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A); + if (S_pred < 0) { + for (int i = 0; i < n_convex * 2; i++) { + grad_A[i] = -grad_A[i]; + } + } + double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; + + double iou = inter_area / union_area; + double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C); + + // printf("%d:live\n", idx); + double rot_giou = iou - (polygon_area - union_area) / polygon_area; + + float grad_point_temp[18] = {}; + + for (int i = 0; i < n_convex; i++) { + int grad_point = points_to_convex_ind[i]; + grad_point_temp[2 * grad_point] = + (float)((union_area + inter_area) / (union_area * union_area) * + grad_AB[2 * i] - + iou / union_area * grad_A[2 * i] - + 1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) - + (union_area) / polygon_area / polygon_area * grad_C[2 * i]); + grad_point_temp[2 * grad_point + 1] = + (float)((union_area + inter_area) / (union_area * union_area) * + grad_AB[2 * i + 1] - + iou / union_area * grad_A[2 * i + 1] - + 1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) - + (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]); + } + + for (int i = 0; i < 9; i++) { + point_grad[2 * i] = grad_point_temp[2 * i]; + point_grad[2 * i + 1] = grad_point_temp[2 * i + 1]; + } + return (float)rot_giou; +} + +template +__global__ void convex_giou_cuda_kernel(const int ex_n_boxes, + const int gt_n_boxes, const T* ex_boxes, + const T* gt_boxes, T* point_grad) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T* cur_box = ex_boxes + index * 18; + const T* cur_gt_box = gt_boxes + index * 8; + T* cur_grad = point_grad + index * 19; + T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x); + cur_grad[18] = giou; + } +} + +__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) { + double s1, s2; + s1 = cross(a, b, c); + s2 = cross(a, b, d); + if (sig(s1) == 0 && sig(s2) == 0) return 2; + if (sig(s2 - s1) == 0) return 0; + p.x = (c.x * s2 - d.x * s1) / (s2 - s1); + p.y = (c.y * s2 - d.y * s1) / (s2 - s1); + return 1; +} + +__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) { + Point pp[MAXN]; + int m = 0; + p[n] = p[0]; + for (int i = 0; i < n; i++) { + if (sig(cross(a, b, p[i])) > 0) { + pp[m] = p[i]; + m++; + } + if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { + lineCross(a, b, p[i], p[i + 1], pp[m]); + m++; + } + } + n = 0; + for (int i = 0; i < m; i++) { + if (!i || !(point_same(pp[i], pp[i - 1]))) { + p[n] = pp[i]; + n++; + } + } + + while (n > 1 && point_same(p[n - 1], p[0])) n--; +} + +__device__ inline double intersectArea(Point a, Point b, Point c, Point d) { + Point o(0, 0); + int s1 = sig(cross(o, a, b)); + int s2 = sig(cross(o, c, d)); + if (s1 == 0 || s2 == 0) return 0.0; + if (s1 == -1) { + Point* i = &a; + Point* j = &b; + swap1(i, j); + } + if (s2 == -1) { + Point* i = &c; + Point* j = &d; + swap1(i, j); + } + Point p[10] = {o, a, b}; + int n = 3; + + polygon_cut(p, n, o, c); + polygon_cut(p, n, c, d); + polygon_cut(p, n, d, o); + double res = area(p, n); + if (s1 * s2 == -1) res = -res; + return res; +} +__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, + int n2) { + if (area(ps1, n1) < 0) reverse1(ps1, n1); + if (area(ps2, n2) < 0) reverse1(ps2, n2); + ps1[n1] = ps1[0]; + ps2[n2] = ps2[0]; + double res = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]); + } + } + return res; +} + +template +__device__ inline float devrIoU(T const* const p, T const* const q) { + Point ps1[MAXN], ps2[MAXN]; + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = (double)p[i * 2]; + convex[i].y = (double)p[i * 2 + 1]; + } + int n_convex = 9; + int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; + Jarvis_and_index(convex, n_convex, points_to_convex_ind); + int n1 = n_convex; + for (int i = 0; i < n1; i++) { + ps1[i].x = (double)convex[i].x; + ps1[i].y = (double)convex[i].y; + } + int n2 = 4; + for (int i = 0; i < n2; i++) { + ps2[i].x = (double)q[i * 2]; + ps2[i].y = (double)q[i * 2 + 1]; + } + double inter_area = intersectAreaO(ps1, n1, ps2, n2); + double S_pred = area(ps1, n1); + double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; + double iou = inter_area / union_area; + return (float)iou; +} + +template +__global__ void convex_iou_cuda_kernel(const int ex_n_boxes, + const int gt_n_boxes, const T* ex_boxes, + const T* gt_boxes, T* iou) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T* cur_box = ex_boxes + index * 18; + for (int i = 0; i < gt_n_boxes; i++) { + iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8); + } + } +} +#endif // CONVEX_IOU_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f910561ec309cd50fd6d4da131ab36cdf3ca963a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh @@ -0,0 +1,231 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu +// Original licence: Under MIT License + +#ifndef CORRELATION_CUDA +#define CORRELATION_CUDA + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#include +#include +// Using is recommended in the official documentation in +// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op. +// However, we use for compatibility with CUDA 9.0 +// Read https://github.com/pytorch/extension-cpp/issues/35 for more details. +#include + +#include +#include + +using namespace torch; + +#define TensorAcc4R PackedTensorAccessor32 +#define TensorAcc5R PackedTensorAccessor32 +#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W) + +#define WARP_SIZE 32 +#define FULL_MASK 0xffffffff + +template +__global__ void correlation_forward_cuda_kernel( + const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output, + int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, + int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW, + int oH, int oW) { + const int iH = rInput1.size(1); + const int iW = rInput1.size(2); + const int C = rInput1.size(3); + + const int n = blockIdx.x; + const int h = blockIdx.y * blockDim.y + threadIdx.y; + const int w = blockIdx.z * blockDim.z + threadIdx.z; + + if (h >= oH || w >= oW) return; + + const int thread = threadIdx.x; + + const int start_i = -padH + h * dH; + const int start_j = -padW + w * dW; + + const int patchRadH = dilation_patchH * (patchH - 1) / 2; + const int patchRadW = dilation_patchW * (patchW - 1) / 2; + + for (int ph = 0; ph < patchH; ++ph) { + int ph_dilated = ph * dilation_patchH - patchRadH; + for (int pw = 0; pw < patchW; ++pw) { + int pw_dilated = pw * dilation_patchW - patchRadW; + scalar_t prod_sum = 0.0f; + for (int i = 0; i < kH; ++i) { + int i1 = start_i + i * dilationH; + int i2 = i1 + ph_dilated; + if (WITHIN_BOUNDS(i1, i2, iH, iH)) { + for (int j = 0; j < kW; ++j) { + int j1 = start_j + j * dilationW; + int j2 = j1 + pw_dilated; + if (WITHIN_BOUNDS(j1, j2, iW, iW)) { + for (int c = thread; c < C; c += WARP_SIZE) { + scalar_t v1 = rInput1[n][i1][j1][c]; + scalar_t v2 = rInput2[n][i2][j2][c]; + prod_sum += v1 * v2; + } + } + } + } + } + // accumulate + for (int offset = 16; offset > 0; offset /= 2) +#ifdef MMCV_WITH_HIP + prod_sum += __shfl_down(float(prod_sum), offset); +#else + prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset); +#endif + if (thread == 0) { + output[n][ph][pw][h][w] = prod_sum; + } + } + } +} + +template +__global__ void correlation_backward_cuda_kernel_input1( + const TensorAcc5R grad_output, const TensorAcc4R input2, + TensorAcc4R grad_input1, const int kH, const int kW, const int patchH, + const int patchW, const int padH, const int padW, const int dilationH, + const int dilationW, const int dilation_patchH, const int dilation_patchW, + const int dH, const int dW) { + const int iH = input2.size(1); + const int iW = input2.size(2); + const int C = input2.size(3); + + const int H = grad_output.size(3); + const int W = grad_output.size(4); + + const int patchRadH = (patchH - 1) / 2; + const int patchRadW = (patchW - 1) / 2; + + const int n = blockIdx.x; + const int h = blockIdx.y; + const int w = blockIdx.z; + + const int h_2 = h + padH; + const int w_2 = w + padW; + const int min_h = h_2 - kH * dilationH; + const int min_w = w_2 - kW * dilationW; + + extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; + scalar_t *grad_cache = reinterpret_cast(grad_cache_char); + for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { + const int ph = i / patchW; + const int pw = i % patchW; + int i1 = h + dilation_patchH * (ph - patchRadH); + int j1 = w + dilation_patchW * (pw - patchRadW); + + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + scalar_t grad_val = 0.0f; + for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { + int i2 = (h_3) / dH; + if (i2 * dH != h_3) continue; + for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { + int j2 = (w_3) / dW; + if (j2 * dW != w_3) continue; + if (WITHIN_BOUNDS(i2, j2, H, W)) { + grad_val += grad_output[n][ph][pw][i2][j2]; + } + } + } + grad_cache[i] = grad_val; + } + } + __syncthreads(); + + for (int c = threadIdx.x; c < C; c += blockDim.x) { + scalar_t grad_input_val = 0.0f; + for (int ph = 0; ph < patchH; ++ph) { + int i1 = h + dilation_patchH * (ph - patchRadH); + for (int pw = 0; pw < patchW; ++pw) { + int j1 = w + dilation_patchW * (pw - patchRadW); + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw]; + } + } + } + grad_input1[n][c][h][w] = grad_input_val; + } +} + +template +__global__ void correlation_backward_cuda_kernel_input2( + const TensorAcc5R grad_output, const TensorAcc4R input1, + TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH, + int padW, int dilationH, int dilationW, int dilation_patchH, + int dilation_patchW, int dH, int dW) { + const int iH = input1.size(1); + const int iW = input1.size(2); + const int C = input1.size(3); + + const int patchRadH = (patchH - 1) / 2; + const int patchRadW = (patchW - 1) / 2; + + const int H = grad_output.size(3); + const int W = grad_output.size(4); + + const int dilatedKH = kH * dilationH; + const int dilatedKW = kW * dilationW; + + const int n = blockIdx.x; + const int h = blockIdx.y; + const int w = blockIdx.z; + + extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; + scalar_t *grad_cache = reinterpret_cast(grad_cache_char); + for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { + const int ph = i / patchW; + const int pw = i % patchW; + int i1 = h - dilation_patchH * (ph - patchRadH); + int j1 = w - dilation_patchW * (pw - patchRadW); + + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + scalar_t grad_val = 0.0f; + + const int h_2 = i1 + padH; + const int w_2 = j1 + padW; + const int min_h = h_2 - dilatedKH; + const int min_w = w_2 - dilatedKW; + + for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { + int i2 = (h_3) / dH; + if (i2 * dH != h_3) continue; + for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { + int j2 = (w_3) / dW; + if (j2 * dW != w_3) continue; + if (WITHIN_BOUNDS(i2, j2, H, W)) { + grad_val += grad_output[n][ph][pw][i2][j2]; + } + } + } + grad_cache[i] = grad_val; + } + } + __syncthreads(); + + for (int c = threadIdx.x; c < C; c += blockDim.x) { + scalar_t grad_input_val = 0.0f; + for (int ph = 0; ph < patchH; ++ph) { + int i1 = h - dilation_patchH * (ph - patchRadH); + for (int pw = 0; pw < patchW; ++pw) { + int j1 = w - dilation_patchW * (pw - patchRadW); + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw]; + } + } + } + grad_input2[n][c][h][w] = grad_input_val; + } +} +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh @@ -0,0 +1,367 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer + ***************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer + ********************* + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +// modified from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +#ifndef DEFORM_CONV_CUDA_KERNEL_CUH +#define DEFORM_CONV_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +template +__device__ T deformable_im2col_bilinear(const T *input, const int data_width, + const int height, const int width, T h, + T w) { + if (h <= -1 || height <= h || w <= -1 || width <= w) { + return 0; + } + + int h_low = floorf(h); + int w_low = floorf(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh, hw = 1 - lw; + + T v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; + T v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = input[h_low * data_width + w_high]; + T v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = input[h_high * data_width + w_low]; + T v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = input[h_high * data_width + w_high]; + + T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h, + const int w, const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floorf(argmax_h); + int argmax_w_low = floorf(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height, + const int width, const T *im_data, + const int data_width, const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floorf(argmax_h); + int argmax_w_low = floorf(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void deformable_im2col_gpu_kernel( + const int n, const T *data_im, const T *data_offset, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T *data_col) { + CUDA_1D_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + T *data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T *data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T *data_offset_ptr = + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + val = deformable_im2col_bilinear(data_im_ptr, width, height, width, + h_im, w_im); + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +__global__ void deformable_col2im_gpu_kernel( + const int n, const T *data_col, const T *data_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + T *grad_im) { + CUDA_1D_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, + cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void deformable_col2im_coord_gpu_kernel( + const int n, const T *data_col, const T *data_im, const T *data_offset, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int offset_channels, const int deformable_group, const int height_col, + const int width_col, T *grad_offset) { + CUDA_1D_KERNEL_LOOP(index, n) { + T val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T *data_col_ptr = data_col + deformable_group_index * + channel_per_deformable_group * + batch_size * width_col * height_col; + const T *data_im_ptr = + data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * + height * width; + const T *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + inv_h = inv_w = -2; + const T weight = get_coordinate_weight(inv_h, inv_w, height, width, + data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +#endif // DEFORM_CONV_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..86c4bc66dd2fb289340a4fb1714edb5db1e798c4 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh @@ -0,0 +1,186 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH +#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void deform_roi_pool_forward_cuda_kernel( + const int nthreads, const T* input, const T* rois, const T* offset, + T* output, const int pooled_height, const int pooled_width, + const T spatial_scale, const int sampling_ratio, const T gamma, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_rois[1] * spatial_scale - 0.5; + T roi_start_h = offset_rois[2] * spatial_scale - 0.5; + T roi_end_w = offset_rois[3] * spatial_scale - 0.5; + T roi_end_h = offset_rois[4] * spatial_scale - 0.5; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_height / pooled_height)); + int roi_bin_grid_w = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_width / pooled_width)); + + // Compute roi offset + if (offset != NULL) { + const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + + ph * pooled_width + pw; + T offset_roi_w = gamma * roi_width * offset_cur_w[0]; + T offset_roi_h = + gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; + roi_start_w += offset_roi_w; + roi_start_h += offset_roi_h; + } + + // We do average pooling inside a bin + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = bilinear_interpolate(offset_input, height, width, y, x, index); + output_val += val; + } + } + output[index] = output_val / count; + } +} + +template +__global__ void deform_roi_pool_backward_cuda_kernel( + const int nthreads, const T* grad_output, const T* input, const T* rois, + const T* offset, T* grad_input, T* grad_offset, const int pooled_height, + const int pooled_width, const T spatial_scale, const int sampling_ratio, + const T gamma, const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + const T* offset_input = + input + ((roi_batch_ind * channels + c) * height * width); + T* offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_rois[1] * spatial_scale - 0.5; + T roi_start_h = offset_rois[2] * spatial_scale - 0.5; + T roi_end_w = offset_rois[3] * spatial_scale - 0.5; + T roi_end_h = offset_rois[4] * spatial_scale - 0.5; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_height / pooled_height)); + int roi_bin_grid_w = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_width / pooled_width)); + + // Compute roi offset + if (offset != NULL) { + const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + + ph * pooled_width + pw; + T offset_roi_w = gamma * roi_width * offset_cur_w[0]; + T offset_roi_h = + gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; + roi_start_w += offset_roi_w; + roi_start_h += offset_roi_h; + } + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + const T grad_output_this_bin = grad_output[index] / count; + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high, index); + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_grad_input + y_low * width + x_low, + grad_output_this_bin * w1); + atomicAdd(offset_grad_input + y_low * width + x_high, + grad_output_this_bin * w2); + atomicAdd(offset_grad_input + y_high * width + x_low, + grad_output_this_bin * w3); + atomicAdd(offset_grad_input + y_high * width + x_high, + grad_output_this_bin * w4); + if (offset != NULL) { + T input_00 = offset_input[y_low * width + x_low]; + T input_10 = offset_input[y_low * width + x_high]; + T input_01 = offset_input[y_high * width + x_low]; + T input_11 = offset_input[y_high * width + x_high]; + T ogx = gamma * roi_width * grad_output_this_bin * + (input_11 * (y - y_low) + input_10 * (y_high - y) + + input_01 * (y_low - y) + input_00 * (y - y_high)); + T ogy = gamma * roi_height * grad_output_this_bin * + (input_11 * (x - x_low) + input_01 * (x_high - x) + + input_10 * (x_low - x) + input_00 * (x - x_high)); + atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + + ph * pooled_width + pw, + ogx); + atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + + pooled_width * pooled_height + ph * pooled_width + pw, + ogy); + } + } + } + } + } +} + +#endif // DEFORM_ROI_POOL_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..053977a3011692b22a5dce6050fcfec4797f092c --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh @@ -0,0 +1,137 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Adapted from +// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAX_NUM_VERT_IDX 9 +#define INTERSECTION_OFFSET 8 +#define EPSILON 1e-8 + +inline int opt_n_thread(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + return max(min(1 << pow_2, THREADS_PER_BLOCK), 1); +} + +/* +compare normalized vertices (vertices around (0,0)) +if vertex1 < vertex2 return true. +order: minimum at x-aixs, become larger in anti-clockwise direction +*/ +__device__ bool compare_vertices(float x1, float y1, float x2, float y2) { + if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON) + return false; // if equal, return false + + if (y1 > 0 && y2 < 0) return true; + if (y1 < 0 && y2 > 0) return false; + + float n1 = x1 * x1 + y1 * y1 + EPSILON; + float n2 = x2 * x2 + y2 * y2 + EPSILON; + float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2; + + if (y1 > 0 && y2 > 0) { + if (diff > EPSILON) + return true; + else + return false; + } + if (y1 < 0 && y2 < 0) { + if (diff < EPSILON) + return true; + else + return false; + } + return false; +} + +__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel( + int b, int n, int m, const float *__restrict__ vertices, + const bool *__restrict__ mask, const int *__restrict__ num_valid, + int *__restrict__ idx) { + int batch_idx = blockIdx.x; + vertices += batch_idx * n * m * 2; + mask += batch_idx * n * m; + num_valid += batch_idx * n; + idx += batch_idx * n * MAX_NUM_VERT_IDX; + + int index = threadIdx.x; // index of polygon + int stride = blockDim.x; + for (int i = index; i < n; i += stride) { + int pad; // index of arbitrary invalid intersection point (not box corner!) + for (int j = INTERSECTION_OFFSET; j < m; ++j) { + if (!mask[i * m + j]) { + pad = j; + break; + } + } + if (num_valid[i] < 3) { + // not enough vertices, take an invalid intersection point + // (zero padding) + for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + } else { + // sort the valid vertices + // note the number of valid vertices is known + // note: check that num_valid[i] < MAX_NUM_VERT_IDX + for (int j = 0; j < num_valid[i]; ++j) { + // initialize with a "big" value + float x_min = 1; + float y_min = -EPSILON; + int i_take = 0; + int i2; + float x2, y2; + if (j != 0) { + i2 = idx[i * MAX_NUM_VERT_IDX + j - 1]; + x2 = vertices[i * m * 2 + i2 * 2 + 0]; + y2 = vertices[i * m * 2 + i2 * 2 + 1]; + } + for (int k = 0; k < m; ++k) { + float x = vertices[i * m * 2 + k * 2 + 0]; + float y = vertices[i * m * 2 + k * 2 + 1]; + if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) { + if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) { + x_min = x; + y_min = y; + i_take = k; + } + } + } + idx[i * MAX_NUM_VERT_IDX + j] = i_take; + } + // duplicate the first idx + idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0]; + + // pad zeros + for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + + // for corner case: the two boxes are exactly the same. + // in this case, idx would have duplicate elements, which makes the + // shoelace formula broken because of the definition, the duplicate + // elements only appear in the first 8 positions (they are "corners in + // box", not "intersection of edges") + if (num_valid[i] == 8) { + int counter = 0; + for (int j = 0; j < 4; ++j) { + int check = idx[i * MAX_NUM_VERT_IDX + j]; + for (int k = 4; k < INTERSECTION_OFFSET; ++k) { + if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++; + } + } + if (counter == 4) { + idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0]; + for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + } + } + + // TODO: still might need to cover some other corner cases :( + } + } +} diff --git a/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..d3801a02c1c8f44874fb84fa884cc23bee25c331 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh @@ -0,0 +1,152 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH +#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, + int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +template +__global__ void furthest_point_sampling_forward_cuda_kernel( + int b, int n, int m, const float *__restrict__ dataset, + float *__restrict__ temp, int *__restrict__ idxs) { + // dataset: (B, N, 3) + // tmp: (B, N) + // output: + // idx: (B, M) + + if (m <= 0) return; + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int batch_index = blockIdx.x; + dataset += batch_index * n * 3; + temp += batch_index * n; + idxs += batch_index * m; + + int tid = threadIdx.x; + const int stride = block_size; + + int old = 0; + if (threadIdx.x == 0) idxs[0] = old; + + __syncthreads(); + for (int j = 1; j < m; j++) { + int besti = 0; + float best = -1; + float x1 = dataset[old * 3 + 0]; + float y1 = dataset[old * 3 + 1]; + float z1 = dataset[old * 3 + 2]; + for (int k = tid; k < n; k += stride) { + float x2, y2, z2; + x2 = dataset[k * 3 + 0]; + y2 = dataset[k * 3 + 1]; + z2 = dataset[k * 3 + 2]; + // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); + // if (mag <= 1e-3) + // continue; + + float d = + (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + float d2 = min(d, temp[k]); + temp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + +#pragma unroll + for (int block_size_thres = 1024; block_size_thres >= 2; + block_size_thres >>= 1) { + const int tid_thres = block_size_thres / 2; + if (block_size >= block_size_thres && tid < tid_thres) { + __update(dists, dists_i, tid, tid + tid_thres); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) idxs[j] = old; + } +} + +// Modified from +// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu +template +__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel( + int b, int n, int m, const float *__restrict__ dataset, + float *__restrict__ temp, int *__restrict__ idxs) { + // dataset: (B, N, N) + // tmp: (B, N) + // output: + // idx: (B, M) + + if (m <= 0) return; + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int batch_index = blockIdx.x; + dataset += batch_index * n * n; + temp += batch_index * n; + idxs += batch_index * m; + + int tid = threadIdx.x; + const int stride = block_size; + + int old = 0; + if (threadIdx.x == 0) idxs[0] = old; + + __syncthreads(); + for (int j = 1; j < m; j++) { + int besti = 0; + float best = -1; + // float x1 = dataset[old * 3 + 0]; + // float y1 = dataset[old * 3 + 1]; + // float z1 = dataset[old * 3 + 2]; + for (int k = tid; k < n; k += stride) { + // float x2, y2, z2; + // x2 = dataset[k * 3 + 0]; + // y2 = dataset[k * 3 + 1]; + // z2 = dataset[k * 3 + 2]; + + // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * + // (z2 - z1); + float d = dataset[old * n + k]; + + float d2 = min(d, temp[k]); + temp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + +#pragma unroll + for (int block_size_thres = 1024; block_size_thres >= 2; + block_size_thres >>= 1) { + const int tid_thres = block_size_thres / 2; + if (block_size >= block_size_thres && tid < tid_thres) { + __update(dists, dists_i, tid, tid + tid_thres); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) idxs[j] = old; + } +} + +#endif // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6d932434cba245833e661b8c7e140601940bc35b --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh @@ -0,0 +1,58 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef GATHER_POINTS_CUDA_KERNEL_CUH +#define GATHER_POINTS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define TOTAL_THREADS 1024 + +template +__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m, + const T *points, + const int *__restrict__ idx, + T *out) { + // points: (B, C, N) + // idx: (B, M) + // output: + // out: (B, C, M) + + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b || c_idx >= c) return; + + out += bs_idx * c * m + c_idx * m + pt_idx; + idx += bs_idx * m + pt_idx; + points += bs_idx * c * n + c_idx * n; + out[0] = points[idx[0]]; + } +} + +template +__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m, + const T *grad_out, + const int *__restrict__ idx, + T *grad_points) { + // grad_out: (B, C, M) + // idx: (B, M) + // output: + // grad_points: (B, C, N) + + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b || c_idx >= c) return; + + grad_out += bs_idx * c * m + c_idx * m + pt_idx; + idx += bs_idx * m + pt_idx; + grad_points += bs_idx * c * n + c_idx * n; + + atomicAdd(grad_points + idx[0], grad_out[0]); + } +} + +#endif // GATHER_POINTS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..dfad66fc16d8759f614d7f36fa961673976b1d95 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh @@ -0,0 +1,65 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu +#ifndef GROUP_POINTS_CUDA_KERNEL_CUH +#define GROUP_POINTS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void group_points_forward_cuda_kernel(int b, int c, int n, + int npoints, int nsample, + const T *points, + const int *__restrict__ idx, + T *out) { + // points: (B, C, N) + // idx: (B, npoints, nsample) + // output: + // out: (B, C, npoints, nsample) + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { + if (bs_idx >= b || c_idx >= c) return; + + int pt_idx = index / nsample; + int sample_idx = index % nsample; + + idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; + int in_idx = bs_idx * c * n + c_idx * n + idx[0]; + int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + + pt_idx * nsample + sample_idx; + + out[out_idx] = points[in_idx]; + } +} + +template +__global__ void group_points_backward_cuda_kernel(int b, int c, int n, + int npoints, int nsample, + const T *grad_out, + const int *__restrict__ idx, + T *grad_points) { + // grad_out: (B, C, npoints, nsample) + // idx: (B, npoints, nsample) + // output: + // grad_points: (B, C, N) + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { + int pt_idx = index / nsample; + if (bs_idx >= b || c_idx >= c) return; + + int sample_idx = index % nsample; + grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + + pt_idx * nsample + sample_idx; + idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; + + atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); + } +} + +#endif // GROUP_POINTS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9ebdcad15eee05a9f412ef34eb12d3553874a4dc --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh @@ -0,0 +1,367 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef IOU3D_CUDA_KERNEL_CUH +#define IOU3D_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +const int THREADS_PER_BLOCK_IOU3D = 16; +const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; +__device__ const float EPS = 1e-8; + +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(double _x, double _y) { x = _x, y = _y; } + + __device__ void set(float _x, float _y) { + x = _x; + y = _y; + } + + __device__ Point operator+(const Point &b) const { + return Point(x + b.x, y + b.y); + } + + __device__ Point operator-(const Point &b) const { + return Point(x - b.x, y - b.y); + } +}; + +__device__ inline float cross(const Point &a, const Point &b) { + return a.x * b.y - a.y * b.x; +} + +__device__ inline float cross(const Point &p1, const Point &p2, + const Point &p0) { + return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); +} + +__device__ int check_rect_cross(const Point &p1, const Point &p2, + const Point &q1, const Point &q2) { + int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && + min(q1.x, q2.x) <= max(p1.x, p2.x) && + min(p1.y, p2.y) <= max(q1.y, q2.y) && + min(q1.y, q2.y) <= max(p1.y, p2.y); + return ret; +} + +__device__ inline int check_in_box2d(const float *box, const Point &p) { + // params: box (7) [x, y, z, dx, dy, dz, heading] + const float MARGIN = 1e-2; + + float center_x = box[0], center_y = box[1]; + // rotate the point in the opposite direction of box + float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; + + return (fabs(rot_x) < box[3] / 2 + MARGIN && + fabs(rot_y) < box[4] / 2 + MARGIN); +} + +__device__ inline int intersection(const Point &p1, const Point &p0, + const Point &q1, const Point &q0, + Point &ans_point) { + // fast exclusion + if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; + + // check cross standing + float s1 = cross(q0, p1, p0); + float s2 = cross(p1, q1, p0); + float s3 = cross(p0, q1, q0); + float s4 = cross(q1, p1, q0); + + if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; + + // calculate intersection of two lines + float s5 = cross(q1, p1, p0); + if (fabs(s5 - s1) > EPS) { + ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); + ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); + + } else { + float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; + float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; + float D = a0 * b1 - a1 * b0; + + ans_point.x = (b0 * c1 - b1 * c0) / D; + ans_point.y = (a1 * c0 - a0 * c1) / D; + } + + return 1; +} + +__device__ inline void rotate_around_center(const Point ¢er, + const float angle_cos, + const float angle_sin, Point &p) { + float new_x = + (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; + float new_y = + (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + p.set(new_x, new_y); +} + +__device__ inline int point_cmp(const Point &a, const Point &b, + const Point ¢er) { + return atan2(a.y - center.y, a.x - center.x) > + atan2(b.y - center.y, b.x - center.x); +} + +__device__ inline float box_overlap(const float *box_a, const float *box_b) { + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + + float a_angle = box_a[6], b_angle = box_b[6]; + float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, + a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; + float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; + float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; + float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; + float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; + + Point center_a(box_a[0], box_a[1]); + Point center_b(box_b[0], box_b[1]); + + Point box_a_corners[5]; + box_a_corners[0].set(a_x1, a_y1); + box_a_corners[1].set(a_x2, a_y1); + box_a_corners[2].set(a_x2, a_y2); + box_a_corners[3].set(a_x1, a_y2); + + Point box_b_corners[5]; + box_b_corners[0].set(b_x1, b_y1); + box_b_corners[1].set(b_x2, b_y1); + box_b_corners[2].set(b_x2, b_y2); + box_b_corners[3].set(b_x1, b_y2); + + // get oriented corners + float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); + float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); + + for (int k = 0; k < 4; k++) { + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); + } + + box_a_corners[4] = box_a_corners[0]; + box_b_corners[4] = box_b_corners[0]; + + // get intersection of lines + Point cross_points[16]; + Point poly_center; + int cnt = 0, flag = 0; + + poly_center.set(0, 0); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + flag = intersection(box_a_corners[i + 1], box_a_corners[i], + box_b_corners[j + 1], box_b_corners[j], + cross_points[cnt]); + if (flag) { + poly_center = poly_center + cross_points[cnt]; + cnt++; + } + } + } + + // check corners + for (int k = 0; k < 4; k++) { + if (check_in_box2d(box_a, box_b_corners[k])) { + poly_center = poly_center + box_b_corners[k]; + cross_points[cnt] = box_b_corners[k]; + cnt++; + } + if (check_in_box2d(box_b, box_a_corners[k])) { + poly_center = poly_center + box_a_corners[k]; + cross_points[cnt] = box_a_corners[k]; + cnt++; + } + } + + poly_center.x /= cnt; + poly_center.y /= cnt; + + // sort the points of polygon + Point temp; + for (int j = 0; j < cnt - 1; j++) { + for (int i = 0; i < cnt - j - 1; i++) { + if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { + temp = cross_points[i]; + cross_points[i] = cross_points[i + 1]; + cross_points[i + 1] = temp; + } + } + } + + // get the overlap areas + float area = 0; + for (int k = 0; k < cnt - 1; k++) { + area += cross(cross_points[k] - cross_points[0], + cross_points[k + 1] - cross_points[0]); + } + + return fabs(area) / 2.0; +} + +__device__ inline float iou_bev(const float *box_a, const float *box_b) { + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + float sa = box_a[3] * box_a[4]; + float sb = box_b[3] * box_b[4]; + float s_overlap = box_overlap(box_a, box_b); + return s_overlap / fmaxf(sa + sb - s_overlap, EPS); +} + +__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel( + const int num_a, const float *boxes_a, const int num_b, + const float *boxes_b, float *ans_overlap) { + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) { + if (a_idx >= num_a || b_idx >= num_b) { + return; + } + + const float *cur_box_a = boxes_a + a_idx * 7; + const float *cur_box_b = boxes_b + b_idx * 7; + float cur_overlap = box_overlap(cur_box_a, cur_box_b); + ans_overlap[a_idx * num_b + b_idx] = cur_overlap; + } +} + +__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num, + const float nms_overlap_thresh, + const float *boxes, + unsigned long long *mask) { + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + // params: mask (N, N/THREADS_PER_BLOCK_NMS) + const int blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + mask[cur_box_idx * col_blocks + col_start] = t; + } + } +} + +__device__ inline float iou_normal(float const *const a, float const *const b) { + // params: a: [x, y, z, dx, dy, dz, heading] + // params: b: [x, y, z, dx, dy, dz, heading] + + float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), + right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); + float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), + bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); + float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); + float interS = width * height; + float Sa = a[3] * a[4]; + float Sb = b[3] * b[4]; + return interS / fmaxf(Sa + Sb - interS, EPS); +} + +__global__ void iou3d_nms3d_normal_forward_cuda_kernel( + const int boxes_num, const float nms_overlap_thresh, const float *boxes, + unsigned long long *mask) { + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + // params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + mask[cur_box_idx * col_blocks + col_start] = t; + } + } +} + +#endif // IOU3D_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..3cf52bb90eb27d02b28c52069c760c8a38f83f08 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh @@ -0,0 +1,92 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap +#ifndef KNN_CUDA_KERNEL_CUH +#define KNN_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +inline __device__ void swap_float(float *x, float *y) { + float tmp = *x; + *x = *y; + *y = tmp; +} + +inline __device__ void swap_int(int *x, int *y) { + int tmp = *x; + *x = *y; + *y = tmp; +} + +__device__ void reheap(float *dist, int *idx, int k) { + int root = 0; + int child = root * 2 + 1; + while (child < k) { + if (child + 1 < k && dist[child + 1] > dist[child]) child++; + if (dist[root] > dist[child]) return; + swap_float(&dist[root], &dist[child]); + swap_int(&idx[root], &idx[child]); + root = child; + child = root * 2 + 1; + } +} + +__device__ void heap_sort(float *dist, int *idx, int k) { + int i; + for (i = k - 1; i > 0; i--) { + swap_float(&dist[0], &dist[i]); + swap_int(&idx[0], &idx[i]); + reheap(dist, idx, i); + } +} + +// input: xyz (b, n, 3) new_xyz (b, m, 3) +// output: idx (b, m, nsample) dist2 (b, m, nsample) +template +__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample, + const T *xyz, const T *new_xyz, + int *__restrict__ idx, T *dist2) { + int bs_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b) return; + + new_xyz += bs_idx * m * 3 + pt_idx * 3; + xyz += bs_idx * n * 3; + idx += bs_idx * m * nsample + pt_idx * nsample; + dist2 += bs_idx * m * nsample + pt_idx * nsample; + + T new_x = new_xyz[0]; + T new_y = new_xyz[1]; + T new_z = new_xyz[2]; + + float best_dist[100]; + int best_idx[100]; + for (int i = 0; i < nsample; i++) { + best_dist[i] = 1e10; + best_idx[i] = 0; + } + for (int i = 0; i < n; i++) { + T x = xyz[i * 3 + 0]; + T y = xyz[i * 3 + 1]; + T z = xyz[i * 3 + 2]; + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < best_dist[0]) { + best_dist[0] = d2; + best_idx[0] = i; + reheap(best_dist, best_idx, nsample); + } + } + heap_sort(best_dist, best_idx, nsample); + for (int i = 0; i < nsample; i++) { + idx[i] = best_idx[i]; + dist2[i] = best_dist[i]; + } + } +} + +#endif // KNN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1a0bd040e823eaaa79f96e525f961a8b8fbeafb5 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh @@ -0,0 +1,62 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH +#define MASKED_CONV2D_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int64_t *mask_h_idx, + const int64_t *mask_w_idx, + const int mask_cnt, scalar_t *data_col) { + // mask_cnt * channels + CUDA_1D_KERNEL_LOOP(index, n) { + const int m_index = index % mask_cnt; + const int h_col = mask_h_idx[m_index]; + const int w_col = mask_w_idx[m_index]; + const int c_im = index / mask_cnt; + const int c_col = c_im * kernel_h * kernel_w; + const int h_offset = h_col - pad_h; + const int w_offset = w_col - pad_w; + scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index; + for (int i = 0; i < kernel_h; ++i) { + int h_im = h_offset + i; + for (int j = 0; j < kernel_w; ++j) { + int w_im = w_offset + j; + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + *data_col_ptr = + (scalar_t)data_im[(c_im * height + h_im) * width + w_im]; + } else { + *data_col_ptr = 0.0; + } + data_col_ptr += mask_cnt; + } + } + } +} + +template +__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col, + const int height, const int width, + const int channels, + const int64_t *mask_h_idx, + const int64_t *mask_w_idx, + const int mask_cnt, scalar_t *data_im) { + CUDA_1D_KERNEL_LOOP(index, n) { + const int m_index = index % mask_cnt; + const int h_im = mask_h_idx[m_index]; + const int w_im = mask_w_idx[m_index]; + const int c_im = index / mask_cnt; + // compute the start and end of the output + data_im[(c_im * height + h_im) * width + w_im] = data_col[index]; + } +} + +#endif // MASKED_CONV2D_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh @@ -0,0 +1,300 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH +#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAXN 20 +__device__ const float PI = 3.1415926; + +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(float x, float y) : x(x), y(y) {} +}; + +__device__ inline void swap1(Point *a, Point *b) { + Point temp; + temp.x = a->x; + temp.y = a->y; + + a->x = b->x; + a->y = b->y; + + b->x = temp.x; + b->y = temp.y; +} +__device__ inline float cross(Point o, Point a, Point b) { + return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); +} + +__device__ inline float dis(Point a, Point b) { + return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); +} +__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) { + float convex_points[2][MAXN]; + for (int j = 0; j < n_points; j++) { + convex_points[0][j] = ps[j].x; + } + for (int j = 0; j < n_points; j++) { + convex_points[1][j] = ps[j].y; + } + + Point edges[MAXN]; + float edges_angles[MAXN]; + float unique_angles[MAXN]; + int n_edges = n_points - 1; + int n_unique = 0; + int unique_flag = 0; + + for (int i = 0; i < n_edges; i++) { + edges[i].x = ps[i + 1].x - ps[i].x; + edges[i].y = ps[i + 1].y - ps[i].y; + } + for (int i = 0; i < n_edges; i++) { + edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x); + if (edges_angles[i] >= 0) { + edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2); + } else { + edges_angles[i] = + edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2); + } + } + unique_angles[0] = edges_angles[0]; + n_unique += 1; + for (int i = 1; i < n_edges; i++) { + for (int j = 0; j < n_unique; j++) { + if (edges_angles[i] == unique_angles[j]) { + unique_flag += 1; + } + } + if (unique_flag == 0) { + unique_angles[n_unique] = edges_angles[i]; + n_unique += 1; + unique_flag = 0; + } else { + unique_flag = 0; + } + } + + float minarea = 1e12; + for (int i = 0; i < n_unique; i++) { + float R[2][2]; + float rot_points[2][MAXN]; + R[0][0] = cos(unique_angles[i]); + R[0][1] = sin(unique_angles[i]); + R[1][0] = -sin(unique_angles[i]); + R[1][1] = cos(unique_angles[i]); + // R x Points + for (int m = 0; m < 2; m++) { + for (int n = 0; n < n_points; n++) { + float sum = 0.0; + for (int k = 0; k < 2; k++) { + sum = sum + R[m][k] * convex_points[k][n]; + } + rot_points[m][n] = sum; + } + } + + // xmin; + float xmin, ymin, xmax, ymax; + xmin = 1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { + continue; + } else { + if (rot_points[0][j] < xmin) { + xmin = rot_points[0][j]; + } + } + } + // ymin + ymin = 1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { + continue; + } else { + if (rot_points[1][j] < ymin) { + ymin = rot_points[1][j]; + } + } + } + // xmax + xmax = -1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { + continue; + } else { + if (rot_points[0][j] > xmax) { + xmax = rot_points[0][j]; + } + } + } + // ymax + ymax = -1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { + continue; + } else { + if (rot_points[1][j] > ymax) { + ymax = rot_points[1][j]; + } + } + } + float area = (xmax - xmin) * (ymax - ymin); + if (area < minarea) { + minarea = area; + minbox[0] = unique_angles[i]; + minbox[1] = xmin; + minbox[2] = ymin; + minbox[3] = xmax; + minbox[4] = ymax; + } + } +} + +// convex_find +__device__ inline void Jarvis(Point *in_poly, int &n_poly) { + int n_input = n_poly; + Point input_poly[20]; + for (int i = 0; i < n_input; i++) { + input_poly[i].x = in_poly[i].x; + input_poly[i].y = in_poly[i].y; + } + Point p_max, p_k; + int max_index, k_index; + int Stack[20], top1, top2; + // float sign; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point *j = &(in_poly[0]); + Point *k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + + for (int i = 0; i <= top1; i++) { + right_point[i] = in_poly[Stack[i]]; + } + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + + for (int i = top2 - 1; i >= 0; i--) { + left_point[i] = in_poly[Stack[i]]; + } + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; +} + +template +__device__ inline void Findminbox(T const *const p, T *minpoints) { + Point ps1[MAXN]; + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = p[i * 2]; + convex[i].y = p[i * 2 + 1]; + } + int n_convex = 9; + Jarvis(convex, n_convex); + int n1 = n_convex; + for (int i = 0; i < n1; i++) { + ps1[i].x = convex[i].x; + ps1[i].y = convex[i].y; + } + ps1[n1].x = convex[0].x; + ps1[n1].y = convex[0].y; + + float minbbox[5] = {0}; + minBoundingRect(ps1, n1 + 1, minbbox); + float angle = minbbox[0]; + float xmin = minbbox[1]; + float ymin = minbbox[2]; + float xmax = minbbox[3]; + float ymax = minbbox[4]; + float R[2][2]; + + R[0][0] = cos(angle); + R[0][1] = sin(angle); + R[1][0] = -sin(angle); + R[1][1] = cos(angle); + + minpoints[0] = xmax * R[0][0] + ymin * R[1][0]; + minpoints[1] = xmax * R[0][1] + ymin * R[1][1]; + minpoints[2] = xmin * R[0][0] + ymin * R[1][0]; + minpoints[3] = xmin * R[0][1] + ymin * R[1][1]; + minpoints[4] = xmin * R[0][0] + ymax * R[1][0]; + minpoints[5] = xmin * R[0][1] + ymax * R[1][1]; + minpoints[6] = xmax * R[0][0] + ymax * R[1][0]; + minpoints[7] = xmax * R[0][1] + ymax * R[1][1]; +} + +template +__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes, + const T *ex_boxes, T *minbox) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T *cur_box = ex_boxes + index * 18; + T *cur_min_box = minbox + index * 8; + Findminbox(cur_box, cur_min_box); + } +} + +#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh @@ -0,0 +1,399 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer + ***************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer + ********************* + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +// modified from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH +#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +template +__device__ T dmcn_im2col_bilinear(const T *input, const int data_width, + const int height, const int width, T h, T w) { + int h_low = floorf(h); + int w_low = floorf(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh, hw = 1 - lw; + + T v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; + T v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = input[h_low * data_width + w_high]; + T v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = input[h_high * data_width + w_low]; + T v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = input[h_high * data_width + w_high]; + + T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h, + const int w, const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floorf(argmax_h); + int argmax_w_low = floorf(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w, + const int height, const int width, + const T *im_data, const int data_width, + const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floorf(argmax_h); + int argmax_w_low = floorf(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void modulated_deformable_im2col_gpu_kernel( + const int n, const T *data_im, const T *data_offset, const T *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T *data_col) { + CUDA_1D_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T *data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T *data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T *data_offset_ptr = + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + const T *data_mask_ptr = + data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, + w_im); + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +__global__ void modulated_deformable_col2im_gpu_kernel( + const int n, const T *data_col, const T *data_offset, const T *data_mask, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + T *grad_im) { + CUDA_1D_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const T *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = + dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, + cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void modulated_deformable_col2im_coord_gpu_kernel( + const int n, const T *data_col, const T *data_im, const T *data_offset, + const T *data_mask, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, T *grad_offset, T *grad_mask) { + CUDA_1D_KERNEL_LOOP(index, n) { + T val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T *data_col_ptr = data_col + deformable_group_index * + channel_per_deformable_group * + batch_size * width_col * height_col; + const T *data_im_ptr = + data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * + height * width; + const T *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const T *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + inv_h = inv_w = -2; + else + mval += data_col_ptr[col_pos] * + dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + const T weight = dmcn_get_coordinate_weight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * + // height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + } +} + +#endif // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..12225ffdb3b1691ad9edabcd1663109f67ef1a6f --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh @@ -0,0 +1,801 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from +*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ +#ifndef DEFORM_ATTN_CUDA_KERNEL +#define DEFORM_ATTN_CUDA_KERNEL + +#include "common_cuda_helper.hpp" +#include "pytorch_cuda_helper.hpp" + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear( + const scalar_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const scalar_t &h, + const scalar_t &w, const int &m, const int &c) { + const int h_low = floorf(h); + const int w_low = floorf(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ void ms_deform_attn_col2im_bilinear( + const scalar_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const scalar_t &h, + const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, + const scalar_t &attn_weight, scalar_t *&grad_value, + scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + const int h_low = floorf(h); + const int w_low = floorf(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + +template +__device__ void ms_deform_attn_col2im_bilinear_gm( + const scalar_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const scalar_t &h, + const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, + const scalar_t &attn_weight, scalar_t *&grad_value, + scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + const int h_low = floorf(h); + const int w_low = floorf(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + +template +__global__ void ms_deformable_im2col_gpu_kernel( + const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, const int batch_size, + const int spatial_size, const int num_heads, const int channels, + const int num_levels, const int num_query, const int num_point, + scalar_t *data_col) { + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = + data_value + + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, + spatial_w, num_heads, channels, + h_im, w_im, m_col, c_col) * + weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + const int qid_stride = num_heads * channels; + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + scalar_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int _tid = 1; _tid < blockSize; ++_tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[_tid]; + sid += 2; + } + + *grad_sampling_loc_out = _grad_w; + *(grad_sampling_loc_out + 1) = _grad_h; + *grad_attn_weight_out = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc_out = cache_grad_sampling_loc[0]; + *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight_out = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + scalar_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[_tid]; + sid += 2; + } + + *grad_sampling_loc_out = _grad_w; + *(grad_sampling_loc_out + 1) = _grad_h; + *grad_attn_weight_out = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc_out = cache_grad_sampling_loc[0]; + *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight_out = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm( + const int n, const scalar_t *grad_col, const scalar_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, + const int batch_size, const int spatial_size, const int num_heads, + const int channels, const int num_levels, const int num_query, + const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) { + CUDA_1D_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_point; ++p_col) { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + grad_sampling_loc_out, grad_attn_weight_out); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; + } + } + } +} +#endif // DEFORM_ATTN_CUDA_KERNEL diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..281d9f0b409f54260a81a79ad96ab09fde9580ce --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh @@ -0,0 +1,117 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef NMS_CUDA_KERNEL_CUH +#define NMS_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +int const threadsPerBlock = sizeof(unsigned long long int) * 8; + +__device__ inline bool devIoU(float const *const a, float const *const b, + const int offset, const float threshold) { + float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); + float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); + float width = fmaxf(right - left + offset, 0.f), + height = fmaxf(bottom - top + offset, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); + float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); + return interS > threshold * (Sa + Sb - interS); +} + +__global__ static void nms_cuda(const int n_boxes, const float iou_threshold, + const int offset, const float *dev_boxes, + unsigned long long *dev_mask) { + int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + const int tid = threadIdx.x; + + if (row_start > col_start) return; + + const int row_size = + fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 4]; + if (tid < col_size) { + block_boxes[tid * 4 + 0] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; + block_boxes[tid * 4 + 1] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; + block_boxes[tid * 4 + 2] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; + block_boxes[tid * 4 + 3] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; + } + __syncthreads(); + + if (tid < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + tid; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + unsigned long long int t = 0; + int start = 0; + if (row_start == col_start) { + start = tid + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { + t |= 1ULL << i; + } + } + dev_mask[cur_box_idx * gridDim.y + col_start] = t; + } + } +} + +__global__ static void gather_keep_from_mask(bool *keep, + const unsigned long long *dev_mask, + const int n_boxes) { + const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; + const int tid = threadIdx.x; + + // mark the bboxes which have been removed. + extern __shared__ unsigned long long removed[]; + + // initialize removed. + for (int i = tid; i < col_blocks; i += blockDim.x) { + removed[i] = 0; + } + __syncthreads(); + + for (int nblock = 0; nblock < col_blocks; ++nblock) { + auto removed_val = removed[nblock]; + __syncthreads(); + const int i_offset = nblock * threadsPerBlock; +#pragma unroll + for (int inblock = 0; inblock < threadsPerBlock; ++inblock) { + const int i = i_offset + inblock; + if (i >= n_boxes) break; + // select a candidate, check if it should kept. + if (!(removed_val & (1ULL << inblock))) { + if (tid == 0) { + // mark the output. + keep[i] = true; + } + auto p = dev_mask + i * col_blocks; + // remove all bboxes which overlap the candidate. + for (int j = tid; j < col_blocks; j += blockDim.x) { + if (j >= nblock) removed[j] |= p[j]; + } + __syncthreads(); + removed_val = removed[nblock]; + } + } + } +} + +#endif // NMS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..bba3b8258f6b8798b9d1a651bfda29c48bb5376a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh @@ -0,0 +1,141 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#ifndef NMS_QUADRI_CUDA_CUH +#define NMS_QUADRI_CUDA_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif +#include "box_iou_rotated_utils.hpp" + +__host__ __device__ inline int divideUP(const int x, const int y) { + return (((x) + (y)-1) / (y)); +} + +namespace { +int const threadsPerBlock = sizeof(unsigned long long) * 8; +} + +template +__global__ void nms_quadri_cuda_kernel(const int n_boxes, + const float iou_threshold, + const T* dev_boxes, + unsigned long long* dev_mask, + const int multi_label) { + if (multi_label == 1) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + // Compared to nms_cuda_kernel, where each box is represented with 4 values + // (x1, y1, x2, y2), each rotated box is represented with 8 values + // (x1, y1, ..., x4, y4) here. + __shared__ T block_boxes[threadsPerBlock * 8]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 8 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0]; + block_boxes[threadIdx.x * 8 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1]; + block_boxes[threadIdx.x * 8 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2]; + block_boxes[threadIdx.x * 8 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3]; + block_boxes[threadIdx.x * 8 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4]; + block_boxes[threadIdx.x * 8 + 5] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5]; + block_boxes[threadIdx.x * 8 + 6] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6]; + block_boxes[threadIdx.x * 8 + 7] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const T* cur_box = dev_boxes + cur_box_idx * 9; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + // Instead of devIoU used by original horizontal nms, here + // we use the single_box_iou_quadri function from + // box_iou_rotated_utils.h + if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > + iou_threshold) { + t |= 1ULL << i; + } + } + const int col_blocks = divideUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } + } else { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + // Compared to nms_cuda_kernel, where each box is represented with 4 values + // (x1, y1, x2, y2), each rotated box is represented with 8 values + // (x1, y1, , ..., x4, y4) here. + __shared__ T block_boxes[threadsPerBlock * 8]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 8 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0]; + block_boxes[threadIdx.x * 8 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1]; + block_boxes[threadIdx.x * 8 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2]; + block_boxes[threadIdx.x * 8 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3]; + block_boxes[threadIdx.x * 8 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4]; + block_boxes[threadIdx.x * 8 + 5] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5]; + block_boxes[threadIdx.x * 8 + 6] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6]; + block_boxes[threadIdx.x * 8 + 7] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const T* cur_box = dev_boxes + cur_box_idx * 8; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + // Instead of devIoU used by original horizontal nms, here + // we use the single_box_iou_quadri function from + // box_iou_rotated_utils.h + if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > + iou_threshold) { + t |= 1ULL << i; + } + } + const int col_blocks = divideUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..747327afb83900177dd4721f1b0ba99153f658d7 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh @@ -0,0 +1,133 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +// modified from +// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu +#ifndef NMS_ROTATED_CUDA_CUH +#define NMS_ROTATED_CUDA_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif +#include "box_iou_rotated_utils.hpp" + +__host__ __device__ inline int divideUP(const int x, const int y) { + return (((x) + (y)-1) / (y)); +} + +namespace { +int const threadsPerBlock = sizeof(unsigned long long) * 8; +} + +template +__global__ void nms_rotated_cuda_kernel(const int n_boxes, + const float iou_threshold, + const T* dev_boxes, + unsigned long long* dev_mask, + const int multi_label) { + // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel + + if (multi_label == 1) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + // Compared to nms_cuda_kernel, where each box is represented with 4 values + // (x1, y1, x2, y2), each rotated box is represented with 5 values + // (x_center, y_center, width, height, angle_degrees) here. + __shared__ T block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const T* cur_box = dev_boxes + cur_box_idx * 6; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + // Instead of devIoU used by original horizontal nms, here + // we use the single_box_iou_rotated function from + // box_iou_rotated_utils.h + if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > + iou_threshold) { + t |= 1ULL << i; + } + } + const int col_blocks = divideUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } + } else { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + // Compared to nms_cuda_kernel, where each box is represented with 4 values + // (x1, y1, x2, y2), each rotated box is represented with 5 values + // (x_center, y_center, width, height, angle_degrees) here. + __shared__ T block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const T* cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + // Instead of devIoU used by original horizontal nms, here + // we use the single_box_iou_rotated function from + // box_iou_rotated_utils.h + if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > + iou_threshold) { + t |= 1ULL << i; + } + } + const int col_blocks = divideUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh new file mode 100644 index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019, SenseTime. + */ + +#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ +#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ + +#ifndef __CUDACC__ +#error cudawarpfunction.cuh should only be included by .cu files +#endif +#include + +#include + +#ifdef PARROTS_USE_HALF +#include +#endif +#ifdef __CUDA_ARCH__ +#define CUDA_INTRINSIC_FUNC(Expr) Expr +#else +#define CUDA_INTRINSIC_FUNC(Expr) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +#ifdef PARROTS_USE_HALF + +#if CUDA_VERSION < 9000 + +__device__ inline float16 __shfl(float16 var, int srcLane, int width) { + CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width);); +} + +__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) { + CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width);); +} + +__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) { + CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width);); +} + +__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) { + CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width);); +} + +#else // CUDA_VERSION >= 9000 + +__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane, + int width = warpSize) { + CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width); + return r;); +} + +__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var, + unsigned delta, int width = warpSize) { + CUDA_INTRINSIC_FUNC( + float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;); +} + +__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var, + unsigned delta, + int width = warpSize) { + CUDA_INTRINSIC_FUNC( + float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;); +} + +__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var, + int laneMask, int width) { + CUDA_INTRINSIC_FUNC(float16 r; + r.y = __shfl_xor_sync(mask, var.y, laneMask, width); + return r;); +} + +#endif // CUDA_VERSION < 9000 + +#endif // PARROTS_USE_HALF + +// warp shuffle interface with a dummy mask +#if CUDA_VERSION < 9000 + +template +__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane, + int width = warpSize) { + CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width);); +} + +template +__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta, + int width = warpSize) { + CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width);); +} + +template +__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta, + int width = warpSize) { + CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width);); +} + +template +__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask, + int width = warpSize) { + CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width);); +} + +#endif // CUDA_VERSION < 9000 + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +#endif // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ diff --git a/external/cv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..342362079a5ce3dde6d19532b3014872f4373330 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh @@ -0,0 +1,95 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH +#define POINT_IN_BOXES_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, + T &local_x, T &local_y) { + T cosa = cos(-rz), sina = sin(-rz); + local_x = shift_x * cosa + shift_y * (-sina); + local_y = shift_x * sina + shift_y * cosa; +} + +template +__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, + T &local_y) { + // param pt: (x, y, z) + // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, + // cz in the bottom center + T x = pt[0], y = pt[1], z = pt[2]; + T cx = box3d[0], cy = box3d[1], cz = box3d[2]; + T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; + cz += z_size / + 2.0; // shift to the center since cz in box3d is the bottom center + + if (fabsf(z - cz) > z_size / 2.0) return 0; + lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); + float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & + (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); + return in_flag; +} + +template +__global__ void points_in_boxes_part_forward_cuda_kernel( + int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR + // coordinate, z is the bottom center, each box DO NOT overlaps params pts: + // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: + // (B, npoints), default -1 + + int bs_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (bs_idx >= batch_size) return; + + boxes += bs_idx * boxes_num * 7; + pts += bs_idx * pts_num * 3 + pt_idx * 3; + box_idx_of_points += bs_idx * pts_num + pt_idx; + + T local_x = 0, local_y = 0; + int cur_in_flag = 0; + for (int k = 0; k < boxes_num; k++) { + cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); + if (cur_in_flag) { + box_idx_of_points[0] = k; + break; + } + } + } +} + +template +__global__ void points_in_boxes_all_forward_cuda_kernel( + int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, + int *box_idx_of_points) { + // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR + // coordinate, z is the bottom center, each box DO NOT overlaps params pts: + // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: + // (B, npoints), default -1 + + int bs_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (bs_idx >= batch_size) return; + + boxes += bs_idx * boxes_num * 7; + pts += bs_idx * pts_num * 3 + pt_idx * 3; + box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; + + T local_x = 0, local_y = 0; + for (int k = 0; k < boxes_num; k++) { + const int cur_in_flag = + check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); + if (cur_in_flag) { + box_idx_of_points[k] = 1; + } + } + } +} + +#endif // POINT_IN_BOXES_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh @@ -0,0 +1,79 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH +#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +struct point { + float x, y; +}; + +template +__global__ void points_in_polygons_forward_cuda_kernel( + const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2, + const int rows, const int cols, scalar_t *inside_flag) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int row = index / cols; + int col = index % cols; + + const scalar_t *offset_vertex1 = vertex1 + row * 2; + const scalar_t *offset_vertex2 = vertex2 + col * 8; + + point point_[1]; + point polygon[4]; + + point_[0].x = offset_vertex1[0]; + point_[0].y = offset_vertex1[1]; + + polygon[0].x = offset_vertex2[0]; + polygon[0].y = offset_vertex2[1]; + polygon[1].x = offset_vertex2[2]; + polygon[1].y = offset_vertex2[3]; + polygon[2].x = offset_vertex2[4]; + polygon[2].y = offset_vertex2[5]; + polygon[3].x = offset_vertex2[6]; + polygon[3].y = offset_vertex2[7]; + + int nCross = 0; + int i, j; + float sx, sy, tx, ty, px, py, x; + for (i = 0, j = 3; i < 4; j = i, i++) { + sx = polygon[i].x; + sy = polygon[i].y; + tx = polygon[j].x; + ty = polygon[j].y; + + px = point_[0].x; + py = point_[0].y; + + if (py < min(sy, ty)) continue; + if (py > max(sy, ty)) continue; + + if ((sx == px && sy == py) || (tx == px && ty == py)) { + break; + } else { + if ((sy < py && ty >= py) || (sy >= py && ty < py)) { + x = sx + (py - sy) * (tx - sx) / (ty - sy); + if (x == px) { + break; + } + if (x > px) { + nCross++; + } + } + } + } + if (nCross % 2 == 1) { + inside_flag[index] = 1.0; + } else { + inside_flag[index] = 0.0; + } + return; + } +} + +#endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e2f5a11b8dd6058f8d2fd288fc943dc235b39c37 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh @@ -0,0 +1,381 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu +// Distributed under terms of the MIT license. +#ifndef PRROI_POOL_CUDA_KERNEL_CUH +#define PRROI_POOL_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data, + const int h, + const int w, + const int height, + const int width) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + T retVal = overflow ? 0.0f : data[h * width + w]; + return retVal; +} + +template +__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) { + return (1.0f - abs(dh)) * (1.0f - abs(dw)); +} + +template +__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t, + T c1, T c2) { + return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1; +} + +template +__device__ static T PrRoIPoolingInterpolation(const T *data, const T h, + const T w, const int height, + const int width) { + T retVal = 0.0f; + int h1 = floorf(h); + int w1 = floorf(w); + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h) + 1; + w1 = floorf(w); + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h); + w1 = floorf(w) + 1; + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h) + 1; + w1 = floorf(w) + 1; + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + return retVal; +} + +template +__device__ static T PrRoIPoolingMatCalculation(const T *this_data, + const int s_h, const int s_w, + const int e_h, const int e_w, + const T y0, const T x0, + const T y1, const T x1, + const int h0, const int w0) { + T alpha, beta, lim_alpha, lim_beta, tmp; + T sum_out = 0; + + alpha = x0 - T(s_w); + beta = y0 - T(s_h); + lim_alpha = x1 - T(s_w); + lim_beta = y1 - T(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; + + alpha = x0 - T(s_w); + beta = T(e_h) - y1; + lim_alpha = x1 - T(s_w); + lim_beta = T(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; + + return sum_out; +} + +template +__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff, + const int h, const int w, + const int height, + const int width, + const T coeff) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff); +} + +template +__device__ static void PrRoIPoolingMatDistributeDiff( + T *diff, const T top_diff, const int s_h, const int s_w, const int e_h, + const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, + const int w0) { + T alpha, beta, lim_alpha, lim_beta, tmp; + + alpha = x0 - T(s_w); + beta = y0 - T(s_h); + lim_alpha = x1 - T(s_w); + lim_beta = y1 - T(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp); + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp); + + alpha = x0 - T(s_w); + beta = T(e_h) - y1; + lim_alpha = x1 - T(s_w); + lim_beta = T(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp); + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); +} + +template +__global__ void prroi_pool_forward_cuda_kernel( + const int nthreads, const T *input, const T *rois, T *output, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T *offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + T roi_x1 = offset_rois[1] * spatial_scale; + T roi_y1 = offset_rois[2] * spatial_scale; + T roi_x2 = offset_rois[3] * spatial_scale; + T roi_y2 = offset_rois[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, ((T)0.0)); + T roi_height = max(roi_y2 - roi_y1, ((T)0.0)); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T *this_data = + input + (roi_batch_ind * channels + c) * height * width; + T *this_out = output + index; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + if (bin_size == 0) { + *this_out = 0; + continue; + } + + T sum_out = 0; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) + for (int bin_y = start_y; bin_y < end_y; ++bin_y) + sum_out += PrRoIPoolingMatCalculation( + this_data, bin_y, bin_x, bin_y + 1, bin_x + 1, + max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), + min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, + width); + *this_out = sum_out / bin_size; + } +} + +template +__global__ void prroi_pool_backward_cuda_kernel( + const int nthreads, const T *grad_output, const T *rois, T *grad_input, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + auto rois_cur = rois + n * 5; + + int roi_batch_ind = rois_cur[0]; + T roi_x1 = rois_cur[1] * spatial_scale; + T roi_y1 = rois_cur[2] * spatial_scale; + T roi_x2 = rois_cur[3] * spatial_scale; + T roi_y2 = rois_cur[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, (T)0); + T roi_height = max(roi_y2 - roi_y1, (T)0); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T *this_out_grad = grad_output + index; + T *this_data_grad = + grad_input + (roi_batch_ind * channels + c) * height * width; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + + T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) + for (int bin_y = start_y; bin_y < end_y; ++bin_y) + PrRoIPoolingMatDistributeDiff( + this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1, + max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), + min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, + width); + } +} + +template +__global__ void prroi_pool_coor_backward_cuda_kernel( + const int nthreads, const T *output, const T *grad_output, const T *input, + const T *rois, T *grad_rois, const int pooled_height, + const int pooled_width, const T spatial_scale, const int channels, + const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + auto rois_cur = rois + n * 5; + + int roi_batch_ind = rois_cur[0]; + T roi_x1 = rois_cur[1] * spatial_scale; + T roi_y1 = rois_cur[2] * spatial_scale; + T roi_x2 = rois_cur[3] * spatial_scale; + T roi_y2 = rois_cur[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, (T)0); + T roi_height = max(roi_y2 - roi_y1, (T)0); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T output_grad_val = grad_output[index]; + const T *this_input_data = + input + (roi_batch_ind * channels + c) * height * width; + const T output_val = output[index]; + T *this_rois_grad = grad_rois + n * 5; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + + T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size; + + // WARNING: to be discussed + if (sum_out == 0) continue; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0; + for (int bin_y = start_y; bin_y < end_y; ++bin_y) { + grad_x1_y += PrRoIPoolingSingleCoorIntegral( + max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, + PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1, + height, width), + PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1, + height, width)); + + grad_x2_y += PrRoIPoolingSingleCoorIntegral( + max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, + PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2, + height, width), + PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2, + height, width)); + } + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) { + grad_x_y1 += PrRoIPoolingSingleCoorIntegral( + max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, + PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x), + height, width), + PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1), + height, width)); + + grad_x_y2 += PrRoIPoolingSingleCoorIntegral( + max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, + PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x), + height, width), + PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1), + height, width)); + } + + T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val; + T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val; + T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val; + T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val; + + partial_x1 = partial_x1 / bin_size * spatial_scale; + partial_x2 = partial_x2 / bin_size * spatial_scale; + partial_y1 = partial_y1 / bin_size * spatial_scale; + partial_y2 = partial_y2 / bin_size * spatial_scale; + + // (index, x1, y1, x2, y2) + this_rois_grad[0] = 0; + atomicAdd(this_rois_grad + 1, + (partial_x1 * (1.0f - T(pw) / pooled_width) + + partial_x2 * (1.0f - T(pw + 1) / pooled_width)) * + output_grad_val); + atomicAdd(this_rois_grad + 2, + (partial_y1 * (1.0f - T(ph) / pooled_height) + + partial_y2 * (1.0f - T(ph + 1) / pooled_height)) * + output_grad_val); + atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width + + partial_x1 * T(pw) / pooled_width) * + output_grad_val); + atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height + + partial_y1 * T(ph) / pooled_height) * + output_grad_val); + } +} + +#endif // ROI_POOL_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5d946686bdd5fdfbf8a27f6d040e15861202f471 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh @@ -0,0 +1,141 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef PSAMASK_CUDA_KERNEL_CUH +#define PSAMASK_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +// CUDA: grid stride looping +#ifndef CUDA_KERNEL_LOOP +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) +#endif + +template +__global__ void psamask_collect_forward_cuda( + const int nthreads, const int h_feature, const int w_feature, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask, const T* mask_data, T* buffer_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % w_feature; + const int h = (index / w_feature) % h_feature; + const int n = index / w_feature / h_feature; + // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed + const int hstart = max(0, half_h_mask - h); + const int hend = min(h_mask, h_feature + half_h_mask - h); + const int wstart = max(0, half_w_mask - w); + const int wend = min(w_mask, w_feature + half_w_mask - w); + // (hidx, widx ) with mask-indexed + // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed + for (int hidx = hstart; hidx < hend; hidx++) { + for (int widx = wstart; widx < wend; widx++) { + buffer_data[(n * h_feature * w_feature + + (hidx + h - half_h_mask) * w_feature + + (widx + w - half_w_mask)) * + h_feature * w_feature + + h * w_feature + w] = mask_data + [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * + w_feature + + w]; + } + } + } +} + +template +__global__ void psamask_distribute_forward_cuda( + const int nthreads, const int h_feature, const int w_feature, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask, const T* mask_data, T* buffer_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % w_feature; + const int h = (index / w_feature) % h_feature; + const int n = index / w_feature / h_feature; + // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed + const int hstart = max(0, half_h_mask - h); + const int hend = min(h_mask, h_feature + half_h_mask - h); + const int wstart = max(0, half_w_mask - w); + const int wend = min(w_mask, w_feature + half_w_mask - w); + // (hidx, widx ) with mask-indexed + // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed + for (int hidx = hstart; hidx < hend; hidx++) { + for (int widx = wstart; widx < wend; widx++) { + buffer_data[(n * h_feature * w_feature + h * w_feature + w) * + h_feature * w_feature + + (hidx + h - half_h_mask) * w_feature + + (widx + w - half_w_mask)] = mask_data + [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * + w_feature + + w]; + } + } + } +} + +template +__global__ void psamask_collect_backward_cuda( + const int nthreads, const int h_feature, const int w_feature, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask, const T* buffer_diff, T* mask_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % w_feature; + const int h = (index / w_feature) % h_feature; + const int n = index / w_feature / h_feature; + // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed + const int hstart = max(0, half_h_mask - h); + const int hend = min(h_mask, h_feature + half_h_mask - h); + const int wstart = max(0, half_w_mask - w); + const int wend = min(w_mask, w_feature + half_w_mask - w); + // (hidx, widx ) with mask-indexed + // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed + for (int hidx = hstart; hidx < hend; hidx++) { + for (int widx = wstart; widx < wend; widx++) { + mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + + h) * + w_feature + + w] = buffer_diff[(n * h_feature * w_feature + + (hidx + h - half_h_mask) * w_feature + + (widx + w - half_w_mask)) * + h_feature * w_feature + + h * w_feature + w]; + } + } + } +} + +template +__global__ void psamask_distribute_backward_cuda( + const int nthreads, const int h_feature, const int w_feature, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask, const T* buffer_diff, T* mask_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % w_feature; + const int h = (index / w_feature) % h_feature; + const int n = index / w_feature / h_feature; + // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed + const int hstart = max(0, half_h_mask - h); + const int hend = min(h_mask, h_feature + half_h_mask - h); + const int wstart = max(0, half_w_mask - w); + const int wend = min(w_mask, w_feature + half_w_mask - w); + // (hidx, widx ) with mask-indexed + // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed + for (int hidx = hstart; hidx < hend; hidx++) { + for (int widx = wstart; widx < wend; widx++) { + mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + + h) * + w_feature + + w] = + buffer_diff[(n * h_feature * w_feature + h * w_feature + w) * + h_feature * w_feature + + (hidx + h - half_h_mask) * w_feature + + (widx + w - half_w_mask)]; + } + } + } +} + +#endif // PSAMASK_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh @@ -0,0 +1,242 @@ +// Modified from +// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu +#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH +#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS + +/*** Forward ***/ +template +__global__ void riroi_align_rotated_forward_cuda_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int num_samples, const bool clockwise, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int num_orientations, scalar_t *top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int o = (index / pooled_width / pooled_height) % num_orientations; + int c = + (index / pooled_width / pooled_height / num_orientations) % channels; + int n = index / pooled_width / pooled_height / num_orientations / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + // find aligned index + scalar_t ind_float = theta * num_orientations / (2 * M_PI); + int ind = floorf(ind_float); + scalar_t l_var = ind_float - (scalar_t)ind; + scalar_t r_var = 1.0 - l_var; + // correct start channel + ind = (ind + num_orientations) % num_orientations; + // rotated channel + int ind_rot = (o - ind + num_orientations) % num_orientations; + int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; + const scalar_t *offset_bottom_data = + bottom_data + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot) * + height * width; + + const scalar_t *offset_bottom_data_plus = + bottom_data + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot_plus) * + height * width; + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (num_samples > 0) + ? num_samples + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosscalar_theta = cos(theta); + scalar_t sinscalar_theta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + scalar_t output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta (counterclockwise) around the center and translate + scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; + scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; + + scalar_t val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + scalar_t val_plus = bilinear_interpolate( + offset_bottom_data_plus, height, width, y, x, index); + output_val += r_var * val + l_var * val_plus; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +/*** Backward ***/ +template +__global__ void riroi_align_rotated_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, + const scalar_t spatial_scale, const int num_samples, const bool clockwise, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int num_orientations, + scalar_t *bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int o = (index / pooled_width / pooled_height) % num_orientations; + int c = + (index / pooled_width / pooled_height / num_orientations) % channels; + int n = index / pooled_width / pooled_height / num_orientations / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not round + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + // find aligned index + scalar_t ind_float = theta * num_orientations / (2 * M_PI); + int ind = floorf(ind_float); + scalar_t l_var = ind_float - (scalar_t)ind; + scalar_t r_var = 1.0 - l_var; + // correct start channel + ind = (ind + num_orientations) % num_orientations; + // rotated channel + int ind_rot = (o - ind + num_orientations) % num_orientations; + int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; + scalar_t *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot) * + height * width; + scalar_t *offset_bottom_diff_plus = + bottom_diff + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot_plus) * + height * width; + int top_offset = + (n * channels * num_orientations + c * num_orientations + o) * + pooled_height * pooled_width; + const scalar_t *offset_top_diff = top_diff + top_offset; + const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (num_samples > 0) + ? num_samples + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosTheta = cos(theta); + scalar_t sinTheta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; + scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; + + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, + w4, x_low, x_high, y_low, + y_high, index); + + scalar_t g1 = top_diff_this_bin * w1 / count; + scalar_t g2 = top_diff_this_bin * w2 / count; + scalar_t g3 = top_diff_this_bin * w3 / count; + scalar_t g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var); + + atomicAdd(offset_bottom_diff_plus + y_low * width + x_low, + g1 * l_var); + atomicAdd(offset_bottom_diff_plus + y_low * width + x_high, + g2 * l_var); + atomicAdd(offset_bottom_diff_plus + y_high * width + x_low, + g3 * l_var); + atomicAdd(offset_bottom_diff_plus + y_high * width + x_high, + g4 * l_var); + + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RiRoIAlignBackward + +#endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4541462afd6bd77ee794badd7d84bdd6c91b2c43 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh @@ -0,0 +1,212 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef ROI_ALIGN_CUDA_KERNEL_CUH +#define ROI_ALIGN_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +/*** Forward ***/ +template +__global__ void roi_align_forward_cuda_kernel( + const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, + T* argmax_x, const int pooled_height, const int pooled_width, + const T spatial_scale, const int sampling_ratio, + const int pool_mode, // 0 - max pool, 1 - avg pool + const bool aligned, const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + // Do not using rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_rois[1] * spatial_scale - offset; + T roi_start_h = offset_rois[2] * spatial_scale - offset; + T roi_end_w = offset_rois[3] * spatial_scale - offset; + T roi_end_h = offset_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_height / pooled_height)); + int roi_bin_grid_w = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_width / pooled_width)); + + if (pool_mode == 0) { + // We do max pooling inside a bin + T maxval = -FLT_MAX; + T maxidx_y = -1.f, maxidx_x = -1.f; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = + bilinear_interpolate(offset_input, height, width, y, x, index); + if (val > maxval) { + maxval = val; + maxidx_y = y; + maxidx_x = x; + } + } + } + output[index] = maxval; + argmax_y[index] = maxidx_y; + argmax_x[index] = maxidx_x; + } else if (pool_mode == 1) { + // We do average pooling inside a bin + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = + bilinear_interpolate(offset_input, height, width, y, x, index); + output_val += val; + } + } + output[index] = output_val / count; + } + } +} + +/*** Backward ***/ +template +__global__ void roi_align_backward_cuda_kernel( + const int nthreads, const T* grad_output, const T* rois, const T* argmax_y, + const T* argmax_x, T* grad_input, const int pooled_height, + const int pooled_width, const T spatial_scale, const int sampling_ratio, + const int pool_mode, // 0 - max pool, 1 - avg pool + const bool aligned, const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T grad_output_this_bin = grad_output[index]; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + T* offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + if (pool_mode == 0) { + T y = argmax_y[index], x = argmax_x[index]; + if (y != -1.f) { + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high, index); + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_grad_input + y_low * width + x_low, + grad_output_this_bin * w1); + atomicAdd(offset_grad_input + y_low * width + x_high, + grad_output_this_bin * w2); + atomicAdd(offset_grad_input + y_high * width + x_low, + grad_output_this_bin * w3); + atomicAdd(offset_grad_input + y_high * width + x_high, + grad_output_this_bin * w4); + } + } + } else if (pool_mode == 1) { + // Do not using rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_rois[1] * spatial_scale - offset; + T roi_start_h = offset_rois[2] * spatial_scale - offset; + T roi_end_w = offset_rois[3] * spatial_scale - offset; + T roi_end_h = offset_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_height / pooled_height)); + int roi_bin_grid_w = + (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceilf(roi_width / pooled_width)); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high, index); + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_grad_input + y_low * width + x_low, + grad_output_this_bin * w1 / count); + atomicAdd(offset_grad_input + y_low * width + x_high, + grad_output_this_bin * w2 / count); + atomicAdd(offset_grad_input + y_high * width + x_low, + grad_output_this_bin * w3 / count); + atomicAdd(offset_grad_input + y_high * width + x_high, + grad_output_this_bin * w4 / count); + } + } + } + } + } +} + +#endif // ROI_ALIGN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8274dc50c709630c4ee456efd543aa1265049b41 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh @@ -0,0 +1,202 @@ +// Modified from +// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH +#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_WITH_TRT +#include "common_cuda_helper.hpp" +#else // MMCV_WITH_TRT +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS +#endif // MMCV_WITH_TRT + +/*** Forward ***/ +template +__global__ void roi_align_rotated_forward_cuda_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int sampling_ratio, const bool aligned, const bool clockwise, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const scalar_t *offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosscalar_theta = cos(theta); + scalar_t sinscalar_theta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + scalar_t output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta (counterclockwise) around the center and translate + scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; + scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; + + scalar_t val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +/*** Backward ***/ +template +__global__ void roi_align_rotated_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, + const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, + const bool clockwise, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not round + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + scalar_t *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const scalar_t *offset_top_diff = top_diff + top_offset; + const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosTheta = cos(theta); + scalar_t sinTheta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; + scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; + + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, + w4, x_low, x_high, y_low, + y_high, index); + + scalar_t g1 = top_diff_this_bin * w1 / count; + scalar_t g2 = top_diff_this_bin * w2 / count; + scalar_t g3 = top_diff_this_bin * w3 / count; + scalar_t g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + +#endif // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..3d7eae66b99b7812b92d9fc8bad237cbcbd59436 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh @@ -0,0 +1,93 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef ROI_POOL_CUDA_KERNEL_CUH +#define ROI_POOL_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void roi_pool_forward_cuda_kernel( + const int nthreads, const T* input, const T* rois, T* output, int* argmax, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + // calculate the roi region on feature maps + T roi_x1 = offset_rois[1] * spatial_scale; + T roi_y1 = offset_rois[2] * spatial_scale; + T roi_x2 = (offset_rois[3] + 1) * spatial_scale; + T roi_y2 = (offset_rois[4] + 1) * spatial_scale; + + // force malformed rois to be 1x1 + T roi_w = roi_x2 - roi_x1; + T roi_h = roi_y2 - roi_y1; + if (roi_w <= 0 || roi_h <= 0) continue; + + T bin_size_w = roi_w / static_cast(pooled_width); + T bin_size_h = roi_h / static_cast(pooled_height); + + // the corresponding bin region + int bin_x1 = floorf(static_cast(pw) * bin_size_w + roi_x1); + int bin_y1 = floorf(static_cast(ph) * bin_size_h + roi_y1); + int bin_x2 = ceilf(static_cast(pw + 1) * bin_size_w + roi_x1); + int bin_y2 = ceilf(static_cast(ph + 1) * bin_size_h + roi_y1); + + // add roi offsets and clip to input boundaries + bin_x1 = min(max(bin_x1, 0), width); + bin_y1 = min(max(bin_y1, 0), height); + bin_x2 = min(max(bin_x2, 0), width); + bin_y2 = min(max(bin_y2, 0), height); + bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1); + + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + // Define an empty pooling region to be zero + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + T max_val = is_empty ? 0 : -FLT_MAX; + int max_idx = -1; + for (int h = bin_y1; h < bin_y2; ++h) { + for (int w = bin_x1; w < bin_x2; ++w) { + int offset = h * width + w; + if (offset_input[offset] > max_val) { + max_val = offset_input[offset]; + max_idx = offset; + } + } + } + output[index] = max_val; + if (argmax != NULL) argmax[index] = max_idx; + } +} + +template +__global__ void roi_pool_backward_cuda_kernel( + const int nthreads, const T* grad_output, const T* rois, const int* argmax, + T* grad_input, const int pooled_height, const int pooled_width, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c) is an element in the pooled output + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + int roi_batch_ind = rois[n * 5]; + T* grad_input_offset = + grad_input + ((roi_batch_ind * channels + c) * height * width); + int argmax_index = argmax[index]; + + if (argmax_index != -1) { + atomicAdd(grad_input_offset + argmax_index, grad_output[index]); + } + } +} + +#endif // ROI_POOL_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..fc0aacf1435f8715fae92de535bf01bac07ac39a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh @@ -0,0 +1,260 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH +#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, + T &local_x, T &local_y) { + T cosa = cos(-rz), sina = sin(-rz); + local_x = shift_x * cosa + shift_y * (-sina); + local_y = shift_x * sina + shift_y * cosa; +} + +template +__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, + T &local_y) { + // param pt: (x, y, z) + // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, + // cz in the bottom center + T x = pt[0], y = pt[1], z = pt[2]; + T cx = box3d[0], cy = box3d[1], cz = box3d[2]; + T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; + cz += z_size / + 2.0; // shift to the center since cz in box3d is the bottom center + + if (fabsf(z - cz) > z_size / 2.0) return 0; + lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); + float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & + (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); + return in_flag; +} + +template +__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, + int out_x, int out_y, int out_z, + const T *rois, const T *pts, + int *pts_mask) { + // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR + // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, + // npoints): -1 means point does not in this box, otherwise: encode (x_idxs, + // y_idxs, z_idxs) by binary bit + int box_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (box_idx >= boxes_num) return; + + pts += pt_idx * 3; + rois += box_idx * 7; + pts_mask += box_idx * pts_num + pt_idx; + + T local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); + + pts_mask[0] = -1; + if (cur_in_flag > 0) { + T local_z = pts[2] - rois[2]; + T x_size = rois[3], y_size = rois[4], z_size = rois[5]; + + T x_res = x_size / out_x; + T y_res = y_size / out_y; + T z_res = z_size / out_z; + + unsigned int x_idx = int((local_x + x_size / 2) / x_res); + unsigned int y_idx = int((local_y + y_size / 2) / y_res); + unsigned int z_idx = int(local_z / z_res); + + x_idx = min(max(x_idx, 0), out_x - 1); + y_idx = min(max(y_idx, 0), out_y - 1); + z_idx = min(max(z_idx, 0), out_z - 1); + + unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; + + pts_mask[0] = idx_encoding; + } + } +} + +template +__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, + int max_pts_each_voxel, int out_x, + int out_y, int out_z, + const int *pts_mask, + T *pts_idx_of_voxels) { + // params pts_mask: (N, npoints) 0 or 1 + // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) + CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) { + int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; + + for (int k = 0; k < pts_num; k++) { + if (pts_mask[box_idx * pts_num + k] != -1) { + unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; + unsigned int x_idx = (idx_encoding >> 16) & 0xFF; + unsigned int y_idx = (idx_encoding >> 8) & 0xFF; + unsigned int z_idx = idx_encoding & 0xFF; + unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + + y_idx * out_z * max_pts_each_voxel + + z_idx * max_pts_each_voxel; + unsigned int cnt = pts_idx_of_voxels[base_offset]; + if (cnt < max_num_pts) { + pts_idx_of_voxels[base_offset + cnt + 1] = k; + pts_idx_of_voxels[base_offset]++; + } + } + } + } +} + +template +__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, + int max_pts_each_voxel, int out_x, int out_y, + int out_z, const T *pts_feature, + const int *pts_idx_of_voxels, + T *pooled_features, int *argmax) { + // params pts_feature: (npoints, C) + // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), + // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) + // params argmax: (N, out_x, out_y, out_z, C) + + int box_idx = blockIdx.z; + int channel_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + pooled_features += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + argmax += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + int argmax_idx = -1; + float max_val = -1e50; + + int total_pts = pts_idx_of_voxels[0]; + + for (int k = 1; k <= total_pts; k++) { + if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > + max_val) { + max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; + argmax_idx = pts_idx_of_voxels[k]; + } + } + + if (argmax_idx != -1) { + pooled_features[0] = max_val; + } + argmax[0] = argmax_idx; + } +} + +template +__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, + int max_pts_each_voxel, int out_x, int out_y, + int out_z, const T *pts_feature, + const int *pts_idx_of_voxels, + T *pooled_features) { + // params pts_feature: (npoints, C) + // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), + // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) + // params argmax: (N, out_x, out_y, out_z, C) + + int box_idx = blockIdx.z; + int channel_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + pooled_features += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + float sum_val = 0; + int total_pts = pts_idx_of_voxels[0]; + + for (int k = 1; k <= total_pts; k++) { + sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; + } + + if (total_pts > 0) { + pooled_features[0] = sum_val / total_pts; + } + } +} + +template +__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, + int out_x, int out_y, int out_z, + const int *argmax, + const T *grad_out, T *grad_in) { + // params argmax: (N, out_x, out_y, out_z, C) + // params grad_out: (N, out_x, out_y, out_z, C) + // params grad_in: (npoints, C), return value + + int box_idx = blockIdx.z; + int channel_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + argmax += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + grad_out += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + if (argmax[0] == -1) return; + + atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); + } +} + +template +__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, + int out_x, int out_y, int out_z, + int max_pts_each_voxel, + const int *pts_idx_of_voxels, + const T *grad_out, T *grad_in) { + // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) + // params grad_out: (N, out_x, out_y, out_z, C) + // params grad_in: (npoints, C), return value + + int box_idx = blockIdx.z; + int channel_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + grad_out += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + int total_pts = pts_idx_of_voxels[0]; + float cur_grad = 1 / fmaxf(float(total_pts), 1.0); + for (int k = 1; k <= total_pts; k++) { + atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, + grad_out[0] * cur_grad); + } + } +} + +#endif // ROIAWARE_POOL3D_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh @@ -0,0 +1,134 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH +#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, + T &local_x, T &local_y) { + T cosa = cos(-rz), sina = sin(-rz); + local_x = shift_x * cosa + shift_y * (-sina); + local_y = shift_x * sina + shift_y * cosa; +} + +template +__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, + T &local_y) { + // param pt: (x, y, z) + // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the + // bottom center + T x = pt[0], y = pt[1], z = pt[2]; + T cx = box3d[0], cy = box3d[1], cz = box3d[2]; + T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6]; + cz += dz / 2.0; // shift to the center since cz in box3d is the bottom center + + if (fabsf(z - cz) > dz / 2.0) return 0; + lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); + T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) & + (local_y > -dy / 2.0) & (local_y < dy / 2.0); + return in_flag; +} + +template +__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, + const T *xyz, const T *boxes3d, + int *pts_assign) { + // params xyz: (B, N, 3) + // params boxes3d: (B, M, 7) + // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means + // background points + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (box_idx >= boxes_num || bs_idx >= batch_size) return; + + int assign_idx = + bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; + pts_assign[assign_idx] = 0; + + int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; + int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; + + T local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, + local_x, local_y); + pts_assign[assign_idx] = cur_in_flag; + } +} + +__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, + int sampled_pts_num, const int *pts_assign, + int *pts_idx, int *pooled_empty_flag) { + // params xyz: (B, N, 3) + // params pts_feature: (B, N, C) + // params pts_assign: (B, N) + // params pts_idx: (B, M, 512) + // params pooled_empty_flag: (B, M) + CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) { + int bs_idx = blockIdx.y; + + int cnt = 0; + for (int k = 0; k < pts_num; k++) { + if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + + boxes_idx]) { + if (cnt < sampled_pts_num) { + pts_idx[bs_idx * boxes_num * sampled_pts_num + + boxes_idx * sampled_pts_num + cnt] = k; + cnt++; + } else + break; + } + } + + if (cnt == 0) { + pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; + } else if (cnt < sampled_pts_num) { + // duplicate same points for sampling + for (int k = cnt; k < sampled_pts_num; k++) { + int duplicate_idx = k % cnt; + int base_offset = + bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; + pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; + } + } + } +} + +template +__global__ void roipoint_pool3d_forward( + int batch_size, int pts_num, int boxes_num, int feature_in_len, + int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature, + T *pooled_features, int *pooled_empty_flag) { + // params xyz: (B, N, 3) + // params pts_idx: (B, M, 512) + // params pts_feature: (B, N, C) + // params pooled_features: (B, M, 512, 3+C) + // params pooled_empty_flag: (B, M) + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) { + if (box_idx >= boxes_num || bs_idx >= batch_size) return; + if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return; + + int temp_idx = bs_idx * boxes_num * sampled_pts_num + + box_idx * sampled_pts_num + sample_pt_idx; + int src_pt_idx = pts_idx[temp_idx]; + int dst_feature_offset = temp_idx * (3 + feature_in_len); + + for (int j = 0; j < 3; j++) + pooled_features[dst_feature_offset + j] = + xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; + + int src_feature_offset = + bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; + memcpy(pooled_features + dst_feature_offset + 3, + pts_feature + src_feature_offset, feature_in_len * sizeof(T)); + } +} + +#endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh @@ -0,0 +1,129 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu +#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH +#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void rotated_feature_align_forward_kernel( + const int nthreads, const int points, const scalar_t* bottom_data, + const scalar_t* best_bboxes, const scalar_t spatial_scale, + const int channels, const int height, const int width, scalar_t* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + const scalar_t* bbox_offset = + best_bboxes + ((n * height + h) * width + w) * 5; + scalar_t roi_y = bbox_offset[0] * spatial_scale; + scalar_t roi_x = bbox_offset[1] * spatial_scale; + + scalar_t px[5] = {roi_x, 0, 0, 0, 0}; + scalar_t py[5] = {roi_y, 0, 0, 0, 0}; + + if (points > 1) { + scalar_t roi_w = bbox_offset[2] * spatial_scale; + scalar_t roi_h = bbox_offset[3] * spatial_scale; + scalar_t roi_a = bbox_offset[4]; + + scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; + scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); + scalar_t wx = cosa * w_2, wy = sina * w_2; + scalar_t hx = -sina * h_2, hy = cosa * h_2; + + px[1] = roi_x + wx + hx; + py[1] = roi_y + wy + hy; + px[2] = roi_x - wx + hx; + py[2] = roi_y - wy + hy; + px[3] = roi_x - wx - hx; + py[3] = roi_y - wy - hy; + px[4] = roi_x + wx - hx; + py[4] = roi_y + wy - hy; + } + + const scalar_t* offset_bottom_data = + bottom_data + (n * channels + c) * height * width; + + scalar_t output_val = bottom_data[index]; + for (int i = 0; i < points; i++) { + output_val += bilinear_interpolate(offset_bottom_data, height, + width, py[i], px[i], i); + } + top_data[index] = output_val; + } +} + +template +__global__ void rotated_feature_align_backward_kernel( + const int nthreads, const int points, const scalar_t* top_diff, + const scalar_t* best_bboxes, const scalar_t spatial_scale, + const int channels, const int height, const int width, + scalar_t* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + const scalar_t* bbox_offset = + best_bboxes + ((n * height + h) * width + w) * 5; + scalar_t roi_y = bbox_offset[0] * spatial_scale; + scalar_t roi_x = bbox_offset[1] * spatial_scale; + + scalar_t px[5] = {roi_x, 0, 0, 0, 0}; + scalar_t py[5] = {roi_y, 0, 0, 0, 0}; + + if (points > 1) { + scalar_t roi_w = bbox_offset[2] * spatial_scale; + scalar_t roi_h = bbox_offset[3] * spatial_scale; + scalar_t roi_a = bbox_offset[4]; + + scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; + scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); + scalar_t wx = cosa * w_2, wy = sina * w_2; + scalar_t hx = -sina * h_2, hy = cosa * h_2; + + px[1] = roi_x + wx + hx; + py[1] = roi_y + wy + hy; + px[2] = roi_x - wx + hx; + py[2] = roi_y - wy + hy; + px[3] = roi_x - wx - hx; + py[3] = roi_y - wy - hy; + px[4] = roi_x + wx - hx; + py[4] = roi_y + wy - hy; + } + + scalar_t* offset_bottom_diff = + bottom_diff + (n * channels + c) * height * width; + scalar_t value_top_diff = top_diff[index]; + + atomicAdd(bottom_diff + index, value_top_diff); + for (int i = 0; i < points; i++) { + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, py[i], px[i], w1, + w2, w3, w4, x_low, x_high, y_low, + y_high, i); + scalar_t g1 = value_top_diff * w1; + scalar_t g2 = value_top_diff * w2; + scalar_t g3 = value_top_diff * w3; + scalar_t g4 = value_top_diff * w4; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); + } + } + } +} +#endif // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..af5b9f67b12060ae5dfa52738dba52c8fe674105 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh @@ -0,0 +1,187 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH +#define SCATTER_POINTS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; +int const maxGridDim = 50000; + +__device__ __forceinline__ static void reduceMax(float *address, float val) { + int *address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + do { + assumed = old; + old = atomicCAS(address_as_i, assumed, + __float_as_int(fmaxf(val, __int_as_float(assumed)))); + } while (assumed != old || __int_as_float(old) < val); +} + +__device__ __forceinline__ static void reduceMax(double *address, double val) { + unsigned long long *address_as_ull = + reinterpret_cast(address); + unsigned long long old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS( + address_as_ull, assumed, + __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); + } while (assumed != old || __longlong_as_double(old) < val); +} + +// get rid of meaningless warnings when compiling host code +#ifdef MMCV_WITH_HIP +__device__ __forceinline__ static void reduceAdd(float *address, float val) { + atomicAdd(address, val); +} +__device__ __forceinline__ static void reduceAdd(double *address, double val) { + atomicAdd(address, val); +} +#else +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ static void reduceAdd(float *address, float val) { +#if (__CUDA_ARCH__ < 200) +#ifdef _MSC_VER +#pragma message( \ + "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32") +#else +#warning \ + "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32" +#endif + int *address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + do { + assumed = old; + old = atomicCAS(address_as_i, assumed, + __float_as_int(val + __int_as_float(assumed))); + } while (assumed != old); +#else + atomicAdd(address, val); +#endif +} + +__device__ __forceinline__ static void reduceAdd(double *address, double val) { +#if (__CUDA_ARCH__ < 600) +#ifdef _MSC_VER +#pragma message( \ + "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64") +#else +#warning \ + "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64" +#endif + unsigned long long *address_as_ull = + reinterpret_cast(address); + unsigned long long old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); +#else + atomicAdd(address, val); +#endif +} +#endif // __CUDA_ARCH__ +#endif // MMCV_WITH_HIP + +template +__global__ void feats_reduce_kernel( + const T *feats, const int32_t *coors_map, + T *reduced_feats, // shall be 0 at initialization + const int num_input, const int num_feats, const reduce_t reduce_type) { + CUDA_1D_KERNEL_LOOP(x, num_input) { + int32_t reduce_to = coors_map[x]; + if (reduce_to == -1) continue; + + const T *feats_offset = feats + x * num_feats; + T *reduced_feats_offset = reduced_feats + reduce_to * num_feats; + if (reduce_type == reduce_t::MAX) { + for (int i = 0; i < num_feats; i++) { + reduceMax(&reduced_feats_offset[i], feats_offset[i]); + } + } else { + for (int i = 0; i < num_feats; i++) { + reduceAdd(&reduced_feats_offset[i], feats_offset[i]); + } + } + } +} + +template +__global__ void add_reduce_traceback_grad_kernel( + T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map, + const int32_t *reduce_count, const int num_input, const int num_feats, + const reduce_t reduce_type) { + CUDA_1D_KERNEL_LOOP(x, num_input) { + int32_t reduce_to = coors_map[x]; + if (reduce_to == -1) { + continue; + } + + const int input_offset = x * num_feats; + T *grad_feats_offset = grad_feats + input_offset; + const int reduced_offset = reduce_to * num_feats; + const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; + + if (reduce_type == reduce_t::SUM) { + for (int i = 0; i < num_feats; i++) { + grad_feats_offset[i] = grad_reduced_feats_offset[i]; + } + } else if (reduce_type == reduce_t::MEAN) { + for (int i = 0; i < num_feats; i++) { + grad_feats_offset[i] = grad_reduced_feats_offset[i] / + static_cast(reduce_count[reduce_to]); + } + } + } +} + +template +__global__ void max_reduce_traceback_scatter_idx_kernel( + const T *feats, const T *reduced_feats, int32_t *reduce_from, + const int32_t *coors_map, const int num_input, const int num_feats) { + CUDA_1D_KERNEL_LOOP(x, num_input) { + int32_t reduce_to = coors_map[x]; + + const int input_offset = x * num_feats; + const T *feats_offset = feats + input_offset; + + if (reduce_to == -1) { + continue; + } + + const int reduced_offset = reduce_to * num_feats; + const T *reduced_feats_offset = reduced_feats + reduced_offset; + int32_t *reduce_from_offset = reduce_from + reduced_offset; + + for (int i = 0; i < num_feats; i++) { + if (feats_offset[i] == reduced_feats_offset[i]) { + atomicMin(&reduce_from_offset[i], static_cast(x)); + } + } + } +} + +template +__global__ void max_reduce_scatter_grad_kernel(T *grad_feats, + const T *grad_reduced_feats, + const int32_t *reduce_from, + const int num_reduced, + const int num_feats) { + CUDA_1D_KERNEL_LOOP(x, num_reduced) { + const int reduced_offset = x * num_feats; + const int32_t *scatter_to_offset = reduce_from + reduced_offset; + const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; + + for (int i = 0; i < num_feats; i++) { + grad_feats[scatter_to_offset[i] * num_feats + i] = + grad_reduced_feats_offset[i]; + } + } +} + +#endif // SCATTER_POINTS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1eb5f8fcccbaafdb62972652e3979803c0acd1ca --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh @@ -0,0 +1,71 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH +#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void sigmoid_focal_loss_forward_cuda_kernel( + const int nthreads, const T* input, const int64_t* target, const T* weight, + T* output, const T gamma, const T alpha, const int num_classes) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / num_classes; + int c = index % num_classes; + + int64_t t = target[n]; + T flag_p = (t == c); + T flag_n = (t != c); + + // p = sigmoid(x) = 1. / 1. + expf(-x) + T p = (T)1. / ((T)1. + expf(-input[index])); + + // (1 - p)**gamma * log(p) + T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN)); + // p**gamma * log(1 - p) + T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN)); + + output[index] = (T)0.; + output[index] += -flag_p * alpha * term_p; + output[index] += -flag_n * ((T)1. - alpha) * term_n; + if (weight != NULL) { + output[index] *= weight[t]; + } + } +} + +template +__global__ void sigmoid_focal_loss_backward_cuda_kernel( + const int nthreads, const T* input, const int64_t* target, const T* weight, + T* grad_input, const T gamma, const T alpha, const int num_classes) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / num_classes; + int c = index % num_classes; + + int64_t t = target[n]; + T flag_p = (t == c); + T flag_n = (t != c); + + // p = sigmoid(x) = 1. / 1. + expf(-x) + T p = (T)1. / ((T)1. + exp(-input[index])); + + // (1 - p)**gamma * (1 - p - gamma*p*log(p)) + T term_p = pow(((T)1. - p), gamma) * + ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN)))); + // p**gamma * (gamma * (1 - p) * log(1 - p) - p) + T term_n = pow(p, gamma) * + (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p); + + grad_input[index] = (T)0.; + grad_input[index] += -flag_p * alpha * term_p; + grad_input[index] += -flag_n * ((T)1. - alpha) * term_n; + if (weight != NULL) { + grad_input[index] *= weight[t]; + } + } +} + +#endif // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..631b2c6175412a9503f6c385ee6597d9527d754f --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh @@ -0,0 +1,72 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH +#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void softmax_focal_loss_forward_cuda_kernel( + const int nthreads, const T* softmax, const int64_t* target, + const T* weight, T* output, const T gamma, const T alpha, + const int num_classes) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int64_t label = target[index]; + T pred = softmax[index * num_classes + label]; + + if (label >= 0) { + output[index] = + -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN)); + } else { + output[index] = 0; + } + if (weight != NULL) { + output[index] *= weight[label]; + } + } +} + +template +__global__ void softmax_focal_loss_backward_cuda1_kernel( + const int nthreads, const T* softmax, const int64_t* target, + const T* weight, T* buff, const T gamma, const T alpha, + const int num_classes) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int64_t label = target[index]; + T pred = softmax[index * num_classes + label]; + + if (label >= 0) { + buff[index] = alpha * (-pow((T)1. - pred, gamma) + + gamma * pow((T)1. - pred, gamma - 1) * pred * + log(max(pred, (T)FLT_MIN))); + } else { + buff[index] = 0; + } + if (weight != NULL) { + buff[index] *= weight[label]; + } + } +} + +template +__global__ void softmax_focal_loss_backward_cuda2_kernel( + const int nthreads, const T* softmax, const int64_t* target, const T* buff, + T* grad_input, const int num_classes) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / num_classes; + int c = index % num_classes; + int64_t label = target[n]; + + if (label >= 0) { + T flag = (label == c ? (T)1. : (T)0.); + grad_input[index] = buff[n] * (flag - softmax[index]); + } else { + grad_input[index] = 0; + } + } +} + +#endif // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/external/cv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh @@ -0,0 +1,236 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INDICE_CU_H_ +#define INDICE_CU_H_ +#include +#include + +#include + +template +__global__ void prepareIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void prepareDeConvIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void assignGridAndIndiceOutKernel( + tv::TensorView indicesOut, tv::TensorView gridsOut, + int numAct, tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, int batchSize) { + Index index; + auto indicesOutPtr = indicesOut.data(); + for (int ix : tv::KernelLoopX(numAct)) { + index = indicePairUnique[ix]; + gridsOut[index] = ix; + index = tv::rowArrayIdxInv( + index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); + indicesOut[ix * (NDim + 1)] = index % batchSize; + } +} + +template +__global__ void assignIndicePairsKernel( + tv::TensorView indicesOut, tv::TensorView gridsOut, + int numActIn, tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape) { + Index index; + int kernelVolume = indicePairs.dim(0); + for (int ix : tv::KernelLoopX(numActIn)) { + for (int i = 0; i < kernelVolume; ++i) { + index = indicePairs(i, 1, ix); + if (index > -1) { + indicePairs(i, 1, ix) = gridsOut[index]; + } + } + } +} + +template +__global__ void prepareSubMGridKernel( + tv::TensorView indicesIn, tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, + outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + gridsOut[index] = ix; + } +} + +template +__global__ void getSubMIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (int i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + if (gridsOut[index] > -1) { + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 1, oldNum) = gridsOut[index]; + indicePairs(offset, 0, oldNum) = ix; + } + } + } +} + +template +__global__ void resetGridKernel(const Index *indicePairUnique, + tv::TensorView gridsOut, + int numAct) { + for (int ix : tv::KernelLoopX(numAct)) { + gridsOut[indicePairUnique[ix]] = -1; + } +} + +template +__global__ void resetGridSubMKernel( + const Index *indices, tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape, int numAct) { + int outSpatialShapeReg[NDim]; + for (int i = 0; i < NDim; ++i) { + outSpatialShapeReg[i] = outSpatialShape[i]; + } + Index spatialVolume = 1; + auto indsPtr = indices; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index; + for (int ix : tv::KernelLoopX(numAct)) { + indsPtr = indices + ix * (NDim + 1); + index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); + gridsOut[index + spatialVolume * indsPtr[0]] = -1; + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/external/cv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh @@ -0,0 +1,160 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef REORDERING_CU_H_ +#define REORDERING_CU_H_ +#include + +template +__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, + int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) + buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = + features[inds[ilp] + iy]; + } + } + } +} + +template +__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) + reinterpret_cast( + buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = + reinterpret_cast(features)[inds[ilp] + iy]; + } + } + } +} + +template +__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, + int numPlanes) { + int ILPStrideY[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; + features += blockIdx.x * NumTLP; + buffer += blockIdx.x * NumTLP; + + for (int iy : tv::KernelLoopY(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + reinterpret_cast( + buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] = + reinterpret_cast( + features)[indices[iy + ILPStrideY[ilp]] * numPlanes + + threadIdx.x]; + } + } +} + +template +__global__ void scatterAddGenericKernel(scalar_t *outFeatures, + const scalar_t *buffer, + const Index *indices, int size, + int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) { + outFeatures[inds[ilp] + iy] += + buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]; + } + } + } + } +} + +template +__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures, + const scalar_t *buffer, + const Index *indices, int size, + int numPlanes) { + int ILPStrideY[NumILP]; + constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t); +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; + outFeatures += blockIdx.x * NumTLP; + buffer += blockIdx.x * NumTLP; + scalar_t buf[vecloadFactor]; + scalar_t buf2[vecloadFactor]; + Index idx; + for (int iy : tv::KernelLoopY(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x; + reinterpret_cast(buf)[0] = + reinterpret_cast(outFeatures)[idx]; + reinterpret_cast(buf2)[0] = reinterpret_cast( + buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]; +#pragma unroll + for (int i = 0; i < vecloadFactor; i++) { + buf[i] += buf2[i]; + } + reinterpret_cast(outFeatures)[idx] = + reinterpret_cast(buf)[0]; + } + } +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..06caefa18d47be11b6cb8770ceb8951479add902 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh @@ -0,0 +1,68 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu +#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH +#define STACK_BALL_QUERY_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void stack_ball_query_forward_cuda_kernel( + int B, int M, float radius, int nsample, const T *new_xyz, + const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt, + int *idx) { + // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features + // :param xyz_batch_cnt: (batch_size), [N1, N2, ...] + // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query + // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...] + // output: + // idx: (M, nsample) + const T *cur_xyz = xyz; + int *cur_idx = idx; + CUDA_1D_KERNEL_LOOP(pt_idx, M) { + int bs_idx = 0; + for (int pt_cnt = 0; bs_idx < B; bs_idx++) { + pt_cnt += new_xyz_batch_cnt[bs_idx]; + if (pt_idx < pt_cnt) break; + } + + int xyz_batch_start_idx = 0; + for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k]; + + const T *new_xyz_p = new_xyz + pt_idx * 3; + cur_xyz += xyz_batch_start_idx * 3; + cur_idx += pt_idx * nsample; + + float radius2 = radius * radius; + T new_x = new_xyz_p[0]; + T new_y = new_xyz_p[1]; + T new_z = new_xyz_p[2]; + int n = xyz_batch_cnt[bs_idx]; + + int cnt = 0; + for (int k = 0; k < n; ++k) { + T x = cur_xyz[k * 3 + 0]; + T y = cur_xyz[k * 3 + 1]; + T z = cur_xyz[k * 3 + 2]; + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < radius2) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + cur_idx[l] = k; + } + } + cur_idx[cnt] = k; + ++cnt; + if (cnt >= nsample) break; + } + } + if (cnt == 0) cur_idx[0] = -1; + } +} + +#endif // STACK_BALL_QUERY_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4ef3663d05bcd9146e15dd93bb979734538919cb --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh @@ -0,0 +1,97 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu +#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH +#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif +#include +template +__global__ void stack_group_points_forward_cuda_kernel( + int b, int c, int m, int nsample, const T *features, + const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, + T *out) { + // :param features: (N1 + N2 ..., C) tensor of features to group + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the + // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor + // containing the indices of features to group with :param idx_batch_cnt: + // (batch_size) [M1 + M2 ...] tensor containing the indices of features to + // group with :return: + // output: (M1 + M2, C, nsample) tensor + CUDA_1D_KERNEL_LOOP(index, m * c * nsample) { + const T *cur_features = features; + const int *cur_idx = idx; + int sample_idx = index % nsample; + int c_idx = (index / nsample) % c; + int pt_idx = (index / nsample / c); + + if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; + int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; + for (int k = 1; k < b; k++) { + if (pt_idx < pt_cnt) break; + pt_cnt += idx_batch_cnt[k]; + bs_idx = k; + } + + int features_batch_start_idx = 0; + int features_batch_end_idx = features_batch_cnt[0]; + for (int k = 0; k < bs_idx; k++) { + features_batch_start_idx += features_batch_cnt[k]; + features_batch_end_idx = + features_batch_start_idx + features_batch_cnt[k + 1]; + } + cur_features += features_batch_start_idx * c; + + cur_idx += pt_idx * nsample + sample_idx; + int in_idx = cur_idx[0] * c + c_idx; + int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx; + if (in_idx < features_batch_end_idx * c) { + out[out_idx] = cur_features[in_idx]; + } + } +} + +template +__global__ void stack_group_points_backward_cuda_kernel( + int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx, + const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) { + // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the + // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing + // the indices of features to group with :param idx_batch_cnt: (batch_size) + // [M1 + M2 ...] tensor containing the indices of features to group with + // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the + // indices of features to group with :return: + // grad_features: (N1 + N2 ..., C) gradient of the features + CUDA_1D_KERNEL_LOOP(index, m * c * nsample) { + const T *cur_grad_out = grad_out; + const int *cur_idx = idx; + T *cur_grad_features = grad_features; + int sample_idx = index % nsample; + int c_idx = (index / nsample) % c; + int pt_idx = (index / nsample / c); + + if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; + + int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; + for (int k = 1; k < b; k++) { + if (pt_idx < pt_cnt) break; + pt_cnt += idx_batch_cnt[k]; + bs_idx = k; + } + + int features_batch_start_idx = 0; + for (int k = 0; k < bs_idx; k++) + features_batch_start_idx += features_batch_cnt[k]; + + cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx; + cur_idx += pt_idx * nsample + sample_idx; + cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx; + + atomicAdd(cur_grad_features, cur_grad_out[0]); + } +} + +#endif // GROUP_POINTS_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4ec6a466886832d38c72da6e3a3574e72d53cec8 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh @@ -0,0 +1,331 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef SYNCBN_CUDA_KERNEL_CUH +#define SYNCBN_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean, + int num, int channels, + int spatial) { + __shared__ float buffer[THREADS_PER_BLOCK]; + int tid = threadIdx.x; + int c = blockIdx.x; + buffer[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + buffer[tid] += input[index]; + } + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer[tid] += buffer[tid + s]; + } + __syncthreads(); + } + int total = num * spatial; + if (tid == 0) { + mean[c] = buffer[0] / total; + } +} + +template <> +__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input, + float *mean, int num, + int channels, int spatial) { + __shared__ float buffer[THREADS_PER_BLOCK]; + int tid = threadIdx.x; + int c = blockIdx.x; + buffer[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + buffer[tid] += static_cast(input[index]); + } + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer[tid] += buffer[tid + s]; + } + __syncthreads(); + } + int total = num * spatial; + if (tid == 0) { + mean[c] = buffer[0] / total; + } +} + +template +__global__ void sync_bn_forward_var_cuda_kernel(const T *input, + const float *mean, float *var, + int num, int channels, + int spatial) { + __shared__ float buffer[THREADS_PER_BLOCK]; + int tid = threadIdx.x; + int c = blockIdx.x; + buffer[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + float td = input[index] - mean[c]; + buffer[tid] += td * td; + } + __syncthreads(); + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer[tid] += buffer[tid + s]; + } + __syncthreads(); + } + int total = num * spatial; + if (tid == 0) { + var[c] = buffer[0] / total; + } +} + +template <> +__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input, + const float *mean, float *var, + int num, int channels, + int spatial) { + __shared__ float buffer[THREADS_PER_BLOCK]; + int tid = threadIdx.x; + int c = blockIdx.x; + buffer[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + float td = static_cast(input[index]) - mean[c]; + buffer[tid] += td * td; + } + __syncthreads(); + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer[tid] += buffer[tid + s]; + } + __syncthreads(); + } + int total = num * spatial; + if (tid == 0) { + var[c] = buffer[0] / total; + } +} + +template +__global__ void sync_bn_forward_output_cuda_kernel( + const T *input, const float *mean, const float *var, float *running_mean, + float *running_var, const float *weight, const float *bias, float *norm, + float *std, T *output, int num, int channels, int spatial, float eps, + float momentum, int group_size) { + int tid = threadIdx.x; + int c = blockIdx.x; + float mean_value = mean[c]; + float std_value = sqrt(var[c] + eps); + + if (weight != nullptr) { + float weight_value = weight[c]; + float bias_value = bias[c]; + if (norm != nullptr) { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + norm[index] = (input[index] - mean_value) / std_value; + output[index] = norm[index] * weight_value + bias_value; + } + } else { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + output[index] = + (input[index] - mean_value) / std_value * weight_value + bias_value; + } + } + } else { + if (norm != nullptr) { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + output[index] = norm[index] = (input[index] - mean_value) / std_value; + } + } else { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + output[index] = (input[index] - mean_value) / std_value; + } + } + } + if (tid == 0) { + if (std != nullptr) std[c] = std_value; + if (running_mean != nullptr) { + running_mean[c] = + momentum * mean_value + (1 - momentum) * running_mean[c]; + int count = num * spatial * group_size; + float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; + running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; + } + } +} + +template <> +__global__ void sync_bn_forward_output_cuda_kernel( + const phalf *input, const float *mean, const float *var, + float *running_mean, float *running_var, const float *weight, + const float *bias, float *norm, float *std, phalf *output, int num, + int channels, int spatial, float eps, float momentum, int group_size) { + int tid = threadIdx.x; + int c = blockIdx.x; + float mean_value = mean[c]; + float std_value = sqrt(var[c] + eps); + if (weight != nullptr) { + float weight_value = weight[c]; + float bias_value = bias[c]; + if (norm != nullptr) { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + norm[index] = + (static_cast(input[index]) - mean_value) / std_value; + output[index] = + static_cast(norm[index] * weight_value + bias_value); + } + } else { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + output[index] = + static_cast((static_cast(input[index]) - mean_value) / + std_value * weight_value + + bias_value); + } + } + } else { + if (norm != nullptr) { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + norm[index] = + (static_cast(input[index]) - mean_value) / std_value; + output[index] = static_cast(norm[index]); + } + } else { + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = + (i / spatial) * channels * spatial + c * spatial + i % spatial; + output[index] = static_cast( + (static_cast(input[index]) - mean_value) / std_value); + } + } + } + if (tid == 0) { + if (std != nullptr) std[c] = std_value; + if (running_mean != nullptr) { + running_mean[c] = + momentum * mean_value + (1 - momentum) * running_mean[c]; + int count = num * spatial * group_size; + float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; + running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; + } + } +} + +template +__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output, + const float *norm, + float *grad_weight, + float *grad_bias, int num, + int channels, int spatial) { + __shared__ float buffer1[THREADS_PER_BLOCK]; + __shared__ float buffer2[THREADS_PER_BLOCK]; + + int tid = threadIdx.x; + int c = blockIdx.x; + buffer1[tid] = buffer2[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + buffer1[tid] += grad_output[index] * norm[index]; + buffer2[tid] += grad_output[index]; + } + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer1[tid] += buffer1[tid + s]; + buffer2[tid] += buffer2[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + grad_weight[c] = buffer1[0]; + grad_bias[c] = buffer2[0]; + } +} + +template <> +__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output, + const float *norm, + float *grad_weight, + float *grad_bias, int num, + int channels, int spatial) { + __shared__ float buffer1[THREADS_PER_BLOCK]; + __shared__ float buffer2[THREADS_PER_BLOCK]; + + int tid = threadIdx.x; + int c = blockIdx.x; + buffer1[tid] = buffer2[tid] = 0; + for (int i = tid; i < num * spatial; i += blockDim.x) { + int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; + buffer1[tid] += static_cast(grad_output[index]) * norm[index]; + buffer2[tid] += static_cast(grad_output[index]); + } + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + buffer1[tid] += buffer1[tid + s]; + buffer2[tid] += buffer2[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + grad_weight[c] = buffer1[0]; + grad_bias[c] = buffer2[0]; + } +} + +template +__global__ void sync_bn_backward_data_cuda_kernel( + int output_size, const T *grad_output, const float *weight, + const float *grad_weight, const float *grad_bias, const float *norm, + const float *std, T *grad_input, int num, int channels, int spatial) { + int factor = num * spatial; + CUDA_1D_KERNEL_LOOP(index, output_size) { + int c = (index / spatial) % channels; + grad_input[index] = + weight[c] * + (grad_output[index] - + (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / + std[c]; + } +} + +template <> +__global__ void sync_bn_backward_data_cuda_kernel( + int output_size, const phalf *grad_output, const float *weight, + const float *grad_weight, const float *grad_bias, const float *norm, + const float *std, phalf *grad_input, int num, int channels, int spatial) { + int factor = num * spatial; + CUDA_1D_KERNEL_LOOP(index, output_size) { + int c = (index / spatial) % channels; + grad_input[index] = static_cast( + weight[c] * + (static_cast(grad_output[index]) - + (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / + std[c]); + } +} + +#endif // SYNCBN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..971b496e589d2210131351305cbaf0ed1a027cb1 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH +#define THREE_INTERPOLATE_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void three_interpolate_forward_cuda_kernel( + int b, int c, int m, int n, const T *points, const int *__restrict__ idx, + const T *weight, T *out) { + // points: (B, C, M) + // idx: (B, N, 3) + // weight: (B, N, 3) + // output: + // out: (B, C, N) + + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b || c_idx >= c) return; + + weight += bs_idx * n * 3 + pt_idx * 3; + points += bs_idx * c * m + c_idx * m; + idx += bs_idx * n * 3 + pt_idx * 3; + out += bs_idx * c * n + c_idx * n; + + out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + + weight[2] * points[idx[2]]; + } +} + +template +__global__ void three_interpolate_backward_cuda_kernel( + int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx, + const T *weight, T *grad_points) { + // grad_out: (B, C, N) + // weight: (B, N, 3) + // output: + // grad_points: (B, C, M) + + int bs_idx = blockIdx.z; + int c_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b || c_idx >= c) return; + + grad_out += bs_idx * c * n + c_idx * n + pt_idx; + weight += bs_idx * n * 3 + pt_idx * 3; + grad_points += bs_idx * c * m + c_idx * m; + idx += bs_idx * n * 3 + pt_idx * 3; + + atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); + atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); + atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); + } +} + +#endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..15434121b94033afb2fcb9945a83db15b92262d4 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh @@ -0,0 +1,67 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef THREE_NN_CUDA_KERNEL_CUH +#define THREE_NN_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void three_nn_forward_cuda_kernel(int b, int n, int m, + const T *unknown, const T *known, + T *dist2, int *__restrict__ idx) { + // unknown: (B, N, 3) + // known: (B, M, 3) + // output: + // dist2: (B, N, 3) + // idx: (B, N, 3) + + int bs_idx = blockIdx.y; + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b) return; + + unknown += bs_idx * n * 3 + pt_idx * 3; + known += bs_idx * m * 3; + dist2 += bs_idx * n * 3 + pt_idx * 3; + idx += bs_idx * n * 3 + pt_idx * 3; + + T ux = unknown[0]; + T uy = unknown[1]; + T uz = unknown[2]; + + double best1 = 1e40, best2 = 1e40, best3 = 1e40; + int besti1 = 0, besti2 = 0, besti3 = 0; + for (int k = 0; k < m; ++k) { + T x = known[k * 3 + 0]; + T y = known[k * 3 + 1]; + T z = known[k * 3 + 2]; + T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); + if (d < best1) { + best3 = best2; + besti3 = besti2; + best2 = best1; + besti2 = besti1; + best1 = d; + besti1 = k; + } else if (d < best2) { + best3 = best2; + besti3 = besti2; + best2 = d; + besti2 = k; + } else if (d < best3) { + best3 = d; + besti3 = k; + } + } + dist2[0] = best1; + dist2[1] = best2; + dist2[2] = best3; + idx[0] = besti1; + idx[1] = besti2; + idx[2] = besti3; + } +} + +#endif // THREE_NN_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4d1159a515f4de2666c25ba4bd5e4f2cbbca1e10 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh @@ -0,0 +1,61 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef TIN_SHIFT_CUDA_KERNEL_CUH +#define TIN_SHIFT_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void tin_shift_forward_cuda_kernel( + const int nthreads, const T* input, const int* shift, T* output, + const int batch_size, const int channels, const int t_size, + const int hw_size, const int group_size, const int group_channel) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int hw_index = index % hw_size; + const int j = (index / hw_size) % channels; + + const int n_index = (index / hw_size / channels) % batch_size; + int group_id = j / group_channel; + int t_shift = shift[n_index * group_size + group_id]; + int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; + for (int i = 0; i < t_size; i++) { + int now_t = i + t_shift; + int data_id = i * hw_size * channels + offset; + if (now_t < 0 || now_t >= t_size) { + continue; + } + int out_id = now_t * hw_size * channels + offset; + output[out_id] = input[data_id]; + } + } +} + +template +__global__ void tin_shift_backward_cuda_kernel( + const int nthreads, const T* input, const int* shift, T* output, + const int batch_size, const int channels, const int t_size, + const int hw_size, const int group_size, const int group_channel) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int hw_index = index % hw_size; + const int j = (index / hw_size) % channels; + + const int n_index = (index / hw_size / channels) % batch_size; + int group_id = j / group_channel; + int t_shift = shift[n_index * group_size + group_id]; + int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; + for (int i = 0; i < t_size; i++) { + int now_t = i + t_shift; + int data_id = i * hw_size * channels + offset; + if (now_t < 0 || now_t >= t_size) { + continue; + } + int out_id = now_t * hw_size * channels + offset; + output[out_id] = input[data_id]; + } + } +} + +#endif // TIN_SHIFT_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..021b488d8d716c9e8132173bf04491d42b7b6fa2 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh @@ -0,0 +1,216 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#ifndef VOXELIZATION_CUDA_KERNEL_CUH +#define VOXELIZATION_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; + +template +__global__ void dynamic_voxelize_kernel( + const T* points, T_int* coors, const float voxel_x, const float voxel_y, + const float voxel_z, const float coors_x_min, const float coors_y_min, + const float coors_z_min, const float coors_x_max, const float coors_y_max, + const float coors_z_max, const int grid_x, const int grid_y, + const int grid_z, const int num_points, const int num_features, + const int NDim) { + // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; + CUDA_1D_KERNEL_LOOP(index, num_points) { + // To save some computation + auto points_offset = points + index * num_features; + auto coors_offset = coors + index * NDim; + int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x); + if (c_x < 0 || c_x >= grid_x) { + coors_offset[0] = -1; + continue; + } + + int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y); + if (c_y < 0 || c_y >= grid_y) { + coors_offset[0] = -1; + coors_offset[1] = -1; + continue; + } + + int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z); + if (c_z < 0 || c_z >= grid_z) { + coors_offset[0] = -1; + coors_offset[1] = -1; + coors_offset[2] = -1; + } else { + coors_offset[0] = c_z; + coors_offset[1] = c_y; + coors_offset[2] = c_x; + } + } +} + +template +__global__ void assign_point_to_voxel(const int nthreads, const T* points, + T_int* point_to_voxelidx, + T_int* coor_to_voxelidx, T* voxels, + const int max_points, + const int num_features, + const int num_points, const int NDim) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; + int index = thread_idx / num_features; + + int num = point_to_voxelidx[index]; + int voxelidx = coor_to_voxelidx[index]; + if (num > -1 && voxelidx > -1) { + auto voxels_offset = + voxels + voxelidx * max_points * num_features + num * num_features; + + int k = thread_idx % num_features; + voxels_offset[k] = points[thread_idx]; + } + } +} + +template +__global__ void assign_voxel_coors(const int nthreads, T_int* coor, + T_int* point_to_voxelidx, + T_int* coor_to_voxelidx, T_int* voxel_coors, + const int num_points, const int NDim) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; + // if (index >= num_points) return; + int index = thread_idx / NDim; + int num = point_to_voxelidx[index]; + int voxelidx = coor_to_voxelidx[index]; + if (num == 0 && voxelidx > -1) { + auto coors_offset = voxel_coors + voxelidx * NDim; + int k = thread_idx % NDim; + coors_offset[k] = coor[thread_idx]; + } + } +} + +template +__global__ void point_to_voxelidx_kernel(const T_int* coor, + T_int* point_to_voxelidx, + T_int* point_to_pointidx, + const int max_points, + const int max_voxels, + const int num_points, const int NDim) { + CUDA_1D_KERNEL_LOOP(index, num_points) { + auto coor_offset = coor + index * NDim; + // skip invalid points + if (coor_offset[0] == -1) continue; + + int num = 0; + int coor_x = coor_offset[0]; + int coor_y = coor_offset[1]; + int coor_z = coor_offset[2]; + // only calculate the coors before this coor[index] + for (int i = 0; i < index; ++i) { + auto prev_coor = coor + i * NDim; + if (prev_coor[0] == -1) continue; + + // Find all previous points that have the same coors + // if find the same coor, record it + if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) && + (prev_coor[2] == coor_z)) { + num++; + if (num == 1) { + // point to the same coor that first show up + point_to_pointidx[index] = i; + } else if (num >= max_points) { + // out of boundary + break; + } + } + } + if (num == 0) { + point_to_pointidx[index] = index; + } + if (num < max_points) { + point_to_voxelidx[index] = num; + } + } +} + +template +__global__ void determin_voxel_num( + // const T_int* coor, + T_int* num_points_per_voxel, T_int* point_to_voxelidx, + T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num, + const int max_points, const int max_voxels, const int num_points) { + // only calculate the coors before this coor[index] + for (int i = 0; i < num_points; ++i) { + int point_pos_in_voxel = point_to_voxelidx[i]; + // record voxel + if (point_pos_in_voxel == -1) { + // out of max_points or invalid point + continue; + } else if (point_pos_in_voxel == 0) { + // record new voxel + int voxelidx = voxel_num[0]; + if (voxel_num[0] >= max_voxels) continue; + voxel_num[0] += 1; + coor_to_voxelidx[i] = voxelidx; + num_points_per_voxel[voxelidx] = 1; + } else { + int point_idx = point_to_pointidx[i]; + int voxelidx = coor_to_voxelidx[point_idx]; + if (voxelidx != -1) { + coor_to_voxelidx[i] = voxelidx; + num_points_per_voxel[voxelidx] += 1; + } + } + } +} + +__global__ void nondeterministic_get_assign_pos( + const int nthreads, const int32_t* coors_map, int32_t* pts_id, + int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + int coors_idx = coors_map[thread_idx]; + if (coors_idx > -1) { + int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1); + pts_id[thread_idx] = coors_pts_pos; + if (coors_pts_pos == 0) { + coors_order[coors_idx] = atomicAdd(coors_count, 1); + } + } + } +} + +template +__global__ void nondeterministic_assign_point_voxel( + const int nthreads, const T* points, const int32_t* coors_map, + const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count, + const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count, + const int max_voxels, const int max_points, const int num_features, + const int NDim) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + int coors_idx = coors_map[thread_idx]; + int coors_pts_pos = pts_id[thread_idx]; + if (coors_idx > -1 && coors_pts_pos < max_points) { + int coors_pos = coors_order[coors_idx]; + if (coors_pos < max_voxels) { + auto voxels_offset = + voxels + (coors_pos * max_points + coors_pts_pos) * num_features; + auto points_offset = points + thread_idx * num_features; + for (int k = 0; k < num_features; k++) { + voxels_offset[k] = points_offset[k]; + } + if (coors_pts_pos == 0) { + pts_count[coors_pos] = min(reduce_count[coors_idx], max_points); + auto coors_offset = coors + coors_pos * NDim; + auto coors_in_offset = coors_in + coors_idx * NDim; + for (int k = 0; k < NDim; k++) { + coors_offset[k] = coors_in_offset[k]; + } + } + } + } + } +} + +#endif // VOXELIZATION_CUDA_KERNEL_CUH diff --git a/external/cv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/external/cv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..852737224183c1852f1394903e1106219d9ad40e --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp @@ -0,0 +1,256 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef COMMON_MLU_HELPER_HPP_ +#define COMMON_MLU_HELPER_HPP_ + +#define NFU_ALIGN_SIZE 128 // Byte +#define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc + +#ifdef __BANG_ARCH__ +#define MAX_NRAM_SIZE \ + (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#define MAX_SRAM_SIZE \ + (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#else +#define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value +#define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value +#endif + +#ifndef PAD_UP +#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) +#endif + +#ifndef PAD_DOWN +#define PAD_DOWN(x, y) (((x) / (y)) * (y)) +#endif + +#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y)) + +template +__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} + +/*! + * @brief Converts int32 to float32 data type. + * + * @param[out] dst + * Pointer to NRAM that stores int32 type data. + * @param[in,out] dst_addition + * Pointer to NRAM as the workspace of dst, which has the same size as dst. + * It allows empty pointer on MLU300 series. + * @param[in] src + * Pointer to NRAM that stores float32 type data. + * @param[in,out] src_addition + * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. + * It allows empty pointer on MLU300 series. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, + float *src_addition, const int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_int2float((float *)dst, (int32_t *)src, src_count, 0); +#else + // get sign bit + const float move_23bit = 8388608.0; + // 0x80000000 = 1,000000000,0000000000000000000000000000 + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); + __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + // get 1 or 0 from sign bit + // judg is Odd + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); + __bang_cycle_bor((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * sizeof(float), + NFU_ALIGN_SIZE); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000001); + __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + // minus xor, positive num invariant + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); + __bang_cycle_mul(dst, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); + // convert int32 to float32 + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x7fffff); + __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x4b000000); + __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + __bang_sub_scalar(dst, dst, move_23bit, src_count); + // add one + __bang_add(dst, dst, dst_addition, src_count); + // set sign for float32 + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); + __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); + __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); + __bang_cycle_band((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * 4, 128); + __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); +#endif // __BANG_ARCH__ >= 300 +} + +/*! + * @brief Converts float32 to int32 data type with to_zero round mode. + * + * @param[out] dst + * Pointer to NRAM that stores float32 type data. + * @param[in,out] dst_addition + * Pointer to NRAM as the workspace of dst, which has the same size as dst. + * It allows empty pointer on MLU300 series. + * @param[in] src + * Pointer to NRAM that stores int32 type data. + * @param[in,out] src_addition + * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. + * It allows empty pointer on MLU300 series. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, + float *src_addition, const int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0); +#else + // sign ===> src_addition + // dst=-1.0 : when src[i] is a negative number + // dst=+1.0 : when src[i] is a positive number + const int floatDchar = sizeof(float) / sizeof(char); + __bang_active_sign((float *)dst, src, src_count); + // dst_addition = abs(src) + __bang_mul(dst_addition, src, (float *)dst, src_count); + // if dst_addition < 1.0 , then src_addition + 1, to fix add error. + __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 1.0f); + __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xbf800000); + // set negative flag -1.0 = 0xbf80000 + __bang_cycle_eq( + (float *)dst, (float *)dst, (float *)src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] + __bang_active_abs(dst_addition, src, src_count); + __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 8388608.0f); + // mask shift move 23 + __bang_cycle_add_tz( + dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); // right shift move 23bit + // two`s complement for negatibe + // dst=1.0 , when src <-1.0 + // dst=0.0 , when src >=-1.0 + __bang_sub(dst_addition, dst_addition, (float *)dst, src_count); + // to fix max value + // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, + // means max value. + __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count); + __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, + src_count * floatDchar); + // get low 23bit + __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + (unsigned)0x007fffff); + // mask low 23bit is 1 + __bang_cycle_band((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * floatDchar, + NFU_ALIGN_SIZE / sizeof(char)); + // set 9 high bit ===> dst + // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 + // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 + __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); + __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + // src or dst_addition + __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, + src_count * floatDchar); + __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count); + __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, + src_count * floatDchar); +#endif // __BANG_ARCH__ >= 300 +} + +/*! + * @brief Converts float32 to half data type, + * the rounding mode on MLU200 is rd, on MLU300 is rn. + * + * @param[out] dst + * Pointer to NRAM that stores half type data. + * @param[in] src + * Pointer to NRAM that stores float32 type data. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ inline void convertFloat2half(half *dst, float *src, + int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_float2half_rn(dst, src, src_count); +#else + __bang_float2half_rd(dst, src, src_count); +#endif +} + +/*! + * @brief recursiveSumPool. + * @param[in,out] dst + * Pointer to NRAM that stores the input and output data. + * @param[in] low_dim + * Which is the number of low dim. + * @param[in] high_dim + * Which is the number of high dim. + * @param[in] kernel_limit + * Which is the high_dim of sumpool per time. + ******************************************************************************/ +template +__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim, + int kernel_limit) { + for (; high_dim > 1;) { + int repeat_s = high_dim / kernel_limit; + int remain_s = high_dim % kernel_limit; + + if (remain_s) { + __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1, + 1); + } + if (repeat_s) { + __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0), + (T *)dst + remain_s * low_dim, low_dim, + kernel_limit * repeat_s, 1, kernel_limit, 1, 1, + kernel_limit); + } + high_dim = repeat_s + (bool)remain_s; + } + return; +} + +#endif // COMMON_MLU_HELPER_HPP_ diff --git a/external/cv/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu b/external/cv/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu new file mode 100755 index 0000000000000000000000000000000000000000..1356a799ac3ba5d36de9df25a0cdd0a706506e75 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu @@ -0,0 +1,181 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +template +__mlu_func__ void MLUUnion1MaskedIm2colForward( + const T *feature, const int height, const int width, const int channels, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt, + T *data_col) { + for (int index = taskId; index < mask_cnt; index += taskDim) { + const int h_col = mask_h_idx[index]; + const int w_col = mask_w_idx[index]; + const int h_offset = h_col - pad_h; + const int w_offset = w_col - pad_w; + int h_start = h_offset; + int h_end = h_offset + kernel_h - 1; + int w_start = w_offset; + int w_end = w_start + kernel_w - 1; + if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) { + continue; + } else { + int h_start_valid = max(0, h_start); + int h_end_valid = min(height - 1, h_end); + int w_start_valid = max(0, w_start); + int w_end_valid = min(width - 1, w_end); + __memcpy( + data_col + index * kernel_h * kernel_w * channels + + ((h_start_valid - h_start) * kernel_w + + (w_start_valid - w_start)) * + channels, + feature + h_start_valid * width * channels + w_start_valid * channels, + (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM, + kernel_w * channels * sizeof(T), width * channels * sizeof(T), + h_end_valid - h_start_valid); + } + } +} + +template +__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height, + const int width, + const int channels, + const int32_t *mask_h_idx, + const int32_t *mask_w_idx, + const int mask_cnt, T *im) { + const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T); + if (channels <= channels_max_num_nram) { + const int deal_num = channels_max_num_nram / channels; + int mask_per_core = mask_cnt / taskDim; + const int mask_remain = mask_cnt % taskDim; + mask_per_core += taskId < mask_remain ? 1 : 0; + int index_start = taskId < mask_remain + ? taskId * mask_per_core + : taskId * mask_per_core + mask_remain; + int loop = mask_per_core / deal_num; + int remain_num = mask_per_core % deal_num; + T *nram_col = (T *)nram_buffer; + for (int index = 0; index < loop; ++index) { + int cur_index = index_start + index * deal_num; + __memcpy(nram_col, col + cur_index * channels, + deal_num * channels * sizeof(T), GDRAM2NRAM); + for (int i = 0; i < deal_num; ++i) { + int mask_index = cur_index + i; + const int h_im = mask_h_idx[mask_index]; + const int w_im = mask_w_idx[mask_index]; + // if(h_im>=height || w_im>=width) continue; + __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, + channels * sizeof(T), NRAM2GDRAM); + } + } + if (remain_num > 0) { + int cur_index = index_start + loop * deal_num; + __memcpy(nram_col, col + cur_index * channels, + remain_num * channels * sizeof(T), GDRAM2NRAM); + for (int i = 0; i < remain_num; ++i) { + int mask_index = cur_index + i; + const int h_im = mask_h_idx[mask_index]; + const int w_im = mask_w_idx[mask_index]; + // if(h_im>=height || w_im>=width) continue; + __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, + channels * sizeof(T), NRAM2GDRAM); + } + } + } else { + for (int index = taskId; index < mask_cnt; index += taskDim) { + const int m_index = index % mask_cnt; + const int h_im = mask_h_idx[m_index]; + const int w_im = mask_w_idx[m_index]; + // if(h_im>=height || w_im>=width) continue; + __memcpy(im + (h_im * width + w_im) * channels, col + index * channels, + channels * sizeof(T), GDRAM2GDRAM); + } + } +} + +__mlu_global__ void MLUKernelMaskedIm2colForward( + const void *feature, const int height, const int width, const int channels, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt, + void *data_col, const cnrtDataType_t data_dtype) { + if (coreId == 0x80) { + return; + } + + switch (data_dtype) { + case CNRT_FLOAT16: { + MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels, + kernel_h, kernel_w, pad_h, pad_w, + (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, + mask_cnt, (half *)data_col); + }; break; + case CNRT_FLOAT32: { + MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels, + kernel_h, kernel_w, pad_h, pad_w, + (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, + mask_cnt, (float *)data_col); + }; break; + default: { + break; + } + } +} + +__mlu_global__ void MLUKernelMaskedCol2imForward( + const void *col, const int height, const int width, const int channels, + const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt, + void *im, const cnrtDataType_t data_dtype) { + if (coreId == 0x80) { + return; + } + switch (data_dtype) { + case CNRT_FLOAT16: { + MLUUnion1MaskedCol2imForward((half *)col, height, width, channels, + (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, + mask_cnt, (half *)im); + }; break; + case CNRT_FLOAT32: { + MLUUnion1MaskedCol2imForward((float *)col, height, width, channels, + (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, + mask_cnt, (float *)im); + }; break; + default: { + break; + } + } +} + +void KernelMaskedIm2colForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + cnrtDataType_t k_dtype, const void *im_ptr, const int height, + const int width, const int channels, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const void *mask_h_idx_ptr, + const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) { + MLUKernelMaskedIm2colForward<<>>( + im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, + mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype); +} + +void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, cnrtDataType_t k_dtype, + const void *col_ptr, const int height, + const int width, const int channels, + const void *mask_h_idx_ptr, + const void *mask_w_idx_ptr, const int mask_cnt, + void *im_ptr) { + MLUKernelMaskedCol2imForward<<>>( + col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr, + mask_cnt, im_ptr, k_dtype); +} diff --git a/external/cv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/external/cv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..3a6d2d3ba61c2ba87ae9b1fb301c412fea93195c --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu @@ -0,0 +1,747 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define ALIGN_SIZE 64 +#define PIPELINE_COMMON_NUM 2 +#define PIPELINE_PINGPONG_NUM 10 + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height, + int width, int channels, int p_height, + int p_width, T spatial_scale, int *bin_x1, + int *bin_y1, int *bin_x2, int *bin_y2, + int *bin_wdim, int *bin_hdim, int *bin_dims, + T **input_base, bool *is_empty) { + int pw = bin_i % p_width; + int ph = (bin_i / p_width) % p_height; + int roi_n = bin_i / p_width / p_height; + + /*roi*/ + const T *roi_info = rois_v + roi_n * 5; // {{batch, x1, y1, x2, y2},,,} + int batch_index = (int)roi_info[0]; + int roi_x1 = round(roi_info[1] * spatial_scale); + int roi_y1 = round(roi_info[2] * spatial_scale); + int roi_x2 = round(roi_info[3] * spatial_scale); + int roi_y2 = round(roi_info[4] * spatial_scale); + int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1; + int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1; + + /*bin*/ + T bin_w = (T)roi_w / (T)p_width; + T bin_h = (T)roi_h / (T)p_height; + + *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1; + *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0; + *bin_x1 = *bin_x1 < width ? *bin_x1 : width; + + *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1; + *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0; + *bin_y1 = *bin_y1 < height ? *bin_y1 : height; + + *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1; + *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0; + *bin_x2 = *bin_x2 < width ? *bin_x2 : width; + + *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1; + *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0; + *bin_y2 = *bin_y2 < height ? *bin_y2 : height; + + *input_base = input_v + batch_index * height * width * channels; + *bin_wdim = *bin_x2 - *bin_x1; + *bin_hdim = *bin_y2 - *bin_y1; + *bin_dims = (*bin_hdim) * (*bin_wdim); + *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1); +} + +template +__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, + int channels, int height, int width, + int p_height, int p_width, int rois_num, + T spatial_scale, T *output_v, int *argmax) { + /* + * NRAM partition + * |---------------------------------------------------| + * | ping | + * |---------------------------------------------------| + * | pong | + * |---------------------------------------------------| + * | out | + * |---------------------------------------------------| + * | argmax | + * |---------------------------------------------------| + * | a | + * |---------------------------------------------------| + * | b | + * |---------------------------------------------------| + */ + uint32_t is_half = sizeof(T) == sizeof(half) ? true : false; + uint32_t t_size = sizeof(T); + uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float); + uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half); + + uint32_t channels_align = PAD_UP(channels, float_div); + uint32_t nram_limit = PAD_DOWN( + (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div); + + // nram PING/PONG, output, argamx, a, b + float *nram_ping = (float *)nram_buffer; + float *nram_pong = (float *)nram_buffer + nram_limit; + float *nram_out = (float *)nram_buffer + 2 * nram_limit; + float *nram_argmax = nram_out + channels_align; + float *nram_a = nram_out + 2 * channels_align; + float *nram_b = nram_out + 3 * channels_align; + + uint32_t c_bins_num = rois_num * p_height * p_width; + uint32_t task_bins = c_bins_num / taskDim; + uint32_t rem_bins = c_bins_num % taskDim; + if (taskId < rem_bins) { + task_bins += 1; + } + int bin_first = + (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId); + int bins_loop = bin_first + task_bins; + + T *input_base = NULL; + T *output_base = output_v + bin_first * channels; + int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL; + int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims; + int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims; + bool is_empty = false; + bool pong_is_empty = false; + bool is_first_bin = true; + uint32_t src_offset = 0; + uint32_t dst_offset = 0; + uint32_t nram_offset = 0; + uint32_t half_offset = + is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0; + float *nram_tmp = NULL; + + uint32_t c_slice = 0; + uint32_t c_slice_align = 0; + uint32_t pongc_slice = 0; + uint32_t pongc_slice_align = 0; + for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) { + getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels, + p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1, + &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims, + &input_base, &is_empty); + uint32_t c_rem = channels; + c_slice = nram_limit / bin_dims / float_div * float_div; + + if (is_first_bin && !is_empty) { + c_slice = c_slice > c_rem ? c_rem : c_slice; + c_slice_align = PAD_UP(c_slice, float_div); + for (int h = bin_y1; h < bin_y2; h++) { + src_offset = (h * width + bin_x1) * channels; + nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset; + if (c_slice_align == channels) { + __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, + bin_wdim * c_slice * t_size, GDRAM2NRAM); + } else { + __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, + c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size, + channels * t_size, bin_wdim - 1); + } + } + } + uint32_t c_offset = 0; + while (c_rem > 0) { + c_slice = c_slice > c_rem ? c_rem : c_slice; + c_slice_align = PAD_UP(c_slice, float_div); + + /*__memcpy_async*/ + if (c_rem - c_slice > 0 && !is_empty) { + pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice; + pongc_slice_align = PAD_UP(pongc_slice, float_div); + for (int h = bin_y1; h < bin_y2; h++) { + src_offset = (h * width + bin_x1) * channels + c_offset; + nram_offset = + (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset; + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset + c_slice, + pongc_slice * t_size, GDRAM2NRAM, + pongc_slice_align * t_size, channels * t_size, + bin_wdim - 1); + } + } else if (bin_i + 1 < bins_loop) { + getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width, + channels, p_height, p_width, (T)spatial_scale, &pbin_x1, + &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim, + &pbin_dims, &input_base, &pong_is_empty); + pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div); + pongc_slice = pongc_slice > channels ? channels : pongc_slice; + pongc_slice_align = PAD_UP(pongc_slice, float_div); + if (!pong_is_empty) { + for (int h = pbin_y1; h < pbin_y2; h++) { + src_offset = (h * width + pbin_x1) * channels; + nram_offset = + (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset; + if (pongc_slice_align == channels) { + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset, + pbin_wdim * pongc_slice * t_size, GDRAM2NRAM); + } else { + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset, pongc_slice * t_size, + GDRAM2NRAM, pongc_slice_align * t_size, + channels * t_size, pbin_wdim - 1); + } + } + } + } + + if (is_empty) { + __bang_write_value((T *)nram_out, c_slice_align, (T)0); + __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, + c_slice * t_size, NRAM2GDRAM); + if (NULL != argmax) { + __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); + __memcpy((int32_t *)argmax_base + dst_offset + c_offset, + (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); + } + } else { + if (is_half) { + uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div); + __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset, + bin_align64); + } + __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align, + bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1); + if (is_half) { + uint32_t c_align64 = PAD_UP(c_slice_align, half_div); + __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64); + } + __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, + c_slice * t_size, NRAM2GDRAM); + if (NULL != argmax) { + /*compute max_index*/ + __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping, + c_slice_align, bin_hdim, bin_wdim, bin_hdim, + bin_wdim, 1, 1); + convertInt2Float((float *)nram_argmax, (float *)nram_a, + (int32_t *)nram_out, (float *)nram_b, c_slice_align); + + /*compute input_h*/ + for (int i = 0; i < c_slice; i++) { + nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); + } + __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1, + c_slice_align); + __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width, + c_slice_align); + + /*compute input_w*/ + __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim, + c_slice_align); + __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, + c_slice_align); + __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1, + c_slice_align); + __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, + c_slice_align); + convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, + (float *)nram_out, (float *)nram_b, c_slice_align); + __memcpy((int32_t *)argmax_base + dst_offset + c_offset, + (int32_t *)nram_argmax, c_slice * sizeof(int32_t), + NRAM2GDRAM); + } + } + nram_tmp = nram_ping; + nram_ping = nram_pong; + nram_pong = nram_tmp; + c_offset += c_slice; + c_rem -= c_slice; + __asm__ volatile("sync;"); + } + dst_offset += channels; + is_first_bin = false; + } +} + +__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, + const void *input_data, + const void *input_rois, int batch, + int channels, int height, int width, + int pooled_height, int pooled_width, + int rois_num, float spatial_scale, + void *output_data, int *argmax) { + switch (data_type) { + case CNRT_FLOAT16: { + MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels, + height, width, pooled_height, pooled_width, rois_num, + (half)spatial_scale, (half *)output_data, argmax); + }; break; + case CNRT_FLOAT32: { + MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch, + channels, height, width, pooled_height, pooled_width, + rois_num, (float)spatial_scale, (float *)output_data, + argmax); + }; break; + default: { break; } + } +} +} // namespace forward + +namespace backward { +// Convert index of argmax from global grads_image to local bin in RoI. Vector +// operations do not support int type, so conversion from int to float is +// performed here. +__mlu_func__ void convertIndex( + int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1, + int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int, + int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w, + int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w, + float *nram_atomic_add, float *nram_grads_image, int width, int height, + int wstart, int hstart, int w_compute, int h_compute, int align_c, + int channels, int loop_flag, int loop_id, int true_limit) { + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); + + // This step uses scalar division, because the above vector division causes + // rounding accuracy problem. + for (int i = 0; i < channels; ++i) { + *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width; + } + + // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width' + // operation. + convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1, + (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2, + align_c); + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2, + align_c); + + // Perform 'temp_result - hstart' operation + __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, + align_c); + + // Perform 'temp_result1 - temp_result2 * width' operation + __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, + align_c); + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); + __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, + (float *)nram_argmax_fp_w, align_c); + + // Perform 'temp_result - wstart' operation + __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, + wstart, align_c); + + // Perform 'temp_result = h * w_compute + w' operation + __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + w_compute, align_c); + __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + (float *)nram_argmax_fp_w, align_c); + + if (loop_flag == 1) { + __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + (loop_id * true_limit), align_c); + } + convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, + (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, + align_c); +} + +template +__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, + const int32_t *argmax, T *grads_image, + int channels, int height, int width, + int pooled_height, int pooled_width, + int rois_num, const T spatial_scale, + int high_precision) { + // Calculate the number of rois processed by each core + int bin_num = rois_num * pooled_height * pooled_width; + int loop = + (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim); + int tid = taskId * loop; + if (bin_num % taskDim != 0) { + if (tid >= bin_num) { + return; + } else { + // last part is (bin_num - tid). + loop = bin_num - tid < loop ? bin_num - tid : loop; + } + } + int align_c = PAD_UP(channels, ALIGN_SIZE); + // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM. + int data_size = + PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c - + (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) / + 2), + ALIGN_SIZE); + int hw_limit = data_size / align_c; + float *nram_grads = (float *)nram_buffer; + for (int idx = tid; idx < tid + loop; ++idx) { + // (n, ph, pw) is a C in the pooled output + int pw = idx % pooled_width; + int ph = (idx / pooled_width) % pooled_height; + int n = idx / pooled_width / pooled_height; + + const T *offset_rois = (const T *)(rois + n * 5); + int roi_batch_ind = int(offset_rois[0]); + // Calculate the roi region on feature maps + int roi_start_w = round(offset_rois[1] * spatial_scale); + int roi_start_h = round(offset_rois[2] * spatial_scale); + int roi_end_w = round(offset_rois[3] * spatial_scale); + int roi_end_h = round(offset_rois[4] * spatial_scale); + // Force malformed rois to 1x1 + int roi_width = + roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1; + int roi_height = + roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1; + T bin_size_h = (T)roi_height / (T)pooled_height; + T bin_size_w = (T)roi_width / (T)pooled_width; + + // The corresponding bin region + int hstart = int(floor((T)ph * bin_size_h)); + int wstart = int(floor((T)pw * bin_size_w)); + int hend = int(ceil((T)(ph + 1) * bin_size_h)); + int wend = int(ceil((T)(pw + 1) * bin_size_w)); + + // Add roi offsets and clip to input boundaries, min(max(A, B), C); + hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0; + hstart = hstart < height ? hstart : height; + hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0; + hend = hend < height ? hend : height; + wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0; + wstart = wstart < width ? wstart : width; + wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0; + wend = wend < width ? wend : width; + + bool is_empty = (hend <= hstart) || (wend <= wstart); + if (!is_empty) { + int h_compute = hend - hstart; + int w_compute = wend - wstart; + int true_limit = + hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute; + int loop_int = (h_compute * w_compute) / true_limit; + int rem = (h_compute * w_compute) % true_limit; + int32_t *nram_argmax = (int32_t *)nram_grads + align_c; + int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c; + int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; + int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; + int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; + int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; + int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; + int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; + int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; + float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c; + float *nram_grads_image = (float *)nram_atomic_add + align_c; + if (true_limit == h_compute * w_compute) { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |---------------------------------------------------| + * | argmax_temp | + * |---------------------------------------------------| + * | atomic_add | + * |---------------------------------------------------| + * | grads_image | + * |---------------------------------------------------| + */ + + // Load the data from GDRAM to NRAM. + __memcpy( + (T *)nram_grads + align_c * high_precision, + (const T *)grads + + (n * pooled_height * pooled_width + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + align_c * high_precision, + align_c); + } + + __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(int32_t), GDRAM2NRAM); + + // Perform pooling operation on NRAM. + convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart, + hstart, w_compute, h_compute, align_c, channels, 0, 0, 0); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, align_c, h_compute, + w_compute, h_compute, w_compute, h_compute, + w_compute); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, + h_compute * w_compute * align_c); + } + + // Store the result on NRAM back to GDRAM. + for (int hc = 0; hc < h_compute; ++hc) { + for (int wc = 0; wc < w_compute; ++wc) { + T *dst = (T *)nram_atomic_add; + int grad_image_offset = (roi_batch_ind * height * width + + (hc + hstart) * width + wc + wstart) * + channels; + T *src1 = (T *)grads_image + grad_image_offset; + int nram_grads_image_offset = (hc * w_compute + wc) * align_c; + T *src2 = (T *)nram_grads_image + nram_grads_image_offset; + __bang_atomic_add(dst, src1, src2, channels); + } + } + } else if (true_limit > 0) { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |--------------------ping_pong----------------------| + * | argmax_temp | argmax_temp | + * |------------------------|--------------------------| + * | atomic_add | atomic_add | + * |------------------------|--------------------------| + * | grads_image | grads_image | + * |---------------------------------------------------| + */ + + // Load the data from GDRAM to NRAM. + __memcpy( + (T *)nram_grads + align_c * high_precision, + (const T *)grads + + (n * pooled_height * pooled_width + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + align_c * high_precision, + align_c); + } + __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(int32_t), GDRAM2NRAM); + + int ping_pong = 0; + int ping_pong_offset = + (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2; + for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { + int size = (loop_id == loop_int) ? rem : true_limit; + if (size == 0) { + break; + } + // Perform pooling operation on NRAM. + nram_argmax_fp = + (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset; + nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; + nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; + nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; + nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; + nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; + nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; + nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; + nram_atomic_add = (float *)nram_argmax_fp_w + align_c; + nram_grads_image = (float *)nram_atomic_add + align_c; + int loop_id_1 = loop_id; + int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit; + if (size_1 == 0) { + break; + } + convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart, + hstart, w_compute, h_compute, align_c, channels, 1, + loop_id_1, true_limit); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, align_c, size_1, 1, + size_1, 1, size_1, 1); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, size_1 * align_c); + } + + // Store the result on NRAM back to GDRAM. + for (int index_size = 0; index_size < size; ++index_size) { + int h = (loop_id * true_limit + index_size) / w_compute; + int w = (loop_id * true_limit + index_size) % w_compute; + T *dst = (T *)nram_atomic_add; + T *grads_image_n = + (T *)grads_image + roi_batch_ind * height * width * channels; + T *src1 = (T *)grads_image_n + + ((h + hstart) * width + (w + wstart)) * channels; + T *src2 = (T *)nram_grads_image + index_size * align_c; + __bang_atomic_add(dst, src1, src2, channels); + } + ping_pong = 1 - ping_pong; + } + } else { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |--------------------ping_pong----------------------| + * | argmax_temp | argmax_temp | + * |------------------------|--------------------------| + * | atomic_add | atomic_add | + * |------------------------|--------------------------| + * | grads_image | grads_image | + * |---------------------------------------------------| + */ + + int c_limit = + PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) / + (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2), + ALIGN_SIZE); + int loop_int = channels / c_limit; + int rem = channels % c_limit; + int ping_pong = 0; + int ping_pong_offset = + (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2; + for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { + int size = (loop_id == loop_int) ? rem : c_limit; + if (size == 0) { + break; + } + nram_argmax_fp = + (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset; + nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit; + nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit; + nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit; + nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit; + nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit; + nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit; + nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit; + nram_atomic_add = (float *)nram_argmax_fp_w + c_limit; + nram_grads_image = (float *)nram_atomic_add + c_limit; + + // This pipeline loads the data from GDRAM to NRAM. + __memcpy((T *)nram_grads + c_limit * high_precision, + (const T *)grads + + n * pooled_height * pooled_width * channels + + ph * pooled_width * channels + pw * channels + + loop_id * c_limit, + size * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + c_limit * high_precision, + c_limit); + } + __memcpy((int32_t *)nram_argmax, + (const int32_t *)argmax + + n * pooled_height * pooled_width * channels + + ph * pooled_width * channels + pw * channels + + loop_id * c_limit, + size * sizeof(int32_t), GDRAM2NRAM); + + for (int hc = 0; hc < h_compute; ++hc) { + for (int wc = 0; wc < w_compute; ++wc) { + // This pipeline performs pooling operation on NRAM. + convertIndex( + nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart + wc, + hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1, + 1, 1); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, c_limit); + } + // This pipeline stores the result on NRAM back to GDRAM. + T *dst = (T *)nram_atomic_add; + T *grads_image_n = + (T *)grads_image + roi_batch_ind * height * width * channels; + T *src1 = (T *)grads_image_n + + ((hc + hstart) * width + (wc + wstart)) * channels + + loop_id * c_limit; + T *src2 = (T *)nram_grads_image; + __bang_atomic_add(dst, src1, src2, size); + } + } + ping_pong = 1 - ping_pong; + } + } + } + } +} + +__mlu_global__ void MLUKernelRoiPoolBackward( + const void *grads, const void *rois, const int *argmax, void *grads_image, + int rois_num, int pooled_height, int pooled_width, int channels, int no, + int height, int width, const float spatial_scale, + const cnrtDataType_t k_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (k_dtype) { + case CNRT_FLOAT16: { + // Using the float type '__bang_max_pool_bp' instruction to increase the + // bit width. + const int high_precision = 1; + MLUUnion1Roipool((const half *)rois, (const half *)grads, + (const int32_t *)argmax, (half *)grads_image, channels, + height, width, pooled_height, pooled_width, rois_num, + (const half)spatial_scale, high_precision); + }; break; + case CNRT_FLOAT32: { + const int high_precision = 0; + MLUUnion1Roipool((const float *)rois, (const float *)grads, + (const int32_t *)argmax, (float *)grads_image, channels, + height, width, pooled_height, pooled_width, rois_num, + (const float)spatial_scale, high_precision); + }; break; + default: { break; } + } +} +} // namespace backward + +void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, cnrtDataType_t data_type, + const void *input_data, const void *input_rois, + const int batch, const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, const int rois_num, + const float spatial_scale, void *output_data, + int *argmax) { + forward::MLUKernelRoiPool<<>>( + data_type, input_data, input_rois, batch, channels, height, width, + pooled_height, pooled_width, rois_num, spatial_scale, output_data, + argmax); +} + +void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, cnrtDataType_t k_dtype, + const void *grad_output_ptr, const void *rois_ptr, + const int *argmax_ptr, void *grad_input_ptr, + const int box_num, const int pooled_height, + const int pooled_width, const int channels, + const int batch, const int height, const int width, + const float spatial_scale) { + backward::MLUKernelRoiPoolBackward<<>>( + grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num, + pooled_height, pooled_width, channels, batch, height, width, + spatial_scale, k_dtype); +} diff --git a/external/cv/mmcv/ops/csrc/common/mps/MPSDevice.h b/external/cv/mmcv/ops/csrc/common/mps/MPSDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mps/MPSDevice.h @@ -0,0 +1,64 @@ +// Copyright © 2022 Apple Inc. + +// This file is modify from: +// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h + +#pragma once +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +typedef id MTLDevice_t; +#else +typedef void* MTLDevice; +typedef void* MTLDevice_t; +#endif + +using namespace std; + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSDevice +// +// MPSDevice is a singleton class that returns the default device +//----------------------------------------------------------------- + +class TORCH_API MPSDevice { + public: + /** + * MPSDevice should not be cloneable. + */ + MPSDevice(MPSDevice& other) = delete; + /** + * MPSDevice should not be assignable. + */ + void operator=(const MPSDevice&) = delete; + /** + * Gets single instance of the Device. + */ + static MPSDevice* getInstance(); + /** + * Returns the single device. + */ + MTLDevice_t device() { return _mtl_device; } + + ~MPSDevice(); + + private: + static MPSDevice* _device; + MTLDevice_t _mtl_device; + MPSDevice(); +}; + +TORCH_API bool is_available(); + +TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); + +} // namespace mps +} // namespace at diff --git a/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.h b/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.h new file mode 100644 index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.h @@ -0,0 +1,61 @@ +#ifndef _MPS_LIBRARY_H_ +#define _MPS_LIBRARY_H_ + +#include +#include + +#ifdef __OBJC__ +#include +#include +#include + +typedef id MTLComputePipelineState_t; +typedef id MTLLibrary_t; +#else +typedef void* MTLComputePipelineState; +typedef void* MTLComputePipelineState_t; +typedef void* MTLLibrary; +typedef void* MTLLibrary_t; +#endif + +class MPSLibrary { + public: + // disable constructor for singleton + static MPSLibrary* createFromUrl(const std::string& library_url); + static MPSLibrary* createFromSource(const std::string& source); + ~MPSLibrary(); + + MTLLibrary_t library() { return _library; } + + MTLComputePipelineState_t getComputePipelineState( + const std::string& function_name); + + private: + MTLLibrary_t _library; + std::unordered_map _pso_map; +}; + +class MPSLibraryManager { + public: + // disable constructor for singleton + MPSLibraryManager(const MPSLibraryManager&) = delete; + MPSLibraryManager& operator=(const MPSLibraryManager&) = delete; + MPSLibraryManager(MPSLibraryManager&&) = delete; + MPSLibraryManager& operator=(MPSLibraryManager&&) = delete; + + static MPSLibraryManager* getInstance(); + + bool hasLibrary(const std::string& name); + + MPSLibrary* getLibrary(const std::string& library_url); + + MPSLibrary* createLibraryFromSouce(const std::string& name, + const std::string& sources); + + ~MPSLibraryManager(); + + private: + MPSLibraryManager(); + std::unordered_map> _library_map; +}; +#endif diff --git a/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.mm new file mode 100644 index 0000000000000000000000000000000000000000..99addc7e28222f890e0b65660bb97711b6b52305 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mps/MPSLibrary.mm @@ -0,0 +1,107 @@ +#include "MPSLibrary.h" +#include "MPSDevice.h" + +static std::unique_ptr mps_library_manager=nullptr; + +MPSLibraryManager* MPSLibraryManager::getInstance() { + if(!mps_library_manager) + mps_library_manager = std::unique_ptr(new MPSLibraryManager()); + return mps_library_manager.get(); +} + +MPSLibraryManager::~MPSLibraryManager() {} + +MPSLibraryManager::MPSLibraryManager() {} + +bool MPSLibraryManager::hasLibrary(const std::string& name) { + return _library_map.find(name) != _library_map.end(); +} + +MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) { + if (_library_map.find(library_url) != _library_map.end()) { + return _library_map[library_url].get(); + } + _library_map.emplace(std::make_pair( + library_url, std::unique_ptr(MPSLibrary::createFromUrl(library_url)))); + return _library_map[library_url].get(); +} + +MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name, + const std::string& source) { + NSString* ns_name = [NSString stringWithCString:name.c_str()]; + if (_library_map.find(name) != _library_map.end()) { + NSLog(@"Library %@ already exist.", ns_name); + return nullptr; + } + + _library_map.emplace( + std::make_pair(name, std::unique_ptr(MPSLibrary::createFromSource(source)))); + return _library_map[name].get(); +} + +MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) { + MPSLibrary* library = new MPSLibrary(); + @autoreleasepool { + NSError* error = nil; + + // load library and func + NSString* utl_str = [NSString stringWithCString:library_url.c_str()]; + NSURL* metal_url = [NSURL fileURLWithPath:utl_str]; + library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url + error:&error]; + if (library->_library == nil) { + NSLog(@"Failed to find library, error %@.", error); + exit(1); + } + } + + return library; +} + +MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) { + MPSLibrary* library = new MPSLibrary(); + @autoreleasepool { + NSError* error = nil; + + // load library and func + NSString* code_str = [NSString stringWithCString:sources.c_str()]; + library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str + options:nil + error:&error]; + if (library->_library == nil) { + NSLog(@"Failed to find library, error %@.", error); + exit(1); + } + } + + return library; +} + +MPSLibrary::~MPSLibrary() { + [_library release]; + _library = nil; +} + +MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) { + if (_pso_map.find(function_name) != _pso_map.end()) { + return _pso_map[function_name]; + } + + MTLComputePipelineState_t pso; + @autoreleasepool { + NSError* error = nil; + + // create function + NSString* function_name_str = [NSString stringWithCString:function_name.c_str()]; + id func = [_library newFunctionWithName:function_name_str]; + if (func == nil) { + NSLog(@"Failed to created pipeline state object, error %@.", error); + exit(1); + } + // create pipeline + pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func + error:&error]; + _pso_map.emplace(std::make_pair(function_name, pso)); + } + return _pso_map[function_name]; +} diff --git a/external/cv/mmcv/ops/csrc/common/mps/MPSStream.h b/external/cv/mmcv/ops/csrc/common/mps/MPSStream.h new file mode 100644 index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mps/MPSStream.h @@ -0,0 +1,132 @@ +// Copyright © 2022 Apple Inc. + +// This file is modify from: +// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h + +#pragma once + +#include +#include + +#include +#include +#include +#include "MPSDevice.h" + +#ifdef __OBJC__ +#include +#include +#include +#include +typedef id MTLCommandQueue_t; +typedef id MTLCommandBuffer_t; +typedef id MTLSharedEvent_t; +typedef id MTLDevice_t; +#else +typedef void* MTLCommandQueue_t; +typedef void* MTLCommandQueue; +typedef void* MTLCommandBuffer_t; +typedef void* MTLCommandBuffer; +typedef void* MTLSharedEvent_t; +typedef void* dispatch_queue_t; +typedef void* MTLDevice_t; +#define nil NULL; +#endif + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +class TORCH_API MPSStream { + public: + enum Unchecked { UNCHECKED }; + /// Construct a MPSStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a MPS stream. + explicit MPSStream(Stream stream); + + ~MPSStream(); + MTLCommandQueue_t commandQueue() const { return _commandQueue; }; + dispatch_queue_t queue() const { return _serialQueue; } + + MTLCommandBuffer_t commandBuffer(); + void commit(bool flush); + void commitAndWait(); + void synchronize(); + + void flush(); + + /// Get the MPS device index that this stream is associated with. + c10::DeviceIndex device_index() const { return _stream.device_index(); } + + MTLCommandQueue_t stream() const { return _commandQueue; }; + + MTLDevice_t device() const { return [_commandQueue device]; } + + /// Explicit conversion to Stream. + Stream unwrap() const { return _stream; } + + private: + Stream _stream; + MTLCommandQueue_t _commandQueue = nil; + MTLCommandBuffer_t _commandBuffer = nil; + void _flush(bool commitAndWait) const; + + dispatch_queue_t _serialQueue = nullptr; +}; + +/** + * Get the current MPS stream + */ +TORCH_API MPSStream* getCurrentMPSStream(); + +/** + * Get the default MPS stream + */ +TORCH_API MPSStream* getDefaultMPSStream(); + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +class TORCH_API MPSStreamImpl { + public: + /** + * Gets single instance of the MPSStream. + */ + static MPSStream* getInstance(); + + private: + static MPSStream* _stream; + MPSStreamImpl(); +}; + +//----------------------------------------------------------------- +// MPSEvent +//----------------------------------------------------------------- + +struct TORCH_API MPSEvent { + MPSEvent(); + // MPSEvent(id device); + + ~MPSEvent(); + MTLSharedEvent_t event() const { return _event; } + + void recordEvent(MPSStream* stream); + void waitForEvent(MPSStream* queue); // waits on the cpu + bool queryEvent(); + uint64_t getCurrentValue() { return _currentValue; } + void setCurrentValue(uint64_t currValue) { _currentValue = currValue; } + + private: + bool _isRecorded = false; + uint64_t _currentValue = 0; + MTLSharedEvent_t _event; +}; + +typedef MPSEvent* mpsEvent_t; + +} // namespace mps +} // namespace at diff --git a/external/cv/mmcv/ops/csrc/common/mps/MPSUtils.h b/external/cv/mmcv/ops/csrc/common/mps/MPSUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/mps/MPSUtils.h @@ -0,0 +1,51 @@ +#ifndef _MPS_UTILS_H_ +#define _MPS_UTILS_H_ +#include +#ifdef __OBJC__ +#include +#include +#include + +typedef id MTLBuffer_t; +typedef id MTLComputeCommandEncoder_t; +#else +typedef void* MTLBuffer; +typedef void* MTLBuffer_t; +typedef void* MTLComputeCommandEncoder; +typedef void* MTLComputeCommandEncoder_t; +#endif + +// utils +static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) { + return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data()); +} + +template , at::Tensor>::value, bool> = true> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t); + +template , at::Tensor>::value, bool> = true> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { + [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index]; +} + +template , at::Tensor>::value, bool>> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { + [encoder setBytes:&t length:sizeof(t) atIndex:index]; +} + +inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {} + +template +void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) { + setMTLArg(encoder, index, std::forward(t)); + setMTLArgsImpl(encoder, index + 1, std::forward(args)...); +} + +template +void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) { + [encoder setComputePipelineState:pso]; + setMTLArgsImpl(encoder, 0, std::forward(args)...); +} +#endif diff --git a/external/cv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp b/external/cv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..72701890dd727db911a1c0ce4d6790c1b531348d --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp @@ -0,0 +1,40 @@ +#ifndef PARROTS_CPP_HELPER +#define PARROTS_CPP_HELPER +#include +#include +#include +#include +#include + +using namespace parrots; + +#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \ + case prim_type: { \ + using scalar_t = type; \ + return __VA_ARGS__(); \ + } + +#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + switch (the_type) { \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ + default: \ + PARROTS_NOTSUPPORTED; \ + } \ + }() + +#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + switch (the_type) { \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \ + default: \ + PARROTS_NOTSUPPORTED; \ + } \ + }() + +#endif // PARROTS_CPP_HELPER diff --git a/external/cv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp b/external/cv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..539009c3f91b46ea58a3a64f0875d799e8bd0b65 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp @@ -0,0 +1,111 @@ +#ifndef PARROTS_CUDA_HELPER +#define PARROTS_CUDA_HELPER + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common_cuda_helper.hpp" +#include "parrots_cudawarpfunction.cuh" + +using namespace parrots; +using phalf = float16; + +#define __PHALF(x) (x.y) + +#define PARROTS_CUDA_CHECK(exp) \ + do { \ + cudaError_t err = exp; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "cudaCheckError() failed : %s\n", \ + cudaGetErrorString(err)); \ + exit(-1); \ + } \ + } while (0) + +#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \ + case prim_type: { \ + using scalar_t = type; \ + return __VA_ARGS__(); \ + } + +#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + switch (the_type) { \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ + default: \ + PARROTS_NOTSUPPORTED; \ + } \ + }() + +#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + switch (the_type) { \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ + PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \ + default: \ + PARROTS_NOTSUPPORTED; \ + } \ + }() + +/** atomicAdd **/ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + +static __inline__ __device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + if (val == 0.0) return __longlong_as_double(old); + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#endif + +static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) { + unsigned int* aligned = + (unsigned int*)((size_t)address - ((size_t)address & 2)); + unsigned int old = *aligned; + unsigned int assumed; + unsigned short old_as_us; + do { + assumed = old; + old_as_us = + (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff); + +#if __CUDACC_VER_MAJOR__ >= 9 + float16 tmp; + tmp.x = old_as_us; + float16 sum = tmp + val; + unsigned short sum_as_us = sum.x; +// half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us)) +// + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum); +#else + unsigned short sum_as_us = + __float2half_rn(__half2float(old_as_us) + (float)(val)); +#endif + + unsigned int sum_as_ui = (size_t)address & 2 + ? (sum_as_us << 16) | (old & 0xffff) + : (old & 0xffff0000) | sum_as_us; + old = atomicCAS(aligned, assumed, sum_as_ui); + } while (assumed != old); + //__half_raw raw = {old_as_us}; + // return float16(raw); + return *reinterpret_cast(&old_as_us); +} +#endif // PARROTS_CUDA_HELPER diff --git a/external/cv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/external/cv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f68e8740561ef833c09e1ba9f999922f5d04bce5 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp @@ -0,0 +1,27 @@ +#ifndef PYTORCH_CPP_HELPER +#define PYTORCH_CPP_HELPER +#include + +#include + +using namespace at; + +#define CHECK_CUDA(x) \ + TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_MLU(x) \ + TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor") +#define CHECK_CPU(x) \ + TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor") +#define CHECK_CONTIGUOUS(x) \ + TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_CUDA_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) +#define CHECK_MLU_INPUT(x) \ + CHECK_MLU(x); \ + CHECK_CONTIGUOUS(x) +#define CHECK_CPU_INPUT(x) \ + CHECK_CPU(x); \ + CHECK_CONTIGUOUS(x) + +#endif // PYTORCH_CPP_HELPER diff --git a/external/cv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/external/cv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..52e512695a403abe2688f9bffeece633a02f189a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp @@ -0,0 +1,20 @@ +#ifndef PYTORCH_CUDA_HELPER +#define PYTORCH_CUDA_HELPER + +#include +#include +#include + +#include +#include + +#include "common_cuda_helper.hpp" + +using at::Half; +using at::Tensor; +using phalf = at::Half; + +#define __PHALF(x) (x) +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +#endif // PYTORCH_CUDA_HELPER diff --git a/external/cv/mmcv/ops/csrc/common/pytorch_device_registry.hpp b/external/cv/mmcv/ops/csrc/common/pytorch_device_registry.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a32b7270c3521f960394af7d18cbbd03ba50df1 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/pytorch_device_registry.hpp @@ -0,0 +1,141 @@ +#ifndef PYTORCH_DEVICE_REGISTRY_H +#define PYTORCH_DEVICE_REGISTRY_H + +// Using is recommended in the official documentation in +// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op. +// However, we use for compatibility with CUDA 9.0 +// Read https://github.com/pytorch/extension-cpp/issues/35 for more details. +#include + +#include +#include +#include +#include + +inline std::string GetDeviceStr(const at::Device& device) { + std::string str = DeviceTypeName(device.type(), true); + if (device.has_index()) { + str.push_back(':'); + str.append(std::to_string(device.index())); + } + return str; +} + +// Registry +template +class DeviceRegistry; + +template +class DeviceRegistry { + public: + using FunctionType = Ret (*)(Args...); + static const int MAX_DEVICE_TYPES = + int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); + + void Register(at::DeviceType device, FunctionType function) { + funcs_[int8_t(device)] = function; + } + + FunctionType Find(at::DeviceType device) const { + return funcs_[int8_t(device)]; + } + + static DeviceRegistry& instance() { + static DeviceRegistry inst; + return inst; + } + + private: + DeviceRegistry() { + for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) { + funcs_[i] = nullptr; + } + }; + FunctionType funcs_[MAX_DEVICE_TYPES]; +}; + +// get device of first tensor param + +template , at::Tensor>::value, + bool> = true> +at::Device GetFirstTensorDevice(T&& t, Args&&... args) { + return std::forward(t).device(); +} +template , at::Tensor>::value, + bool> = true> +at::Device GetFirstTensorDevice(T&& t, Args&&... args) { + return GetFirstTensorDevice(std::forward(args)...); +} + +// check device consistency + +inline std::pair CheckDeviceConsistency( + const at::Device& device, int index) { + return {index, device}; +} + +template , at::Tensor>::value, + bool> = true> +std::pair CheckDeviceConsistency(const at::Device& device, + int index, T&& t, + Args&&... args); + +template , at::Tensor>::value, + bool> = true> +std::pair CheckDeviceConsistency(const at::Device& device, + int index, T&& t, + Args&&... args) { + auto new_device = std::forward(t).device(); + if (new_device.type() != device.type() || + new_device.index() != device.index()) { + return {index, new_device}; + } + return CheckDeviceConsistency(device, index + 1, std::forward(args)...); +} + +template < + typename T, typename... Args, + std::enable_if_t, at::Tensor>::value, bool>> +std::pair CheckDeviceConsistency(const at::Device& device, + int index, T&& t, + Args&&... args) { + return CheckDeviceConsistency(device, index + 1, std::forward(args)...); +} + +// dispatch + +template +auto Dispatch(const R& registry, const char* name, Args&&... args) { + auto device = GetFirstTensorDevice(std::forward(args)...); + auto inconsist = + CheckDeviceConsistency(device, 0, std::forward(args)...); + TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ", + inconsist.first, + ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(), + " vs ", GetDeviceStr(device).c_str(), "\n") + auto f_ptr = registry.Find(device.type()); + TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ", + GetDeviceStr(device).c_str(), " not found.\n") + return f_ptr(std::forward(args)...); +} + +// helper macro + +#define DEVICE_REGISTRY(key) DeviceRegistry::instance() + +#define REGISTER_DEVICE_IMPL(key, device, value) \ + struct key##_##device##_registerer { \ + key##_##device##_registerer() { \ + DEVICE_REGISTRY(key).Register(at::k##device, value); \ + } \ + }; \ + static key##_##device##_registerer _##key##_##device##_registerer; + +#define DISPATCH_DEVICE_IMPL(key, ...) \ + Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__) + +#endif // PYTORCH_DEVICE_REGISTRY diff --git a/external/cv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/external/cv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e49572ca841211e2960192f1e0955b54819086cc --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef PYTORCH_MLU_HELPER_HPP_ +#define PYTORCH_MLU_HELPER_HPP_ + +#ifdef MMCV_WITH_MLU +#include "aten.h" + +#define NFU_ALIGN_SIZE 128 + +#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) + +#define PAD_DOWN(x, y) (((x) / (y)) * (y)) + +#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) + +#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y)) + +inline int32_t getJobLimitCapability() { + CNcontext drv_ctx; + TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails"); + CNctxConfigParam ctx_conf_param; + TORCH_CHECK( + CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT, + &ctx_conf_param), + "cnGetCtxConfigParam fails."); + return (int32_t)ctx_conf_param.unionLimit; +} + +inline int32_t getCoreNumOfJobLimitCapability() { + switch (getJobLimitCapability()) { + default: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * + getJobLimitCapability(); + case CN_KERNEL_CLASS_BLOCK: + return 1; + case CN_KERNEL_CLASS_UNION: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + case CN_KERNEL_CLASS_UNION2: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2; + case CN_KERNEL_CLASS_UNION4: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4; + case CN_KERNEL_CLASS_UNION8: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8; + case CN_KERNEL_CLASS_UNION16: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16; + } +} + +#endif // MMCV_WITH_MLU + +#endif // PYTORCH_MLU_HELPER_HPP_ diff --git a/external/cv/mmcv/ops/csrc/common/pytorch_npu_helper.hpp b/external/cv/mmcv/ops/csrc/common/pytorch_npu_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..073d6b38c345ed480542c2dd68d9fc256a4665ae --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/pytorch_npu_helper.hpp @@ -0,0 +1,47 @@ +/****************************************************************************** + * Copyright (c) 2022 Huawei Technologies Co., Ltd + * All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSD-3-Clause + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef PYTORCH_NPU_HELPER_HPP_ +#define PYTORCH_NPU_HELPER_HPP_ + +#include +#include +#include + +#include "pytorch_cpp_helper.hpp" +#include "pytorch_device_registry.hpp" + +#define NPU_NAME_SPACE at_npu::native + +#ifdef MMCV_WITH_XLA +#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value) +#else +#define REGISTER_NPU_IMPL(key, value) \ + REGISTER_DEVICE_IMPL(key, PrivateUse1, value) +#endif + +#ifdef MMCV_WITH_XLA +#define CHECK_NPU(x) \ + TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor") +#else +#define CHECK_NPU(x) \ + TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \ + " must be a NPU " \ + "tensor") + +#endif +#endif // PYTORCH_NPU_HELPER_HPP_ diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h new file mode 100644 index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h @@ -0,0 +1,70 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PARAMS_GRID_H_ +#define PARAMS_GRID_H_ +#include +#include + +namespace detail { +template +int getTotalSize(std::vector arg) { + return arg.size(); +} + +template +int getTotalSize(std::vector arg, std::vector... args) { + return arg.size() * getTotalSize(args...); +} + +template +int getSize(std::vector arg) { + return arg.size(); +} + +template +void assigner(TT &src, std::vector counter, std::vector &arg) { + std::get(src) = arg[counter[Idx]]; +} + +template +void assigner(TT &src, std::vector counter, std::vector &arg, + std::vector &... args) { + std::get(src) = arg[counter[Idx]]; + assigner(src, counter, args...); +} +} // namespace detail + +template +std::vector> paramsGrid(std::vector... args) { + int length = detail::getTotalSize(args...); + std::vector sizes = {detail::getSize(args)...}; + int size = sizes.size(); + + std::vector> params(length); + std::vector counter(size); + for (int i = 0; i < length; ++i) { + detail::assigner<0>(params[i], counter, args...); + counter[size - 1] += 1; + for (int c = size - 1; c >= 0; --c) { + if (counter[c] == sizes[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return params; +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h new file mode 100644 index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h @@ -0,0 +1,493 @@ +// Copyright Louis Delacroix 2010 - 2014. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// A pretty printing library for C++ +// +// Usage: +// Include this header, and operator<< will "just work". + +#ifndef H_PRETTY_PRINT +#define H_PRETTY_PRINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pretty_print { +namespace detail { +// SFINAE type trait to detect whether T::const_iterator exists. + +struct sfinae_base { + using yes = char; + using no = yes[2]; +}; + +template +struct has_const_iterator : private sfinae_base { + private: + template + static yes &test(typename C::const_iterator *); + template + static no &test(...); + + public: + static const bool value = sizeof(test(nullptr)) == sizeof(yes); + using type = T; +}; + +template +struct has_begin_end : private sfinae_base { + private: + template + static yes & + f(typename std::enable_if< + std::is_same(&C::begin)), + typename C::const_iterator (C::*)() const>::value>::type *); + + template + static no &f(...); + + template + static yes &g(typename std::enable_if< + std::is_same(&C::end)), + typename C::const_iterator (C::*)() const>::value, + void>::type *); + + template + static no &g(...); + + public: + static bool const beg_value = sizeof(f(nullptr)) == sizeof(yes); + static bool const end_value = sizeof(g(nullptr)) == sizeof(yes); +}; + +} // namespace detail + +// Holds the delimiter values for a specific character type + +template +struct delimiters_values { + using char_type = TChar; + const char_type *prefix; + const char_type *delimiter; + const char_type *postfix; +}; + +// Defines the delimiter values for a specific container and character type + +template +struct delimiters { + using type = delimiters_values; + static const type values; +}; + +// Functor to print containers. You can use this directly if you want +// to specify a non-default delimiters type. The printing logic can +// be customized by specializing the nested template. + +template , + typename TDelimiters = delimiters> +struct print_container_helper { + using delimiters_type = TDelimiters; + using ostream_type = std::basic_ostream; + + template + struct printer { + static void print_body(const U &c, ostream_type &stream) { + using std::begin; + using std::end; + + auto it = begin(c); + const auto the_end = end(c); + + if (it != the_end) { + for (;;) { + stream << *it; + + if (++it == the_end) break; + + if (delimiters_type::values.delimiter != NULL) + stream << delimiters_type::values.delimiter; + } + } + } + }; + + print_container_helper(const T &container) : container_(container) {} + + inline void operator()(ostream_type &stream) const { + if (delimiters_type::values.prefix != NULL) + stream << delimiters_type::values.prefix; + + printer::print_body(container_, stream); + + if (delimiters_type::values.postfix != NULL) + stream << delimiters_type::values.postfix; + } + + private: + const T &container_; +}; + +// Specialization for pairs + +template +template +struct print_container_helper::printer> { + using ostream_type = + typename print_container_helper::ostream_type; + + static void print_body(const std::pair &c, ostream_type &stream) { + stream << c.first; + if (print_container_helper::delimiters_type::values + .delimiter != NULL) + stream << print_container_helper::delimiters_type::values + .delimiter; + stream << c.second; + } +}; + +// Specialization for tuples + +template +template +struct print_container_helper::printer> { + using ostream_type = + typename print_container_helper::ostream_type; + using element_type = std::tuple; + + template + struct Int {}; + + static void print_body(const element_type &c, ostream_type &stream) { + tuple_print(c, stream, Int<0>()); + } + + static void tuple_print(const element_type &, ostream_type &, + Int) {} + + static void tuple_print( + const element_type &c, ostream_type &stream, + typename std::conditional, + std::nullptr_t>::type) { + stream << std::get<0>(c); + tuple_print(c, stream, Int<1>()); + } + + template + static void tuple_print(const element_type &c, ostream_type &stream, Int) { + if (print_container_helper::delimiters_type::values + .delimiter != NULL) + stream << print_container_helper::delimiters_type::values + .delimiter; + + stream << std::get(c); + + tuple_print(c, stream, Int()); + } +}; + +// Prints a print_container_helper to the specified stream. + +template +inline std::basic_ostream &operator<<( + std::basic_ostream &stream, + const print_container_helper &helper) { + helper(stream); + return stream; +} + +// Basic is_container template; specialize to derive from std::true_type for all +// desired container types + +template +struct is_container + : public std::integral_constant::value && + detail::has_begin_end::beg_value && + detail::has_begin_end::end_value> {}; + +template +struct is_container : std::true_type {}; + +template +struct is_container : std::false_type {}; + +template +struct is_container> : std::true_type {}; + +template +struct is_container> : std::true_type {}; + +template +struct is_container> : std::true_type {}; + +// Default delimiters + +template +struct delimiters { + static const delimiters_values values; +}; +template +const delimiters_values delimiters::values = {"[", ", ", "]"}; +template +struct delimiters { + static const delimiters_values values; +}; +template +const delimiters_values delimiters::values = {L"[", L", ", + L"]"}; + +// Delimiters for (multi)set and unordered_(multi)set + +template +struct delimiters<::std::set, char> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::set, char>::values = {"{", ", ", + "}"}; + +template +struct delimiters<::std::set, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::set, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::multiset, char> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::multiset, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::multiset, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::multiset, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::unordered_set, char> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_set, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::unordered_set, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_set, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::unordered_multiset, + char> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_multiset, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::unordered_multiset, + wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::unordered_multiset, + wchar_t>::values = {L"{", L", ", L"}"}; + +// Delimiters for pair and tuple + +template +struct delimiters, char> { + static const delimiters_values values; +}; +template +const delimiters_values delimiters, char>::values = { + "(", ", ", ")"}; +template +struct delimiters<::std::pair, wchar_t> { + static const delimiters_values values; +}; +template +const delimiters_values + delimiters<::std::pair, wchar_t>::values = {L"(", L", ", L")"}; + +template +struct delimiters, char> { + static const delimiters_values values; +}; +template +const delimiters_values delimiters, char>::values = { + "(", ", ", ")"}; +template +struct delimiters<::std::tuple, wchar_t> { + static const delimiters_values values; +}; +template +const delimiters_values + delimiters<::std::tuple, wchar_t>::values = {L"(", L", ", L")"}; + +// Type-erasing helper class for easy use of custom delimiters. +// Requires TCharTraits = std::char_traits and TChar = char or wchar_t, +// and MyDelims needs to be defined for TChar. Usage: "cout << +// pretty_print::custom_delims(x)". + +struct custom_delims_base { + virtual ~custom_delims_base() {} + virtual std::ostream &stream(::std::ostream &) = 0; + virtual std::wostream &stream(::std::wostream &) = 0; +}; + +template +struct custom_delims_wrapper : custom_delims_base { + custom_delims_wrapper(const T &t_) : t(t_) {} + + std::ostream &stream(std::ostream &s) { + return s << print_container_helper, Delims>( + t); + } + + std::wostream &stream(std::wostream &s) { + return s << print_container_helper, + Delims>(t); + } + + private: + const T &t; +}; + +template +struct custom_delims { + template + custom_delims(const Container &c) + : base(new custom_delims_wrapper(c)) {} + + std::unique_ptr base; +}; + +template +inline std::basic_ostream &operator<<( + std::basic_ostream &s, const custom_delims &p) { + return p.base->stream(s); +} + +// A wrapper for a C-style array given as pointer-plus-size. +// Usage: std::cout << pretty_print_array(arr, n) << std::endl; + +template +struct array_wrapper_n { + typedef const T *const_iterator; + typedef T value_type; + + array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {} + inline const_iterator begin() const { return _array; } + inline const_iterator end() const { return _array + _n; } + + private: + const T *const _array; + size_t _n; +}; + +// A wrapper for hash-table based containers that offer local iterators to each +// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket +// 5 of container m.) + +template +struct bucket_print_wrapper { + typedef typename T::const_local_iterator const_iterator; + typedef typename T::size_type size_type; + + const_iterator begin() const { return m_map.cbegin(n); } + + const_iterator end() const { return m_map.cend(n); } + + bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {} + + private: + const T &m_map; + const size_type n; +}; + +} // namespace pretty_print + +// Global accessor functions for the convenience wrappers + +template +inline pretty_print::array_wrapper_n pretty_print_array(const T *const a, + size_t n) { + return pretty_print::array_wrapper_n(a, n); +} + +template +pretty_print::bucket_print_wrapper bucket_print(const T &m, + typename T::size_type n) { + return pretty_print::bucket_print_wrapper(m, n); +} + +// Main magic entry point: An overload snuck into namespace std. +// Can we do better? + +namespace std { +// Prints a container to the stream using default delimiters + +template +inline typename enable_if<::pretty_print::is_container::value, + basic_ostream &>::type +operator<<(basic_ostream &stream, const T &container) { + return stream + << ::pretty_print::print_container_helper( + container); +} +} // namespace std + +#endif // H_PRETTY_PRINT diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h @@ -0,0 +1,60 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace py = pybind11; + +template +std::vector array2Vector(TPyObject arr) { + py::array arr_np = arr; + size_t size = arr.attr("size").template cast(); + py::array_t arr_cc = arr_np; + std::vector data(arr_cc.data(), arr_cc.data() + size); + return data; +} + +template +std::vector arrayT2Vector(py::array_t arr) { + std::vector data(arr.data(), arr.data() + arr.size()); + return data; +} + +template +tv::TensorView array2TensorView(TPyObject arr) { + py::array arr_np = arr; + py::array_t arr_cc = arr_np; + tv::Shape shape; + for (int i = 0; i < arr_cc.ndim(); ++i) { + shape.push_back(arr_cc.shape(i)); + } + return tv::TensorView(arr_cc.mutable_data(), shape); +} +template +tv::TensorView arrayT2TensorView(py::array_t arr) { + tv::Shape shape; + for (int i = 0; i < arr.ndim(); ++i) { + shape.push_back(arr.shape(i)); + } + return tv::TensorView(arr.mutable_data(), shape); +} diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h new file mode 100644 index 0000000000000000000000000000000000000000..def6fe5e125a4e8c7e38f889887a6af80557f219 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h @@ -0,0 +1,295 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPCONV_GEOMETRY_H_ +#define SPCONV_GEOMETRY_H_ + +#include + +#include +#include + +template +TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, + const Index *kernelSize, + const Index *stride, const Index *padding, + const Index *dilation, + const Index *outSpatialShape, Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + + stride[i] + padding[i]) / + stride[i]; + uppers[i] = (input_pos[i] + padding[i]) / stride[i]; + } + +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } + +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + // break; + } + offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; + m *= kernelSize[j]; + } + + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +TV_HOST_DEVICE Index getValidOutPosTranspose( + const Index *input_pos, const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, const Index *outSpatialShape, + Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + lowers[i] = input_pos[i] * stride[i] - padding[i]; + uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; + } +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + } + offset += m * (val - lowers[j]) / dilation[j]; + m *= kernelSize[j]; + } + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +Index getIndicePairsConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + // indicesOut: num_active * kernelVolume * (NDim + 1) + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsDeConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsSubM(tv::TensorView indicesIn, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *const kernelSize, + const Index *const stride, const Index *const padding, + const Index *dilation, + const Index *const outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + // Index validPoints[kernelVolume * (NDim + 1)]; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + Index index = 0; + for (int j = 0; j < numActIn; ++j) { + index = tv::rowArrayIdx(indicesIn.data() + j * (NDim + 1) + 1, + outSpatialShape) + + spatialVolume * indicesIn(j, 0); + gridsOut[index] = j; + } + for (int j = 0; j < numActIn; ++j) { + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * indicesIn(j, 0); + if (gridsOut[index] > -1) { + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + } + return numActIn; +} + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h new file mode 100644 index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h @@ -0,0 +1,78 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ +#define SPARSE_CONV_INDICE_FUNCTOR_H_ +#include + +namespace functor { +template +struct CreateConvIndicePairFunctorP1 { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose); +}; + +template +struct CreateConvIndicePairFunctorP2 { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; + +template +struct CreateConvIndicePairFunctor { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; + +template +struct CreateSubMIndicePairFunctor { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; +} // namespace functor + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h new file mode 100644 index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h @@ -0,0 +1,37 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_MAXPOOL_FUNCTOR_H_ +#define SPARSE_MAXPOOL_FUNCTOR_H_ +#include + +namespace functor { +template +struct SparseMaxPoolForwardFunctor { + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView indices, int size); +}; + +template +struct SparseMaxPoolBackwardFunctor { + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView fout, + tv::TensorView fin, + tv::TensorView indices, int size); +}; +} // namespace functor + +#endif diff --git a/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6 --- /dev/null +++ b/external/cv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h @@ -0,0 +1,50 @@ +#ifndef MP_HELPER_H_ +#define MP_HELPER_H_ +#include +#include + +template +struct mp_list {}; + +template +using mp_list_c = mp_list...>; + +namespace detail { + +template +constexpr F mp_for_each_impl(mp_list, F &&f) { + return std::initializer_list{(f(T()), 0)...}, std::forward(f); +} + +template +constexpr F mp_for_each_impl(mp_list<>, F &&f) { + return std::forward(f); +} + +} // namespace detail + +namespace detail { + +template class B> +struct mp_rename_impl { + // An error "no type named 'type'" here means that the first argument to + // mp_rename is not a list +}; + +template