diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..3bcbe277a8218bc25b31b3d27b42cc659c27b23e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,38 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d1d9a561b76aaa8c22a6f446fb505150d5a305ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+**pycache**
diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000000000000000000000000000000000000..47351a4dd132b031ef7e8f4693d1bf130de74390
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,11 @@
+## Create wheel for mmcv
+```
+cd ./external/engine
+python setup.py bdist_wheel
+
+cd ./external/cv
+MMCV_WITH_OPS=1 python setup.py bdist_wheel
+
+cd ./external/det
+python setup.py bdist_wheel
+```
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0211f40f12d6e730786bd34036869b44bd61454c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+---
+title: Sapiens Pose
+emoji: 📊
+colorFrom: pink
+colorTo: yellow
+sdk: gradio
+sdk_version: 4.42.0
+app_file: app.py
+pinned: false
+license: cc-by-nc-4.0
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..30cbf3cd8306056525ce204c459df24c8bceb9ba
--- /dev/null
+++ b/app.py
@@ -0,0 +1,453 @@
+import os
+from typing import List
+import spaces
+import gradio as gr
+import numpy as np
+import torch
+import json
+import tempfile
+import torch.nn.functional as F
+from torchvision import transforms
+from PIL import Image
+import cv2
+from gradio.themes.utils import sizes
+from classes_and_palettes import (
+ COCO_KPTS_COLORS,
+ COCO_WHOLEBODY_KPTS_COLORS,
+ GOLIATH_KPTS_COLORS,
+ GOLIATH_SKELETON_INFO,
+ GOLIATH_KEYPOINTS
+)
+
+import os
+import sys
+import subprocess
+import importlib.util
+
+def is_package_installed(package_name):
+ return importlib.util.find_spec(package_name) is not None
+
+def find_wheel(package_path):
+ dist_dir = os.path.join(package_path, "dist")
+ if os.path.exists(dist_dir):
+ wheel_files = [f for f in os.listdir(dist_dir) if f.endswith('.whl')]
+ if wheel_files:
+ return os.path.join(dist_dir, wheel_files[0])
+ return None
+
+def install_from_wheel(package_name, package_path):
+ wheel_file = find_wheel(package_path)
+ if wheel_file:
+ print(f"Installing {package_name} from wheel: {wheel_file}")
+ subprocess.check_call([sys.executable, "-m", "pip", "install", wheel_file])
+ else:
+ print(f"{package_name} wheel not found in {package_path}. Please build it first.")
+ sys.exit(1)
+
+def install_local_packages():
+ packages = [
+ ("mmengine", "./external/engine"),
+ ("mmcv", "./external/cv"),
+ ("mmdet", "./external/det")
+ ]
+
+ for package_name, package_path in packages:
+ if not is_package_installed(package_name):
+ print(f"Installing {package_name}...")
+ install_from_wheel(package_name, package_path)
+ else:
+ print(f"{package_name} is already installed.")
+
+# Run the installation at the start of your app
+install_local_packages()
+
+from detector_utils import (
+ adapt_mmdet_pipeline,
+ init_detector,
+ process_images_detector,
+ )
+
+class Config:
+ ASSETS_DIR = os.path.join(os.path.dirname(__file__), 'assets')
+ CHECKPOINTS_DIR = os.path.join(ASSETS_DIR, "checkpoints")
+ CHECKPOINTS = {
+ "0.3b": "sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2",
+ "1b": "sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2",
+ }
+ DETECTION_CHECKPOINT = os.path.join(CHECKPOINTS_DIR, 'rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth')
+ DETECTION_CONFIG = os.path.join(ASSETS_DIR, 'rtmdet_m_640-8xb32_coco-person_no_nms.py')
+
+class ModelManager:
+ @staticmethod
+ def load_model(checkpoint_name: str):
+ if checkpoint_name is None:
+ return None
+ checkpoint_path = os.path.join(Config.CHECKPOINTS_DIR, checkpoint_name)
+ model = torch.jit.load(checkpoint_path)
+ model.eval()
+ model.to("cuda")
+ return model
+
+ @staticmethod
+ @torch.inference_mode()
+ def run_model(model, input_tensor):
+ return model(input_tensor)
+
+class ImageProcessor:
+ def __init__(self):
+ self.transform = transforms.Compose([
+ transforms.Resize((1024, 768)),
+ transforms.ToTensor(),
+ transforms.Normalize(mean=[123.5/255, 116.5/255, 103.5/255],
+ std=[58.5/255, 57.0/255, 57.5/255])
+ ])
+ self.detector = init_detector(
+ Config.DETECTION_CONFIG, Config.DETECTION_CHECKPOINT, device='cpu'
+ )
+ self.detector.cfg = adapt_mmdet_pipeline(self.detector.cfg)
+
+ def detect_persons(self, image: Image.Image):
+ # Convert PIL Image to tensor
+ image = np.array(image)
+ image = np.expand_dims(image, axis=0)
+
+ # Perform person detection
+ bboxes_batch = process_images_detector(
+ image,
+ self.detector
+ )
+ bboxes = self.get_person_bboxes(bboxes_batch[0]) # Get bboxes for the first (and only) image
+
+ return bboxes
+
+ def get_person_bboxes(self, bboxes_batch, score_thr=0.3):
+ person_bboxes = []
+ for bbox in bboxes_batch:
+ if len(bbox) == 5: # [x1, y1, x2, y2, score]
+ if bbox[4] > score_thr:
+ person_bboxes.append(bbox)
+ elif len(bbox) == 4: # [x1, y1, x2, y2]
+ person_bboxes.append(bbox + [1.0]) # Add a default score of 1.0
+ return person_bboxes
+
+ @spaces.GPU
+ @torch.inference_mode()
+ def estimate_pose(self, image: Image.Image, bboxes: List[List[float]], model_name: str, kpt_threshold: float):
+ pose_model = ModelManager.load_model(Config.CHECKPOINTS[model_name])
+
+ result_image = image.copy()
+ all_keypoints = [] # List to store keypoints for all persons
+
+ for bbox in bboxes:
+ cropped_img = self.crop_image(result_image, bbox)
+ input_tensor = self.transform(cropped_img).unsqueeze(0).to("cuda")
+ heatmaps = ModelManager.run_model(pose_model, input_tensor)
+ keypoints = self.heatmaps_to_keypoints(heatmaps[0].cpu().numpy())
+ all_keypoints.append(keypoints) # Collect keypoints
+ result_image = self.draw_keypoints(result_image, keypoints, bbox, kpt_threshold)
+
+ return result_image, all_keypoints
+
+ def process_image(self, image: Image.Image, model_name: str, kpt_threshold: str):
+ bboxes = self.detect_persons(image)
+ result_image, keypoints = self.estimate_pose(image, bboxes, model_name, float(kpt_threshold))
+ return result_image, keypoints
+
+ def crop_image(self, image, bbox):
+ if len(bbox) == 4:
+ x1, y1, x2, y2 = map(int, bbox)
+ elif len(bbox) >= 5:
+ x1, y1, x2, y2, _ = map(int, bbox[:5])
+ else:
+ raise ValueError(f"Unexpected bbox format: {bbox}")
+
+ crop = image.crop((x1, y1, x2, y2))
+ return crop
+
+ @staticmethod
+ def heatmaps_to_keypoints(heatmaps):
+ num_joints = heatmaps.shape[0] # Should be 308
+ keypoints = {}
+ for i, name in enumerate(GOLIATH_KEYPOINTS):
+ if i < num_joints:
+ heatmap = heatmaps[i]
+ y, x = np.unravel_index(np.argmax(heatmap), heatmap.shape)
+ conf = heatmap[y, x]
+ keypoints[name] = (float(x), float(y), float(conf))
+ return keypoints
+
+ @staticmethod
+ def draw_keypoints(image, keypoints, bbox, kpt_threshold):
+ image = np.array(image)
+
+ # Handle both 4 and 5-element bounding boxes
+ if len(bbox) == 4:
+ x1, y1, x2, y2 = map(int, bbox)
+ elif len(bbox) >= 5:
+ x1, y1, x2, y2, _ = map(int, bbox[:5])
+ else:
+ raise ValueError(f"Unexpected bbox format: {bbox}")
+
+ # Calculate adaptive radius and thickness based on bounding box size
+ bbox_width = x2 - x1
+ bbox_height = y2 - y1
+ bbox_size = np.sqrt(bbox_width * bbox_height)
+
+ radius = max(1, int(bbox_size * 0.006)) # minimum 1 pixel
+ thickness = max(1, int(bbox_size * 0.006)) # minimum 1 pixel
+ bbox_thickness = max(1, thickness//4)
+
+ cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), bbox_thickness)
+
+ # Draw keypoints
+ for i, (name, (x, y, conf)) in enumerate(keypoints.items()):
+ if conf > kpt_threshold and i < len(GOLIATH_KPTS_COLORS):
+ x_coord = int(x * bbox_width / 192) + x1
+ y_coord = int(y * bbox_height / 256) + y1
+ color = GOLIATH_KPTS_COLORS[i]
+ cv2.circle(image, (x_coord, y_coord), radius, color, -1)
+
+ # Draw skeleton
+ for _, link_info in GOLIATH_SKELETON_INFO.items():
+ pt1_name, pt2_name = link_info['link']
+ color = link_info['color']
+
+ if pt1_name in keypoints and pt2_name in keypoints:
+ pt1 = keypoints[pt1_name]
+ pt2 = keypoints[pt2_name]
+ if pt1[2] > kpt_threshold and pt2[2] > kpt_threshold:
+ x1_coord = int(pt1[0] * bbox_width / 192) + x1
+ y1_coord = int(pt1[1] * bbox_height / 256) + y1
+ x2_coord = int(pt2[0] * bbox_width / 192) + x1
+ y2_coord = int(pt2[1] * bbox_height / 256) + y1
+ cv2.line(image, (x1_coord, y1_coord), (x2_coord, y2_coord), color, thickness=thickness)
+
+ return Image.fromarray(image)
+
+class GradioInterface:
+ def __init__(self):
+ self.image_processor = ImageProcessor()
+
+ def create_interface(self):
+ app_styles = """
+
+ """
+
+ header_html = f"""
+
+
+ {app_styles}
+
+ """
+
+ js_func = """
+ function refresh() {
+ const url = new URL(window.location);
+ if (url.searchParams.get('__theme') !== 'dark') {
+ url.searchParams.set('__theme', 'dark');
+ window.location.href = url.href;
+ }
+ }
+ """
+
+ def process_image(image, model_name, kpt_threshold):
+ result_image, keypoints = self.image_processor.process_image(image, model_name, kpt_threshold)
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w') as json_file:
+ json.dump(keypoints, json_file)
+ json_file_path = json_file.name
+ return result_image, json_file_path
+
+ with gr.Blocks(js=js_func, theme=gr.themes.Default()) as demo:
+ gr.HTML(header_html)
+ with gr.Row(elem_classes="content-container"):
+ with gr.Column():
+ input_image = gr.Image(label="Input Image", type="pil", format="png", elem_classes="image-preview")
+ with gr.Row():
+ model_name = gr.Dropdown(
+ label="Model Size",
+ choices=list(Config.CHECKPOINTS.keys()),
+ value="1b",
+ )
+ kpt_threshold = gr.Dropdown(
+ label="Min Keypoint Confidence",
+ choices=["0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9"],
+ value="0.3",
+ )
+ example_model = gr.Examples(
+ inputs=input_image,
+ examples_per_page=14,
+ examples=[
+ os.path.join(Config.ASSETS_DIR, "images", img)
+ for img in os.listdir(os.path.join(Config.ASSETS_DIR, "images"))
+ ],
+ )
+ with gr.Column():
+ result_image = gr.Image(label="Pose-308 Result", type="pil", elem_classes="image-preview")
+ json_output = gr.File(label="Pose-308 Output (.json)")
+ run_button = gr.Button("Run")
+
+ run_button.click(
+ fn=process_image,
+ inputs=[input_image, model_name, kpt_threshold],
+ outputs=[result_image, json_output],
+ )
+
+ return demo
+
+def main():
+ if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+ interface = GradioInterface()
+ demo = interface.create_interface()
+ demo.launch(share=False)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth b/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth
new file mode 100644
index 0000000000000000000000000000000000000000..35c573c1f7ea4c44ee9af0917710a21c3c38434a
--- /dev/null
+++ b/assets/checkpoints/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b66b27072c6a3cd4f093882df440921987076131fb78a7df7b1cf92d67f41509
+size 99149914
diff --git a/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2 b/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2
new file mode 100644
index 0000000000000000000000000000000000000000..7e320f245c82df4f06f26b4e8908f5e9090af4b5
--- /dev/null
+++ b/assets/checkpoints/sapiens_0.3b_goliath_best_goliath_AP_575_torchscript.pt2
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21cf7e3e723720d847bee6d3b321bfcdb33268c9f1418d7552552264ae0a5a9b
+size 1319579523
diff --git a/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2 b/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2
new file mode 100644
index 0000000000000000000000000000000000000000..3ea09bc91c4b6c449451fce529e91260af5ef198
--- /dev/null
+++ b/assets/checkpoints/sapiens_1b_goliath_best_goliath_AP_640_torchscript.pt2
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6218c6be17697157f9e65ee34054a94ab8ca0f637380fa5748c18e04814976e
+size 4677162331
diff --git a/assets/images/68204.png b/assets/images/68204.png
new file mode 100644
index 0000000000000000000000000000000000000000..6584b288fafd94166c2877b7e43a3f387016a434
--- /dev/null
+++ b/assets/images/68204.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b0268cb801ed164864a4b5f6d131e0ac5cc2fbd149a6467d5d0c97da47122c2
+size 4285020
diff --git a/assets/images/68210.png b/assets/images/68210.png
new file mode 100644
index 0000000000000000000000000000000000000000..a0c34954cd7483f373026408b083c8195d165489
--- /dev/null
+++ b/assets/images/68210.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbe5f80498af4ebd1ff09ae4184f37c20ba981e53bd554c3cc78d39ae0ee7fd7
+size 3933143
diff --git a/assets/images/68658.png b/assets/images/68658.png
new file mode 100644
index 0000000000000000000000000000000000000000..24dd9477f8cdb5d92d96db34a8932a0d24da334e
--- /dev/null
+++ b/assets/images/68658.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61a68b619bd17235e683324f2826ce0693322e45ab8c86f1c057851ecb333ac7
+size 5096267
diff --git a/assets/images/68666.png b/assets/images/68666.png
new file mode 100644
index 0000000000000000000000000000000000000000..95e7ae11dc90d22afc15fa3b41cbfc60ac4cda91
--- /dev/null
+++ b/assets/images/68666.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea3047e6c2ccb485fdb3966aa2325e803cbf49c27c0bff00287b44bc16f18914
+size 4562681
diff --git a/assets/images/68691.png b/assets/images/68691.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c688716c962b891073e1feea115a7838f72fcba
--- /dev/null
+++ b/assets/images/68691.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fae39e4055c1b297af7068cdddfeeba8d685363281b839d8c5afac1980204b57
+size 3736765
diff --git a/assets/images/68956.png b/assets/images/68956.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8a83b85cdb8d999f65677278a28deaa08352a57
--- /dev/null
+++ b/assets/images/68956.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eee1f27082b10999d0fa848121ecb06cda3386b1a864b9aa0f59ae78261f8908
+size 4147008
diff --git a/assets/images/pexels-amresh444-17315601.png b/assets/images/pexels-amresh444-17315601.png
new file mode 100644
index 0000000000000000000000000000000000000000..8453dc7bb9885c43733d798c46c877779cc8ba15
--- /dev/null
+++ b/assets/images/pexels-amresh444-17315601.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e17ee1b229147e4b52e8348a6ef426bc9e9a2f90738e776e15b26b325abb9b3
+size 3503065
diff --git a/assets/images/pexels-gabby-k-6311686.png b/assets/images/pexels-gabby-k-6311686.png
new file mode 100644
index 0000000000000000000000000000000000000000..9add365bb9485f5085155dbbbab2232a0b533449
--- /dev/null
+++ b/assets/images/pexels-gabby-k-6311686.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f10eded3fb05ab04b963f7b9fd2e183d8d4e81b20569b1c6b0653549639421f
+size 3651731
diff --git a/assets/images/pexels-julia-m-cameron-4145040.png b/assets/images/pexels-julia-m-cameron-4145040.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff67ab842aabe6ebb6c0d5c8630c2c081c7a40ba
--- /dev/null
+++ b/assets/images/pexels-julia-m-cameron-4145040.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:459cf0280667b028ffbca16aa11188780d7a0205c0defec02916ff3cbaeecb72
+size 2924608
diff --git a/assets/images/pexels-marcus-aurelius-6787357.png b/assets/images/pexels-marcus-aurelius-6787357.png
new file mode 100644
index 0000000000000000000000000000000000000000..c48247aeacbdb3e0e81b2a0dd1376bf26b687817
--- /dev/null
+++ b/assets/images/pexels-marcus-aurelius-6787357.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d35452f76492125eaf7d5783aa9fd6b0d5990ebe0579fe9dfd58a9d634f4955
+size 3297473
diff --git a/assets/images/pexels-mo-saeed-3616599-5409085.png b/assets/images/pexels-mo-saeed-3616599-5409085.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac7017af6d97524a95a6e43940f95ce10193cc9f
--- /dev/null
+++ b/assets/images/pexels-mo-saeed-3616599-5409085.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c1ca7afd6c2a654e94ef59d5fb56fca4f3cde5fb5216f6b218c34a7b8c143dc
+size 3125143
diff --git a/assets/images/pexels-riedelmax-27355495.png b/assets/images/pexels-riedelmax-27355495.png
new file mode 100644
index 0000000000000000000000000000000000000000..20a059e38001957319449e3c1892b3a9bae0ab94
--- /dev/null
+++ b/assets/images/pexels-riedelmax-27355495.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4141d2f5f718f162ea1f6710c06b28b5cb51fd69598fde35948f8f3491228164
+size 3732680
diff --git a/assets/images/pexels-sergeymakashin-5368660.png b/assets/images/pexels-sergeymakashin-5368660.png
new file mode 100644
index 0000000000000000000000000000000000000000..5b0d1554db9cdcaec20efba4c0628a0ab55867f4
--- /dev/null
+++ b/assets/images/pexels-sergeymakashin-5368660.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af8f5a8f26dd102d87d94c1be36ec903791fe8e6d951c68ebb9ebcfc6d7397bb
+size 4075879
diff --git a/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png b/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png
new file mode 100644
index 0000000000000000000000000000000000000000..95aa28be407ccc9b63706bdfb961f99a67319dad
--- /dev/null
+++ b/assets/images/pexels-vinicius-wiesehofer-289347-4219918.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6eef5eee15b81fe65ea95627e9a46040b9889466689b3c1ca6ed273e02fe84f
+size 3627053
diff --git a/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py b/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b2b465444f5162e3bd859f63608146ba9437a4
--- /dev/null
+++ b/assets/rtmdet_m_640-8xb32_coco-person_no_nms.py
@@ -0,0 +1,20 @@
+_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py'
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa
+
+model = dict(
+ backbone=dict(
+ init_cfg=dict(
+ type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+ bbox_head=dict(num_classes=1),
+ test_cfg=dict(
+ nms_pre=1000,
+ min_bbox_size=0,
+ score_thr=0.05,
+ nms=None,
+ max_per_img=100))
+
+train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+
+val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+test_dataloader = val_dataloader
\ No newline at end of file
diff --git a/build_wheel.py b/build_wheel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d4520ce8c84e4d0923db97c9743ad2f3e187483
--- /dev/null
+++ b/build_wheel.py
@@ -0,0 +1,26 @@
+import os
+import subprocess
+import sys
+
+def build_wheel(package_path):
+ current_dir = os.getcwd()
+ os.chdir(package_path)
+ try:
+ subprocess.check_call([sys.executable, "setup.py", "bdist_wheel"])
+ finally:
+ os.chdir(current_dir)
+
+def main():
+ packages = [
+ "./external/engine",
+ "./external/cv",
+ "./external/det"
+ ]
+
+ for package in packages:
+ print(f"Building wheel for {package}...")
+ build_wheel(package)
+ print(f"Wheel built for {package}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/classes_and_palettes.py b/classes_and_palettes.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff650bb7e0f738f4b453590554c9f995feb8235c
--- /dev/null
+++ b/classes_and_palettes.py
@@ -0,0 +1,1024 @@
+COCO_KPTS_COLORS = [
+ [51, 153, 255], # 0: nose
+ [51, 153, 255], # 1: left_eye
+ [51, 153, 255], # 2: right_eye
+ [51, 153, 255], # 3: left_ear
+ [51, 153, 255], # 4: right_ear
+ [0, 255, 0], # 5: left_shoulder
+ [255, 128, 0], # 6: right_shoulder
+ [0, 255, 0], # 7: left_elbow
+ [255, 128, 0], # 8: right_elbow
+ [0, 255, 0], # 9: left_wrist
+ [255, 128, 0], # 10: right_wrist
+ [0, 255, 0], # 11: left_hip
+ [255, 128, 0], # 12: right_hip
+ [0, 255, 0], # 13: left_knee
+ [255, 128, 0], # 14: right_knee
+ [0, 255, 0], # 15: left_ankle
+ [255, 128, 0], # 16: right_ankle
+]
+
+COCO_WHOLEBODY_KPTS_COLORS = [
+ [51, 153, 255], # 0: nose
+ [51, 153, 255], # 1: left_eye
+ [51, 153, 255], # 2: right_eye
+ [51, 153, 255], # 3: left_ear
+ [51, 153, 255], # 4: right_ear
+ [0, 255, 0], # 5: left_shoulder
+ [255, 128, 0], # 6: right_shoulder
+ [0, 255, 0], # 7: left_elbow
+ [255, 128, 0], # 8: right_elbow
+ [0, 255, 0], # 9: left_wrist
+ [255, 128, 0], # 10: right_wrist
+ [0, 255, 0], # 11: left_hip
+ [255, 128, 0], # 12: right_hip
+ [0, 255, 0], # 13: left_knee
+ [255, 128, 0], # 14: right_knee
+ [0, 255, 0], # 15: left_ankle
+ [255, 128, 0], # 16: right_ankle
+ [255, 128, 0], # 17: left_big_toe
+ [255, 128, 0], # 18: left_small_toe
+ [255, 128, 0], # 19: left_heel
+ [255, 128, 0], # 20: right_big_toe
+ [255, 128, 0], # 21: right_small_toe
+ [255, 128, 0], # 22: right_heel
+ [255, 255, 255], # 23: face-0
+ [255, 255, 255], # 24: face-1
+ [255, 255, 255], # 25: face-2
+ [255, 255, 255], # 26: face-3
+ [255, 255, 255], # 27: face-4
+ [255, 255, 255], # 28: face-5
+ [255, 255, 255], # 29: face-6
+ [255, 255, 255], # 30: face-7
+ [255, 255, 255], # 31: face-8
+ [255, 255, 255], # 32: face-9
+ [255, 255, 255], # 33: face-10
+ [255, 255, 255], # 34: face-11
+ [255, 255, 255], # 35: face-12
+ [255, 255, 255], # 36: face-13
+ [255, 255, 255], # 37: face-14
+ [255, 255, 255], # 38: face-15
+ [255, 255, 255], # 39: face-16
+ [255, 255, 255], # 40: face-17
+ [255, 255, 255], # 41: face-18
+ [255, 255, 255], # 42: face-19
+ [255, 255, 255], # 43: face-20
+ [255, 255, 255], # 44: face-21
+ [255, 255, 255], # 45: face-22
+ [255, 255, 255], # 46: face-23
+ [255, 255, 255], # 47: face-24
+ [255, 255, 255], # 48: face-25
+ [255, 255, 255], # 49: face-26
+ [255, 255, 255], # 50: face-27
+ [255, 255, 255], # 51: face-28
+ [255, 255, 255], # 52: face-29
+ [255, 255, 255], # 53: face-30
+ [255, 255, 255], # 54: face-31
+ [255, 255, 255], # 55: face-32
+ [255, 255, 255], # 56: face-33
+ [255, 255, 255], # 57: face-34
+ [255, 255, 255], # 58: face-35
+ [255, 255, 255], # 59: face-36
+ [255, 255, 255], # 60: face-37
+ [255, 255, 255], # 61: face-38
+ [255, 255, 255], # 62: face-39
+ [255, 255, 255], # 63: face-40
+ [255, 255, 255], # 64: face-41
+ [255, 255, 255], # 65: face-42
+ [255, 255, 255], # 66: face-43
+ [255, 255, 255], # 67: face-44
+ [255, 255, 255], # 68: face-45
+ [255, 255, 255], # 69: face-46
+ [255, 255, 255], # 70: face-47
+ [255, 255, 255], # 71: face-48
+ [255, 255, 255], # 72: face-49
+ [255, 255, 255], # 73: face-50
+ [255, 255, 255], # 74: face-51
+ [255, 255, 255], # 75: face-52
+ [255, 255, 255], # 76: face-53
+ [255, 255, 255], # 77: face-54
+ [255, 255, 255], # 78: face-55
+ [255, 255, 255], # 79: face-56
+ [255, 255, 255], # 80: face-57
+ [255, 255, 255], # 81: face-58
+ [255, 255, 255], # 82: face-59
+ [255, 255, 255], # 83: face-60
+ [255, 255, 255], # 84: face-61
+ [255, 255, 255], # 85: face-62
+ [255, 255, 255], # 86: face-63
+ [255, 255, 255], # 87: face-64
+ [255, 255, 255], # 88: face-65
+ [255, 255, 255], # 89: face-66
+ [255, 255, 255], # 90: face-67
+ [255, 255, 255], # 91: left_hand_root
+ [255, 128, 0], # 92: left_thumb1
+ [255, 128, 0], # 93: left_thumb2
+ [255, 128, 0], # 94: left_thumb3
+ [255, 128, 0], # 95: left_thumb4
+ [255, 153, 255], # 96: left_forefinger1
+ [255, 153, 255], # 97: left_forefinger2
+ [255, 153, 255], # 98: left_forefinger3
+ [255, 153, 255], # 99: left_forefinger4
+ [102, 178, 255], # 100: left_middle_finger1
+ [102, 178, 255], # 101: left_middle_finger2
+ [102, 178, 255], # 102: left_middle_finger3
+ [102, 178, 255], # 103: left_middle_finger4
+ [255, 51, 51], # 104: left_ring_finger1
+ [255, 51, 51], # 105: left_ring_finger2
+ [255, 51, 51], # 106: left_ring_finger3
+ [255, 51, 51], # 107: left_ring_finger4
+ [0, 255, 0], # 108: left_pinky_finger1
+ [0, 255, 0], # 109: left_pinky_finger2
+ [0, 255, 0], # 110: left_pinky_finger3
+ [0, 255, 0], # 111: left_pinky_finger4
+ [255, 255, 255], # 112: right_hand_root
+ [255, 128, 0], # 113: right_thumb1
+ [255, 128, 0], # 114: right_thumb2
+ [255, 128, 0], # 115: right_thumb3
+ [255, 128, 0], # 116: right_thumb4
+ [255, 153, 255], # 117: right_forefinger1
+ [255, 153, 255], # 118: right_forefinger2
+ [255, 153, 255], # 119: right_forefinger3
+ [255, 153, 255], # 120: right_forefinger4
+ [102, 178, 255], # 121: right_middle_finger1
+ [102, 178, 255], # 122: right_middle_finger2
+ [102, 178, 255], # 123: right_middle_finger3
+ [102, 178, 255], # 124: right_middle_finger4
+ [255, 51, 51], # 125: right_ring_finger1
+ [255, 51, 51], # 126: right_ring_finger2
+ [255, 51, 51], # 127: right_ring_finger3
+ [255, 51, 51], # 128: right_ring_finger4
+ [0, 255, 0], # 129: right_pinky_finger1
+ [0, 255, 0], # 130: right_pinky_finger2
+ [0, 255, 0], # 131: right_pinky_finger3
+ [0, 255, 0], # 132: right_pinky_finger4
+]
+
+
+GOLIATH_KPTS_COLORS = [
+ [51, 153, 255], # 0: nose
+ [51, 153, 255], # 1: left_eye
+ [51, 153, 255], # 2: right_eye
+ [51, 153, 255], # 3: left_ear
+ [51, 153, 255], # 4: right_ear
+ [51, 153, 255], # 5: left_shoulder
+ [51, 153, 255], # 6: right_shoulder
+ [51, 153, 255], # 7: left_elbow
+ [51, 153, 255], # 8: right_elbow
+ [51, 153, 255], # 9: left_hip
+ [51, 153, 255], # 10: right_hip
+ [51, 153, 255], # 11: left_knee
+ [51, 153, 255], # 12: right_knee
+ [51, 153, 255], # 13: left_ankle
+ [51, 153, 255], # 14: right_ankle
+ [51, 153, 255], # 15: left_big_toe
+ [51, 153, 255], # 16: left_small_toe
+ [51, 153, 255], # 17: left_heel
+ [51, 153, 255], # 18: right_big_toe
+ [51, 153, 255], # 19: right_small_toe
+ [51, 153, 255], # 20: right_heel
+ [51, 153, 255], # 21: right_thumb4
+ [51, 153, 255], # 22: right_thumb3
+ [51, 153, 255], # 23: right_thumb2
+ [51, 153, 255], # 24: right_thumb_third_joint
+ [51, 153, 255], # 25: right_forefinger4
+ [51, 153, 255], # 26: right_forefinger3
+ [51, 153, 255], # 27: right_forefinger2
+ [51, 153, 255], # 28: right_forefinger_third_joint
+ [51, 153, 255], # 29: right_middle_finger4
+ [51, 153, 255], # 30: right_middle_finger3
+ [51, 153, 255], # 31: right_middle_finger2
+ [51, 153, 255], # 32: right_middle_finger_third_joint
+ [51, 153, 255], # 33: right_ring_finger4
+ [51, 153, 255], # 34: right_ring_finger3
+ [51, 153, 255], # 35: right_ring_finger2
+ [51, 153, 255], # 36: right_ring_finger_third_joint
+ [51, 153, 255], # 37: right_pinky_finger4
+ [51, 153, 255], # 38: right_pinky_finger3
+ [51, 153, 255], # 39: right_pinky_finger2
+ [51, 153, 255], # 40: right_pinky_finger_third_joint
+ [51, 153, 255], # 41: right_wrist
+ [51, 153, 255], # 42: left_thumb4
+ [51, 153, 255], # 43: left_thumb3
+ [51, 153, 255], # 44: left_thumb2
+ [51, 153, 255], # 45: left_thumb_third_joint
+ [51, 153, 255], # 46: left_forefinger4
+ [51, 153, 255], # 47: left_forefinger3
+ [51, 153, 255], # 48: left_forefinger2
+ [51, 153, 255], # 49: left_forefinger_third_joint
+ [51, 153, 255], # 50: left_middle_finger4
+ [51, 153, 255], # 51: left_middle_finger3
+ [51, 153, 255], # 52: left_middle_finger2
+ [51, 153, 255], # 53: left_middle_finger_third_joint
+ [51, 153, 255], # 54: left_ring_finger4
+ [51, 153, 255], # 55: left_ring_finger3
+ [51, 153, 255], # 56: left_ring_finger2
+ [51, 153, 255], # 57: left_ring_finger_third_joint
+ [51, 153, 255], # 58: left_pinky_finger4
+ [51, 153, 255], # 59: left_pinky_finger3
+ [51, 153, 255], # 60: left_pinky_finger2
+ [51, 153, 255], # 61: left_pinky_finger_third_joint
+ [51, 153, 255], # 62: left_wrist
+ [51, 153, 255], # 63: left_olecranon
+ [51, 153, 255], # 64: right_olecranon
+ [51, 153, 255], # 65: left_cubital_fossa
+ [51, 153, 255], # 66: right_cubital_fossa
+ [51, 153, 255], # 67: left_acromion
+ [51, 153, 255], # 68: right_acromion
+ [51, 153, 255], # 69: neck
+ [255, 255, 255], # 70: center_of_glabella
+ [255, 255, 255], # 71: center_of_nose_root
+ [255, 255, 255], # 72: tip_of_nose_bridge
+ [255, 255, 255], # 73: midpoint_1_of_nose_bridge
+ [255, 255, 255], # 74: midpoint_2_of_nose_bridge
+ [255, 255, 255], # 75: midpoint_3_of_nose_bridge
+ [255, 255, 255], # 76: center_of_labiomental_groove
+ [255, 255, 255], # 77: tip_of_chin
+ [255, 255, 255], # 78: upper_startpoint_of_r_eyebrow
+ [255, 255, 255], # 79: lower_startpoint_of_r_eyebrow
+ [255, 255, 255], # 80: end_of_r_eyebrow
+ [255, 255, 255], # 81: upper_midpoint_1_of_r_eyebrow
+ [255, 255, 255], # 82: lower_midpoint_1_of_r_eyebrow
+ [255, 255, 255], # 83: upper_midpoint_2_of_r_eyebrow
+ [255, 255, 255], # 84: upper_midpoint_3_of_r_eyebrow
+ [255, 255, 255], # 85: lower_midpoint_2_of_r_eyebrow
+ [255, 255, 255], # 86: lower_midpoint_3_of_r_eyebrow
+ [255, 255, 255], # 87: upper_startpoint_of_l_eyebrow
+ [255, 255, 255], # 88: lower_startpoint_of_l_eyebrow
+ [255, 255, 255], # 89: end_of_l_eyebrow
+ [255, 255, 255], # 90: upper_midpoint_1_of_l_eyebrow
+ [255, 255, 255], # 91: lower_midpoint_1_of_l_eyebrow
+ [255, 255, 255], # 92: upper_midpoint_2_of_l_eyebrow
+ [255, 255, 255], # 93: upper_midpoint_3_of_l_eyebrow
+ [255, 255, 255], # 94: lower_midpoint_2_of_l_eyebrow
+ [255, 255, 255], # 95: lower_midpoint_3_of_l_eyebrow
+ [192, 64, 128], # 96: l_inner_end_of_upper_lash_line
+ [192, 64, 128], # 97: l_outer_end_of_upper_lash_line
+ [192, 64, 128], # 98: l_centerpoint_of_upper_lash_line
+ [192, 64, 128], # 99: l_midpoint_2_of_upper_lash_line
+ [192, 64, 128], # 100: l_midpoint_1_of_upper_lash_line
+ [192, 64, 128], # 101: l_midpoint_6_of_upper_lash_line
+ [192, 64, 128], # 102: l_midpoint_5_of_upper_lash_line
+ [192, 64, 128], # 103: l_midpoint_4_of_upper_lash_line
+ [192, 64, 128], # 104: l_midpoint_3_of_upper_lash_line
+ [192, 64, 128], # 105: l_outer_end_of_upper_eyelid_line
+ [192, 64, 128], # 106: l_midpoint_6_of_upper_eyelid_line
+ [192, 64, 128], # 107: l_midpoint_2_of_upper_eyelid_line
+ [192, 64, 128], # 108: l_midpoint_5_of_upper_eyelid_line
+ [192, 64, 128], # 109: l_centerpoint_of_upper_eyelid_line
+ [192, 64, 128], # 110: l_midpoint_4_of_upper_eyelid_line
+ [192, 64, 128], # 111: l_midpoint_1_of_upper_eyelid_line
+ [192, 64, 128], # 112: l_midpoint_3_of_upper_eyelid_line
+ [192, 64, 128], # 113: l_midpoint_6_of_upper_crease_line
+ [192, 64, 128], # 114: l_midpoint_2_of_upper_crease_line
+ [192, 64, 128], # 115: l_midpoint_5_of_upper_crease_line
+ [192, 64, 128], # 116: l_centerpoint_of_upper_crease_line
+ [192, 64, 128], # 117: l_midpoint_4_of_upper_crease_line
+ [192, 64, 128], # 118: l_midpoint_1_of_upper_crease_line
+ [192, 64, 128], # 119: l_midpoint_3_of_upper_crease_line
+ [64, 32, 192], # 120: r_inner_end_of_upper_lash_line
+ [64, 32, 192], # 121: r_outer_end_of_upper_lash_line
+ [64, 32, 192], # 122: r_centerpoint_of_upper_lash_line
+ [64, 32, 192], # 123: r_midpoint_1_of_upper_lash_line
+ [64, 32, 192], # 124: r_midpoint_2_of_upper_lash_line
+ [64, 32, 192], # 125: r_midpoint_3_of_upper_lash_line
+ [64, 32, 192], # 126: r_midpoint_4_of_upper_lash_line
+ [64, 32, 192], # 127: r_midpoint_5_of_upper_lash_line
+ [64, 32, 192], # 128: r_midpoint_6_of_upper_lash_line
+ [64, 32, 192], # 129: r_outer_end_of_upper_eyelid_line
+ [64, 32, 192], # 130: r_midpoint_3_of_upper_eyelid_line
+ [64, 32, 192], # 131: r_midpoint_1_of_upper_eyelid_line
+ [64, 32, 192], # 132: r_midpoint_4_of_upper_eyelid_line
+ [64, 32, 192], # 133: r_centerpoint_of_upper_eyelid_line
+ [64, 32, 192], # 134: r_midpoint_5_of_upper_eyelid_line
+ [64, 32, 192], # 135: r_midpoint_2_of_upper_eyelid_line
+ [64, 32, 192], # 136: r_midpoint_6_of_upper_eyelid_line
+ [64, 32, 192], # 137: r_midpoint_3_of_upper_crease_line
+ [64, 32, 192], # 138: r_midpoint_1_of_upper_crease_line
+ [64, 32, 192], # 139: r_midpoint_4_of_upper_crease_line
+ [64, 32, 192], # 140: r_centerpoint_of_upper_crease_line
+ [64, 32, 192], # 141: r_midpoint_5_of_upper_crease_line
+ [64, 32, 192], # 142: r_midpoint_2_of_upper_crease_line
+ [64, 32, 192], # 143: r_midpoint_6_of_upper_crease_line
+ [64, 192, 128], # 144: l_inner_end_of_lower_lash_line
+ [64, 192, 128], # 145: l_outer_end_of_lower_lash_line
+ [64, 192, 128], # 146: l_centerpoint_of_lower_lash_line
+ [64, 192, 128], # 147: l_midpoint_2_of_lower_lash_line
+ [64, 192, 128], # 148: l_midpoint_1_of_lower_lash_line
+ [64, 192, 128], # 149: l_midpoint_6_of_lower_lash_line
+ [64, 192, 128], # 150: l_midpoint_5_of_lower_lash_line
+ [64, 192, 128], # 151: l_midpoint_4_of_lower_lash_line
+ [64, 192, 128], # 152: l_midpoint_3_of_lower_lash_line
+ [64, 192, 128], # 153: l_outer_end_of_lower_eyelid_line
+ [64, 192, 128], # 154: l_midpoint_6_of_lower_eyelid_line
+ [64, 192, 128], # 155: l_midpoint_2_of_lower_eyelid_line
+ [64, 192, 128], # 156: l_midpoint_5_of_lower_eyelid_line
+ [64, 192, 128], # 157: l_centerpoint_of_lower_eyelid_line
+ [64, 192, 128], # 158: l_midpoint_4_of_lower_eyelid_line
+ [64, 192, 128], # 159: l_midpoint_1_of_lower_eyelid_line
+ [64, 192, 128], # 160: l_midpoint_3_of_lower_eyelid_line
+ [64, 192, 32], # 161: r_inner_end_of_lower_lash_line
+ [64, 192, 32], # 162: r_outer_end_of_lower_lash_line
+ [64, 192, 32], # 163: r_centerpoint_of_lower_lash_line
+ [64, 192, 32], # 164: r_midpoint_1_of_lower_lash_line
+ [64, 192, 32], # 165: r_midpoint_2_of_lower_lash_line
+ [64, 192, 32], # 166: r_midpoint_3_of_lower_lash_line
+ [64, 192, 32], # 167: r_midpoint_4_of_lower_lash_line
+ [64, 192, 32], # 168: r_midpoint_5_of_lower_lash_line
+ [64, 192, 32], # 169: r_midpoint_6_of_lower_lash_line
+ [64, 192, 32], # 170: r_outer_end_of_lower_eyelid_line
+ [64, 192, 32], # 171: r_midpoint_3_of_lower_eyelid_line
+ [64, 192, 32], # 172: r_midpoint_1_of_lower_eyelid_line
+ [64, 192, 32], # 173: r_midpoint_4_of_lower_eyelid_line
+ [64, 192, 32], # 174: r_centerpoint_of_lower_eyelid_line
+ [64, 192, 32], # 175: r_midpoint_5_of_lower_eyelid_line
+ [64, 192, 32], # 176: r_midpoint_2_of_lower_eyelid_line
+ [64, 192, 32], # 177: r_midpoint_6_of_lower_eyelid_line
+ [0, 192, 0], # 178: tip_of_nose
+ [0, 192, 0], # 179: bottom_center_of_nose
+ [0, 192, 0], # 180: r_outer_corner_of_nose
+ [0, 192, 0], # 181: l_outer_corner_of_nose
+ [0, 192, 0], # 182: inner_corner_of_r_nostril
+ [0, 192, 0], # 183: outer_corner_of_r_nostril
+ [0, 192, 0], # 184: upper_corner_of_r_nostril
+ [0, 192, 0], # 185: inner_corner_of_l_nostril
+ [0, 192, 0], # 186: outer_corner_of_l_nostril
+ [0, 192, 0], # 187: upper_corner_of_l_nostril
+ [192, 0, 0], # 188: r_outer_corner_of_mouth
+ [192, 0, 0], # 189: l_outer_corner_of_mouth
+ [192, 0, 0], # 190: center_of_cupid_bow
+ [192, 0, 0], # 191: center_of_lower_outer_lip
+ [192, 0, 0], # 192: midpoint_1_of_upper_outer_lip
+ [192, 0, 0], # 193: midpoint_2_of_upper_outer_lip
+ [192, 0, 0], # 194: midpoint_1_of_lower_outer_lip
+ [192, 0, 0], # 195: midpoint_2_of_lower_outer_lip
+ [192, 0, 0], # 196: midpoint_3_of_upper_outer_lip
+ [192, 0, 0], # 197: midpoint_4_of_upper_outer_lip
+ [192, 0, 0], # 198: midpoint_5_of_upper_outer_lip
+ [192, 0, 0], # 199: midpoint_6_of_upper_outer_lip
+ [192, 0, 0], # 200: midpoint_3_of_lower_outer_lip
+ [192, 0, 0], # 201: midpoint_4_of_lower_outer_lip
+ [192, 0, 0], # 202: midpoint_5_of_lower_outer_lip
+ [192, 0, 0], # 203: midpoint_6_of_lower_outer_lip
+ [0, 192, 192], # 204: r_inner_corner_of_mouth
+ [0, 192, 192], # 205: l_inner_corner_of_mouth
+ [0, 192, 192], # 206: center_of_upper_inner_lip
+ [0, 192, 192], # 207: center_of_lower_inner_lip
+ [0, 192, 192], # 208: midpoint_1_of_upper_inner_lip
+ [0, 192, 192], # 209: midpoint_2_of_upper_inner_lip
+ [0, 192, 192], # 210: midpoint_1_of_lower_inner_lip
+ [0, 192, 192], # 211: midpoint_2_of_lower_inner_lip
+ [0, 192, 192], # 212: midpoint_3_of_upper_inner_lip
+ [0, 192, 192], # 213: midpoint_4_of_upper_inner_lip
+ [0, 192, 192], # 214: midpoint_5_of_upper_inner_lip
+ [0, 192, 192], # 215: midpoint_6_of_upper_inner_lip
+ [0, 192, 192], # 216: midpoint_3_of_lower_inner_lip
+ [0, 192, 192], # 217: midpoint_4_of_lower_inner_lip
+ [0, 192, 192], # 218: midpoint_5_of_lower_inner_lip
+ [0, 192, 192], # 219: midpoint_6_of_lower_inner_lip. teeths removed
+ [200, 200, 0], # 256: l_top_end_of_inferior_crus
+ [200, 200, 0], # 257: l_top_end_of_superior_crus
+ [200, 200, 0], # 258: l_start_of_antihelix
+ [200, 200, 0], # 259: l_end_of_antihelix
+ [200, 200, 0], # 260: l_midpoint_1_of_antihelix
+ [200, 200, 0], # 261: l_midpoint_1_of_inferior_crus
+ [200, 200, 0], # 262: l_midpoint_2_of_antihelix
+ [200, 200, 0], # 263: l_midpoint_3_of_antihelix
+ [200, 200, 0], # 264: l_point_1_of_inner_helix
+ [200, 200, 0], # 265: l_point_2_of_inner_helix
+ [200, 200, 0], # 266: l_point_3_of_inner_helix
+ [200, 200, 0], # 267: l_point_4_of_inner_helix
+ [200, 200, 0], # 268: l_point_5_of_inner_helix
+ [200, 200, 0], # 269: l_point_6_of_inner_helix
+ [200, 200, 0], # 270: l_point_7_of_inner_helix
+ [200, 200, 0], # 271: l_highest_point_of_antitragus
+ [200, 200, 0], # 272: l_bottom_point_of_tragus
+ [200, 200, 0], # 273: l_protruding_point_of_tragus
+ [200, 200, 0], # 274: l_top_point_of_tragus
+ [200, 200, 0], # 275: l_start_point_of_crus_of_helix
+ [200, 200, 0], # 276: l_deepest_point_of_concha
+ [200, 200, 0], # 277: l_tip_of_ear_lobe
+ [200, 200, 0], # 278: l_midpoint_between_22_15
+ [200, 200, 0], # 279: l_bottom_connecting_point_of_ear_lobe
+ [200, 200, 0], # 280: l_top_connecting_point_of_helix
+ [200, 200, 0], # 281: l_point_8_of_inner_helix
+ [0, 200, 200], # 282: r_top_end_of_inferior_crus
+ [0, 200, 200], # 283: r_top_end_of_superior_crus
+ [0, 200, 200], # 284: r_start_of_antihelix
+ [0, 200, 200], # 285: r_end_of_antihelix
+ [0, 200, 200], # 286: r_midpoint_1_of_antihelix
+ [0, 200, 200], # 287: r_midpoint_1_of_inferior_crus
+ [0, 200, 200], # 288: r_midpoint_2_of_antihelix
+ [0, 200, 200], # 289: r_midpoint_3_of_antihelix
+ [0, 200, 200], # 290: r_point_1_of_inner_helix
+ [0, 200, 200], # 291: r_point_8_of_inner_helix
+ [0, 200, 200], # 292: r_point_3_of_inner_helix
+ [0, 200, 200], # 293: r_point_4_of_inner_helix
+ [0, 200, 200], # 294: r_point_5_of_inner_helix
+ [0, 200, 200], # 295: r_point_6_of_inner_helix
+ [0, 200, 200], # 296: r_point_7_of_inner_helix
+ [0, 200, 200], # 297: r_highest_point_of_antitragus
+ [0, 200, 200], # 298: r_bottom_point_of_tragus
+ [0, 200, 200], # 299: r_protruding_point_of_tragus
+ [0, 200, 200], # 300: r_top_point_of_tragus
+ [0, 200, 200], # 301: r_start_point_of_crus_of_helix
+ [0, 200, 200], # 302: r_deepest_point_of_concha
+ [0, 200, 200], # 303: r_tip_of_ear_lobe
+ [0, 200, 200], # 304: r_midpoint_between_22_15
+ [0, 200, 200], # 305: r_bottom_connecting_point_of_ear_lobe
+ [0, 200, 200], # 306: r_top_connecting_point_of_helix
+ [0, 200, 200], # 307: r_point_2_of_inner_helix
+ [128, 192, 64], # 308: l_center_of_iris
+ [128, 192, 64], # 309: l_border_of_iris_3
+ [128, 192, 64], # 310: l_border_of_iris_midpoint_1
+ [128, 192, 64], # 311: l_border_of_iris_12
+ [128, 192, 64], # 312: l_border_of_iris_midpoint_4
+ [128, 192, 64], # 313: l_border_of_iris_9
+ [128, 192, 64], # 314: l_border_of_iris_midpoint_3
+ [128, 192, 64], # 315: l_border_of_iris_6
+ [128, 192, 64], # 316: l_border_of_iris_midpoint_2
+ [192, 32, 64], # 317: r_center_of_iris
+ [192, 32, 64], # 318: r_border_of_iris_3
+ [192, 32, 64], # 319: r_border_of_iris_midpoint_1
+ [192, 32, 64], # 320: r_border_of_iris_12
+ [192, 32, 64], # 321: r_border_of_iris_midpoint_4
+ [192, 32, 64], # 322: r_border_of_iris_9
+ [192, 32, 64], # 323: r_border_of_iris_midpoint_3
+ [192, 32, 64], # 324: r_border_of_iris_6
+ [192, 32, 64], # 325: r_border_of_iris_midpoint_2
+ [192, 128, 64], # 326: l_center_of_pupil
+ [192, 128, 64], # 327: l_border_of_pupil_3
+ [192, 128, 64], # 328: l_border_of_pupil_midpoint_1
+ [192, 128, 64], # 329: l_border_of_pupil_12
+ [192, 128, 64], # 330: l_border_of_pupil_midpoint_4
+ [192, 128, 64], # 331: l_border_of_pupil_9
+ [192, 128, 64], # 332: l_border_of_pupil_midpoint_3
+ [192, 128, 64], # 333: l_border_of_pupil_6
+ [192, 128, 64], # 334: l_border_of_pupil_midpoint_2
+ [32, 192, 192], # 335: r_center_of_pupil
+ [32, 192, 192], # 336: r_border_of_pupil_3
+ [32, 192, 192], # 337: r_border_of_pupil_midpoint_1
+ [32, 192, 192], # 338: r_border_of_pupil_12
+ [32, 192, 192], # 339: r_border_of_pupil_midpoint_4
+ [32, 192, 192], # 340: r_border_of_pupil_9
+ [32, 192, 192], # 341: r_border_of_pupil_midpoint_3
+ [32, 192, 192], # 342: r_border_of_pupil_6
+ [32, 192, 192], # 343: r_border_of_pupil_midpoint_2
+]
+
+GOLIATH_KEYPOINTS = [
+ "nose",
+ "left_eye",
+ "right_eye",
+ "left_ear",
+ "right_ear",
+ "left_shoulder",
+ "right_shoulder",
+ "left_elbow",
+ "right_elbow",
+ "left_hip",
+ "right_hip",
+ "left_knee",
+ "right_knee",
+ "left_ankle",
+ "right_ankle",
+ "left_big_toe",
+ "left_small_toe",
+ "left_heel",
+ "right_big_toe",
+ "right_small_toe",
+ "right_heel",
+ "right_thumb4",
+ "right_thumb3",
+ "right_thumb2",
+ "right_thumb_third_joint",
+ "right_forefinger4",
+ "right_forefinger3",
+ "right_forefinger2",
+ "right_forefinger_third_joint",
+ "right_middle_finger4",
+ "right_middle_finger3",
+ "right_middle_finger2",
+ "right_middle_finger_third_joint",
+ "right_ring_finger4",
+ "right_ring_finger3",
+ "right_ring_finger2",
+ "right_ring_finger_third_joint",
+ "right_pinky_finger4",
+ "right_pinky_finger3",
+ "right_pinky_finger2",
+ "right_pinky_finger_third_joint",
+ "right_wrist",
+ "left_thumb4",
+ "left_thumb3",
+ "left_thumb2",
+ "left_thumb_third_joint",
+ "left_forefinger4",
+ "left_forefinger3",
+ "left_forefinger2",
+ "left_forefinger_third_joint",
+ "left_middle_finger4",
+ "left_middle_finger3",
+ "left_middle_finger2",
+ "left_middle_finger_third_joint",
+ "left_ring_finger4",
+ "left_ring_finger3",
+ "left_ring_finger2",
+ "left_ring_finger_third_joint",
+ "left_pinky_finger4",
+ "left_pinky_finger3",
+ "left_pinky_finger2",
+ "left_pinky_finger_third_joint",
+ "left_wrist",
+ "left_olecranon",
+ "right_olecranon",
+ "left_cubital_fossa",
+ "right_cubital_fossa",
+ "left_acromion",
+ "right_acromion",
+ "neck",
+ "center_of_glabella",
+ "center_of_nose_root",
+ "tip_of_nose_bridge",
+ "midpoint_1_of_nose_bridge",
+ "midpoint_2_of_nose_bridge",
+ "midpoint_3_of_nose_bridge",
+ "center_of_labiomental_groove",
+ "tip_of_chin",
+ "upper_startpoint_of_r_eyebrow",
+ "lower_startpoint_of_r_eyebrow",
+ "end_of_r_eyebrow",
+ "upper_midpoint_1_of_r_eyebrow",
+ "lower_midpoint_1_of_r_eyebrow",
+ "upper_midpoint_2_of_r_eyebrow",
+ "upper_midpoint_3_of_r_eyebrow",
+ "lower_midpoint_2_of_r_eyebrow",
+ "lower_midpoint_3_of_r_eyebrow",
+ "upper_startpoint_of_l_eyebrow",
+ "lower_startpoint_of_l_eyebrow",
+ "end_of_l_eyebrow",
+ "upper_midpoint_1_of_l_eyebrow",
+ "lower_midpoint_1_of_l_eyebrow",
+ "upper_midpoint_2_of_l_eyebrow",
+ "upper_midpoint_3_of_l_eyebrow",
+ "lower_midpoint_2_of_l_eyebrow",
+ "lower_midpoint_3_of_l_eyebrow",
+ "l_inner_end_of_upper_lash_line",
+ "l_outer_end_of_upper_lash_line",
+ "l_centerpoint_of_upper_lash_line",
+ "l_midpoint_2_of_upper_lash_line",
+ "l_midpoint_1_of_upper_lash_line",
+ "l_midpoint_6_of_upper_lash_line",
+ "l_midpoint_5_of_upper_lash_line",
+ "l_midpoint_4_of_upper_lash_line",
+ "l_midpoint_3_of_upper_lash_line",
+ "l_outer_end_of_upper_eyelid_line",
+ "l_midpoint_6_of_upper_eyelid_line",
+ "l_midpoint_2_of_upper_eyelid_line",
+ "l_midpoint_5_of_upper_eyelid_line",
+ "l_centerpoint_of_upper_eyelid_line",
+ "l_midpoint_4_of_upper_eyelid_line",
+ "l_midpoint_1_of_upper_eyelid_line",
+ "l_midpoint_3_of_upper_eyelid_line",
+ "l_midpoint_6_of_upper_crease_line",
+ "l_midpoint_2_of_upper_crease_line",
+ "l_midpoint_5_of_upper_crease_line",
+ "l_centerpoint_of_upper_crease_line",
+ "l_midpoint_4_of_upper_crease_line",
+ "l_midpoint_1_of_upper_crease_line",
+ "l_midpoint_3_of_upper_crease_line",
+ "r_inner_end_of_upper_lash_line",
+ "r_outer_end_of_upper_lash_line",
+ "r_centerpoint_of_upper_lash_line",
+ "r_midpoint_1_of_upper_lash_line",
+ "r_midpoint_2_of_upper_lash_line",
+ "r_midpoint_3_of_upper_lash_line",
+ "r_midpoint_4_of_upper_lash_line",
+ "r_midpoint_5_of_upper_lash_line",
+ "r_midpoint_6_of_upper_lash_line",
+ "r_outer_end_of_upper_eyelid_line",
+ "r_midpoint_3_of_upper_eyelid_line",
+ "r_midpoint_1_of_upper_eyelid_line",
+ "r_midpoint_4_of_upper_eyelid_line",
+ "r_centerpoint_of_upper_eyelid_line",
+ "r_midpoint_5_of_upper_eyelid_line",
+ "r_midpoint_2_of_upper_eyelid_line",
+ "r_midpoint_6_of_upper_eyelid_line",
+ "r_midpoint_3_of_upper_crease_line",
+ "r_midpoint_1_of_upper_crease_line",
+ "r_midpoint_4_of_upper_crease_line",
+ "r_centerpoint_of_upper_crease_line",
+ "r_midpoint_5_of_upper_crease_line",
+ "r_midpoint_2_of_upper_crease_line",
+ "r_midpoint_6_of_upper_crease_line",
+ "l_inner_end_of_lower_lash_line",
+ "l_outer_end_of_lower_lash_line",
+ "l_centerpoint_of_lower_lash_line",
+ "l_midpoint_2_of_lower_lash_line",
+ "l_midpoint_1_of_lower_lash_line",
+ "l_midpoint_6_of_lower_lash_line",
+ "l_midpoint_5_of_lower_lash_line",
+ "l_midpoint_4_of_lower_lash_line",
+ "l_midpoint_3_of_lower_lash_line",
+ "l_outer_end_of_lower_eyelid_line",
+ "l_midpoint_6_of_lower_eyelid_line",
+ "l_midpoint_2_of_lower_eyelid_line",
+ "l_midpoint_5_of_lower_eyelid_line",
+ "l_centerpoint_of_lower_eyelid_line",
+ "l_midpoint_4_of_lower_eyelid_line",
+ "l_midpoint_1_of_lower_eyelid_line",
+ "l_midpoint_3_of_lower_eyelid_line",
+ "r_inner_end_of_lower_lash_line",
+ "r_outer_end_of_lower_lash_line",
+ "r_centerpoint_of_lower_lash_line",
+ "r_midpoint_1_of_lower_lash_line",
+ "r_midpoint_2_of_lower_lash_line",
+ "r_midpoint_3_of_lower_lash_line",
+ "r_midpoint_4_of_lower_lash_line",
+ "r_midpoint_5_of_lower_lash_line",
+ "r_midpoint_6_of_lower_lash_line",
+ "r_outer_end_of_lower_eyelid_line",
+ "r_midpoint_3_of_lower_eyelid_line",
+ "r_midpoint_1_of_lower_eyelid_line",
+ "r_midpoint_4_of_lower_eyelid_line",
+ "r_centerpoint_of_lower_eyelid_line",
+ "r_midpoint_5_of_lower_eyelid_line",
+ "r_midpoint_2_of_lower_eyelid_line",
+ "r_midpoint_6_of_lower_eyelid_line",
+ "tip_of_nose",
+ "bottom_center_of_nose",
+ "r_outer_corner_of_nose",
+ "l_outer_corner_of_nose",
+ "inner_corner_of_r_nostril",
+ "outer_corner_of_r_nostril",
+ "upper_corner_of_r_nostril",
+ "inner_corner_of_l_nostril",
+ "outer_corner_of_l_nostril",
+ "upper_corner_of_l_nostril",
+ "r_outer_corner_of_mouth",
+ "l_outer_corner_of_mouth",
+ "center_of_cupid_bow",
+ "center_of_lower_outer_lip",
+ "midpoint_1_of_upper_outer_lip",
+ "midpoint_2_of_upper_outer_lip",
+ "midpoint_1_of_lower_outer_lip",
+ "midpoint_2_of_lower_outer_lip",
+ "midpoint_3_of_upper_outer_lip",
+ "midpoint_4_of_upper_outer_lip",
+ "midpoint_5_of_upper_outer_lip",
+ "midpoint_6_of_upper_outer_lip",
+ "midpoint_3_of_lower_outer_lip",
+ "midpoint_4_of_lower_outer_lip",
+ "midpoint_5_of_lower_outer_lip",
+ "midpoint_6_of_lower_outer_lip",
+ "r_inner_corner_of_mouth",
+ "l_inner_corner_of_mouth",
+ "center_of_upper_inner_lip",
+ "center_of_lower_inner_lip",
+ "midpoint_1_of_upper_inner_lip",
+ "midpoint_2_of_upper_inner_lip",
+ "midpoint_1_of_lower_inner_lip",
+ "midpoint_2_of_lower_inner_lip",
+ "midpoint_3_of_upper_inner_lip",
+ "midpoint_4_of_upper_inner_lip",
+ "midpoint_5_of_upper_inner_lip",
+ "midpoint_6_of_upper_inner_lip",
+ "midpoint_3_of_lower_inner_lip",
+ "midpoint_4_of_lower_inner_lip",
+ "midpoint_5_of_lower_inner_lip",
+ "midpoint_6_of_lower_inner_lip",
+ "l_top_end_of_inferior_crus",
+ "l_top_end_of_superior_crus",
+ "l_start_of_antihelix",
+ "l_end_of_antihelix",
+ "l_midpoint_1_of_antihelix",
+ "l_midpoint_1_of_inferior_crus",
+ "l_midpoint_2_of_antihelix",
+ "l_midpoint_3_of_antihelix",
+ "l_point_1_of_inner_helix",
+ "l_point_2_of_inner_helix",
+ "l_point_3_of_inner_helix",
+ "l_point_4_of_inner_helix",
+ "l_point_5_of_inner_helix",
+ "l_point_6_of_inner_helix",
+ "l_point_7_of_inner_helix",
+ "l_highest_point_of_antitragus",
+ "l_bottom_point_of_tragus",
+ "l_protruding_point_of_tragus",
+ "l_top_point_of_tragus",
+ "l_start_point_of_crus_of_helix",
+ "l_deepest_point_of_concha",
+ "l_tip_of_ear_lobe",
+ "l_midpoint_between_22_15",
+ "l_bottom_connecting_point_of_ear_lobe",
+ "l_top_connecting_point_of_helix",
+ "l_point_8_of_inner_helix",
+ "r_top_end_of_inferior_crus",
+ "r_top_end_of_superior_crus",
+ "r_start_of_antihelix",
+ "r_end_of_antihelix",
+ "r_midpoint_1_of_antihelix",
+ "r_midpoint_1_of_inferior_crus",
+ "r_midpoint_2_of_antihelix",
+ "r_midpoint_3_of_antihelix",
+ "r_point_1_of_inner_helix",
+ "r_point_8_of_inner_helix",
+ "r_point_3_of_inner_helix",
+ "r_point_4_of_inner_helix",
+ "r_point_5_of_inner_helix",
+ "r_point_6_of_inner_helix",
+ "r_point_7_of_inner_helix",
+ "r_highest_point_of_antitragus",
+ "r_bottom_point_of_tragus",
+ "r_protruding_point_of_tragus",
+ "r_top_point_of_tragus",
+ "r_start_point_of_crus_of_helix",
+ "r_deepest_point_of_concha",
+ "r_tip_of_ear_lobe",
+ "r_midpoint_between_22_15",
+ "r_bottom_connecting_point_of_ear_lobe",
+ "r_top_connecting_point_of_helix",
+ "r_point_2_of_inner_helix",
+ "l_center_of_iris",
+ "l_border_of_iris_3",
+ "l_border_of_iris_midpoint_1",
+ "l_border_of_iris_12",
+ "l_border_of_iris_midpoint_4",
+ "l_border_of_iris_9",
+ "l_border_of_iris_midpoint_3",
+ "l_border_of_iris_6",
+ "l_border_of_iris_midpoint_2",
+ "r_center_of_iris",
+ "r_border_of_iris_3",
+ "r_border_of_iris_midpoint_1",
+ "r_border_of_iris_12",
+ "r_border_of_iris_midpoint_4",
+ "r_border_of_iris_9",
+ "r_border_of_iris_midpoint_3",
+ "r_border_of_iris_6",
+ "r_border_of_iris_midpoint_2",
+ "l_center_of_pupil",
+ "l_border_of_pupil_3",
+ "l_border_of_pupil_midpoint_1",
+ "l_border_of_pupil_12",
+ "l_border_of_pupil_midpoint_4",
+ "l_border_of_pupil_9",
+ "l_border_of_pupil_midpoint_3",
+ "l_border_of_pupil_6",
+ "l_border_of_pupil_midpoint_2",
+ "r_center_of_pupil",
+ "r_border_of_pupil_3",
+ "r_border_of_pupil_midpoint_1",
+ "r_border_of_pupil_12",
+ "r_border_of_pupil_midpoint_4",
+ "r_border_of_pupil_9",
+ "r_border_of_pupil_midpoint_3",
+ "r_border_of_pupil_6",
+ "r_border_of_pupil_midpoint_2"
+]
+
+GOLIATH_SKELETON_INFO = {
+ 0:
+ dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+ 1:
+ dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+ 2:
+ dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+ 3:
+ dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+ 4:
+ dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+ 5:
+ dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+ 6:
+ dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+ 7:
+ dict(
+ link=('left_shoulder', 'right_shoulder'),
+ id=7,
+ color=[51, 153, 255]),
+ 8:
+ dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+ 9:
+ dict(
+ link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+ 10:
+ dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+ 11:
+ dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+ 12:
+ dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+ 13:
+ dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+ 14:
+ dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+ 15:
+ dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+ 16:
+ dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+ 17:
+ dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+ 18:
+ dict(
+ link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+ 19:
+ dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+ 20:
+ dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+ 21:
+ dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+ 22:
+ dict(
+ link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+ 23:
+ dict(
+ link=('right_ankle', 'right_small_toe'),
+ id=23,
+ color=[255, 128, 0]),
+ 24:
+ dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+ 25:
+ dict(
+ link=('left_wrist', 'left_thumb_third_joint'), id=25, color=[255, 128,
+ 0]),
+ 26:
+ dict(link=('left_thumb_third_joint', 'left_thumb2'), id=26, color=[255, 128, 0]),
+ 27:
+ dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+ 28:
+ dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+ 29:
+ dict(
+ link=('left_wrist', 'left_forefinger_third_joint'),
+ id=29,
+ color=[255, 153, 255]),
+ 30:
+ dict(
+ link=('left_forefinger_third_joint', 'left_forefinger2'),
+ id=30,
+ color=[255, 153, 255]),
+ 31:
+ dict(
+ link=('left_forefinger2', 'left_forefinger3'),
+ id=31,
+ color=[255, 153, 255]),
+ 32:
+ dict(
+ link=('left_forefinger3', 'left_forefinger4'),
+ id=32,
+ color=[255, 153, 255]),
+ 33:
+ dict(
+ link=('left_wrist', 'left_middle_finger_third_joint'),
+ id=33,
+ color=[102, 178, 255]),
+ 34:
+ dict(
+ link=('left_middle_finger_third_joint', 'left_middle_finger2'),
+ id=34,
+ color=[102, 178, 255]),
+ 35:
+ dict(
+ link=('left_middle_finger2', 'left_middle_finger3'),
+ id=35,
+ color=[102, 178, 255]),
+ 36:
+ dict(
+ link=('left_middle_finger3', 'left_middle_finger4'),
+ id=36,
+ color=[102, 178, 255]),
+ 37:
+ dict(
+ link=('left_wrist', 'left_ring_finger_third_joint'),
+ id=37,
+ color=[255, 51, 51]),
+ 38:
+ dict(
+ link=('left_ring_finger_third_joint', 'left_ring_finger2'),
+ id=38,
+ color=[255, 51, 51]),
+ 39:
+ dict(
+ link=('left_ring_finger2', 'left_ring_finger3'),
+ id=39,
+ color=[255, 51, 51]),
+ 40:
+ dict(
+ link=('left_ring_finger3', 'left_ring_finger4'),
+ id=40,
+ color=[255, 51, 51]),
+ 41:
+ dict(
+ link=('left_wrist', 'left_pinky_finger_third_joint'),
+ id=41,
+ color=[0, 255, 0]),
+ 42:
+ dict(
+ link=('left_pinky_finger_third_joint', 'left_pinky_finger2'),
+ id=42,
+ color=[0, 255, 0]),
+ 43:
+ dict(
+ link=('left_pinky_finger2', 'left_pinky_finger3'),
+ id=43,
+ color=[0, 255, 0]),
+ 44:
+ dict(
+ link=('left_pinky_finger3', 'left_pinky_finger4'),
+ id=44,
+ color=[0, 255, 0]),
+ 45:
+ dict(
+ link=('right_wrist', 'right_thumb_third_joint'),
+ id=45,
+ color=[255, 128, 0]),
+ 46:
+ dict(
+ link=('right_thumb_third_joint', 'right_thumb2'), id=46, color=[255, 128, 0]),
+ 47:
+ dict(
+ link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+ 48:
+ dict(
+ link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+ 49:
+ dict(
+ link=('right_wrist', 'right_forefinger_third_joint'),
+ id=49,
+ color=[255, 153, 255]),
+ 50:
+ dict(
+ link=('right_forefinger_third_joint', 'right_forefinger2'),
+ id=50,
+ color=[255, 153, 255]),
+ 51:
+ dict(
+ link=('right_forefinger2', 'right_forefinger3'),
+ id=51,
+ color=[255, 153, 255]),
+ 52:
+ dict(
+ link=('right_forefinger3', 'right_forefinger4'),
+ id=52,
+ color=[255, 153, 255]),
+ 53:
+ dict(
+ link=('right_wrist', 'right_middle_finger_third_joint'),
+ id=53,
+ color=[102, 178, 255]),
+ 54:
+ dict(
+ link=('right_middle_finger_third_joint', 'right_middle_finger2'),
+ id=54,
+ color=[102, 178, 255]),
+ 55:
+ dict(
+ link=('right_middle_finger2', 'right_middle_finger3'),
+ id=55,
+ color=[102, 178, 255]),
+ 56:
+ dict(
+ link=('right_middle_finger3', 'right_middle_finger4'),
+ id=56,
+ color=[102, 178, 255]),
+ 57:
+ dict(
+ link=('right_wrist', 'right_ring_finger_third_joint'),
+ id=57,
+ color=[255, 51, 51]),
+ 58:
+ dict(
+ link=('right_ring_finger_third_joint', 'right_ring_finger2'),
+ id=58,
+ color=[255, 51, 51]),
+ 59:
+ dict(
+ link=('right_ring_finger2', 'right_ring_finger3'),
+ id=59,
+ color=[255, 51, 51]),
+ 60:
+ dict(
+ link=('right_ring_finger3', 'right_ring_finger4'),
+ id=60,
+ color=[255, 51, 51]),
+ 61:
+ dict(
+ link=('right_wrist', 'right_pinky_finger_third_joint'),
+ id=61,
+ color=[0, 255, 0]),
+ 62:
+ dict(
+ link=('right_pinky_finger_third_joint', 'right_pinky_finger2'),
+ id=62,
+ color=[0, 255, 0]),
+ 63:
+ dict(
+ link=('right_pinky_finger2', 'right_pinky_finger3'),
+ id=63,
+ color=[0, 255, 0]),
+ 64:
+ dict(
+ link=('right_pinky_finger3', 'right_pinky_finger4'),
+ id=64,
+ color=[0, 255, 0])
+ }
\ No newline at end of file
diff --git a/detector_utils.py b/detector_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb8f2a42ed53cc516e506fb3a852d528118d4de
--- /dev/null
+++ b/detector_utils.py
@@ -0,0 +1,196 @@
+from typing import List, Optional, Sequence, Union
+
+import torch
+import cv2
+import numpy as np
+from mmcv.ops import RoIPool
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.device import get_device
+from mmengine.registry import init_default_scope
+from mmdet.apis import inference_detector, init_detector
+from mmdet.structures import DetDataSample, SampleList
+from mmdet.utils import get_test_pipeline_cfg
+
+
+ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+def nms(dets: np.ndarray, thr: float):
+ """Greedily select boxes with high confidence and overlap <= thr.
+ Args:
+ dets (np.ndarray): [[x1, y1, x2, y2, score]].
+ thr (float): Retain overlap < thr.
+ Returns:
+ list: Indexes to keep.
+ """
+ if len(dets) == 0:
+ return []
+
+ x1 = dets[:, 0]
+ y1 = dets[:, 1]
+ x2 = dets[:, 2]
+ y2 = dets[:, 3]
+ scores = dets[:, 4]
+
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ order = scores.argsort()[::-1]
+
+ keep = []
+ while len(order) > 0:
+ i = order[0]
+ keep.append(i)
+ xx1 = np.maximum(x1[i], x1[order[1:]])
+ yy1 = np.maximum(y1[i], y1[order[1:]])
+ xx2 = np.minimum(x2[i], x2[order[1:]])
+ yy2 = np.minimum(y2[i], y2[order[1:]])
+
+ w = np.maximum(0.0, xx2 - xx1 + 1)
+ h = np.maximum(0.0, yy2 - yy1 + 1)
+ inter = w * h
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+ inds = np.where(ovr <= thr)[0]
+ order = order[inds + 1]
+
+ return keep
+
+def adapt_mmdet_pipeline(cfg):
+ """Converts pipeline types in MMDetection's test dataloader to use the
+ 'mmdet' namespace.
+
+ Args:
+ cfg (ConfigDict): Configuration dictionary for MMDetection.
+
+ Returns:
+ ConfigDict: Configuration dictionary with updated pipeline types.
+ """
+ # use lazy import to avoid hard dependence on mmdet
+ from mmdet.datasets import transforms
+
+ if 'test_dataloader' not in cfg:
+ return cfg
+
+ pipeline = cfg.test_dataloader.dataset.pipeline
+ for trans in pipeline:
+ if trans['type'] in dir(transforms):
+ trans['type'] = 'mmdet.' + trans['type']
+
+ return cfg
+
+
+def inference_detector(
+ model: torch.nn.Module,
+ imgs: ImagesType,
+ test_pipeline: Optional[Compose] = None,
+ text_prompt: Optional[str] = None,
+ custom_entities: bool = False,
+) -> Union[DetDataSample, SampleList]:
+ """Inference image(s) with the detector.
+
+ Args:
+ model (nn.Module): The loaded detector.
+ imgs (str, ndarray, Sequence[str/ndarray]):
+ Either image files or loaded images.
+ test_pipeline (:obj:`Compose`): Test pipeline.
+
+ Returns:
+ :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
+ If imgs is a list or tuple, the same length list type results
+ will be returned, otherwise return the detection results directly.
+ """
+ if isinstance(imgs, torch.Tensor):
+ if imgs.is_cuda:
+ imgs = imgs.cpu()
+
+ # Remove batch dimension and transpose
+ imgs = imgs.squeeze(0).permute(1, 2, 0).numpy()
+
+ # Ensure the data type is appropriate (uint8 for most image processing functions)
+ imgs = (imgs * 255).astype(np.uint8)
+
+ if isinstance(imgs, (list, tuple)) or (isinstance(imgs, np.ndarray) and len(imgs.shape) == 4):
+ is_batch = True
+ else:
+ imgs = [imgs]
+ is_batch = False
+
+ cfg = model.cfg
+
+ if test_pipeline is None:
+ cfg = cfg.copy()
+ test_pipeline = get_test_pipeline_cfg(cfg)
+ if isinstance(imgs[0], np.ndarray):
+ # Calling this method across libraries will result
+ # in module unregistered error if not prefixed with mmdet.
+ test_pipeline[0].type = "mmdet.LoadImageFromNDArray"
+
+ test_pipeline = Compose(test_pipeline)
+
+ if model.data_preprocessor.device.type == "cpu":
+ for m in model.modules():
+ assert not isinstance(
+ m, RoIPool
+ ), "CPU inference with RoIPool is not supported currently."
+
+ result_list = []
+ for i, img in enumerate(imgs):
+ # prepare data
+ if isinstance(img, np.ndarray):
+ # TODO: remove img_id.
+ data_ = dict(img=img, img_id=0)
+ else:
+ # TODO: remove img_id.
+ data_ = dict(img_path=img, img_id=0)
+
+ if text_prompt:
+ data_["text"] = text_prompt
+ data_["custom_entities"] = custom_entities
+
+ # build the data pipeline
+ data_ = test_pipeline(data_)
+
+ data_["inputs"] = [data_["inputs"]]
+ data_["data_samples"] = [data_["data_samples"]]
+
+ # forward the model
+ with torch.no_grad(), torch.autocast(device_type=get_device(), dtype=torch.bfloat16):
+ results = model.test_step(data_)[0]
+
+ result_list.append(results)
+
+ if not is_batch:
+ return result_list[0]
+ else:
+ return result_list
+
+
+def process_one_image_bbox(pred_instance, det_cat_id, bbox_thr, nms_thr):
+ bboxes = np.concatenate(
+ (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1
+ )
+ bboxes = bboxes[
+ np.logical_and(
+ pred_instance.labels == det_cat_id,
+ pred_instance.scores > bbox_thr,
+ )
+ ]
+ bboxes = bboxes[nms(bboxes, nms_thr), :4]
+ return bboxes
+
+
+def process_images_detector(imgs, detector):
+ """Visualize predicted keypoints (and heatmaps) of one image."""
+ # predict bbox
+ det_results = inference_detector(detector, imgs)
+ pred_instances = list(
+ map(lambda det_result: det_result.pred_instances.numpy(), det_results)
+ )
+ bboxes_batch = list(
+ map(
+ lambda pred_instance: process_one_image_bbox(
+ pred_instance, 0, 0.3, 0.3 ## argparse.Namespace(det_cat_id=0, bbox_thr=0.3, nms_thr=0.3),
+ ),
+ pred_instances,
+ )
+ )
+
+ return bboxes_batch
diff --git a/external/cv/.gitignore b/external/cv/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a787c67d6eec893ae8f4e07b6123a5170f2593fe
--- /dev/null
+++ b/external/cv/.gitignore
@@ -0,0 +1,125 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# PyTorch checkpoint
+*.pth
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+#dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+mlu-ops/
+mlu-ops.*
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build/
+docs/en/api/generated/
+docs/zh_cn/_build/
+docs/zh_cn/api/generated/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# editors and IDEs
+.idea/
+.vscode/
+
+# custom
+.DS_Store
+
+# datasets and logs and checkpoints
+data/
+work_dir/
+
+src/
diff --git a/external/cv/MANIFEST.in b/external/cv/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..622635caa1ec01f78d95c684b87658df87c63b38
--- /dev/null
+++ b/external/cv/MANIFEST.in
@@ -0,0 +1,6 @@
+include requirements/runtime.txt
+include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
+include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
+include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
+include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
+recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
diff --git a/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl b/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl
new file mode 100644
index 0000000000000000000000000000000000000000..eecbde2ca8eee264c31762fc1ee936277a76ca12
--- /dev/null
+++ b/external/cv/dist/sapiens_cv-1.0.0-cp310-cp310-linux_x86_64.whl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:746f2be13eefdfe43a59d9c415e03a4b0b922e6ce487b76a572a376ae76c9300
+size 30006791
diff --git a/external/cv/mmcv/__init__.py b/external/cv/mmcv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7bb7ac46d9bc1fe6c1dd5b6f74044776df805a
--- /dev/null
+++ b/external/cv/mmcv/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from .arraymisc import *
+from .image import *
+from .transforms import *
+from .version import *
+from .video import *
+from .visualization import *
+
+# The following modules are not imported to this level, so mmcv may be used
+# without PyTorch.
+# - op
+# - utils
diff --git a/external/cv/mmcv/arraymisc/__init__.py b/external/cv/mmcv/arraymisc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad965788fdc37610d7c56a7d0c3c9bbeed3bc98
--- /dev/null
+++ b/external/cv/mmcv/arraymisc/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .quantization import dequantize, quantize
+
+__all__ = ['quantize', 'dequantize']
diff --git a/external/cv/mmcv/arraymisc/quantization.py b/external/cv/mmcv/arraymisc/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..06fc55c930754eb43f7e26fb5401afc6106cbb09
--- /dev/null
+++ b/external/cv/mmcv/arraymisc/quantization.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import numpy as np
+
+
+def quantize(arr: np.ndarray,
+ min_val: Union[int, float],
+ max_val: Union[int, float],
+ levels: int,
+ dtype=np.int64) -> tuple:
+ """Quantize an array of (-inf, inf) to [0, levels-1].
+
+ Args:
+ arr (ndarray): Input array.
+ min_val (int or float): Minimum value to be clipped.
+ max_val (int or float): Maximum value to be clipped.
+ levels (int): Quantization levels.
+ dtype (np.type): The type of the quantized array.
+
+ Returns:
+ tuple: Quantized array.
+ """
+ if not (isinstance(levels, int) and levels > 1):
+ raise ValueError(
+ f'levels must be a positive integer, but got {levels}')
+ if min_val >= max_val:
+ raise ValueError(
+ f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+ arr = np.clip(arr, min_val, max_val) - min_val
+ quantized_arr = np.minimum(
+ np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
+
+ return quantized_arr
+
+
+def dequantize(arr: np.ndarray,
+ min_val: Union[int, float],
+ max_val: Union[int, float],
+ levels: int,
+ dtype=np.float64) -> tuple:
+ """Dequantize an array.
+
+ Args:
+ arr (ndarray): Input array.
+ min_val (int or float): Minimum value to be clipped.
+ max_val (int or float): Maximum value to be clipped.
+ levels (int): Quantization levels.
+ dtype (np.type): The type of the dequantized array.
+
+ Returns:
+ tuple: Dequantized array.
+ """
+ if not (isinstance(levels, int) and levels > 1):
+ raise ValueError(
+ f'levels must be a positive integer, but got {levels}')
+ if min_val >= max_val:
+ raise ValueError(
+ f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+ dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
+ min_val) / levels + min_val
+
+ return dequantized_arr
diff --git a/external/cv/mmcv/cnn/__init__.py b/external/cv/mmcv/cnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b24b7783a5f4c3523dd9465eadbd2646dea994a
--- /dev/null
+++ b/external/cv/mmcv/cnn/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .alexnet import AlexNet
+# yapf: disable
+from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+ ConvTranspose2d, ConvTranspose3d, ConvWS2d,
+ DepthwiseSeparableConvModule, GeneralizedAttention,
+ HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
+ NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
+ build_activation_layer, build_conv_layer,
+ build_norm_layer, build_padding_layer, build_plugin_layer,
+ build_upsample_layer, conv_ws_2d, is_norm)
+# yapf: enable
+from .resnet import ResNet, make_res_layer
+from .rfsearch import Conv2dRFSearchOp, RFSearchHook
+from .utils import fuse_conv_bn, get_model_complexity_info
+from .vgg import VGG, make_vgg_layer
+
+__all__ = [
+ 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
+ 'ConvModule', 'build_activation_layer', 'build_conv_layer',
+ 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+ 'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',
+ 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',
+ 'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+ 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',
+ 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',
+ 'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'
+]
diff --git a/external/cv/mmcv/cnn/alexnet.py b/external/cv/mmcv/cnn/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf385e2c1af131bf7cf4cc481eea285d803fd6b
--- /dev/null
+++ b/external/cv/mmcv/cnn/alexnet.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+
+
+class AlexNet(nn.Module):
+ """AlexNet backbone.
+
+ Args:
+ num_classes (int): number of classes for classification.
+ """
+
+ def __init__(self, num_classes: int = -1):
+ super().__init__()
+ self.num_classes = num_classes
+ self.features = nn.Sequential(
+ nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+ nn.ReLU(inplace=True),
+ nn.MaxPool2d(kernel_size=3, stride=2),
+ nn.Conv2d(64, 192, kernel_size=5, padding=2),
+ nn.ReLU(inplace=True),
+ nn.MaxPool2d(kernel_size=3, stride=2),
+ nn.Conv2d(192, 384, kernel_size=3, padding=1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(384, 256, kernel_size=3, padding=1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(256, 256, kernel_size=3, padding=1),
+ nn.ReLU(inplace=True),
+ nn.MaxPool2d(kernel_size=3, stride=2),
+ )
+ if self.num_classes > 0:
+ self.classifier = nn.Sequential(
+ nn.Dropout(),
+ nn.Linear(256 * 6 * 6, 4096),
+ nn.ReLU(inplace=True),
+ nn.Dropout(),
+ nn.Linear(4096, 4096),
+ nn.ReLU(inplace=True),
+ nn.Linear(4096, num_classes),
+ )
+
+ def init_weights(self, pretrained: Optional[str] = None) -> None:
+ if isinstance(pretrained, str):
+ logger = logging.getLogger()
+ load_checkpoint(self, pretrained, strict=False, logger=logger)
+ elif pretrained is None:
+ # use default initializer
+ pass
+ else:
+ raise TypeError('pretrained must be a str or None')
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+ x = self.features(x)
+ if self.num_classes > 0:
+ x = x.view(x.size(0), 256 * 6 * 6)
+ x = self.classifier(x)
+
+ return x
diff --git a/external/cv/mmcv/cnn/bricks/__init__.py b/external/cv/mmcv/cnn/bricks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e607bb4e65199d9f537709a4f59ff50c0f44ebfe
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .activation import build_activation_layer
+from .context_block import ContextBlock
+from .conv import build_conv_layer
+from .conv2d_adaptive_padding import Conv2dAdaptivePadding
+from .conv_module import ConvModule
+from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
+from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
+from .generalized_attention import GeneralizedAttention
+from .hsigmoid import HSigmoid
+from .hswish import HSwish
+from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
+from .norm import build_norm_layer, is_norm
+from .padding import build_padding_layer
+from .plugin import build_plugin_layer
+from .scale import LayerScale, Scale
+from .swish import Swish
+from .upsample import build_upsample_layer
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+ Linear, MaxPool2d, MaxPool3d)
+
+__all__ = [
+ 'ConvModule', 'build_activation_layer', 'build_conv_layer',
+ 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+ 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
+ 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
+ 'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',
+ 'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',
+ 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',
+ 'Conv3d', 'Dropout', 'DropPath', 'LayerScale'
+]
diff --git a/external/cv/mmcv/cnn/bricks/activation.py b/external/cv/mmcv/cnn/bricks/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..397c541eb0036b4f9068ee590a9aa426bcdddad3
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/activation.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+for module in [
+ nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+ nn.Sigmoid, nn.Tanh
+]:
+ MODELS.register_module(module=module)
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+ MODELS.register_module(module=nn.SiLU, name='SiLU')
+else:
+
+ class SiLU(nn.Module):
+ """Sigmoid Weighted Liner Unit."""
+
+ def __init__(self, inplace=False):
+ super().__init__()
+ self.inplace = inplace
+
+ def forward(self, inputs) -> torch.Tensor:
+ if self.inplace:
+ return inputs.mul_(torch.sigmoid(inputs))
+ else:
+ return inputs * torch.sigmoid(inputs)
+
+ MODELS.register_module(module=SiLU, name='SiLU')
+
+
+@MODELS.register_module(name='Clip')
+@MODELS.register_module()
+class Clamp(nn.Module):
+ """Clamp activation layer.
+
+ This activation function is to clamp the feature map value within
+ :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+ Args:
+ min (Number | optional): Lower-bound of the range to be clamped to.
+ Default to -1.
+ max (Number | optional): Upper-bound of the range to be clamped to.
+ Default to 1.
+ """
+
+ def __init__(self, min: float = -1., max: float = 1.):
+ super().__init__()
+ self.min = min
+ self.max = max
+
+ def forward(self, x) -> torch.Tensor:
+ """Forward function.
+
+ Args:
+ x (torch.Tensor): The input tensor.
+
+ Returns:
+ torch.Tensor: Clamped tensor.
+ """
+ return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+ r"""Applies the Gaussian Error Linear Units function:
+
+ .. math::
+ \text{GELU}(x) = x * \Phi(x)
+ where :math:`\Phi(x)` is the Cumulative Distribution Function for
+ Gaussian Distribution.
+
+ Shape:
+ - Input: :math:`(N, *)` where `*` means, any number of additional
+ dimensions
+ - Output: :math:`(N, *)`, same shape as the input
+
+ .. image:: scripts/activation_images/GELU.png
+
+ Examples::
+
+ >>> m = nn.GELU()
+ >>> input = torch.randn(2)
+ >>> output = m(input)
+ """
+
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
+ return F.gelu(input)
+
+
+if (TORCH_VERSION == 'parrots'
+ or digit_version(TORCH_VERSION) < digit_version('1.4')):
+ MODELS.register_module(module=GELU)
+else:
+ MODELS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg: Dict) -> nn.Module:
+ """Build activation layer.
+
+ Args:
+ cfg (dict): The activation layer config, which should contain:
+
+ - type (str): Layer type.
+ - layer args: Args needed to instantiate an activation layer.
+
+ Returns:
+ nn.Module: Created activation layer.
+ """
+ return MODELS.build(cfg)
diff --git a/external/cv/mmcv/cnn/bricks/context_block.py b/external/cv/mmcv/cnn/bricks/context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..364b8c3dfbb3f80689e538a586eb656a9f7e77ba
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/context_block.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import torch
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from torch import nn
+
+
+def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
+ if isinstance(m, nn.Sequential):
+ constant_init(m[-1], val=0)
+ else:
+ constant_init(m, val=0)
+
+
+@MODELS.register_module()
+class ContextBlock(nn.Module):
+ """ContextBlock module in GCNet.
+
+ See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+ (https://arxiv.org/abs/1904.11492) for details.
+
+ Args:
+ in_channels (int): Channels of the input feature map.
+ ratio (float): Ratio of channels of transform bottleneck
+ pooling_type (str): Pooling method for context modeling.
+ Options are 'att' and 'avg', stand for attention pooling and
+ average pooling respectively. Default: 'att'.
+ fusion_types (Sequence[str]): Fusion method for feature fusion,
+ Options are 'channels_add', 'channel_mul', stand for channelwise
+ addition and multiplication respectively. Default: ('channel_add',)
+ """
+
+ _abbr_ = 'context_block'
+
+ def __init__(self,
+ in_channels: int,
+ ratio: float,
+ pooling_type: str = 'att',
+ fusion_types: tuple = ('channel_add', )):
+ super().__init__()
+ assert pooling_type in ['avg', 'att']
+ assert isinstance(fusion_types, (list, tuple))
+ valid_fusion_types = ['channel_add', 'channel_mul']
+ assert all([f in valid_fusion_types for f in fusion_types])
+ assert len(fusion_types) > 0, 'at least one fusion should be used'
+ self.in_channels = in_channels
+ self.ratio = ratio
+ self.planes = int(in_channels * ratio)
+ self.pooling_type = pooling_type
+ self.fusion_types = fusion_types
+ if pooling_type == 'att':
+ self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+ self.softmax = nn.Softmax(dim=2)
+ else:
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ if 'channel_add' in fusion_types:
+ self.channel_add_conv = nn.Sequential(
+ nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+ nn.LayerNorm([self.planes, 1, 1]),
+ nn.ReLU(inplace=True), # yapf: disable
+ nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+ else:
+ self.channel_add_conv = None
+ if 'channel_mul' in fusion_types:
+ self.channel_mul_conv = nn.Sequential(
+ nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+ nn.LayerNorm([self.planes, 1, 1]),
+ nn.ReLU(inplace=True), # yapf: disable
+ nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+ else:
+ self.channel_mul_conv = None
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ if self.pooling_type == 'att':
+ kaiming_init(self.conv_mask, mode='fan_in')
+ self.conv_mask.inited = True
+
+ if self.channel_add_conv is not None:
+ last_zero_init(self.channel_add_conv)
+ if self.channel_mul_conv is not None:
+ last_zero_init(self.channel_mul_conv)
+
+ def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
+ batch, channel, height, width = x.size()
+ if self.pooling_type == 'att':
+ input_x = x
+ # [N, C, H * W]
+ input_x = input_x.view(batch, channel, height * width)
+ # [N, 1, C, H * W]
+ input_x = input_x.unsqueeze(1)
+ # [N, 1, H, W]
+ context_mask = self.conv_mask(x)
+ # [N, 1, H * W]
+ context_mask = context_mask.view(batch, 1, height * width)
+ # [N, 1, H * W]
+ context_mask = self.softmax(context_mask)
+ # [N, 1, H * W, 1]
+ context_mask = context_mask.unsqueeze(-1)
+ # [N, 1, C, 1]
+ context = torch.matmul(input_x, context_mask)
+ # [N, C, 1, 1]
+ context = context.view(batch, channel, 1, 1)
+ else:
+ # [N, C, 1, 1]
+ context = self.avg_pool(x)
+
+ return context
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # [N, C, 1, 1]
+ context = self.spatial_pool(x)
+
+ out = x
+ if self.channel_mul_conv is not None:
+ # [N, C, 1, 1]
+ channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+ out = out * channel_mul_term
+ if self.channel_add_conv is not None:
+ # [N, C, 1, 1]
+ channel_add_term = self.channel_add_conv(context)
+ out = out + channel_add_term
+
+ return out
diff --git a/external/cv/mmcv/cnn/bricks/conv.py b/external/cv/mmcv/cnn/bricks/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6da2ece9d77137e02f825633b50c992ff6df5b
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/conv.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Dict, Optional
+
+from mmengine.registry import MODELS
+from torch import nn
+
+MODELS.register_module('Conv1d', module=nn.Conv1d)
+MODELS.register_module('Conv2d', module=nn.Conv2d)
+MODELS.register_module('Conv3d', module=nn.Conv3d)
+MODELS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
+ """Build convolution layer.
+
+ Args:
+ cfg (None or dict): The conv layer config, which should contain:
+ - type (str): Layer type.
+ - layer args: Args needed to instantiate an conv layer.
+ args (argument list): Arguments passed to the `__init__`
+ method of the corresponding conv layer.
+ kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+ method of the corresponding conv layer.
+
+ Returns:
+ nn.Module: Created conv layer.
+ """
+ if cfg is None:
+ cfg_ = dict(type='Conv2d')
+ else:
+ if not isinstance(cfg, dict):
+ raise TypeError('cfg must be a dict')
+ if 'type' not in cfg:
+ raise KeyError('the cfg dict must contain the key "type"')
+ cfg_ = cfg.copy()
+
+ layer_type = cfg_.pop('type')
+ if inspect.isclass(layer_type):
+ return layer_type(*args, **kwargs, **cfg_) # type: ignore
+ # Switch registry to the target scope. If `conv_layer` cannot be found
+ # in the registry, fallback to search `conv_layer` in the
+ # mmengine.MODELS.
+ with MODELS.switch_scope_and_registry(None) as registry:
+ conv_layer = registry.get(layer_type)
+ if conv_layer is None:
+ raise KeyError(f'Cannot find {conv_layer} in registry under scope '
+ f'name {registry.scope}')
+ layer = conv_layer(*args, **kwargs, **cfg_)
+
+ return layer
diff --git a/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6376f4ca558ed721633ec6063ef1349cafdcba4
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple, Union
+
+import torch
+from mmengine.registry import MODELS
+from torch import nn
+from torch.nn import functional as F
+
+
+@MODELS.register_module()
+class Conv2dAdaptivePadding(nn.Conv2d):
+ """Implementation of 2D convolution in tensorflow with `padding` as "same",
+ which applies padding to input (if needed) so that input image gets fully
+ covered by filter and stride you specified. For stride 1, this will ensure
+ that output image size is same as input. For stride of 2, output dimensions
+ will be half, for example.
+
+ Args:
+ in_channels (int): Number of channels in the input image
+ out_channels (int): Number of channels produced by the convolution
+ kernel_size (int or tuple): Size of the convolving kernel
+ stride (int or tuple, optional): Stride of the convolution. Default: 1
+ padding (int or tuple, optional): Zero-padding added to both sides of
+ the input. Default: 0
+ dilation (int or tuple, optional): Spacing between kernel elements.
+ Default: 1
+ groups (int, optional): Number of blocked connections from input
+ channels to output channels. Default: 1
+ bias (bool, optional): If ``True``, adds a learnable bias to the
+ output. Default: ``True``
+ """
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ bias: bool = True):
+ super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+ dilation, groups, bias)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ img_h, img_w = x.size()[-2:]
+ kernel_h, kernel_w = self.weight.size()[-2:]
+ stride_h, stride_w = self.stride
+ output_h = math.ceil(img_h / stride_h)
+ output_w = math.ceil(img_w / stride_w)
+ pad_h = (
+ max((output_h - 1) * self.stride[0] +
+ (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
+ pad_w = (
+ max((output_w - 1) * self.stride[1] +
+ (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
+ if pad_h > 0 or pad_w > 0:
+ x = F.pad(x, [
+ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+ ])
+ return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+ self.dilation, self.groups)
diff --git a/external/cv/mmcv/cnn/bricks/conv_module.py b/external/cv/mmcv/cnn/bricks/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2f05361cf2578f4742d1379dc675474816e134
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/conv_module.py
@@ -0,0 +1,343 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+
+
+def efficient_conv_bn_eval_forward(bn: _BatchNorm,
+ conv: nn.modules.conv._ConvNd,
+ x: torch.Tensor):
+ """
+ Implementation based on https://arxiv.org/abs/2305.11624
+ "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+ It leverages the associative law between convolution and affine transform,
+ i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+ It works for Eval mode of ConvBN blocks during validation, and can be used
+ for training as well. It reduces memory and computation cost.
+
+ Args:
+ bn (_BatchNorm): a BatchNorm module.
+ conv (nn._ConvNd): a conv module
+ x (torch.Tensor): Input feature map.
+ """
+ # These lines of code are designed to deal with various cases
+ # like bn without affine transform, and conv without bias
+ weight_on_the_fly = conv.weight
+ if conv.bias is not None:
+ bias_on_the_fly = conv.bias
+ else:
+ bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+ if bn.weight is not None:
+ bn_weight = bn.weight
+ else:
+ bn_weight = torch.ones_like(bn.running_var)
+
+ if bn.bias is not None:
+ bn_bias = bn.bias
+ else:
+ bn_bias = torch.zeros_like(bn.running_var)
+
+ # shape of [C_out, 1, 1, 1] in Conv2d
+ weight_coeff = torch.rsqrt(bn.running_var +
+ bn.eps).reshape([-1] + [1] *
+ (len(conv.weight.shape) - 1))
+ # shape of [C_out, 1, 1, 1] in Conv2d
+ coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+ # shape of [C_out, C_in, k, k] in Conv2d
+ weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+ # shape of [C_out] in Conv2d
+ bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+ (bias_on_the_fly - bn.running_mean)
+
+ return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
+
+
+@MODELS.register_module()
+class ConvModule(nn.Module):
+ """A conv block that bundles conv/norm/activation layers.
+
+ This block simplifies the usage of convolution layers, which are commonly
+ used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+ It is based upon three build methods: `build_conv_layer()`,
+ `build_norm_layer()` and `build_activation_layer()`.
+
+ Besides, we add some additional features in this module.
+ 1. Automatically set `bias` of the conv layer.
+ 2. Spectral norm is supported.
+ 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+ supports zero and circular padding, and we add "reflect" padding mode.
+
+ Args:
+ in_channels (int): Number of channels in the input feature map.
+ Same as that in ``nn._ConvNd``.
+ out_channels (int): Number of channels produced by the convolution.
+ Same as that in ``nn._ConvNd``.
+ kernel_size (int | tuple[int]): Size of the convolving kernel.
+ Same as that in ``nn._ConvNd``.
+ stride (int | tuple[int]): Stride of the convolution.
+ Same as that in ``nn._ConvNd``.
+ padding (int | tuple[int]): Zero-padding added to both sides of
+ the input. Same as that in ``nn._ConvNd``.
+ dilation (int | tuple[int]): Spacing between kernel elements.
+ Same as that in ``nn._ConvNd``.
+ groups (int): Number of blocked connections from input channels to
+ output channels. Same as that in ``nn._ConvNd``.
+ bias (bool | str): If specified as `auto`, it will be decided by the
+ norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+ False. Default: "auto".
+ conv_cfg (dict): Config dict for convolution layer. Default: None,
+ which means using conv2d.
+ norm_cfg (dict): Config dict for normalization layer. Default: None.
+ act_cfg (dict): Config dict for activation layer.
+ Default: dict(type='ReLU').
+ inplace (bool): Whether to use inplace mode for activation.
+ Default: True.
+ with_spectral_norm (bool): Whether use spectral norm in conv module.
+ Default: False.
+ padding_mode (str): If the `padding_mode` has not been supported by
+ current `Conv2d` in PyTorch, we will use our own padding layer
+ instead. Currently, we support ['zeros', 'circular'] with official
+ implementation and ['reflect'] with our own implementation.
+ Default: 'zeros'.
+ order (tuple[str]): The order of conv/norm/activation layers. It is a
+ sequence of "conv", "norm" and "act". Common examples are
+ ("conv", "norm", "act") and ("act", "conv", "norm").
+ Default: ('conv', 'norm', 'act').
+ efficient_conv_bn_eval (bool): Whether use efficient conv when the
+ consecutive bn is in eval mode (either training or testing), as
+ proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
+ """
+
+ _abbr_ = 'conv_block'
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ bias: Union[bool, str] = 'auto',
+ conv_cfg: Optional[Dict] = None,
+ norm_cfg: Optional[Dict] = None,
+ act_cfg: Optional[Dict] = dict(type='ReLU'),
+ inplace: bool = True,
+ with_spectral_norm: bool = False,
+ padding_mode: str = 'zeros',
+ order: tuple = ('conv', 'norm', 'act'),
+ efficient_conv_bn_eval: bool = False):
+ super().__init__()
+ assert conv_cfg is None or isinstance(conv_cfg, dict)
+ assert norm_cfg is None or isinstance(norm_cfg, dict)
+ assert act_cfg is None or isinstance(act_cfg, dict)
+ official_padding_mode = ['zeros', 'circular']
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+ self.inplace = inplace
+ self.with_spectral_norm = with_spectral_norm
+ self.with_explicit_padding = padding_mode not in official_padding_mode
+ self.order = order
+ assert isinstance(self.order, tuple) and len(self.order) == 3
+ assert set(order) == {'conv', 'norm', 'act'}
+
+ self.with_norm = norm_cfg is not None
+ self.with_activation = act_cfg is not None
+ # if the conv layer is before a norm layer, bias is unnecessary.
+ if bias == 'auto':
+ bias = not self.with_norm
+ self.with_bias = bias
+
+ if self.with_explicit_padding:
+ pad_cfg = dict(type=padding_mode)
+ self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+ # reset padding to 0 for conv module
+ conv_padding = 0 if self.with_explicit_padding else padding
+ # build convolution layer
+ self.conv = build_conv_layer(
+ conv_cfg,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=conv_padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias)
+ # export the attributes of self.conv to a higher level for convenience
+ self.in_channels = self.conv.in_channels
+ self.out_channels = self.conv.out_channels
+ self.kernel_size = self.conv.kernel_size
+ self.stride = self.conv.stride
+ self.padding = padding
+ self.dilation = self.conv.dilation
+ self.transposed = self.conv.transposed
+ self.output_padding = self.conv.output_padding
+ self.groups = self.conv.groups
+
+ if self.with_spectral_norm:
+ self.conv = nn.utils.spectral_norm(self.conv)
+
+ # build normalization layers
+ if self.with_norm:
+ # norm layer is after conv layer
+ if order.index('norm') > order.index('conv'):
+ norm_channels = out_channels
+ else:
+ norm_channels = in_channels
+ self.norm_name, norm = build_norm_layer(
+ norm_cfg, norm_channels) # type: ignore
+ self.add_module(self.norm_name, norm)
+ if self.with_bias:
+ if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+ warnings.warn(
+ 'Unnecessary conv bias before batch/instance norm')
+ else:
+ self.norm_name = None # type: ignore
+
+ self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+ # build activation layer
+ if self.with_activation:
+ act_cfg_ = act_cfg.copy() # type: ignore
+ # nn.Tanh has no 'inplace' argument
+ if act_cfg_['type'] not in [
+ 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
+ ]:
+ act_cfg_.setdefault('inplace', inplace)
+ self.activate = build_activation_layer(act_cfg_)
+
+ # Use msra init by default
+ self.init_weights()
+
+ @property
+ def norm(self):
+ if self.norm_name:
+ return getattr(self, self.norm_name)
+ else:
+ return None
+
+ def init_weights(self):
+ # 1. It is mainly for customized conv layers with their own
+ # initialization manners by calling their own ``init_weights()``,
+ # and we do not want ConvModule to override the initialization.
+ # 2. For customized conv layers without their own initialization
+ # manners (that is, they don't have their own ``init_weights()``)
+ # and PyTorch's conv layers, they will be initialized by
+ # this method with default ``kaiming_init``.
+ # Note: For PyTorch's conv layers, they will be overwritten by our
+ # initialization implementation using default ``kaiming_init``.
+ if not hasattr(self.conv, 'init_weights'):
+ if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+ nonlinearity = 'leaky_relu'
+ a = self.act_cfg.get('negative_slope', 0.01)
+ else:
+ nonlinearity = 'relu'
+ a = 0
+ kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+ if self.with_norm:
+ constant_init(self.norm, 1, bias=0)
+
+ def forward(self,
+ x: torch.Tensor,
+ activate: bool = True,
+ norm: bool = True) -> torch.Tensor:
+ layer_index = 0
+ while layer_index < len(self.order):
+ layer = self.order[layer_index]
+ if layer == 'conv':
+ if self.with_explicit_padding:
+ x = self.padding_layer(x)
+ # if the next operation is norm and we have a norm layer in
+ # eval mode and we have enabled `efficient_conv_bn_eval` for
+ # the conv operator, then activate the optimized forward and
+ # skip the next norm operator since it has been fused
+ if layer_index + 1 < len(self.order) and \
+ self.order[layer_index + 1] == 'norm' and norm and \
+ self.with_norm and not self.norm.training and \
+ self.efficient_conv_bn_eval_forward is not None:
+ self.conv.forward = partial(
+ self.efficient_conv_bn_eval_forward, self.norm,
+ self.conv)
+ layer_index += 1
+ x = self.conv(x)
+ del self.conv.forward
+ else:
+ x = self.conv(x)
+ elif layer == 'norm' and norm and self.with_norm:
+ x = self.norm(x)
+ elif layer == 'act' and activate and self.with_activation:
+ x = self.activate(x)
+ layer_index += 1
+ return x
+
+ def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
+ # efficient_conv_bn_eval works for conv + bn
+ # with `track_running_stats` option
+ if efficient_conv_bn_eval and self.norm \
+ and isinstance(self.norm, _BatchNorm) \
+ and self.norm.track_running_stats:
+ self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501
+ else:
+ self.efficient_conv_bn_eval_forward = None # type: ignore
+
+ @staticmethod
+ def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
+ bn: torch.nn.modules.batchnorm._BatchNorm,
+ efficient_conv_bn_eval=True) -> 'ConvModule':
+ """Create a ConvModule from a conv and a bn module."""
+ self = ConvModule.__new__(ConvModule)
+ super(ConvModule, self).__init__()
+
+ self.conv_cfg = None
+ self.norm_cfg = None
+ self.act_cfg = None
+ self.inplace = False
+ self.with_spectral_norm = False
+ self.with_explicit_padding = False
+ self.order = ('conv', 'norm', 'act')
+
+ self.with_norm = True
+ self.with_activation = False
+ self.with_bias = conv.bias is not None
+
+ # build convolution layer
+ self.conv = conv
+ # export the attributes of self.conv to a higher level for convenience
+ self.in_channels = self.conv.in_channels
+ self.out_channels = self.conv.out_channels
+ self.kernel_size = self.conv.kernel_size
+ self.stride = self.conv.stride
+ self.padding = self.conv.padding
+ self.dilation = self.conv.dilation
+ self.transposed = self.conv.transposed
+ self.output_padding = self.conv.output_padding
+ self.groups = self.conv.groups
+
+ # build normalization layers
+ self.norm_name, norm = 'bn', bn
+ self.add_module(self.norm_name, norm)
+
+ self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+ return self
diff --git a/external/cv/mmcv/cnn/bricks/conv_ws.py b/external/cv/mmcv/cnn/bricks/conv_ws.py
new file mode 100644
index 0000000000000000000000000000000000000000..901cd8002a95bcbb12f9d2723fd34a341017ae91
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/conv_ws.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+
+def conv_ws_2d(input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ eps: float = 1e-5) -> torch.Tensor:
+ c_in = weight.size(0)
+ weight_flat = weight.view(c_in, -1)
+ mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+ std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+ weight = (weight - mean) / (std + eps)
+ return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+@MODELS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ bias: bool = True,
+ eps: float = 1e-5):
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias)
+ self.eps = eps
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+ self.dilation, self.groups, self.eps)
+
+
+@MODELS.register_module(name='ConvAWS')
+class ConvAWS2d(nn.Conv2d):
+ """AWS (Adaptive Weight Standardization)
+
+ This is a variant of Weight Standardization
+ (https://arxiv.org/pdf/1903.10520.pdf)
+ It is used in DetectoRS to avoid NaN
+ (https://arxiv.org/pdf/2006.02334.pdf)
+
+ Args:
+ in_channels (int): Number of channels in the input image
+ out_channels (int): Number of channels produced by the convolution
+ kernel_size (int or tuple): Size of the conv kernel
+ stride (int or tuple, optional): Stride of the convolution. Default: 1
+ padding (int or tuple, optional): Zero-padding added to both sides of
+ the input. Default: 0
+ dilation (int or tuple, optional): Spacing between kernel elements.
+ Default: 1
+ groups (int, optional): Number of blocked connections from input
+ channels to output channels. Default: 1
+ bias (bool, optional): If set True, adds a learnable bias to the
+ output. Default: True
+ """
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ groups: int = 1,
+ bias: bool = True):
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias)
+ self.register_buffer('weight_gamma',
+ torch.ones(self.out_channels, 1, 1, 1))
+ self.register_buffer('weight_beta',
+ torch.zeros(self.out_channels, 1, 1, 1))
+
+ def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
+ weight_flat = weight.view(weight.size(0), -1)
+ mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+ std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+ weight = (weight - mean) / std
+ weight = self.weight_gamma * weight + self.weight_beta
+ return weight
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ weight = self._get_weight(self.weight)
+ return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+ self.dilation, self.groups)
+
+ def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+ local_metadata: Dict, strict: bool,
+ missing_keys: List[str],
+ unexpected_keys: List[str],
+ error_msgs: List[str]) -> None:
+ """Override default load function.
+
+ AWS overrides the function _load_from_state_dict to recover
+ weight_gamma and weight_beta if they are missing. If weight_gamma and
+ weight_beta are found in the checkpoint, this function will return
+ after super()._load_from_state_dict. Otherwise, it will compute the
+ mean and std of the pretrained weights and store them in weight_beta
+ and weight_gamma.
+ """
+
+ self.weight_gamma.data.fill_(-1)
+ local_missing_keys: List = []
+ super()._load_from_state_dict(state_dict, prefix, local_metadata,
+ strict, local_missing_keys,
+ unexpected_keys, error_msgs)
+ if self.weight_gamma.data.mean() > 0:
+ for k in local_missing_keys:
+ missing_keys.append(k)
+ return
+ weight = self.weight.data
+ weight_flat = weight.view(weight.size(0), -1)
+ mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+ std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+ self.weight_beta.data.copy_(mean)
+ self.weight_gamma.data.copy_(std)
+ missing_gamma_beta = [
+ k for k in local_missing_keys
+ if k.endswith('weight_gamma') or k.endswith('weight_beta')
+ ]
+ for k in missing_gamma_beta:
+ local_missing_keys.remove(k)
+ for k in local_missing_keys:
+ missing_keys.append(k)
diff --git a/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..d66c8951c32f9c40f50ceb3d394aaf0fd4252150
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -0,0 +1,104 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .conv_module import ConvModule
+
+
+class DepthwiseSeparableConvModule(nn.Module):
+ """Depthwise separable convolution module.
+
+ See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+ This module can replace a ConvModule with the conv block replaced by two
+ conv block: depthwise conv block and pointwise conv block. The depthwise
+ conv block contains depthwise-conv/norm/activation layers. The pointwise
+ conv block contains pointwise-conv/norm/activation layers. It should be
+ noted that there will be norm/activation layer in the depthwise conv block
+ if `norm_cfg` and `act_cfg` are specified.
+
+ Args:
+ in_channels (int): Number of channels in the input feature map.
+ Same as that in ``nn._ConvNd``.
+ out_channels (int): Number of channels produced by the convolution.
+ Same as that in ``nn._ConvNd``.
+ kernel_size (int | tuple[int]): Size of the convolving kernel.
+ Same as that in ``nn._ConvNd``.
+ stride (int | tuple[int]): Stride of the convolution.
+ Same as that in ``nn._ConvNd``. Default: 1.
+ padding (int | tuple[int]): Zero-padding added to both sides of
+ the input. Same as that in ``nn._ConvNd``. Default: 0.
+ dilation (int | tuple[int]): Spacing between kernel elements.
+ Same as that in ``nn._ConvNd``. Default: 1.
+ norm_cfg (dict): Default norm config for both depthwise ConvModule and
+ pointwise ConvModule. Default: None.
+ act_cfg (dict): Default activation config for both depthwise ConvModule
+ and pointwise ConvModule. Default: dict(type='ReLU').
+ dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+ 'default', it will be the same as `norm_cfg`. Default: 'default'.
+ dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+ 'default', it will be the same as `act_cfg`. Default: 'default'.
+ pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+ 'default', it will be the same as `norm_cfg`. Default: 'default'.
+ pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+ 'default', it will be the same as `act_cfg`. Default: 'default'.
+ kwargs (optional): Other shared arguments for depthwise and pointwise
+ ConvModule. See ConvModule for ref.
+ """
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]],
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Union[int, Tuple[int, int]] = 0,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ norm_cfg: Optional[Dict] = None,
+ act_cfg: Dict = dict(type='ReLU'),
+ dw_norm_cfg: Union[Dict, str] = 'default',
+ dw_act_cfg: Union[Dict, str] = 'default',
+ pw_norm_cfg: Union[Dict, str] = 'default',
+ pw_act_cfg: Union[Dict, str] = 'default',
+ **kwargs):
+ super().__init__()
+ assert 'groups' not in kwargs, 'groups should not be specified'
+
+ # if norm/activation config of depthwise/pointwise ConvModule is not
+ # specified, use default config.
+ dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
+ dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
+ pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
+ pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
+
+ # depthwise convolution
+ self.depthwise_conv = ConvModule(
+ in_channels,
+ in_channels,
+ kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=in_channels,
+ norm_cfg=dw_norm_cfg, # type: ignore
+ act_cfg=dw_act_cfg, # type: ignore
+ **kwargs)
+
+ self.pointwise_conv = ConvModule(
+ in_channels,
+ out_channels,
+ 1,
+ norm_cfg=pw_norm_cfg, # type: ignore
+ act_cfg=pw_act_cfg, # type: ignore
+ **kwargs)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.depthwise_conv(x)
+ x = self.pointwise_conv(x)
+ return x
diff --git a/external/cv/mmcv/cnn/bricks/drop.py b/external/cv/mmcv/cnn/bricks/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e2853a17994f62c9e4db1e2704673c24b3cf670
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/drop.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+def drop_path(x: torch.Tensor,
+ drop_prob: float = 0.,
+ training: bool = False) -> torch.Tensor:
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of
+ residual blocks).
+
+ We follow the implementation
+ https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
+ """
+ if not training:
+ return x
+ keep_prob = 1 - drop_prob
+ # handle tensors with different dimensions, not just 4D tensors.
+ shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+ random_tensor = keep_prob + torch.rand(
+ shape, dtype=x.dtype, device=x.device)
+ output = x.div(keep_prob) * random_tensor.floor()
+ return output
+
+
+@MODELS.register_module()
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of
+ residual blocks).
+
+ We follow the implementation
+ https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
+
+ Args:
+ drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+ """
+
+ def __init__(self, drop_prob: float = 0.1):
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return drop_path(x, self.drop_prob, self.training)
+
+
+@MODELS.register_module()
+class Dropout(nn.Dropout):
+ """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+ ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+ ``DropPath``
+
+ Args:
+ drop_prob (float): Probability of the elements to be
+ zeroed. Default: 0.5.
+ inplace (bool): Do the operation inplace or not. Default: False.
+ """
+
+ def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
+ super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
+ """Builder for drop out layers."""
+ return MODELS.build(cfg, default_args=default_args)
diff --git a/external/cv/mmcv/cnn/bricks/generalized_attention.py b/external/cv/mmcv/cnn/bricks/generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e1b466afe31f577723316214b39df6114882195
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/generalized_attention.py
@@ -0,0 +1,416 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import kaiming_init
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class GeneralizedAttention(nn.Module):
+ """GeneralizedAttention module.
+
+ See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+ (https://arxiv.org/abs/1904.05873) for details.
+
+ Args:
+ in_channels (int): Channels of the input feature map.
+ spatial_range (int): The spatial range. -1 indicates no spatial range
+ constraint. Default: -1.
+ num_heads (int): The head number of empirical_attention module.
+ Default: 9.
+ position_embedding_dim (int): The position embedding dimension.
+ Default: -1.
+ position_magnitude (int): A multiplier acting on coord difference.
+ Default: 1.
+ kv_stride (int): The feature stride acting on key/value feature map.
+ Default: 2.
+ q_stride (int): The feature stride acting on query feature map.
+ Default: 1.
+ attention_type (str): A binary indicator string for indicating which
+ items in generalized empirical_attention module are used.
+ Default: '1111'.
+
+ - '1000' indicates 'query and key content' (appr - appr) item,
+ - '0100' indicates 'query content and relative position'
+ (appr - position) item,
+ - '0010' indicates 'key content only' (bias - appr) item,
+ - '0001' indicates 'relative position only' (bias - position) item.
+ """
+
+ _abbr_ = 'gen_attention_block'
+
+ def __init__(self,
+ in_channels: int,
+ spatial_range: int = -1,
+ num_heads: int = 9,
+ position_embedding_dim: int = -1,
+ position_magnitude: int = 1,
+ kv_stride: int = 2,
+ q_stride: int = 1,
+ attention_type: str = '1111'):
+
+ super().__init__()
+
+ # hard range means local range for non-local operation
+ self.position_embedding_dim = (
+ position_embedding_dim
+ if position_embedding_dim > 0 else in_channels)
+
+ self.position_magnitude = position_magnitude
+ self.num_heads = num_heads
+ self.in_channels = in_channels
+ self.spatial_range = spatial_range
+ self.kv_stride = kv_stride
+ self.q_stride = q_stride
+ self.attention_type = [bool(int(_)) for _ in attention_type]
+ self.qk_embed_dim = in_channels // num_heads
+ out_c = self.qk_embed_dim * num_heads
+
+ if self.attention_type[0] or self.attention_type[1]:
+ self.query_conv = nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_c,
+ kernel_size=1,
+ bias=False)
+ self.query_conv.kaiming_init = True
+
+ if self.attention_type[0] or self.attention_type[2]:
+ self.key_conv = nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_c,
+ kernel_size=1,
+ bias=False)
+ self.key_conv.kaiming_init = True
+
+ self.v_dim = in_channels // num_heads
+ self.value_conv = nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=self.v_dim * num_heads,
+ kernel_size=1,
+ bias=False)
+ self.value_conv.kaiming_init = True
+
+ if self.attention_type[1] or self.attention_type[3]:
+ self.appr_geom_fc_x = nn.Linear(
+ self.position_embedding_dim // 2, out_c, bias=False)
+ self.appr_geom_fc_x.kaiming_init = True
+
+ self.appr_geom_fc_y = nn.Linear(
+ self.position_embedding_dim // 2, out_c, bias=False)
+ self.appr_geom_fc_y.kaiming_init = True
+
+ if self.attention_type[2]:
+ stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+ appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+ self.appr_bias = nn.Parameter(appr_bias_value)
+
+ if self.attention_type[3]:
+ stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+ geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+ self.geom_bias = nn.Parameter(geom_bias_value)
+
+ self.proj_conv = nn.Conv2d(
+ in_channels=self.v_dim * num_heads,
+ out_channels=in_channels,
+ kernel_size=1,
+ bias=True)
+ self.proj_conv.kaiming_init = True
+ self.gamma = nn.Parameter(torch.zeros(1))
+
+ if self.spatial_range >= 0:
+ # only works when non local is after 3*3 conv
+ if in_channels == 256:
+ max_len = 84
+ elif in_channels == 512:
+ max_len = 42
+
+ max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+ local_constraint_map = np.ones(
+ (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+ for iy in range(max_len):
+ for ix in range(max_len):
+ local_constraint_map[
+ iy, ix,
+ max((iy - self.spatial_range) //
+ self.kv_stride, 0):min((iy + self.spatial_range +
+ 1) // self.kv_stride +
+ 1, max_len),
+ max((ix - self.spatial_range) //
+ self.kv_stride, 0):min((ix + self.spatial_range +
+ 1) // self.kv_stride +
+ 1, max_len)] = 0
+
+ self.local_constraint_map = nn.Parameter(
+ torch.from_numpy(local_constraint_map).byte(),
+ requires_grad=False)
+
+ if self.q_stride > 1:
+ self.q_downsample = nn.AvgPool2d(
+ kernel_size=1, stride=self.q_stride)
+ else:
+ self.q_downsample = None
+
+ if self.kv_stride > 1:
+ self.kv_downsample = nn.AvgPool2d(
+ kernel_size=1, stride=self.kv_stride)
+ else:
+ self.kv_downsample = None
+
+ self.init_weights()
+
+ def get_position_embedding(self,
+ h,
+ w,
+ h_kv,
+ w_kv,
+ q_stride,
+ kv_stride,
+ device,
+ dtype,
+ feat_dim,
+ wave_length=1000):
+ # the default type of Tensor is float32, leading to type mismatch
+ # in fp16 mode. Cast it to support fp16 mode.
+ h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
+ h_idxs = h_idxs.view((h, 1)) * q_stride
+
+ w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
+ w_idxs = w_idxs.view((w, 1)) * q_stride
+
+ h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
+ device=device, dtype=dtype)
+ h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+
+ w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
+ device=device, dtype=dtype)
+ w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+
+ # (h, h_kv, 1)
+ h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+ h_diff *= self.position_magnitude
+
+ # (w, w_kv, 1)
+ w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+ w_diff *= self.position_magnitude
+
+ feat_range = torch.arange(0, feat_dim / 4).to(
+ device=device, dtype=dtype)
+
+ dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
+ dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+ dim_mat = dim_mat.view((1, 1, -1))
+
+ embedding_x = torch.cat(
+ ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+
+ embedding_y = torch.cat(
+ ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+
+ return embedding_x, embedding_y
+
+ def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+ num_heads = self.num_heads
+
+ # use empirical_attention
+ if self.q_downsample is not None:
+ x_q = self.q_downsample(x_input)
+ else:
+ x_q = x_input
+ n, _, h, w = x_q.shape
+
+ if self.kv_downsample is not None:
+ x_kv = self.kv_downsample(x_input)
+ else:
+ x_kv = x_input
+ _, _, h_kv, w_kv = x_kv.shape
+
+ if self.attention_type[0] or self.attention_type[1]:
+ proj_query = self.query_conv(x_q).view(
+ (n, num_heads, self.qk_embed_dim, h * w))
+ proj_query = proj_query.permute(0, 1, 3, 2)
+
+ if self.attention_type[0] or self.attention_type[2]:
+ proj_key = self.key_conv(x_kv).view(
+ (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+
+ if self.attention_type[1] or self.attention_type[3]:
+ position_embed_x, position_embed_y = self.get_position_embedding(
+ h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+ x_input.device, x_input.dtype, self.position_embedding_dim)
+ # (n, num_heads, w, w_kv, dim)
+ position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+ view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+ permute(0, 3, 1, 2, 4).\
+ repeat(n, 1, 1, 1, 1)
+
+ # (n, num_heads, h, h_kv, dim)
+ position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+ view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+ permute(0, 3, 1, 2, 4).\
+ repeat(n, 1, 1, 1, 1)
+
+ position_feat_x /= math.sqrt(2)
+ position_feat_y /= math.sqrt(2)
+
+ # accelerate for saliency only
+ if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+ appr_bias = self.appr_bias.\
+ view(1, num_heads, 1, self.qk_embed_dim).\
+ repeat(n, 1, 1, 1)
+
+ energy = torch.matmul(appr_bias, proj_key).\
+ view(n, num_heads, 1, h_kv * w_kv)
+
+ h = 1
+ w = 1
+ else:
+ # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+ if not self.attention_type[0]:
+ energy = torch.zeros(
+ n,
+ num_heads,
+ h,
+ w,
+ h_kv,
+ w_kv,
+ dtype=x_input.dtype,
+ device=x_input.device)
+
+ # attention_type[0]: appr - appr
+ # attention_type[1]: appr - position
+ # attention_type[2]: bias - appr
+ # attention_type[3]: bias - position
+ if self.attention_type[0] or self.attention_type[2]:
+ if self.attention_type[0] and self.attention_type[2]:
+ appr_bias = self.appr_bias.\
+ view(1, num_heads, 1, self.qk_embed_dim)
+ energy = torch.matmul(proj_query + appr_bias, proj_key).\
+ view(n, num_heads, h, w, h_kv, w_kv)
+
+ elif self.attention_type[0]:
+ energy = torch.matmul(proj_query, proj_key).\
+ view(n, num_heads, h, w, h_kv, w_kv)
+
+ elif self.attention_type[2]:
+ appr_bias = self.appr_bias.\
+ view(1, num_heads, 1, self.qk_embed_dim).\
+ repeat(n, 1, 1, 1)
+
+ energy += torch.matmul(appr_bias, proj_key).\
+ view(n, num_heads, 1, 1, h_kv, w_kv)
+
+ if self.attention_type[1] or self.attention_type[3]:
+ if self.attention_type[1] and self.attention_type[3]:
+ geom_bias = self.geom_bias.\
+ view(1, num_heads, 1, self.qk_embed_dim)
+
+ proj_query_reshape = (proj_query + geom_bias).\
+ view(n, num_heads, h, w, self.qk_embed_dim)
+
+ energy_x = torch.matmul(
+ proj_query_reshape.permute(0, 1, 3, 2, 4),
+ position_feat_x.permute(0, 1, 2, 4, 3))
+ energy_x = energy_x.\
+ permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+ energy_y = torch.matmul(
+ proj_query_reshape,
+ position_feat_y.permute(0, 1, 2, 4, 3))
+ energy_y = energy_y.unsqueeze(5)
+
+ energy += energy_x + energy_y
+
+ elif self.attention_type[1]:
+ proj_query_reshape = proj_query.\
+ view(n, num_heads, h, w, self.qk_embed_dim)
+ proj_query_reshape = proj_query_reshape.\
+ permute(0, 1, 3, 2, 4)
+ position_feat_x_reshape = position_feat_x.\
+ permute(0, 1, 2, 4, 3)
+ position_feat_y_reshape = position_feat_y.\
+ permute(0, 1, 2, 4, 3)
+
+ energy_x = torch.matmul(proj_query_reshape,
+ position_feat_x_reshape)
+ energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+ energy_y = torch.matmul(proj_query_reshape,
+ position_feat_y_reshape)
+ energy_y = energy_y.unsqueeze(5)
+
+ energy += energy_x + energy_y
+
+ elif self.attention_type[3]:
+ geom_bias = self.geom_bias.\
+ view(1, num_heads, self.qk_embed_dim, 1).\
+ repeat(n, 1, 1, 1)
+
+ position_feat_x_reshape = position_feat_x.\
+ view(n, num_heads, w * w_kv, self.qk_embed_dim)
+
+ position_feat_y_reshape = position_feat_y.\
+ view(n, num_heads, h * h_kv, self.qk_embed_dim)
+
+ energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+ energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+
+ energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+ energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+
+ energy += energy_x + energy_y
+
+ energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+
+ if self.spatial_range >= 0:
+ cur_local_constraint_map = \
+ self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+ contiguous().\
+ view(1, 1, h*w, h_kv*w_kv)
+
+ energy = energy.masked_fill_(cur_local_constraint_map,
+ float('-inf'))
+
+ attention = F.softmax(energy, 3)
+
+ proj_value = self.value_conv(x_kv)
+ proj_value_reshape = proj_value.\
+ view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+ permute(0, 1, 3, 2)
+
+ out = torch.matmul(attention, proj_value_reshape).\
+ permute(0, 1, 3, 2).\
+ contiguous().\
+ view(n, self.v_dim * self.num_heads, h, w)
+
+ out = self.proj_conv(out)
+
+ # output is downsampled, upsample back to input size
+ if self.q_downsample is not None:
+ out = F.interpolate(
+ out,
+ size=x_input.shape[2:],
+ mode='bilinear',
+ align_corners=False)
+
+ out = self.gamma * out + x_input
+ return out
+
+ def init_weights(self):
+ for m in self.modules():
+ if hasattr(m, 'kaiming_init') and m.kaiming_init:
+ kaiming_init(
+ m,
+ mode='fan_in',
+ nonlinearity='leaky_relu',
+ bias=0,
+ distribution='uniform',
+ a=1)
diff --git a/external/cv/mmcv/cnn/bricks/hsigmoid.py b/external/cv/mmcv/cnn/bricks/hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c371cd460a7ca9801dd0b3bfb7968c9e92978f3
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/hsigmoid.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class HSigmoid(nn.Module):
+ """Hard Sigmoid Module. Apply the hard sigmoid function:
+ Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
+ Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+ Note:
+ In MMCV v1.4.4, we modified the default value of args to align with
+ PyTorch official.
+
+ Args:
+ bias (float): Bias of the input feature map. Default: 3.0.
+ divisor (float): Divisor of the input feature map. Default: 6.0.
+ min_value (float): Lower bound value. Default: 0.0.
+ max_value (float): Upper bound value. Default: 1.0.
+
+ Returns:
+ Tensor: The output tensor.
+ """
+
+ def __init__(self,
+ bias: float = 3.0,
+ divisor: float = 6.0,
+ min_value: float = 0.0,
+ max_value: float = 1.0):
+ super().__init__()
+ warnings.warn(
+ 'In MMCV v1.4.4, we modified the default value of args to align '
+ 'with PyTorch official. Previous Implementation: '
+ 'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+ 'Current Implementation: '
+ 'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+ self.bias = bias
+ self.divisor = divisor
+ assert self.divisor != 0
+ self.min_value = min_value
+ self.max_value = max_value
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = (x + self.bias) / self.divisor
+
+ return x.clamp_(self.min_value, self.max_value)
diff --git a/external/cv/mmcv/cnn/bricks/hswish.py b/external/cv/mmcv/cnn/bricks/hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06d7bcb2aee2f1b8a275e83175bf5edd8a24acb
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/hswish.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+class HSwish(nn.Module):
+ """Hard Swish Module.
+
+ This module applies the hard swish function:
+
+ .. math::
+ Hswish(x) = x * ReLU6(x + 3) / 6
+
+ Args:
+ inplace (bool): can optionally do the operation in-place.
+ Default: False.
+
+ Returns:
+ Tensor: The output tensor.
+ """
+
+ def __init__(self, inplace: bool = False):
+ super().__init__()
+ self.act = nn.ReLU6(inplace)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x * self.act(x + 3) / 6
+
+
+if (TORCH_VERSION == 'parrots'
+ or digit_version(TORCH_VERSION) < digit_version('1.7')):
+ # Hardswish is not supported when PyTorch version < 1.6.
+ # And Hardswish in PyTorch 1.6 does not support inplace.
+ MODELS.register_module(module=HSwish)
+else:
+ MODELS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/external/cv/mmcv/cnn/bricks/non_local.py b/external/cv/mmcv/cnn/bricks/non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..4828f3097bfa1f7ff230a8a60f26c97ff08935d3
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/non_local.py
@@ -0,0 +1,313 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABCMeta
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, normal_init
+from mmengine.registry import MODELS
+
+from .conv_module import ConvModule
+
+
+class _NonLocalNd(nn.Module, metaclass=ABCMeta):
+ """Basic Non-local module.
+
+ This module is proposed in
+ "Non-local Neural Networks"
+ Paper reference: https://arxiv.org/abs/1711.07971
+ Code reference: https://github.com/AlexHex7/Non-local_pytorch
+
+ Args:
+ in_channels (int): Channels of the input feature map.
+ reduction (int): Channel reduction ratio. Default: 2.
+ use_scale (bool): Whether to scale pairwise_weight by
+ `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
+ Default: True.
+ conv_cfg (None | dict): The config dict for convolution layers.
+ If not specified, it will use `nn.Conv2d` for convolution layers.
+ Default: None.
+ norm_cfg (None | dict): The config dict for normalization layers.
+ Default: None. (This parameter is only applicable to conv_out.)
+ mode (str): Options are `gaussian`, `concatenation`,
+ `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+ """
+
+ def __init__(self,
+ in_channels: int,
+ reduction: int = 2,
+ use_scale: bool = True,
+ conv_cfg: Optional[Dict] = None,
+ norm_cfg: Optional[Dict] = None,
+ mode: str = 'embedded_gaussian',
+ **kwargs):
+ super().__init__()
+ self.in_channels = in_channels
+ self.reduction = reduction
+ self.use_scale = use_scale
+ self.inter_channels = max(in_channels // reduction, 1)
+ self.mode = mode
+
+ if mode not in [
+ 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+ ]:
+ raise ValueError("Mode should be in 'gaussian', 'concatenation', "
+ f"'embedded_gaussian' or 'dot_product', but got "
+ f'{mode} instead.')
+
+ # g, theta, phi are defaulted as `nn.ConvNd`.
+ # Here we use ConvModule for potential usage.
+ self.g = ConvModule(
+ self.in_channels,
+ self.inter_channels,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ act_cfg=None) # type: ignore
+ self.conv_out = ConvModule(
+ self.inter_channels,
+ self.in_channels,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=None)
+
+ if self.mode != 'gaussian':
+ self.theta = ConvModule(
+ self.in_channels,
+ self.inter_channels,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ act_cfg=None)
+ self.phi = ConvModule(
+ self.in_channels,
+ self.inter_channels,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ act_cfg=None)
+
+ if self.mode == 'concatenation':
+ self.concat_project = ConvModule(
+ self.inter_channels * 2,
+ 1,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=False,
+ act_cfg=dict(type='ReLU'))
+
+ self.init_weights(**kwargs)
+
+ def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
+ if self.mode != 'gaussian':
+ for m in [self.g, self.theta, self.phi]:
+ normal_init(m.conv, std=std)
+ else:
+ normal_init(self.g.conv, std=std)
+ if zeros_init:
+ if self.conv_out.norm_cfg is None:
+ constant_init(self.conv_out.conv, 0)
+ else:
+ constant_init(self.conv_out.norm, 0)
+ else:
+ if self.conv_out.norm_cfg is None:
+ normal_init(self.conv_out.conv, std=std)
+ else:
+ normal_init(self.conv_out.norm, std=std)
+
+ def gaussian(self, theta_x: torch.Tensor,
+ phi_x: torch.Tensor) -> torch.Tensor:
+ # NonLocal1d pairwise_weight: [N, H, H]
+ # NonLocal2d pairwise_weight: [N, HxW, HxW]
+ # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+ pairwise_weight = torch.matmul(theta_x, phi_x)
+ pairwise_weight = pairwise_weight.softmax(dim=-1)
+ return pairwise_weight
+
+ def embedded_gaussian(self, theta_x: torch.Tensor,
+ phi_x: torch.Tensor) -> torch.Tensor:
+ # NonLocal1d pairwise_weight: [N, H, H]
+ # NonLocal2d pairwise_weight: [N, HxW, HxW]
+ # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+ pairwise_weight = torch.matmul(theta_x, phi_x)
+ if self.use_scale:
+ # theta_x.shape[-1] is `self.inter_channels`
+ pairwise_weight /= theta_x.shape[-1]**0.5
+ pairwise_weight = pairwise_weight.softmax(dim=-1)
+ return pairwise_weight
+
+ def dot_product(self, theta_x: torch.Tensor,
+ phi_x: torch.Tensor) -> torch.Tensor:
+ # NonLocal1d pairwise_weight: [N, H, H]
+ # NonLocal2d pairwise_weight: [N, HxW, HxW]
+ # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+ pairwise_weight = torch.matmul(theta_x, phi_x)
+ pairwise_weight /= pairwise_weight.shape[-1]
+ return pairwise_weight
+
+ def concatenation(self, theta_x: torch.Tensor,
+ phi_x: torch.Tensor) -> torch.Tensor:
+ # NonLocal1d pairwise_weight: [N, H, H]
+ # NonLocal2d pairwise_weight: [N, HxW, HxW]
+ # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+ h = theta_x.size(2)
+ w = phi_x.size(3)
+ theta_x = theta_x.repeat(1, 1, 1, w)
+ phi_x = phi_x.repeat(1, 1, h, 1)
+
+ concat_feature = torch.cat([theta_x, phi_x], dim=1)
+ pairwise_weight = self.concat_project(concat_feature)
+ n, _, h, w = pairwise_weight.size()
+ pairwise_weight = pairwise_weight.view(n, h, w)
+ pairwise_weight /= pairwise_weight.shape[-1]
+
+ return pairwise_weight
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # Assume `reduction = 1`, then `inter_channels = C`
+ # or `inter_channels = C` when `mode="gaussian"`
+
+ # NonLocal1d x: [N, C, H]
+ # NonLocal2d x: [N, C, H, W]
+ # NonLocal3d x: [N, C, T, H, W]
+ n = x.size(0)
+
+ # NonLocal1d g_x: [N, H, C]
+ # NonLocal2d g_x: [N, HxW, C]
+ # NonLocal3d g_x: [N, TxHxW, C]
+ g_x = self.g(x).view(n, self.inter_channels, -1)
+ g_x = g_x.permute(0, 2, 1)
+
+ # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
+ # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+ # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
+ if self.mode == 'gaussian':
+ theta_x = x.view(n, self.in_channels, -1)
+ theta_x = theta_x.permute(0, 2, 1)
+ if self.sub_sample:
+ phi_x = self.phi(x).view(n, self.in_channels, -1)
+ else:
+ phi_x = x.view(n, self.in_channels, -1)
+ elif self.mode == 'concatenation':
+ theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+ phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+ else:
+ theta_x = self.theta(x).view(n, self.inter_channels, -1)
+ theta_x = theta_x.permute(0, 2, 1)
+ phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+ pairwise_func = getattr(self, self.mode)
+ # NonLocal1d pairwise_weight: [N, H, H]
+ # NonLocal2d pairwise_weight: [N, HxW, HxW]
+ # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+ pairwise_weight = pairwise_func(theta_x, phi_x)
+
+ # NonLocal1d y: [N, H, C]
+ # NonLocal2d y: [N, HxW, C]
+ # NonLocal3d y: [N, TxHxW, C]
+ y = torch.matmul(pairwise_weight, g_x)
+ # NonLocal1d y: [N, C, H]
+ # NonLocal2d y: [N, C, H, W]
+ # NonLocal3d y: [N, C, T, H, W]
+ y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+ *x.size()[2:])
+
+ output = x + self.conv_out(y)
+
+ return output
+
+
+class NonLocal1d(_NonLocalNd):
+ """1D Non-local module.
+
+ Args:
+ in_channels (int): Same as `NonLocalND`.
+ sub_sample (bool): Whether to apply max pooling after pairwise
+ function (Note that the `sub_sample` is applied on spatial only).
+ Default: False.
+ conv_cfg (None | dict): Same as `NonLocalND`.
+ Default: dict(type='Conv1d').
+ """
+
+ def __init__(self,
+ in_channels: int,
+ sub_sample: bool = False,
+ conv_cfg: Dict = dict(type='Conv1d'),
+ **kwargs):
+ super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+ self.sub_sample = sub_sample
+
+ if sub_sample:
+ max_pool_layer = nn.MaxPool1d(kernel_size=2)
+ self.g = nn.Sequential(self.g, max_pool_layer)
+ if self.mode != 'gaussian':
+ self.phi = nn.Sequential(self.phi, max_pool_layer)
+ else:
+ self.phi = max_pool_layer
+
+
+@MODELS.register_module()
+class NonLocal2d(_NonLocalNd):
+ """2D Non-local module.
+
+ Args:
+ in_channels (int): Same as `NonLocalND`.
+ sub_sample (bool): Whether to apply max pooling after pairwise
+ function (Note that the `sub_sample` is applied on spatial only).
+ Default: False.
+ conv_cfg (None | dict): Same as `NonLocalND`.
+ Default: dict(type='Conv2d').
+ """
+
+ _abbr_ = 'nonlocal_block'
+
+ def __init__(self,
+ in_channels: int,
+ sub_sample: bool = False,
+ conv_cfg: Dict = dict(type='Conv2d'),
+ **kwargs):
+ super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+ self.sub_sample = sub_sample
+
+ if sub_sample:
+ max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+ self.g = nn.Sequential(self.g, max_pool_layer)
+ if self.mode != 'gaussian':
+ self.phi = nn.Sequential(self.phi, max_pool_layer)
+ else:
+ self.phi = max_pool_layer
+
+
+class NonLocal3d(_NonLocalNd):
+ """3D Non-local module.
+
+ Args:
+ in_channels (int): Same as `NonLocalND`.
+ sub_sample (bool): Whether to apply max pooling after pairwise
+ function (Note that the `sub_sample` is applied on spatial only).
+ Default: False.
+ conv_cfg (None | dict): Same as `NonLocalND`.
+ Default: dict(type='Conv3d').
+ """
+
+ def __init__(self,
+ in_channels: int,
+ sub_sample: bool = False,
+ conv_cfg: Dict = dict(type='Conv3d'),
+ **kwargs):
+ super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+ self.sub_sample = sub_sample
+
+ if sub_sample:
+ max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+ self.g = nn.Sequential(self.g, max_pool_layer)
+ if self.mode != 'gaussian':
+ self.phi = nn.Sequential(self.phi, max_pool_layer)
+ else:
+ self.phi = max_pool_layer
diff --git a/external/cv/mmcv/cnn/bricks/norm.py b/external/cv/mmcv/cnn/bricks/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67f3ffa7cebe9cd485a57da9a6fca43ef6d3540
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/norm.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import is_tuple_of
+from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,
+ _InstanceNorm)
+
+MODELS.register_module('BN', module=nn.BatchNorm2d)
+MODELS.register_module('BN1d', module=nn.BatchNorm1d)
+MODELS.register_module('BN2d', module=nn.BatchNorm2d)
+MODELS.register_module('BN3d', module=nn.BatchNorm3d)
+MODELS.register_module('SyncBN', module=SyncBatchNorm)
+MODELS.register_module('GN', module=nn.GroupNorm)
+MODELS.register_module('LN', module=nn.LayerNorm)
+MODELS.register_module('IN', module=nn.InstanceNorm2d)
+MODELS.register_module('IN1d', module=nn.InstanceNorm1d)
+MODELS.register_module('IN2d', module=nn.InstanceNorm2d)
+MODELS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+ """Infer abbreviation from the class name.
+
+ When we build a norm layer with `build_norm_layer()`, we want to preserve
+ the norm type in variable names, e.g, self.bn1, self.gn. This method will
+ infer the abbreviation to map class types to abbreviations.
+
+ Rule 1: If the class has the property "_abbr_", return the property.
+ Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+ InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+ "in" respectively.
+ Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+ the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+ respectively.
+ Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+ Args:
+ class_type (type): The norm layer type.
+
+ Returns:
+ str: The inferred abbreviation.
+ """
+ if not inspect.isclass(class_type):
+ raise TypeError(
+ f'class_type must be a type, but got {type(class_type)}')
+ if hasattr(class_type, '_abbr_'):
+ return class_type._abbr_
+ if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
+ return 'in'
+ elif issubclass(class_type, _BatchNorm):
+ return 'bn'
+ elif issubclass(class_type, nn.GroupNorm):
+ return 'gn'
+ elif issubclass(class_type, nn.LayerNorm):
+ return 'ln'
+ else:
+ class_name = class_type.__name__.lower()
+ if 'batch' in class_name:
+ return 'bn'
+ elif 'group' in class_name:
+ return 'gn'
+ elif 'layer' in class_name:
+ return 'ln'
+ elif 'instance' in class_name:
+ return 'in'
+ else:
+ return 'norm_layer'
+
+
+def build_norm_layer(cfg: Dict,
+ num_features: int,
+ postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
+ """Build normalization layer.
+
+ Args:
+ cfg (dict): The norm layer config, which should contain:
+
+ - type (str): Layer type.
+ - layer args: Args needed to instantiate a norm layer.
+ - requires_grad (bool, optional): Whether stop gradient updates.
+ num_features (int): Number of input channels.
+ postfix (int | str): The postfix to be appended into norm abbreviation
+ to create named layer.
+
+ Returns:
+ tuple[str, nn.Module]: The first element is the layer name consisting
+ of abbreviation and postfix, e.g., bn1, gn. The second element is the
+ created norm layer.
+ """
+ if not isinstance(cfg, dict):
+ raise TypeError('cfg must be a dict')
+ if 'type' not in cfg:
+ raise KeyError('the cfg dict must contain the key "type"')
+ cfg_ = cfg.copy()
+
+ layer_type = cfg_.pop('type')
+
+ if inspect.isclass(layer_type):
+ norm_layer = layer_type
+ else:
+ # Switch registry to the target scope. If `norm_layer` cannot be found
+ # in the registry, fallback to search `norm_layer` in the
+ # mmengine.MODELS.
+ with MODELS.switch_scope_and_registry(None) as registry:
+ norm_layer = registry.get(layer_type)
+ if norm_layer is None:
+ raise KeyError(f'Cannot find {norm_layer} in registry under '
+ f'scope name {registry.scope}')
+ abbr = infer_abbr(norm_layer)
+
+ assert isinstance(postfix, (int, str))
+ name = abbr + str(postfix)
+
+ requires_grad = cfg_.pop('requires_grad', True)
+ cfg_.setdefault('eps', 1e-5)
+ if norm_layer is not nn.GroupNorm:
+ layer = norm_layer(num_features, **cfg_)
+ if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+ layer._specify_ddp_gpu_num(1)
+ else:
+ assert 'num_groups' in cfg_
+ layer = norm_layer(num_channels=num_features, **cfg_)
+
+ for param in layer.parameters():
+ param.requires_grad = requires_grad
+
+ return name, layer
+
+
+def is_norm(layer: nn.Module,
+ exclude: Union[type, tuple, None] = None) -> bool:
+ """Check if a layer is a normalization layer.
+
+ Args:
+ layer (nn.Module): The layer to be checked.
+ exclude (type | tuple[type]): Types to be excluded.
+
+ Returns:
+ bool: Whether the layer is a norm layer.
+ """
+ if exclude is not None:
+ if not isinstance(exclude, tuple):
+ exclude = (exclude, )
+ if not is_tuple_of(exclude, type):
+ raise TypeError(
+ f'"exclude" must be either None or type or a tuple of types, '
+ f'but got {type(exclude)}: {exclude}')
+
+ if exclude and isinstance(layer, exclude):
+ return False
+
+ all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+ return isinstance(layer, all_norm_bases)
diff --git a/external/cv/mmcv/cnn/bricks/padding.py b/external/cv/mmcv/cnn/bricks/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..77218481cca68272252582265788830cebe36b40
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/padding.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Dict
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+MODELS.register_module('zero', module=nn.ZeroPad2d)
+MODELS.register_module('reflect', module=nn.ReflectionPad2d)
+MODELS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+ """Build padding layer.
+
+ Args:
+ cfg (dict): The padding layer config, which should contain:
+ - type (str): Layer type.
+ - layer args: Args needed to instantiate a padding layer.
+
+ Returns:
+ nn.Module: Created padding layer.
+ """
+ if not isinstance(cfg, dict):
+ raise TypeError('cfg must be a dict')
+ if 'type' not in cfg:
+ raise KeyError('the cfg dict must contain the key "type"')
+
+ cfg_ = cfg.copy()
+ padding_type = cfg_.pop('type')
+ if inspect.isclass(padding_type):
+ return padding_type(*args, **kwargs, **cfg_)
+ # Switch registry to the target scope. If `padding_layer` cannot be found
+ # in the registry, fallback to search `padding_layer` in the
+ # mmengine.MODELS.
+ with MODELS.switch_scope_and_registry(None) as registry:
+ padding_layer = registry.get(padding_type)
+ if padding_layer is None:
+ raise KeyError(f'Cannot find {padding_layer} in registry under scope '
+ f'name {registry.scope}')
+ layer = padding_layer(*args, **kwargs, **cfg_)
+
+ return layer
diff --git a/external/cv/mmcv/cnn/bricks/plugin.py b/external/cv/mmcv/cnn/bricks/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..682b4877dc628b727c9d1090aa4a9066daa8eedf
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/plugin.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+import platform
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+if platform.system() == 'Windows':
+ import regex as re # type: ignore
+else:
+ import re # type: ignore
+
+
+def infer_abbr(class_type: type) -> str:
+ """Infer abbreviation from the class name.
+
+ This method will infer the abbreviation to map class types to
+ abbreviations.
+
+ Rule 1: If the class has the property "abbr", return the property.
+ Rule 2: Otherwise, the abbreviation falls back to snake case of class
+ name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+ Args:
+ class_type (type): The norm layer type.
+
+ Returns:
+ str: The inferred abbreviation.
+ """
+
+ def camel2snack(word):
+ """Convert camel case word into snack case.
+
+ Modified from `inflection lib
+ `_.
+
+ Example::
+
+ >>> camel2snack("FancyBlock")
+ 'fancy_block'
+ """
+
+ word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+ word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+ word = word.replace('-', '_')
+ return word.lower()
+
+ if not inspect.isclass(class_type):
+ raise TypeError(
+ f'class_type must be a type, but got {type(class_type)}')
+ if hasattr(class_type, '_abbr_'):
+ return class_type._abbr_ # type: ignore
+ else:
+ return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg: Dict,
+ postfix: Union[int, str] = '',
+ **kwargs) -> Tuple[str, nn.Module]:
+ """Build plugin layer.
+
+ Args:
+ cfg (dict): cfg should contain:
+
+ - type (str): identify plugin layer type.
+ - layer args: args needed to instantiate a plugin layer.
+ postfix (int, str): appended into norm abbreviation to
+ create named layer. Default: ''.
+
+ Returns:
+ tuple[str, nn.Module]: The first one is the concatenation of
+ abbreviation and postfix. The second is the created plugin layer.
+ """
+ if not isinstance(cfg, dict):
+ raise TypeError('cfg must be a dict')
+ if 'type' not in cfg:
+ raise KeyError('the cfg dict must contain the key "type"')
+ cfg_ = cfg.copy()
+
+ layer_type = cfg_.pop('type')
+ if inspect.isclass(layer_type):
+ plugin_layer = layer_type
+ else:
+ # Switch registry to the target scope. If `plugin_layer` cannot be
+ # found in the registry, fallback to search `plugin_layer` in the
+ # mmengine.MODELS.
+ with MODELS.switch_scope_and_registry(None) as registry:
+ plugin_layer = registry.get(layer_type)
+ if plugin_layer is None:
+ raise KeyError(
+ f'Cannot find {plugin_layer} in registry under scope '
+ f'name {registry.scope}')
+ abbr = infer_abbr(plugin_layer)
+
+ assert isinstance(postfix, (int, str))
+ name = abbr + str(postfix)
+
+ layer = plugin_layer(**kwargs, **cfg_)
+
+ return name, layer
diff --git a/external/cv/mmcv/cnn/bricks/scale.py b/external/cv/mmcv/cnn/bricks/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..5facada835b892969e2c8bbe072b0fa6985b8872
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/scale.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+ """A learnable scale parameter.
+
+ This layer scales the input by a learnable factor. It multiplies a
+ learnable scale parameter of shape (1,) with input of any shape.
+
+ Args:
+ scale (float): Initial value of scale factor. Default: 1.0
+ """
+
+ def __init__(self, scale: float = 1.0):
+ super().__init__()
+ self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x * self.scale
+
+
+class LayerScale(nn.Module):
+ """LayerScale layer.
+
+ Args:
+ dim (int): Dimension of input features.
+ inplace (bool): Whether performs operation in-place.
+ Default: `False`.
+ data_format (str): The input data format, could be 'channels_last'
+ or 'channels_first', representing (B, C, H, W) and
+ (B, N, C) format data respectively. Default: 'channels_last'.
+ scale (float): Initial value of scale factor. Default: 1.0
+ """
+
+ def __init__(self,
+ dim: int,
+ inplace: bool = False,
+ data_format: str = 'channels_last',
+ scale: float = 1e-5):
+ super().__init__()
+ assert data_format in ('channels_last', 'channels_first'), \
+ "'data_format' could only be channels_last or channels_first."
+ self.inplace = inplace
+ self.data_format = data_format
+ self.weight = nn.Parameter(torch.ones(dim) * scale)
+
+ def forward(self, x) -> torch.Tensor:
+ if self.data_format == 'channels_first':
+ shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
+ else:
+ shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
+ if self.inplace:
+ return x.mul_(self.weight.view(*shape))
+ else:
+ return x * self.weight.view(*shape)
diff --git a/external/cv/mmcv/cnn/bricks/swish.py b/external/cv/mmcv/cnn/bricks/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e5d33ad7612b53a548b582f97c7e4f6f4b4acc6
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/swish.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class Swish(nn.Module):
+ """Swish Module.
+
+ This module applies the swish function:
+
+ .. math::
+ Swish(x) = x * Sigmoid(x)
+
+ Returns:
+ Tensor: The output tensor.
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x * torch.sigmoid(x)
diff --git a/external/cv/mmcv/cnn/bricks/transformer.py b/external/cv/mmcv/cnn/bricks/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..997fb6a5b74e139f7129891431256386cc43b09a
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/transformer.py
@@ -0,0 +1,956 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning, to_2tuple
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+ build_norm_layer)
+from .drop import build_dropout
+from .scale import LayerScale
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+ from mmcv.ops.multi_scale_deform_attn import \
+ MultiScaleDeformableAttention # noqa F401
+ warnings.warn(
+ ImportWarning(
+ '``MultiScaleDeformableAttention`` has been moved to '
+ '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501
+ '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501
+ 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501
+ ))
+
+except ImportError:
+ warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+ '``mmcv.ops.multi_scale_deform_attn``, '
+ 'You should install ``mmcv`` rather than ``mmcv-lite`` '
+ 'if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+ """Builder for Position Encoding."""
+ return MODELS.build(cfg, default_args=default_args)
+
+
+def build_attention(cfg, default_args=None):
+ """Builder for attention."""
+ return MODELS.build(cfg, default_args=default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+ """Builder for feed-forward network (FFN)."""
+ return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+ """Builder for transformer layer."""
+ return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+ """Builder for transformer encoder and transformer decoder."""
+ return MODELS.build(cfg, default_args=default_args)
+
+
+class AdaptivePadding(nn.Module):
+ """Applies padding adaptively to the input.
+
+ This module can make input get fully covered by filter
+ you specified. It support two modes "same" and "corner". The
+ "same" mode is same with "SAME" padding mode in TensorFlow, pad
+ zero around input. The "corner" mode would pad zero
+ to bottom right.
+
+ Args:
+ kernel_size (int | tuple): Size of the kernel. Default: 1.
+ stride (int | tuple): Stride of the filter. Default: 1.
+ dilation (int | tuple): Spacing between kernel elements.
+ Default: 1.
+ padding (str): Support "same" and "corner", "corner" mode
+ would pad zero to bottom right, and "same" mode would
+ pad zero around input. Default: "corner".
+
+ Example:
+ >>> kernel_size = 16
+ >>> stride = 16
+ >>> dilation = 1
+ >>> input = torch.rand(1, 1, 15, 17)
+ >>> adap_pad = AdaptivePadding(
+ >>> kernel_size=kernel_size,
+ >>> stride=stride,
+ >>> dilation=dilation,
+ >>> padding="corner")
+ >>> out = adap_pad(input)
+ >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+ >>> input = torch.rand(1, 1, 16, 17)
+ >>> out = adap_pad(input)
+ >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+ """
+
+ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+ super().__init__()
+ assert padding in ('same', 'corner')
+
+ kernel_size = to_2tuple(kernel_size)
+ stride = to_2tuple(stride)
+ dilation = to_2tuple(dilation)
+
+ self.padding = padding
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.dilation = dilation
+
+ def get_pad_shape(self, input_shape):
+ """Calculate the padding size of input.
+
+ Args:
+ input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+ Returns:
+ Tuple[int]: The padding size along the
+ original H and W directions
+ """
+ input_h, input_w = input_shape
+ kernel_h, kernel_w = self.kernel_size
+ stride_h, stride_w = self.stride
+ output_h = math.ceil(input_h / stride_h)
+ output_w = math.ceil(input_w / stride_w)
+ pad_h = max((output_h - 1) * stride_h +
+ (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+ pad_w = max((output_w - 1) * stride_w +
+ (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+ return pad_h, pad_w
+
+ def forward(self, x):
+ """Add padding to `x`
+
+ Args:
+ x (Tensor): Input tensor has shape (B, C, H, W).
+
+ Returns:
+ Tensor: The tensor with adaptive padding
+ """
+ pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+ if pad_h > 0 or pad_w > 0:
+ if self.padding == 'corner':
+ x = F.pad(x, [0, pad_w, 0, pad_h])
+ elif self.padding == 'same':
+ x = F.pad(x, [
+ pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+ pad_h - pad_h // 2
+ ])
+ return x
+
+
+class PatchEmbed(BaseModule):
+ """Image to Patch Embedding.
+
+ We use a conv layer to implement PatchEmbed.
+
+ Args:
+ in_channels (int): The num of input channels. Default: 3
+ embed_dims (int): The dimensions of embedding. Default: 768
+ conv_type (str): The type of convolution
+ to generate patch embedding. Default: "Conv2d".
+ kernel_size (int): The kernel_size of embedding conv. Default: 16.
+ stride (int): The slide stride of embedding conv.
+ Default: 16.
+ padding (int | tuple | string): The padding length of
+ embedding conv. When it is a string, it means the mode
+ of adaptive padding, support "same" and "corner" now.
+ Default: "corner".
+ dilation (int): The dilation rate of embedding conv. Default: 1.
+ bias (bool): Bias of embed conv. Default: True.
+ norm_cfg (dict, optional): Config dict for normalization layer.
+ Default: None.
+ input_size (int | tuple | None): The size of input, which will be
+ used to calculate the out size. Only works when `dynamic_size`
+ is False. Default: None.
+ init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+ Default: None.
+ """
+
+ def __init__(self,
+ in_channels=3,
+ embed_dims=768,
+ conv_type='Conv2d',
+ kernel_size=16,
+ stride=16,
+ padding='corner',
+ dilation=1,
+ bias=True,
+ norm_cfg=None,
+ input_size=None,
+ init_cfg=None):
+ super().__init__(init_cfg=init_cfg)
+
+ self.embed_dims = embed_dims
+ if stride is None:
+ stride = kernel_size
+
+ kernel_size = to_2tuple(kernel_size)
+ stride = to_2tuple(stride)
+ dilation = to_2tuple(dilation)
+
+ if isinstance(padding, str):
+ self.adaptive_padding = AdaptivePadding(
+ kernel_size=kernel_size,
+ stride=stride,
+ dilation=dilation,
+ padding=padding)
+ # disable the padding of conv
+ padding = 0
+ else:
+ self.adaptive_padding = None
+ padding = to_2tuple(padding)
+
+ self.projection = build_conv_layer(
+ dict(type=conv_type),
+ in_channels=in_channels,
+ out_channels=embed_dims,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias)
+
+ if norm_cfg is not None:
+ self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+ else:
+ self.norm = None
+
+ if input_size:
+ input_size = to_2tuple(input_size)
+ # `init_out_size` would be used outside to
+ # calculate the num_patches
+ # e.g. when `use_abs_pos_embed` outside
+ self.init_input_size = input_size
+ if self.adaptive_padding:
+ pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+ input_h, input_w = input_size
+ input_h = input_h + pad_h
+ input_w = input_w + pad_w
+ input_size = (input_h, input_w)
+
+ # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+ h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+ (kernel_size[0] - 1) - 1) // stride[0] + 1
+ w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+ (kernel_size[1] - 1) - 1) // stride[1] + 1
+ self.init_out_size = (h_out, w_out)
+ else:
+ self.init_input_size = None
+ self.init_out_size = None
+
+ def forward(self, x):
+ """
+ Args:
+ x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+ Returns:
+ tuple: Contains merged results and its spatial shape.
+
+ - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+ - out_size (tuple[int]): Spatial shape of x, arrange as
+ (out_h, out_w).
+ """
+
+ if self.adaptive_padding:
+ x = self.adaptive_padding(x)
+
+ x = self.projection(x)
+ out_size = (x.shape[2], x.shape[3])
+ x = x.flatten(2).transpose(1, 2)
+ if self.norm is not None:
+ x = self.norm(x)
+ return x, out_size
+
+
+class PatchMerging(BaseModule):
+ """Merge patch feature map.
+
+ This layer groups feature map by kernel_size, and applies norm and linear
+ layers to the grouped feature map ((used in Swin Transformer)).
+ Our implementation uses `nn.Unfold` to
+ merge patches, which is about 25% faster than the original
+ implementation. However, we need to modify pretrained
+ models for compatibility.
+
+ Args:
+ in_channels (int): The num of input channels.
+ to gets fully covered by filter and stride you specified.
+ out_channels (int): The num of output channels.
+ kernel_size (int | tuple, optional): the kernel size in the unfold
+ layer. Defaults to 2.
+ stride (int | tuple, optional): the stride of the sliding blocks in the
+ unfold layer. Default: None. (Would be set as `kernel_size`)
+ padding (int | tuple | string ): The padding length of
+ embedding conv. When it is a string, it means the mode
+ of adaptive padding, support "same" and "corner" now.
+ Default: "corner".
+ dilation (int | tuple, optional): dilation parameter in the unfold
+ layer. Default: 1.
+ bias (bool, optional): Whether to add bias in linear layer or not.
+ Defaults: False.
+ norm_cfg (dict, optional): Config dict for normalization layer.
+ Default: dict(type='LN').
+ init_cfg (dict, optional): The extra config for initialization.
+ Default: None.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=2,
+ stride=None,
+ padding='corner',
+ dilation=1,
+ bias=False,
+ norm_cfg=dict(type='LN'),
+ init_cfg=None):
+ super().__init__(init_cfg=init_cfg)
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ if stride:
+ stride = stride
+ else:
+ stride = kernel_size
+
+ kernel_size = to_2tuple(kernel_size)
+ stride = to_2tuple(stride)
+ dilation = to_2tuple(dilation)
+
+ if isinstance(padding, str):
+ self.adaptive_padding = AdaptivePadding(
+ kernel_size=kernel_size,
+ stride=stride,
+ dilation=dilation,
+ padding=padding)
+ # disable the padding of unfold
+ padding = 0
+ else:
+ self.adaptive_padding = None
+
+ padding = to_2tuple(padding)
+ self.sampler = nn.Unfold(
+ kernel_size=kernel_size,
+ dilation=dilation,
+ padding=padding,
+ stride=stride)
+
+ sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+ if norm_cfg is not None:
+ self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+ else:
+ self.norm = None
+
+ self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+ def forward(self, x, input_size):
+ """
+ Args:
+ x (Tensor): Has shape (B, H*W, C_in).
+ input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+ Default: None.
+
+ Returns:
+ tuple: Contains merged results and its spatial shape.
+
+ - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+ - out_size (tuple[int]): Spatial shape of x, arrange as
+ (Merged_H, Merged_W).
+ """
+ B, L, C = x.shape
+ assert isinstance(input_size, Sequence), f'Expect ' \
+ f'input_size is ' \
+ f'`Sequence` ' \
+ f'but get {input_size}'
+
+ H, W = input_size
+ assert L == H * W, 'input feature has wrong size'
+
+ x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
+
+ if self.adaptive_padding:
+ x = self.adaptive_padding(x)
+ H, W = x.shape[-2:]
+
+ # Use nn.Unfold to merge patch. About 25% faster than original method,
+ # but need to modify pretrained model for compatibility
+ # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+ x = self.sampler(x)
+
+ out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+ (self.sampler.kernel_size[0] - 1) -
+ 1) // self.sampler.stride[0] + 1
+ out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+ (self.sampler.kernel_size[1] - 1) -
+ 1) // self.sampler.stride[1] + 1
+
+ output_size = (out_h, out_w)
+ x = x.transpose(1, 2) # B, H/2*W/2, 4*C
+ x = self.norm(x) if self.norm else x
+ x = self.reduction(x)
+ return x, output_size
+
+
+@MODELS.register_module()
+class MultiheadAttention(BaseModule):
+ """A wrapper for ``torch.nn.MultiheadAttention``.
+
+ This module implements MultiheadAttention with identity connection,
+ and positional encoding is also passed as input.
+
+ Args:
+ embed_dims (int): The embedding dimension.
+ num_heads (int): Parallel attention heads.
+ attn_drop (float): A Dropout layer on attn_output_weights.
+ Default: 0.0.
+ proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+ Default: 0.0.
+ dropout_layer (obj:`ConfigDict`): The dropout_layer used
+ when adding the shortcut.
+ init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+ Default: None.
+ batch_first (bool): When it is True, Key, Query and Value are shape of
+ (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+ Default to False.
+ """
+
+ def __init__(self,
+ embed_dims,
+ num_heads,
+ attn_drop=0.,
+ proj_drop=0.,
+ dropout_layer=dict(type='Dropout', drop_prob=0.),
+ init_cfg=None,
+ batch_first=False,
+ **kwargs):
+ super().__init__(init_cfg)
+ if 'dropout' in kwargs:
+ warnings.warn(
+ 'The arguments `dropout` in MultiheadAttention '
+ 'has been deprecated, now you can separately '
+ 'set `attn_drop`(float), proj_drop(float), '
+ 'and `dropout_layer`(dict) ', DeprecationWarning)
+ attn_drop = kwargs['dropout']
+ dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+ self.embed_dims = embed_dims
+ self.num_heads = num_heads
+ self.batch_first = batch_first
+
+ self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+ **kwargs)
+
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.dropout_layer = build_dropout(
+ dropout_layer) if dropout_layer else nn.Identity()
+
+ # @deprecated_api_warning({'residual': 'identity'},
+ # cls_name='MultiheadAttention')
+ def forward(self,
+ query,
+ key=None,
+ value=None,
+ identity=None,
+ query_pos=None,
+ key_pos=None,
+ attn_mask=None,
+ key_padding_mask=None,
+ **kwargs):
+ """Forward function for `MultiheadAttention`.
+
+ **kwargs allow passing a more general data flow when combining
+ with other operations in `transformerlayer`.
+
+ Args:
+ query (Tensor): The input query with shape [num_queries, bs,
+ embed_dims] if self.batch_first is False, else
+ [bs, num_queries embed_dims].
+ key (Tensor): The key tensor with shape [num_keys, bs,
+ embed_dims] if self.batch_first is False, else
+ [bs, num_keys, embed_dims] .
+ If None, the ``query`` will be used. Defaults to None.
+ value (Tensor): The value tensor with same shape as `key`.
+ Same in `nn.MultiheadAttention.forward`. Defaults to None.
+ If None, the `key` will be used.
+ identity (Tensor): This tensor, with the same shape as x,
+ will be used for the identity link.
+ If None, `x` will be used. Defaults to None.
+ query_pos (Tensor): The positional encoding for query, with
+ the same shape as `x`. If not None, it will
+ be added to `x` before forward function. Defaults to None.
+ key_pos (Tensor): The positional encoding for `key`, with the
+ same shape as `key`. Defaults to None. If not None, it will
+ be added to `key` before forward function. If None, and
+ `query_pos` has the same shape as `key`, then `query_pos`
+ will be used for `key_pos`. Defaults to None.
+ attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+ num_keys]. Same in `nn.MultiheadAttention.forward`.
+ Defaults to None.
+ key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+ Defaults to None.
+
+ Returns:
+ Tensor: forwarded results with shape
+ [num_queries, bs, embed_dims]
+ if self.batch_first is False, else
+ [bs, num_queries embed_dims].
+ """
+
+ if key is None:
+ key = query
+ if value is None:
+ value = key
+ if identity is None:
+ identity = query
+ if key_pos is None:
+ if query_pos is not None:
+ # use query_pos if key_pos is not available
+ if query_pos.shape == key.shape:
+ key_pos = query_pos
+ else:
+ warnings.warn(f'position encoding of key is'
+ f'missing in {self.__class__.__name__}.')
+ if query_pos is not None:
+ query = query + query_pos
+ if key_pos is not None:
+ key = key + key_pos
+
+ # Because the dataflow('key', 'query', 'value') of
+ # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+ # embed_dims), We should adjust the shape of dataflow from
+ # batch_first (batch, num_query, embed_dims) to num_query_first
+ # (num_query ,batch, embed_dims), and recover ``attn_output``
+ # from num_query_first to batch_first.
+ if self.batch_first:
+ query = query.transpose(0, 1)
+ key = key.transpose(0, 1)
+ value = value.transpose(0, 1)
+
+ out = self.attn(
+ query=query,
+ key=key,
+ value=value,
+ attn_mask=attn_mask,
+ key_padding_mask=key_padding_mask)[0]
+
+ if self.batch_first:
+ out = out.transpose(0, 1)
+
+ return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@MODELS.register_module()
+class FFN(BaseModule):
+ """Implements feed-forward networks (FFNs) with identity connection.
+
+ Args:
+ embed_dims (int): The feature dimension. Same as
+ `MultiheadAttention`. Defaults: 256.
+ feedforward_channels (int): The hidden dimension of FFNs.
+ Defaults: 1024.
+ num_fcs (int, optional): The number of fully-connected layers in
+ FFNs. Default: 2.
+ act_cfg (dict, optional): The activation config for FFNs.
+ Default: dict(type='ReLU')
+ ffn_drop (float, optional): Probability of an element to be
+ zeroed in FFN. Default 0.0.
+ add_identity (bool, optional): Whether to add the
+ identity connection. Default: `True`.
+ dropout_layer (obj:`ConfigDict`): The dropout_layer used
+ when adding the shortcut.
+ init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+ Default: None.
+ layer_scale_init_value (float): Initial value of scale factor in
+ LayerScale. Default: 1.0
+ """
+
+ # @deprecated_api_warning(
+ # {
+ # 'dropout': 'ffn_drop',
+ # 'add_residual': 'add_identity'
+ # },
+ # cls_name='FFN')
+ def __init__(self,
+ embed_dims=256,
+ feedforward_channels=1024,
+ num_fcs=2,
+ act_cfg=dict(type='ReLU', inplace=True),
+ ffn_drop=0.,
+ dropout_layer=None,
+ add_identity=True,
+ init_cfg=None,
+ layer_scale_init_value=0.):
+ super().__init__(init_cfg)
+ assert num_fcs >= 2, 'num_fcs should be no less ' \
+ f'than 2. got {num_fcs}.'
+ self.embed_dims = embed_dims
+ self.feedforward_channels = feedforward_channels
+ self.num_fcs = num_fcs
+
+ layers = []
+ in_channels = embed_dims
+ for _ in range(num_fcs - 1):
+ layers.append(
+ Sequential(
+ Linear(in_channels, feedforward_channels),
+ build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))
+ in_channels = feedforward_channels
+ layers.append(Linear(feedforward_channels, embed_dims))
+ layers.append(nn.Dropout(ffn_drop))
+ self.layers = Sequential(*layers)
+ self.dropout_layer = build_dropout(
+ dropout_layer) if dropout_layer else torch.nn.Identity()
+ self.add_identity = add_identity
+
+ if layer_scale_init_value > 0:
+ self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
+ else:
+ self.gamma2 = nn.Identity()
+
+ # @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+ def forward(self, x, identity=None):
+ """Forward function for `FFN`.
+
+ The function would add x to the output tensor if residue is None.
+ """
+ out = self.layers(x)
+ out = self.gamma2(out)
+ if not self.add_identity:
+ return self.dropout_layer(out)
+ if identity is None:
+ identity = x
+ return identity + self.dropout_layer(out)
+
+
+@MODELS.register_module()
+class BaseTransformerLayer(BaseModule):
+ """Base `TransformerLayer` for vision transformer.
+
+ It can be built from `mmcv.ConfigDict` and support more flexible
+ customization, for example, using any number of `FFN or LN ` and
+ use different kinds of `attention` by specifying a list of `ConfigDict`
+ named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+ when you specifying `norm` as the first element of `operation_order`.
+ More details about the `prenorm`: `On Layer Normalization in the
+ Transformer Architecture `_ .
+
+ Args:
+ attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+ Configs for `self_attention` or `cross_attention` modules,
+ The order of the configs in the list should be consistent with
+ corresponding attentions in operation_order.
+ If it is a dict, all of the attention modules in operation_order
+ will be built with this config. Default: None.
+ ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+ Configs for FFN, The order of the configs in the list should be
+ consistent with corresponding ffn in operation_order.
+ If it is a dict, all of the attention modules in operation_order
+ will be built with this config.
+ operation_order (tuple[str]): The execution order of operation
+ in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+ Support `prenorm` when you specifying first element as `norm`.
+ Default:None.
+ norm_cfg (dict): Config dict for normalization layer.
+ Default: dict(type='LN').
+ init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+ Default: None.
+ batch_first (bool): Key, Query and Value are shape
+ of (batch, n, embed_dim)
+ or (n, batch, embed_dim). Default to False.
+ """
+
+ def __init__(self,
+ attn_cfgs=None,
+ ffn_cfgs=dict(
+ type='FFN',
+ embed_dims=256,
+ feedforward_channels=1024,
+ num_fcs=2,
+ ffn_drop=0.,
+ act_cfg=dict(type='ReLU', inplace=True),
+ ),
+ operation_order=None,
+ norm_cfg=dict(type='LN'),
+ init_cfg=None,
+ batch_first=False,
+ **kwargs):
+
+ deprecated_args = dict(
+ feedforward_channels='feedforward_channels',
+ ffn_dropout='ffn_drop',
+ ffn_num_fcs='num_fcs')
+ for ori_name, new_name in deprecated_args.items():
+ if ori_name in kwargs:
+ warnings.warn(
+ f'The arguments `{ori_name}` in BaseTransformerLayer '
+ f'has been deprecated, now you should set `{new_name}` '
+ f'and other FFN related arguments '
+ f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+ ffn_cfgs[new_name] = kwargs[ori_name]
+
+ super().__init__(init_cfg)
+
+ self.batch_first = batch_first
+
+ assert set(operation_order) & {
+ 'self_attn', 'norm', 'ffn', 'cross_attn'} == \
+ set(operation_order), f'The operation_order of' \
+ f' {self.__class__.__name__} should ' \
+ f'contains all four operation type ' \
+ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+ num_attn = operation_order.count('self_attn') + operation_order.count(
+ 'cross_attn')
+ if isinstance(attn_cfgs, dict):
+ attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+ else:
+ assert num_attn == len(attn_cfgs), f'The length ' \
+ f'of attn_cfg {num_attn} is ' \
+ f'not consistent with the number of attention' \
+ f'in operation_order {operation_order}.'
+
+ self.num_attn = num_attn
+ self.operation_order = operation_order
+ self.norm_cfg = norm_cfg
+ self.pre_norm = operation_order[0] == 'norm'
+ self.attentions = ModuleList()
+
+ index = 0
+ for operation_name in operation_order:
+ if operation_name in ['self_attn', 'cross_attn']:
+ if 'batch_first' in attn_cfgs[index]:
+ assert self.batch_first == attn_cfgs[index]['batch_first']
+ else:
+ attn_cfgs[index]['batch_first'] = self.batch_first
+ attention = build_attention(attn_cfgs[index])
+ # Some custom attentions used as `self_attn`
+ # or `cross_attn` can have different behavior.
+ attention.operation_name = operation_name
+ self.attentions.append(attention)
+ index += 1
+
+ self.embed_dims = self.attentions[0].embed_dims
+
+ self.ffns = ModuleList()
+ num_ffns = operation_order.count('ffn')
+ if isinstance(ffn_cfgs, dict):
+ ffn_cfgs = ConfigDict(ffn_cfgs)
+ if isinstance(ffn_cfgs, dict):
+ ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+ assert len(ffn_cfgs) == num_ffns
+ for ffn_index in range(num_ffns):
+ if 'embed_dims' not in ffn_cfgs[ffn_index]:
+ ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+ else:
+ assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+ self.ffns.append(
+ build_feedforward_network(ffn_cfgs[ffn_index],
+ dict(type='FFN')))
+
+ self.norms = ModuleList()
+ num_norms = operation_order.count('norm')
+ for _ in range(num_norms):
+ self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+ def forward(self,
+ query,
+ key=None,
+ value=None,
+ query_pos=None,
+ key_pos=None,
+ attn_masks=None,
+ query_key_padding_mask=None,
+ key_padding_mask=None,
+ **kwargs):
+ """Forward function for `TransformerDecoderLayer`.
+
+ **kwargs contains some specific arguments of attentions.
+
+ Args:
+ query (Tensor): The input query with shape
+ [num_queries, bs, embed_dims] if
+ self.batch_first is False, else
+ [bs, num_queries embed_dims].
+ key (Tensor): The key tensor with shape [num_keys, bs,
+ embed_dims] if self.batch_first is False, else
+ [bs, num_keys, embed_dims] .
+ value (Tensor): The value tensor with same shape as `key`.
+ query_pos (Tensor): The positional encoding for `query`.
+ Default: None.
+ key_pos (Tensor): The positional encoding for `key`.
+ Default: None.
+ attn_masks (List[Tensor] | None): 2D Tensor used in
+ calculation of corresponding attention. The length of
+ it should equal to the number of `attention` in
+ `operation_order`. Default: None.
+ query_key_padding_mask (Tensor): ByteTensor for `query`, with
+ shape [bs, num_queries]. Only used in `self_attn` layer.
+ Defaults to None.
+ key_padding_mask (Tensor): ByteTensor for `query`, with
+ shape [bs, num_keys]. Default: None.
+
+ Returns:
+ Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+ """
+
+ norm_index = 0
+ attn_index = 0
+ ffn_index = 0
+ identity = query
+ if attn_masks is None:
+ attn_masks = [None for _ in range(self.num_attn)]
+ elif isinstance(attn_masks, torch.Tensor):
+ attn_masks = [
+ copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+ ]
+ warnings.warn(f'Use same attn_mask in all attentions in '
+ f'{self.__class__.__name__} ')
+ else:
+ assert len(attn_masks) == self.num_attn, f'The length of ' \
+ f'attn_masks {len(attn_masks)} must be equal ' \
+ f'to the number of attention in ' \
+ f'operation_order {self.num_attn}'
+
+ for layer in self.operation_order:
+ if layer == 'self_attn':
+ temp_key = temp_value = query
+ query = self.attentions[attn_index](
+ query,
+ temp_key,
+ temp_value,
+ identity if self.pre_norm else None,
+ query_pos=query_pos,
+ key_pos=query_pos,
+ attn_mask=attn_masks[attn_index],
+ key_padding_mask=query_key_padding_mask,
+ **kwargs)
+ attn_index += 1
+ identity = query
+
+ elif layer == 'norm':
+ query = self.norms[norm_index](query)
+ norm_index += 1
+
+ elif layer == 'cross_attn':
+ query = self.attentions[attn_index](
+ query,
+ key,
+ value,
+ identity if self.pre_norm else None,
+ query_pos=query_pos,
+ key_pos=key_pos,
+ attn_mask=attn_masks[attn_index],
+ key_padding_mask=key_padding_mask,
+ **kwargs)
+ attn_index += 1
+ identity = query
+
+ elif layer == 'ffn':
+ query = self.ffns[ffn_index](
+ query, identity if self.pre_norm else None)
+ ffn_index += 1
+
+ return query
+
+
+@MODELS.register_module()
+class TransformerLayerSequence(BaseModule):
+ """Base class for TransformerEncoder and TransformerDecoder in vision
+ transformer.
+
+ As base-class of Encoder and Decoder in vision transformer.
+ Support customization such as specifying different kind
+ of `transformer_layer` in `transformer_coder`.
+
+ Args:
+ transformerlayer (list[obj:`mmcv.ConfigDict`] |
+ obj:`mmcv.ConfigDict`): Config of transformerlayer
+ in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+ it would be repeated `num_layer` times to a
+ list[`mmcv.ConfigDict`]. Default: None.
+ num_layers (int): The number of `TransformerLayer`. Default: None.
+ init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+ Default: None.
+ """
+
+ def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+ super().__init__(init_cfg)
+ if isinstance(transformerlayers, dict):
+ transformerlayers = [
+ copy.deepcopy(transformerlayers) for _ in range(num_layers)
+ ]
+ else:
+ assert isinstance(transformerlayers, list) and \
+ len(transformerlayers) == num_layers
+ self.num_layers = num_layers
+ self.layers = ModuleList()
+ for i in range(num_layers):
+ self.layers.append(build_transformer_layer(transformerlayers[i]))
+ self.embed_dims = self.layers[0].embed_dims
+ self.pre_norm = self.layers[0].pre_norm
+
+ def forward(self,
+ query,
+ key,
+ value,
+ query_pos=None,
+ key_pos=None,
+ attn_masks=None,
+ query_key_padding_mask=None,
+ key_padding_mask=None,
+ **kwargs):
+ """Forward function for `TransformerCoder`.
+
+ Args:
+ query (Tensor): Input query with shape
+ `(num_queries, bs, embed_dims)`.
+ key (Tensor): The key tensor with shape
+ `(num_keys, bs, embed_dims)`.
+ value (Tensor): The value tensor with shape
+ `(num_keys, bs, embed_dims)`.
+ query_pos (Tensor): The positional encoding for `query`.
+ Default: None.
+ key_pos (Tensor): The positional encoding for `key`.
+ Default: None.
+ attn_masks (List[Tensor], optional): Each element is 2D Tensor
+ which is used in calculation of corresponding attention in
+ operation_order. Default: None.
+ query_key_padding_mask (Tensor): ByteTensor for `query`, with
+ shape [bs, num_queries]. Only used in self-attention
+ Default: None.
+ key_padding_mask (Tensor): ByteTensor for `query`, with
+ shape [bs, num_keys]. Default: None.
+
+ Returns:
+ Tensor: results with shape [num_queries, bs, embed_dims].
+ """
+ for layer in self.layers:
+ query = layer(
+ query,
+ key,
+ value,
+ query_pos=query_pos,
+ key_pos=key_pos,
+ attn_masks=attn_masks,
+ query_key_padding_mask=query_key_padding_mask,
+ key_padding_mask=key_padding_mask,
+ **kwargs)
+ return query
diff --git a/external/cv/mmcv/cnn/bricks/upsample.py b/external/cv/mmcv/cnn/bricks/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73fc73c1832f97da9ff5b917436e619aa954150
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/upsample.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import xavier_init
+from mmengine.registry import MODELS
+
+MODELS.register_module('nearest', module=nn.Upsample)
+MODELS.register_module('bilinear', module=nn.Upsample)
+
+
+@MODELS.register_module(name='pixel_shuffle')
+class PixelShufflePack(nn.Module):
+ """Pixel Shuffle upsample layer.
+
+ This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
+ achieve a simple upsampling with pixel shuffle.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ scale_factor (int): Upsample ratio.
+ upsample_kernel (int): Kernel size of the conv layer to expand the
+ channels.
+ """
+
+ def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
+ upsample_kernel: int):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.scale_factor = scale_factor
+ self.upsample_kernel = upsample_kernel
+ self.upsample_conv = nn.Conv2d(
+ self.in_channels,
+ self.out_channels * scale_factor * scale_factor,
+ self.upsample_kernel,
+ padding=(self.upsample_kernel - 1) // 2)
+ self.init_weights()
+
+ def init_weights(self):
+ xavier_init(self.upsample_conv, distribution='uniform')
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.upsample_conv(x)
+ x = F.pixel_shuffle(x, self.scale_factor)
+ return x
+
+
+def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+ """Build upsample layer.
+
+ Args:
+ cfg (dict): The upsample layer config, which should contain:
+
+ - type (str): Layer type.
+ - scale_factor (int): Upsample ratio, which is not applicable to
+ deconv.
+ - layer args: Args needed to instantiate a upsample layer.
+ args (argument list): Arguments passed to the ``__init__``
+ method of the corresponding conv layer.
+ kwargs (keyword arguments): Keyword arguments passed to the
+ ``__init__`` method of the corresponding conv layer.
+
+ Returns:
+ nn.Module: Created upsample layer.
+ """
+ if not isinstance(cfg, dict):
+ raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+ if 'type' not in cfg:
+ raise KeyError(
+ f'the cfg dict must contain the key "type", but got {cfg}')
+ cfg_ = cfg.copy()
+
+ layer_type = cfg_.pop('type')
+
+ if inspect.isclass(layer_type):
+ upsample = layer_type
+ # Switch registry to the target scope. If `upsample` cannot be found
+ # in the registry, fallback to search `upsample` in the
+ # mmengine.MODELS.
+ else:
+ with MODELS.switch_scope_and_registry(None) as registry:
+ upsample = registry.get(layer_type)
+ if upsample is None:
+ raise KeyError(f'Cannot find {upsample} in registry under scope '
+ f'name {registry.scope}')
+ if upsample is nn.Upsample:
+ cfg_['mode'] = layer_type
+ layer = upsample(*args, **kwargs, **cfg_)
+ return layer
diff --git a/external/cv/mmcv/cnn/bricks/wrappers.py b/external/cv/mmcv/cnn/bricks/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c4502b1b232d637edc7ff9f2f7b3ed51a082db
--- /dev/null
+++ b/external/cv/mmcv/cnn/bricks/wrappers.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from torch.nn.modules.utils import _pair, _triple
+
+if torch.__version__ == 'parrots':
+ TORCH_VERSION = torch.__version__
+else:
+ # torch.__version__ could be 1.3.1+cu92, we only need the first two
+ # for comparison
+ TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold) -> bool:
+ return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
+ ctx.shape = x.shape
+ return x.new_empty(new_shape)
+
+ @staticmethod
+ def backward(ctx, grad: torch.Tensor) -> tuple:
+ shape = ctx.shape
+ return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@MODELS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+ out_shape = [x.shape[0], self.out_channels]
+ for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+ self.padding, self.stride, self.dilation):
+ o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+ out_shape.append(o)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ if self.training:
+ # produce dummy gradient to avoid DDP warning.
+ dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+ return empty + dummy
+ else:
+ return empty
+
+ return super().forward(x)
+
+
+@MODELS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+ out_shape = [x.shape[0], self.out_channels]
+ for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+ self.padding, self.stride, self.dilation):
+ o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+ out_shape.append(o)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ if self.training:
+ # produce dummy gradient to avoid DDP warning.
+ dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+ return empty + dummy
+ else:
+ return empty
+
+ return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv')
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+ out_shape = [x.shape[0], self.out_channels]
+ for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+ self.padding, self.stride,
+ self.dilation, self.output_padding):
+ out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ if self.training:
+ # produce dummy gradient to avoid DDP warning.
+ dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+ return empty + dummy
+ else:
+ return empty
+
+ return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv3d')
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+ out_shape = [x.shape[0], self.out_channels]
+ for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+ self.padding, self.stride,
+ self.dilation, self.output_padding):
+ out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ if self.training:
+ # produce dummy gradient to avoid DDP warning.
+ dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+ return empty + dummy
+ else:
+ return empty
+
+ return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # PyTorch 1.9 does not support empty tensor inference yet
+ if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+ out_shape = list(x.shape[:2])
+ for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+ _pair(self.padding), _pair(self.stride),
+ _pair(self.dilation)):
+ o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+ o = math.ceil(o) if self.ceil_mode else math.floor(o)
+ out_shape.append(o)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ return empty
+
+ return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # PyTorch 1.9 does not support empty tensor inference yet
+ if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+ out_shape = list(x.shape[:2])
+ for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+ _triple(self.padding),
+ _triple(self.stride),
+ _triple(self.dilation)):
+ o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+ o = math.ceil(o) if self.ceil_mode else math.floor(o)
+ out_shape.append(o)
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ return empty
+
+ return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # empty tensor forward of Linear layer is supported in Pytorch 1.6
+ if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
+ out_shape = [x.shape[0], self.out_features]
+ empty = NewEmptyTensorOp.apply(x, out_shape)
+ if self.training:
+ # produce dummy gradient to avoid DDP warning.
+ dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+ return empty + dummy
+ else:
+ return empty
+
+ return super().forward(x)
diff --git a/external/cv/mmcv/cnn/resnet.py b/external/cv/mmcv/cnn/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2d2990b44381c8fb3c4b416356f452f1fb278a6
--- /dev/null
+++ b/external/cv/mmcv/cnn/resnet.py
@@ -0,0 +1,326 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmengine.model import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+ out_planes: int,
+ stride: int = 1,
+ dilation: int = 1):
+ """3x3 convolution with padding."""
+ return nn.Conv2d(
+ in_planes,
+ out_planes,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ dilation=dilation,
+ bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self,
+ inplanes: int,
+ planes: int,
+ stride: int = 1,
+ dilation: int = 1,
+ downsample: Optional[nn.Module] = None,
+ style: str = 'pytorch',
+ with_cp: bool = False):
+ super().__init__()
+ assert style in ['pytorch', 'caffe']
+ self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.downsample = downsample
+ self.stride = stride
+ self.dilation = dilation
+ assert not with_cp
+
+ def forward(self, x: Tensor) -> Tensor:
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Module):
+ expansion = 4
+
+ def __init__(self,
+ inplanes: int,
+ planes: int,
+ stride: int = 1,
+ dilation: int = 1,
+ downsample: Optional[nn.Module] = None,
+ style: str = 'pytorch',
+ with_cp: bool = False):
+ """Bottleneck block.
+
+ If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+ it is "caffe", the stride-two layer is the first 1x1 conv layer.
+ """
+ super().__init__()
+ assert style in ['pytorch', 'caffe']
+ if style == 'pytorch':
+ conv1_stride = 1
+ conv2_stride = stride
+ else:
+ conv1_stride = stride
+ conv2_stride = 1
+ self.conv1 = nn.Conv2d(
+ inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+ self.conv2 = nn.Conv2d(
+ planes,
+ planes,
+ kernel_size=3,
+ stride=conv2_stride,
+ padding=dilation,
+ dilation=dilation,
+ bias=False)
+
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.conv3 = nn.Conv2d(
+ planes, planes * self.expansion, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+ self.relu = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.stride = stride
+ self.dilation = dilation
+ self.with_cp = with_cp
+
+ def forward(self, x: Tensor) -> Tensor:
+
+ def _inner_forward(x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+
+ return out
+
+ if self.with_cp and x.requires_grad:
+ out = cp.checkpoint(_inner_forward, x)
+ else:
+ out = _inner_forward(x)
+
+ out = self.relu(out)
+
+ return out
+
+
+def make_res_layer(block: nn.Module,
+ inplanes: int,
+ planes: int,
+ blocks: int,
+ stride: int = 1,
+ dilation: int = 1,
+ style: str = 'pytorch',
+ with_cp: bool = False) -> nn.Module:
+ downsample = None
+ if stride != 1 or inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(
+ inplanes,
+ planes * block.expansion,
+ kernel_size=1,
+ stride=stride,
+ bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(
+ block(
+ inplanes,
+ planes,
+ stride,
+ dilation,
+ downsample,
+ style=style,
+ with_cp=with_cp))
+ inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(
+ block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+ return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+ """ResNet backbone.
+
+ Args:
+ depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+ num_stages (int): Resnet stages, normally 4.
+ strides (Sequence[int]): Strides of the first block of each stage.
+ dilations (Sequence[int]): Dilation of each stage.
+ out_indices (Sequence[int]): Output from which stages.
+ style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+ layer is the 3x3 conv layer, otherwise the stride-two layer is
+ the first 1x1 conv layer.
+ frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+ not freezing any parameters.
+ bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+ running stats (mean and var).
+ bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+ with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+ memory while slowing down the training speed.
+ """
+
+ arch_settings = {
+ 18: (BasicBlock, (2, 2, 2, 2)),
+ 34: (BasicBlock, (3, 4, 6, 3)),
+ 50: (Bottleneck, (3, 4, 6, 3)),
+ 101: (Bottleneck, (3, 4, 23, 3)),
+ 152: (Bottleneck, (3, 8, 36, 3))
+ }
+
+ def __init__(self,
+ depth: int,
+ num_stages: int = 4,
+ strides: Sequence[int] = (1, 2, 2, 2),
+ dilations: Sequence[int] = (1, 1, 1, 1),
+ out_indices: Sequence[int] = (0, 1, 2, 3),
+ style: str = 'pytorch',
+ frozen_stages: int = -1,
+ bn_eval: bool = True,
+ bn_frozen: bool = False,
+ with_cp: bool = False):
+ super().__init__()
+ if depth not in self.arch_settings:
+ raise KeyError(f'invalid depth {depth} for resnet')
+ assert num_stages >= 1 and num_stages <= 4
+ block, stage_blocks = self.arch_settings[depth]
+ stage_blocks = stage_blocks[:num_stages] # type: ignore
+ assert len(strides) == len(dilations) == num_stages
+ assert max(out_indices) < num_stages
+
+ self.out_indices = out_indices
+ self.style = style
+ self.frozen_stages = frozen_stages
+ self.bn_eval = bn_eval
+ self.bn_frozen = bn_frozen
+ self.with_cp = with_cp
+
+ self.inplanes: int = 64
+ self.conv1 = nn.Conv2d(
+ 3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+ self.bn1 = nn.BatchNorm2d(64)
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+ self.res_layers = []
+ for i, num_blocks in enumerate(stage_blocks):
+ stride = strides[i]
+ dilation = dilations[i]
+ planes = 64 * 2**i
+ res_layer = make_res_layer(
+ block,
+ self.inplanes,
+ planes,
+ num_blocks,
+ stride=stride,
+ dilation=dilation,
+ style=self.style,
+ with_cp=with_cp)
+ self.inplanes = planes * block.expansion # type: ignore
+ layer_name = f'layer{i + 1}'
+ self.add_module(layer_name, res_layer)
+ self.res_layers.append(layer_name)
+
+ self.feat_dim = block.expansion * 64 * 2**( # type: ignore
+ len(stage_blocks) - 1)
+
+ def init_weights(self, pretrained: Optional[str] = None) -> None:
+ if isinstance(pretrained, str):
+ logger = logging.getLogger()
+ load_checkpoint(self, pretrained, strict=False, logger=logger)
+ elif pretrained is None:
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ kaiming_init(m)
+ elif isinstance(m, nn.BatchNorm2d):
+ constant_init(m, 1)
+ else:
+ raise TypeError('pretrained must be a str or None')
+
+ def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+ outs = []
+ for i, layer_name in enumerate(self.res_layers):
+ res_layer = getattr(self, layer_name)
+ x = res_layer(x)
+ if i in self.out_indices:
+ outs.append(x)
+ if len(outs) == 1:
+ return outs[0]
+ else:
+ return tuple(outs)
+
+ def train(self, mode: bool = True) -> None:
+ super().train(mode)
+ if self.bn_eval:
+ for m in self.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eval()
+ if self.bn_frozen:
+ for params in m.parameters():
+ params.requires_grad = False
+ if mode and self.frozen_stages >= 0:
+ for param in self.conv1.parameters():
+ param.requires_grad = False
+ for param in self.bn1.parameters():
+ param.requires_grad = False
+ self.bn1.eval()
+ self.bn1.weight.requires_grad = False
+ self.bn1.bias.requires_grad = False
+ for i in range(1, self.frozen_stages + 1):
+ mod = getattr(self, f'layer{i}')
+ mod.eval()
+ for param in mod.parameters():
+ param.requires_grad = False
diff --git a/external/cv/mmcv/cnn/rfsearch/__init__.py b/external/cv/mmcv/cnn/rfsearch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4526227ca748bb50edc8c3a254ce54b0cc06a2dc
--- /dev/null
+++ b/external/cv/mmcv/cnn/rfsearch/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp
+from .search import RFSearchHook
+
+__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']
diff --git a/external/cv/mmcv/cnn/rfsearch/operator.py b/external/cv/mmcv/cnn/rfsearch/operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..50222f5742c312e98d5470c4d0d6877f2f8fc310
--- /dev/null
+++ b/external/cv/mmcv/cnn/rfsearch/operator.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from .utils import expand_rates, get_single_padding
+
+
+class BaseConvRFSearchOp(BaseModule):
+ """Based class of ConvRFSearchOp.
+
+ Args:
+ op_layer (nn.Module): pytorch module, e,g, Conv2d
+ global_config (dict): config dict.
+ """
+
+ def __init__(self, op_layer: nn.Module, global_config: dict):
+ super().__init__()
+ self.op_layer = op_layer
+ self.global_config = global_config
+
+ def normlize(self, weights: nn.Parameter) -> nn.Parameter:
+ """Normalize weights.
+
+ Args:
+ weights (nn.Parameter): Weights to be normalized.
+
+ Returns:
+ nn.Parameters: Normalized weights.
+ """
+ abs_weights = torch.abs(weights)
+ normalized_weights = abs_weights / torch.sum(abs_weights)
+ return normalized_weights
+
+
+class Conv2dRFSearchOp(BaseConvRFSearchOp):
+ """Enable Conv2d with receptive field searching ability.
+
+ Args:
+ op_layer (nn.Module): pytorch module, e,g, Conv2d
+ global_config (dict): config dict. Defaults to None.
+ By default this must include:
+
+ - "init_alphas": The value for initializing weights of each branch.
+ - "num_branches": The controller of the size of
+ search space (the number of branches).
+ - "exp_rate": The controller of the sparsity of search space.
+ - "mmin": The minimum dilation rate.
+ - "mmax": The maximum dilation rate.
+
+ Extra keys may exist, but are used by RFSearchHook, e.g., "step",
+ "max_step", "search_interval", and "skip_layer".
+ verbose (bool): Determines whether to print rf-next
+ related logging messages.
+ Defaults to True.
+ """
+
+ def __init__(self,
+ op_layer: nn.Module,
+ global_config: dict,
+ verbose: bool = True):
+ super().__init__(op_layer, global_config)
+ assert global_config is not None, 'global_config is None'
+ self.num_branches = global_config['num_branches']
+ assert self.num_branches in [2, 3]
+ self.verbose = verbose
+ init_dilation = op_layer.dilation
+ self.dilation_rates = expand_rates(init_dilation, global_config)
+ if self.op_layer.kernel_size[
+ 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+ self.dilation_rates = [(op_layer.dilation[0], r[1])
+ for r in self.dilation_rates]
+ if self.op_layer.kernel_size[
+ 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+ self.dilation_rates = [(r[0], op_layer.dilation[1])
+ for r in self.dilation_rates]
+
+ self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))
+ if self.verbose:
+ print_log(f'Expand as {self.dilation_rates}', 'current')
+ nn.init.constant_(self.branch_weights, global_config['init_alphas'])
+
+ def forward(self, input: Tensor) -> Tensor:
+ norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+ if len(self.dilation_rates) == 1:
+ outputs = [
+ nn.functional.conv2d(
+ input,
+ weight=self.op_layer.weight,
+ bias=self.op_layer.bias,
+ stride=self.op_layer.stride,
+ padding=self.get_padding(self.dilation_rates[0]),
+ dilation=self.dilation_rates[0],
+ groups=self.op_layer.groups,
+ )
+ ]
+ else:
+ outputs = [
+ nn.functional.conv2d(
+ input,
+ weight=self.op_layer.weight,
+ bias=self.op_layer.bias,
+ stride=self.op_layer.stride,
+ padding=self.get_padding(r),
+ dilation=r,
+ groups=self.op_layer.groups,
+ ) * norm_w[i] for i, r in enumerate(self.dilation_rates)
+ ]
+ output = outputs[0]
+ for i in range(1, len(self.dilation_rates)):
+ output += outputs[i]
+ return output
+
+ def estimate_rates(self) -> None:
+ """Estimate new dilation rate based on trained branch_weights."""
+ norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+ if self.verbose:
+ print_log(
+ 'Estimate dilation {} with weight {}.'.format(
+ self.dilation_rates,
+ norm_w.detach().cpu().numpy().tolist()), 'current')
+
+ sum0, sum1, w_sum = 0, 0, 0
+ for i in range(len(self.dilation_rates)):
+ sum0 += norm_w[i].item() * self.dilation_rates[i][0]
+ sum1 += norm_w[i].item() * self.dilation_rates[i][1]
+ w_sum += norm_w[i].item()
+ estimated = [
+ np.clip(
+ int(round(sum0 / w_sum)), self.global_config['mmin'],
+ self.global_config['mmax']).item(),
+ np.clip(
+ int(round(sum1 / w_sum)), self.global_config['mmin'],
+ self.global_config['mmax']).item()
+ ]
+ self.op_layer.dilation = tuple(estimated)
+ self.op_layer.padding = self.get_padding(self.op_layer.dilation)
+ self.dilation_rates = [tuple(estimated)]
+ if self.verbose:
+ print_log(f'Estimate as {tuple(estimated)}', 'current')
+
+ def expand_rates(self) -> None:
+ """Expand dilation rate."""
+ dilation = self.op_layer.dilation
+ dilation_rates = expand_rates(dilation, self.global_config)
+ if self.op_layer.kernel_size[
+ 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+ dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]
+ if self.op_layer.kernel_size[
+ 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+ dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]
+
+ self.dilation_rates = copy.deepcopy(dilation_rates)
+ if self.verbose:
+ print_log(f'Expand as {self.dilation_rates}', 'current')
+ nn.init.constant_(self.branch_weights,
+ self.global_config['init_alphas'])
+
+ def get_padding(self, dilation) -> tuple:
+ padding = (get_single_padding(self.op_layer.kernel_size[0],
+ self.op_layer.stride[0], dilation[0]),
+ get_single_padding(self.op_layer.kernel_size[1],
+ self.op_layer.stride[1], dilation[1]))
+ return padding
diff --git a/external/cv/mmcv/cnn/rfsearch/search.py b/external/cv/mmcv/cnn/rfsearch/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f77ca9b4d8cb44e26b0b76cb5d74f6a172fbc5
--- /dev/null
+++ b/external/cv/mmcv/cnn/rfsearch/search.py
@@ -0,0 +1,244 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Dict, Optional
+
+import mmengine
+import torch # noqa
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp # noqa
+from .utils import get_single_padding, write_to_json
+
+
+@HOOKS.register_module()
+class RFSearchHook(Hook):
+ """Rcecptive field search via dilation rates.
+
+ Please refer to `RF-Next: Efficient Receptive Field
+ Search for Convolutional Neural Networks
+ `_ for more details.
+
+
+ Args:
+ mode (str, optional): It can be set to the following types:
+ 'search', 'fixed_single_branch', or 'fixed_multi_branch'.
+ Defaults to 'search'.
+ config (Dict, optional): config dict of search.
+ By default this config contains "search",
+ and config["search"] must include:
+
+ - "step": recording the current searching step.
+ - "max_step": The maximum number of searching steps
+ to update the structures.
+ - "search_interval": The interval (epoch/iteration)
+ between two updates.
+ - "exp_rate": The controller of the sparsity of search space.
+ - "init_alphas": The value for initializing weights of each branch.
+ - "mmin": The minimum dilation rate.
+ - "mmax": The maximum dilation rate.
+ - "num_branches": The controller of the size of
+ search space (the number of branches).
+ - "skip_layer": The modules in skip_layer will be ignored
+ during the receptive field search.
+ rfstructure_file (str, optional): Path to load searched receptive
+ fields of the model. Defaults to None.
+ by_epoch (bool, optional): Determine to perform step by epoch or
+ by iteration. If set to True, it will step by epoch. Otherwise, by
+ iteration. Defaults to True.
+ verbose (bool): Determines whether to print rf-next related logging
+ messages. Defaults to True.
+ """
+
+ def __init__(self,
+ mode: str = 'search',
+ config: Dict = {},
+ rfstructure_file: Optional[str] = None,
+ by_epoch: bool = True,
+ verbose: bool = True):
+ assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']
+ assert config is not None
+ self.config = config
+ self.config['structure'] = {}
+ self.verbose = verbose
+ if rfstructure_file is not None:
+ rfstructure = mmengine.load(rfstructure_file)['structure']
+ self.config['structure'] = rfstructure
+ self.mode = mode
+ self.num_branches = self.config['search']['num_branches']
+ self.by_epoch = by_epoch
+
+ def init_model(self, model: nn.Module):
+ """init model with search ability.
+
+ Args:
+ model (nn.Module): pytorch model
+
+ Raises:
+ NotImplementedError: only support three modes:
+ search/fixed_single_branch/fixed_multi_branch
+ """
+ if self.verbose:
+ print_log('RFSearch init begin.', 'current')
+ if self.mode == 'search':
+ if self.config['structure']:
+ self.set_model(model, search_op='Conv2d')
+ self.wrap_model(model, search_op='Conv2d')
+ elif self.mode == 'fixed_single_branch':
+ self.set_model(model, search_op='Conv2d')
+ elif self.mode == 'fixed_multi_branch':
+ self.set_model(model, search_op='Conv2d')
+ self.wrap_model(model, search_op='Conv2d')
+ else:
+ raise NotImplementedError
+ if self.verbose:
+ print_log('RFSearch init end.', 'current')
+
+ def after_train_epoch(self, runner):
+ """Performs a dilation searching step after one training epoch."""
+ if self.by_epoch and self.mode == 'search':
+ self.step(runner.model, runner.work_dir)
+
+ def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+ """Performs a dilation searching step after one training iteration."""
+ if not self.by_epoch and self.mode == 'search':
+ self.step(runner.model, runner.work_dir)
+
+ def step(self, model: nn.Module, work_dir: str) -> None:
+ """Performs a dilation searching step.
+
+ Args:
+ model (nn.Module): pytorch model
+ work_dir (str): Directory to save the searching results.
+ """
+ self.config['search']['step'] += 1
+ if (self.config['search']['step']
+ ) % self.config['search']['search_interval'] == 0 and (self.config[
+ 'search']['step']) < self.config['search']['max_step']:
+ self.estimate_and_expand(model)
+ for name, module in model.named_modules():
+ if isinstance(module, BaseConvRFSearchOp):
+ self.config['structure'][name] = module.op_layer.dilation
+
+ write_to_json(
+ self.config,
+ os.path.join(
+ work_dir,
+ 'local_search_config_step%d.json' %
+ self.config['search']['step'],
+ ),
+ )
+
+ def estimate_and_expand(self, model: nn.Module) -> None:
+ """estimate and search for RFConvOp.
+
+ Args:
+ model (nn.Module): pytorch model
+ """
+ for module in model.modules():
+ if isinstance(module, BaseConvRFSearchOp):
+ module.estimate_rates()
+ module.expand_rates()
+
+ def wrap_model(self,
+ model: nn.Module,
+ search_op: str = 'Conv2d',
+ prefix: str = '') -> None:
+ """wrap model to support searchable conv op.
+
+ Args:
+ model (nn.Module): pytorch model
+ search_op (str): The module that uses RF search.
+ Defaults to 'Conv2d'.
+ init_rates (int, optional): Set to other initial dilation rates.
+ Defaults to None.
+ prefix (str): Prefix for function recursion. Defaults to ''.
+ """
+ op = 'torch.nn.' + search_op
+ for name, module in model.named_children():
+ if prefix == '':
+ fullname = 'module.' + name
+ else:
+ fullname = prefix + '.' + name
+ if self.config['search']['skip_layer'] is not None:
+ if any(layer in fullname
+ for layer in self.config['search']['skip_layer']):
+ continue
+ if isinstance(module, eval(op)):
+ if 1 < module.kernel_size[0] and \
+ 0 != module.kernel_size[0] % 2 or \
+ 1 < module.kernel_size[1] and \
+ 0 != module.kernel_size[1] % 2:
+ moduleWrap = eval(search_op + 'RFSearchOp')(
+ module, self.config['search'], self.verbose)
+ moduleWrap = moduleWrap.to(module.weight.device)
+ if self.verbose:
+ print_log(
+ 'Wrap model %s to %s.' %
+ (str(module), str(moduleWrap)), 'current')
+ setattr(model, name, moduleWrap)
+ elif not isinstance(module, BaseConvRFSearchOp):
+ self.wrap_model(module, search_op, fullname)
+
+ def set_model(self,
+ model: nn.Module,
+ search_op: str = 'Conv2d',
+ init_rates: Optional[int] = None,
+ prefix: str = '') -> None:
+ """set model based on config.
+
+ Args:
+ model (nn.Module): pytorch model
+ config (Dict): config file
+ search_op (str): The module that uses RF search.
+ Defaults to 'Conv2d'.
+ init_rates (int, optional): Set to other initial dilation rates.
+ Defaults to None.
+ prefix (str): Prefix for function recursion. Defaults to ''.
+ """
+ op = 'torch.nn.' + search_op
+ for name, module in model.named_children():
+ if prefix == '':
+ fullname = 'module.' + name
+ else:
+ fullname = prefix + '.' + name
+ if self.config['search']['skip_layer'] is not None:
+ if any(layer in fullname
+ for layer in self.config['search']['skip_layer']):
+ continue
+ if isinstance(module, eval(op)):
+ if 1 < module.kernel_size[0] and \
+ 0 != module.kernel_size[0] % 2 or \
+ 1 < module.kernel_size[1] and \
+ 0 != module.kernel_size[1] % 2:
+ if isinstance(self.config['structure'][fullname], int):
+ self.config['structure'][fullname] = [
+ self.config['structure'][fullname],
+ self.config['structure'][fullname]
+ ]
+ module.dilation = (
+ self.config['structure'][fullname][0],
+ self.config['structure'][fullname][1],
+ )
+ module.padding = (
+ get_single_padding(
+ module.kernel_size[0], module.stride[0],
+ self.config['structure'][fullname][0]),
+ get_single_padding(
+ module.kernel_size[1], module.stride[1],
+ self.config['structure'][fullname][1]))
+ setattr(model, name, module)
+ if self.verbose:
+ print_log(
+ 'Set module %s dilation as: [%d %d]' %
+ (fullname, module.dilation[0], module.dilation[1]),
+ 'current')
+ elif not isinstance(module, BaseConvRFSearchOp):
+ self.set_model(module, search_op, init_rates, fullname)
diff --git a/external/cv/mmcv/cnn/rfsearch/utils.py b/external/cv/mmcv/cnn/rfsearch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..67d301a4784a2a11110676011acc02f2937317de
--- /dev/null
+++ b/external/cv/mmcv/cnn/rfsearch/utils.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import mmengine
+import numpy as np
+
+
+def write_to_json(config: dict, filename: str):
+ """save config to json file.
+
+ Args:
+ config (dict): Config to be saved.
+ filename (str): Path to save config.
+ """
+
+ with open(filename, 'w', encoding='utf-8') as f:
+ mmengine.dump(config, f, file_format='json')
+
+
+def expand_rates(dilation: tuple, config: dict) -> list:
+ """expand dilation rate according to config.
+
+ Args:
+ dilation (int): _description_
+ config (dict): config dict
+
+ Returns:
+ list: list of expanded dilation rates
+ """
+ exp_rate = config['exp_rate']
+
+ large_rates = []
+ small_rates = []
+ for _ in range(config['num_branches'] // 2):
+ large_rates.append(
+ tuple([
+ np.clip(
+ int(round((1 + exp_rate) * dilation[0])), config['mmin'],
+ config['mmax']).item(),
+ np.clip(
+ int(round((1 + exp_rate) * dilation[1])), config['mmin'],
+ config['mmax']).item()
+ ]))
+ small_rates.append(
+ tuple([
+ np.clip(
+ int(round((1 - exp_rate) * dilation[0])), config['mmin'],
+ config['mmax']).item(),
+ np.clip(
+ int(round((1 - exp_rate) * dilation[1])), config['mmin'],
+ config['mmax']).item()
+ ]))
+
+ small_rates.reverse()
+
+ if config['num_branches'] % 2 == 0:
+ rate_list = small_rates + large_rates
+ else:
+ rate_list = small_rates + [dilation] + large_rates
+
+ unique_rate_list = list(set(rate_list))
+ unique_rate_list.sort(key=rate_list.index)
+ return unique_rate_list
+
+
+def get_single_padding(kernel_size: int,
+ stride: int = 1,
+ dilation: int = 1) -> int:
+ padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+ return padding
diff --git a/external/cv/mmcv/cnn/utils/__init__.py b/external/cv/mmcv/cnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0822f2a0c5f12efdbcf78aa52c14e334a07c56fb
--- /dev/null
+++ b/external/cv/mmcv/cnn/utils/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .flops_counter import get_model_complexity_info
+from .fuse_conv_bn import fuse_conv_bn
+
+__all__ = ['get_model_complexity_info', 'fuse_conv_bn']
diff --git a/external/cv/mmcv/cnn/utils/flops_counter.py b/external/cv/mmcv/cnn/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18607485bdb7a26065f765debdeb143865da974
--- /dev/null
+++ b/external/cv/mmcv/cnn/utils/flops_counter.py
@@ -0,0 +1,610 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from flops-counter.pytorch by Vladislav Sovrasov
+# original repo: https://github.com/sovrasov/flops-counter.pytorch
+
+# MIT License
+
+# Copyright (c) 2018 Vladislav Sovrasov
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,
+ MaxPool2d, MaxPool3d)
+
+
+def get_model_complexity_info(model: nn.Module,
+ input_shape: tuple,
+ print_per_layer_stat: bool = True,
+ as_strings: bool = True,
+ input_constructor: Optional[Callable] = None,
+ flush: bool = False,
+ ost: TextIO = sys.stdout) -> tuple:
+ """Get complexity information of a model.
+
+ This method can calculate FLOPs and parameter counts of a model with
+ corresponding input shape. It can also print complexity information for
+ each layer in a model.
+
+ Supported layers are listed as below:
+ - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+ - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+ ``nn.LeakyReLU``, ``nn.ReLU6``.
+ - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+ ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+ ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+ ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+ ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+ - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+ ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+ ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+ - Linear: ``nn.Linear``.
+ - Deconvolution: ``nn.ConvTranspose2d``.
+ - Upsample: ``nn.Upsample``.
+
+ Args:
+ model (nn.Module): The model for complexity calculation.
+ input_shape (tuple): Input shape used for calculation.
+ print_per_layer_stat (bool): Whether to print complexity information
+ for each layer in a model. Default: True.
+ as_strings (bool): Output FLOPs and params counts in a string form.
+ Default: True.
+ input_constructor (None | callable): If specified, it takes a callable
+ method that generates input. otherwise, it will generate a random
+ tensor with input shape to calculate FLOPs. Default: None.
+ flush (bool): same as that in :func:`print`. Default: False.
+ ost (stream): same as ``file`` param in :func:`print`.
+ Default: sys.stdout.
+
+ Returns:
+ tuple[float | str]: If ``as_strings`` is set to True, it will return
+ FLOPs and parameter counts in a string format. otherwise, it will
+ return those in a float number format.
+ """
+ assert type(input_shape) is tuple
+ assert len(input_shape) >= 1
+ assert isinstance(model, nn.Module)
+ flops_model = add_flops_counting_methods(model)
+ flops_model.eval()
+ flops_model.start_flops_count()
+ if input_constructor:
+ input = input_constructor(input_shape)
+ _ = flops_model(**input)
+ else:
+ try:
+ batch = torch.ones(()).new_empty(
+ (1, *input_shape),
+ dtype=next(flops_model.parameters()).dtype,
+ device=next(flops_model.parameters()).device)
+ except StopIteration:
+ # Avoid StopIteration for models which have no parameters,
+ # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+ batch = torch.ones(()).new_empty((1, *input_shape))
+
+ _ = flops_model(batch)
+
+ flops_count, params_count = flops_model.compute_average_flops_cost()
+ if print_per_layer_stat:
+ print_model_with_flops(
+ flops_model, flops_count, params_count, ost=ost, flush=flush)
+ flops_model.stop_flops_count()
+
+ if as_strings:
+ return flops_to_string(flops_count), params_to_string(params_count)
+
+ return flops_count, params_count
+
+
+def flops_to_string(flops: float,
+ units: Optional[str] = 'GFLOPs',
+ precision: int = 2) -> str:
+ """Convert FLOPs number into a string.
+
+ Note that Here we take a multiply-add counts as one FLOP.
+
+ Args:
+ flops (float): FLOPs number to be converted.
+ units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+ 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+ choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+ precision (int): Digit number after the decimal point. Default: 2.
+
+ Returns:
+ str: The converted FLOPs number with units.
+
+ Examples:
+ >>> flops_to_string(1e9)
+ '1.0 GFLOPs'
+ >>> flops_to_string(2e5, 'MFLOPs')
+ '0.2 MFLOPs'
+ >>> flops_to_string(3e-9, None)
+ '3e-09 FLOPs'
+ """
+ if units is None:
+ if flops // 10**9 > 0:
+ return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+ elif flops // 10**6 > 0:
+ return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+ elif flops // 10**3 > 0:
+ return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+ else:
+ return str(flops) + ' FLOPs'
+ else:
+ if units == 'GFLOPs':
+ return str(round(flops / 10.**9, precision)) + ' ' + units
+ elif units == 'MFLOPs':
+ return str(round(flops / 10.**6, precision)) + ' ' + units
+ elif units == 'KFLOPs':
+ return str(round(flops / 10.**3, precision)) + ' ' + units
+ else:
+ return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params: float,
+ units: Optional[str] = None,
+ precision: int = 2) -> str:
+ """Convert parameter number into a string.
+
+ Args:
+ num_params (float): Parameter number to be converted.
+ units (str | None): Converted FLOPs units. Options are None, 'M',
+ 'K' and ''. If set to None, it will automatically choose the most
+ suitable unit for Parameter number. Default: None.
+ precision (int): Digit number after the decimal point. Default: 2.
+
+ Returns:
+ str: The converted parameter number with units.
+
+ Examples:
+ >>> params_to_string(1e9)
+ '1000.0 M'
+ >>> params_to_string(2e5)
+ '200.0 k'
+ >>> params_to_string(3e-9)
+ '3e-09'
+ """
+ if units is None:
+ if num_params // 10**6 > 0:
+ return str(round(num_params / 10**6, precision)) + ' M'
+ elif num_params // 10**3:
+ return str(round(num_params / 10**3, precision)) + ' k'
+ else:
+ return str(num_params)
+ else:
+ if units == 'M':
+ return str(round(num_params / 10.**6, precision)) + ' ' + units
+ elif units == 'K':
+ return str(round(num_params / 10.**3, precision)) + ' ' + units
+ else:
+ return str(num_params)
+
+
+def print_model_with_flops(model: nn.Module,
+ total_flops: float,
+ total_params: float,
+ units: Optional[str] = 'GFLOPs',
+ precision: int = 3,
+ ost: TextIO = sys.stdout,
+ flush: bool = False) -> None:
+ """Print a model with FLOPs for each layer.
+
+ Args:
+ model (nn.Module): The model to be printed.
+ total_flops (float): Total FLOPs of the model.
+ total_params (float): Total parameter counts of the model.
+ units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+ precision (int): Digit number after the decimal point. Default: 3.
+ ost (stream): same as `file` param in :func:`print`.
+ Default: sys.stdout.
+ flush (bool): same as that in :func:`print`. Default: False.
+
+ Example:
+ >>> class ExampleModel(nn.Module):
+
+ >>> def __init__(self):
+ >>> super().__init__()
+ >>> self.conv1 = nn.Conv2d(3, 8, 3)
+ >>> self.conv2 = nn.Conv2d(8, 256, 3)
+ >>> self.conv3 = nn.Conv2d(256, 8, 3)
+ >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+ >>> self.flatten = nn.Flatten()
+ >>> self.fc = nn.Linear(8, 1)
+
+ >>> def forward(self, x):
+ >>> x = self.conv1(x)
+ >>> x = self.conv2(x)
+ >>> x = self.conv3(x)
+ >>> x = self.avg_pool(x)
+ >>> x = self.flatten(x)
+ >>> x = self.fc(x)
+ >>> return x
+
+ >>> model = ExampleModel()
+ >>> x = (3, 16, 16)
+ to print the complexity information state for each layer, you can use
+ >>> get_model_complexity_info(model, x)
+ or directly use
+ >>> print_model_with_flops(model, 4579784.0, 37361)
+ ExampleModel(
+ 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+ (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501
+ (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+ (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+ (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+ (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+ (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+ )
+ """
+
+ def accumulate_params(self):
+ if is_supported_instance(self):
+ return self.__params__
+ else:
+ sum = 0
+ for m in self.children():
+ sum += m.accumulate_params()
+ return sum
+
+ def accumulate_flops(self):
+ if is_supported_instance(self):
+ return self.__flops__ / model.__batch_counter__
+ else:
+ sum = 0
+ for m in self.children():
+ sum += m.accumulate_flops()
+ return sum
+
+ def flops_repr(self):
+ accumulated_num_params = self.accumulate_params()
+ accumulated_flops_cost = self.accumulate_flops()
+ return ', '.join([
+ params_to_string(
+ accumulated_num_params, units='M', precision=precision),
+ f'{accumulated_num_params / total_params:.3%} Params',
+ flops_to_string(
+ accumulated_flops_cost, units=units, precision=precision),
+ f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
+ self.original_extra_repr()
+ ])
+
+ def add_extra_repr(m):
+ m.accumulate_flops = accumulate_flops.__get__(m)
+ m.accumulate_params = accumulate_params.__get__(m)
+ flops_extra_repr = flops_repr.__get__(m)
+ if m.extra_repr != flops_extra_repr:
+ m.original_extra_repr = m.extra_repr
+ m.extra_repr = flops_extra_repr
+ assert m.extra_repr != m.original_extra_repr
+
+ def del_extra_repr(m):
+ if hasattr(m, 'original_extra_repr'):
+ m.extra_repr = m.original_extra_repr
+ del m.original_extra_repr
+ if hasattr(m, 'accumulate_flops'):
+ del m.accumulate_flops
+
+ model.apply(add_extra_repr)
+ print(model, file=ost, flush=flush)
+ model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model: nn.Module) -> float:
+ """Calculate parameter number of a model.
+
+ Args:
+ model (nn.module): The model for parameter number calculation.
+
+ Returns:
+ float: Parameter number of the model.
+ """
+ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ return num_params
+
+
+def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
+ # adding additional methods to the existing module object,
+ # this is done this way so that each function has access to self object
+ net_main_module.start_flops_count = start_flops_count.__get__( # type: ignore # noqa E501
+ net_main_module)
+ net_main_module.stop_flops_count = stop_flops_count.__get__( # type: ignore # noqa E501
+ net_main_module)
+ net_main_module.reset_flops_count = reset_flops_count.__get__( # type: ignore # noqa E501
+ net_main_module)
+ net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # type: ignore # noqa E501
+ net_main_module)
+
+ net_main_module.reset_flops_count()
+
+ return net_main_module
+
+
+def compute_average_flops_cost(self) -> Tuple[float, float]:
+ """Compute average FLOPs cost.
+
+ A method to compute average FLOPs cost, which will be available after
+ `add_flops_counting_methods()` is called on a desired net object.
+
+ Returns:
+ float: Current mean flops consumption per image.
+ """
+ batches_count = self.__batch_counter__
+ flops_sum = 0
+ for module in self.modules():
+ if is_supported_instance(module):
+ flops_sum += module.__flops__
+ params_sum = get_model_parameters_number(self)
+ return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self) -> None:
+ """Activate the computation of mean flops consumption per image.
+
+ A method to activate the computation of mean flops consumption per image.
+ which will be available after ``add_flops_counting_methods()`` is called on
+ a desired net object. It should be called before running the network.
+ """
+ add_batch_counter_hook_function(self)
+
+ def add_flops_counter_hook_function(module: nn.Module) -> None:
+ if is_supported_instance(module):
+ if hasattr(module, '__flops_handle__'):
+ return
+
+ else:
+ handle = module.register_forward_hook(
+ get_modules_mapping()[type(module)])
+
+ module.__flops_handle__ = handle
+
+ self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self) -> None:
+ """Stop computing the mean flops consumption per image.
+
+ A method to stop computing the mean flops consumption per image, which will
+ be available after ``add_flops_counting_methods()`` is called on a desired
+ net object. It can be called to pause the computation whenever.
+ """
+ remove_batch_counter_hook_function(self)
+ self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self) -> None:
+ """Reset statistics computed so far.
+
+ A method to Reset computed statistics, which will be available after
+ `add_flops_counting_methods()` is called on a desired net object.
+ """
+ add_batch_counter_variables_or_reset(self)
+ self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module: nn.Module, input: tuple,
+ output: Any) -> None:
+ module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ output_size = output[0]
+ batch_size = output_size.shape[0]
+ output_elements_count = batch_size
+ for val in output_size.shape[1:]:
+ output_elements_count *= val
+ module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ active_elements_count = output.numel()
+ module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ output_last_dim = output.shape[
+ -1] # pytorch checks dimensions, so here we don't care much
+ module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ module.__flops__ += int(np.prod(input[0].shape))
+
+
+def norm_flops_counter_hook(module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ batch_flops = np.prod(input[0].shape)
+ if (getattr(module, 'affine', False)
+ or getattr(module, 'elementwise_affine', False)):
+ batch_flops *= 2
+ module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ # Can have multiple inputs, getting the first one
+ batch_size = input[0].shape[0]
+ input_height, input_width = input[0].shape[2:]
+
+ kernel_height, kernel_width = conv_module.kernel_size
+ in_channels = conv_module.in_channels
+ out_channels = conv_module.out_channels
+ groups = conv_module.groups
+
+ filters_per_channel = out_channels // groups
+ conv_per_position_flops = (
+ kernel_height * kernel_width * in_channels * filters_per_channel)
+
+ active_elements_count = batch_size * input_height * input_width
+ overall_conv_flops = conv_per_position_flops * active_elements_count
+ bias_flops = 0
+ if conv_module.bias is not None:
+ output_height, output_width = output.shape[2:]
+ bias_flops = out_channels * batch_size * output_height * output_width
+ overall_flops = overall_conv_flops + bias_flops
+
+ conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+ output: torch.Tensor) -> None:
+ # Can have multiple inputs, getting the first one
+ batch_size = input[0].shape[0]
+ output_dims = list(output.shape[2:])
+
+ kernel_dims = list(conv_module.kernel_size)
+ in_channels = conv_module.in_channels
+ out_channels = conv_module.out_channels
+ groups = conv_module.groups
+
+ filters_per_channel = out_channels // groups
+ conv_per_position_flops = int(
+ np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+ active_elements_count = batch_size * int(np.prod(output_dims))
+
+ overall_conv_flops = conv_per_position_flops * active_elements_count
+
+ bias_flops = 0
+
+ if conv_module.bias is not None:
+
+ bias_flops = out_channels * active_elements_count
+
+ overall_flops = overall_conv_flops + bias_flops
+
+ conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
+ batch_size = 1
+ if len(input) > 0:
+ # Can have multiple inputs, getting the first one
+ batch_size = len(input[0])
+ else:
+ warnings.warn('No positional inputs found for a module, '
+ 'assuming batch size is 1.')
+ module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
+
+ module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module: nn.Module) -> None:
+ if hasattr(module, '__batch_counter_handle__'):
+ return
+
+ handle = module.register_forward_hook(batch_counter_hook)
+ module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module: nn.Module) -> None:
+ if hasattr(module, '__batch_counter_handle__'):
+ module.__batch_counter_handle__.remove()
+ del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
+ if is_supported_instance(module):
+ if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+ warnings.warn('variables __flops__ or __params__ are already '
+ 'defined for the module' + type(module).__name__ +
+ ' ptflops can affect your code!')
+ module.__flops__ = 0
+ module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module: nn.Module) -> bool:
+ if type(module) in get_modules_mapping():
+ return True
+ return False
+
+
+def remove_flops_counter_hook_function(module: nn.Module) -> None:
+ if is_supported_instance(module):
+ if hasattr(module, '__flops_handle__'):
+ module.__flops_handle__.remove()
+ del module.__flops_handle__
+
+
+def get_modules_mapping() -> Dict:
+ return {
+ # convolutions
+ nn.Conv1d: conv_flops_counter_hook,
+ nn.Conv2d: conv_flops_counter_hook,
+ Conv2d: conv_flops_counter_hook,
+ nn.Conv3d: conv_flops_counter_hook,
+ Conv3d: conv_flops_counter_hook,
+ # activations
+ nn.ReLU: relu_flops_counter_hook,
+ nn.PReLU: relu_flops_counter_hook,
+ nn.ELU: relu_flops_counter_hook,
+ nn.LeakyReLU: relu_flops_counter_hook,
+ nn.ReLU6: relu_flops_counter_hook,
+ # poolings
+ nn.MaxPool1d: pool_flops_counter_hook,
+ nn.AvgPool1d: pool_flops_counter_hook,
+ nn.AvgPool2d: pool_flops_counter_hook,
+ nn.MaxPool2d: pool_flops_counter_hook,
+ MaxPool2d: pool_flops_counter_hook,
+ nn.MaxPool3d: pool_flops_counter_hook,
+ MaxPool3d: pool_flops_counter_hook,
+ nn.AvgPool3d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+ # normalizations
+ nn.BatchNorm1d: norm_flops_counter_hook,
+ nn.BatchNorm2d: norm_flops_counter_hook,
+ nn.BatchNorm3d: norm_flops_counter_hook,
+ nn.GroupNorm: norm_flops_counter_hook,
+ nn.InstanceNorm1d: norm_flops_counter_hook,
+ nn.InstanceNorm2d: norm_flops_counter_hook,
+ nn.InstanceNorm3d: norm_flops_counter_hook,
+ nn.LayerNorm: norm_flops_counter_hook,
+ # FC
+ nn.Linear: linear_flops_counter_hook,
+ Linear: linear_flops_counter_hook,
+ # Upscale
+ nn.Upsample: upsample_flops_counter_hook,
+ # Deconvolution
+ nn.ConvTranspose2d: deconv_flops_counter_hook,
+ ConvTranspose2d: deconv_flops_counter_hook,
+ }
diff --git a/external/cv/mmcv/cnn/utils/fuse_conv_bn.py b/external/cv/mmcv/cnn/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..07aab3bd21bf0d695272bc9faa55ddf91861e206
--- /dev/null
+++ b/external/cv/mmcv/cnn/utils/fuse_conv_bn.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
+ """Fuse conv and bn into one module.
+
+ Args:
+ conv (nn.Module): Conv to be fused.
+ bn (nn.Module): BN to be fused.
+
+ Returns:
+ nn.Module: Fused module.
+ """
+ conv_w = conv.weight
+ conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+ bn.running_mean)
+
+ factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+ conv.weight = nn.Parameter(conv_w *
+ factor.reshape([conv.out_channels, 1, 1, 1]))
+ conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+ return conv
+
+
+def fuse_conv_bn(module: nn.Module) -> nn.Module:
+ """Recursively fuse conv and bn in a module.
+
+ During inference, the functionary of batch norm layers is turned off
+ but only the mean and var alone channels are used, which exposes the
+ chance to fuse it with the preceding conv layers to save computations and
+ simplify network structures.
+
+ Args:
+ module (nn.Module): Module to be fused.
+
+ Returns:
+ nn.Module: Fused module.
+ """
+ last_conv = None
+ last_conv_name = None
+
+ for name, child in module.named_children():
+ if isinstance(child,
+ (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+ if last_conv is None: # only fuse BN that is after Conv
+ continue
+ fused_conv = _fuse_conv_bn(last_conv, child)
+ module._modules[last_conv_name] = fused_conv
+ # To reduce changes, set BN as Identity instead of deleting it.
+ module._modules[name] = nn.Identity()
+ last_conv = None
+ elif isinstance(child, nn.Conv2d):
+ last_conv = child
+ last_conv_name = name
+ else:
+ fuse_conv_bn(child)
+ return module
diff --git a/external/cv/mmcv/cnn/vgg.py b/external/cv/mmcv/cnn/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..013ff815de3c6f8a29207e752fd8a1e7ac14abc2
--- /dev/null
+++ b/external/cv/mmcv/cnn/vgg.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init, normal_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
+ """3x3 convolution with padding."""
+ return nn.Conv2d(
+ in_planes,
+ out_planes,
+ kernel_size=3,
+ padding=dilation,
+ dilation=dilation)
+
+
+def make_vgg_layer(inplanes: int,
+ planes: int,
+ num_blocks: int,
+ dilation: int = 1,
+ with_bn: bool = False,
+ ceil_mode: bool = False) -> List[nn.Module]:
+ layers = []
+ for _ in range(num_blocks):
+ layers.append(conv3x3(inplanes, planes, dilation))
+ if with_bn:
+ layers.append(nn.BatchNorm2d(planes))
+ layers.append(nn.ReLU(inplace=True))
+ inplanes = planes
+ layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+ return layers
+
+
+class VGG(nn.Module):
+ """VGG backbone.
+
+ Args:
+ depth (int): Depth of vgg, from {11, 13, 16, 19}.
+ with_bn (bool): Use BatchNorm or not.
+ num_classes (int): number of classes for classification.
+ num_stages (int): VGG stages, normally 5.
+ dilations (Sequence[int]): Dilation of each stage.
+ out_indices (Sequence[int]): Output from which stages.
+ frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+ not freezing any parameters.
+ bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+ running stats (mean and var).
+ bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+ """
+
+ arch_settings = {
+ 11: (1, 1, 2, 2, 2),
+ 13: (2, 2, 2, 2, 2),
+ 16: (2, 2, 3, 3, 3),
+ 19: (2, 2, 4, 4, 4)
+ }
+
+ def __init__(self,
+ depth: int,
+ with_bn: bool = False,
+ num_classes: int = -1,
+ num_stages: int = 5,
+ dilations: Sequence[int] = (1, 1, 1, 1, 1),
+ out_indices: Sequence[int] = (0, 1, 2, 3, 4),
+ frozen_stages: int = -1,
+ bn_eval: bool = True,
+ bn_frozen: bool = False,
+ ceil_mode: bool = False,
+ with_last_pool: bool = True):
+ super().__init__()
+ if depth not in self.arch_settings:
+ raise KeyError(f'invalid depth {depth} for vgg')
+ assert num_stages >= 1 and num_stages <= 5
+ stage_blocks = self.arch_settings[depth]
+ self.stage_blocks = stage_blocks[:num_stages]
+ assert len(dilations) == num_stages
+ assert max(out_indices) <= num_stages
+
+ self.num_classes = num_classes
+ self.out_indices = out_indices
+ self.frozen_stages = frozen_stages
+ self.bn_eval = bn_eval
+ self.bn_frozen = bn_frozen
+
+ self.inplanes = 3
+ start_idx = 0
+ vgg_layers = []
+ self.range_sub_modules = []
+ for i, num_blocks in enumerate(self.stage_blocks):
+ num_modules = num_blocks * (2 + with_bn) + 1
+ end_idx = start_idx + num_modules
+ dilation = dilations[i]
+ planes = 64 * 2**i if i < 4 else 512
+ vgg_layer = make_vgg_layer(
+ self.inplanes,
+ planes,
+ num_blocks,
+ dilation=dilation,
+ with_bn=with_bn,
+ ceil_mode=ceil_mode)
+ vgg_layers.extend(vgg_layer)
+ self.inplanes = planes
+ self.range_sub_modules.append([start_idx, end_idx])
+ start_idx = end_idx
+ if not with_last_pool:
+ vgg_layers.pop(-1)
+ self.range_sub_modules[-1][1] -= 1
+ self.module_name = 'features'
+ self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+ if self.num_classes > 0:
+ self.classifier = nn.Sequential(
+ nn.Linear(512 * 7 * 7, 4096),
+ nn.ReLU(True),
+ nn.Dropout(),
+ nn.Linear(4096, 4096),
+ nn.ReLU(True),
+ nn.Dropout(),
+ nn.Linear(4096, num_classes),
+ )
+
+ def init_weights(self, pretrained: Optional[str] = None) -> None:
+ if isinstance(pretrained, str):
+ logger = logging.getLogger()
+ load_checkpoint(self, pretrained, strict=False, logger=logger)
+ elif pretrained is None:
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ kaiming_init(m)
+ elif isinstance(m, nn.BatchNorm2d):
+ constant_init(m, 1)
+ elif isinstance(m, nn.Linear):
+ normal_init(m, std=0.01)
+ else:
+ raise TypeError('pretrained must be a str or None')
+
+ def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
+ outs = []
+ vgg_layers = getattr(self, self.module_name)
+ for i in range(len(self.stage_blocks)):
+ for j in range(*self.range_sub_modules[i]):
+ vgg_layer = vgg_layers[j]
+ x = vgg_layer(x)
+ if i in self.out_indices:
+ outs.append(x)
+ if self.num_classes > 0:
+ x = x.view(x.size(0), -1)
+ x = self.classifier(x)
+ outs.append(x)
+ if len(outs) == 1:
+ return outs[0]
+ else:
+ return tuple(outs)
+
+ def train(self, mode: bool = True) -> None:
+ super().train(mode)
+ if self.bn_eval:
+ for m in self.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eval()
+ if self.bn_frozen:
+ for params in m.parameters():
+ params.requires_grad = False
+ vgg_layers = getattr(self, self.module_name)
+ if mode and self.frozen_stages >= 0:
+ for i in range(self.frozen_stages):
+ for j in range(*self.range_sub_modules[i]):
+ mod = vgg_layers[j]
+ mod.eval()
+ for param in mod.parameters():
+ param.requires_grad = False
diff --git a/external/cv/mmcv/image/__init__.py b/external/cv/mmcv/image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f946caa2750b1da93e6b5b8086f954912e89c06e
--- /dev/null
+++ b/external/cv/mmcv/image/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+ gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+ rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+ impad_to_multiple, imrescale, imresize, imresize_like,
+ imresize_to_multiple, imrotate, imshear, imtranslate,
+ rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .misc import tensor2imgs
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+ adjust_hue, adjust_lighting, adjust_sharpness,
+ auto_contrast, clahe, imdenormalize, imequalize,
+ iminvert, imnormalize, imnormalize_, lut_transform,
+ posterize, solarize)
+
+__all__ = [
+ 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+ 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+ 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+ 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+ 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+ 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+ 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+ 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+ 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+ 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
+ 'adjust_hue'
+]
diff --git a/external/cv/mmcv/image/colorspace.py b/external/cv/mmcv/image/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee1d2aab48d95782115245580876937e49cc6c0
--- /dev/null
+++ b/external/cv/mmcv/image/colorspace.py
@@ -0,0 +1,314 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Union
+
+import cv2
+import numpy as np
+
+
+def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
+ """Convert an image from the src colorspace to dst colorspace.
+
+ Args:
+ img (ndarray): The input image.
+ src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+ dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+ Returns:
+ ndarray: The converted image.
+ """
+ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+ out_img = cv2.cvtColor(img, code)
+ return out_img
+
+
+def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+ """Convert a BGR image to grayscale image.
+
+ Args:
+ img (ndarray): The input image.
+ keepdim (bool): If False (by default), then return the grayscale image
+ with 2 dims, otherwise 3 dims.
+
+ Returns:
+ ndarray: The converted grayscale image.
+ """
+ out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ if keepdim:
+ out_img = out_img[..., None]
+ return out_img
+
+
+def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+ """Convert a RGB image to grayscale image.
+
+ Args:
+ img (ndarray): The input image.
+ keepdim (bool): If False (by default), then return the grayscale image
+ with 2 dims, otherwise 3 dims.
+
+ Returns:
+ ndarray: The converted grayscale image.
+ """
+ out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+ if keepdim:
+ out_img = out_img[..., None]
+ return out_img
+
+
+def gray2bgr(img: np.ndarray) -> np.ndarray:
+ """Convert a grayscale image to BGR image.
+
+ Args:
+ img (ndarray): The input image.
+
+ Returns:
+ ndarray: The converted BGR image.
+ """
+ img = img[..., None] if img.ndim == 2 else img
+ out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+ return out_img
+
+
+def gray2rgb(img: np.ndarray) -> np.ndarray:
+ """Convert a grayscale image to RGB image.
+
+ Args:
+ img (ndarray): The input image.
+
+ Returns:
+ ndarray: The converted RGB image.
+ """
+ img = img[..., None] if img.ndim == 2 else img
+ out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ return out_img
+
+
+def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
+ """Convert the type and range of the input image.
+
+ It converts the input image to np.float32 type and range of [0, 1].
+ It is mainly used for pre-processing the input image in colorspace
+ conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with type of np.float32 and range of
+ [0, 1].
+ """
+ img_type = img.dtype
+ img = img.astype(np.float32)
+ if img_type == np.float32:
+ pass
+ elif img_type == np.uint8:
+ img /= 255.
+ else:
+ raise TypeError('The img type should be np.float32 or np.uint8, '
+ f'but got {img_type}')
+ return img
+
+
+def _convert_output_type_range(
+ img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
+ """Convert the type and range of the image according to dst_type.
+
+ It converts the image to desired type and range. If `dst_type` is np.uint8,
+ images will be converted to np.uint8 type with range [0, 255]. If
+ `dst_type` is np.float32, it converts the image to np.float32 type with
+ range [0, 1].
+ It is mainly used for post-processing images in colorspace conversion
+ functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The image to be converted with np.float32 type and
+ range [0, 255].
+ dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+ converts the image to np.uint8 type with range [0, 255]. If
+ dst_type is np.float32, it converts the image to np.float32 type
+ with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with desired type and range.
+ """
+ if dst_type not in (np.uint8, np.float32):
+ raise TypeError('The dst_type should be np.float32 or np.uint8, '
+ f'but got {dst_type}')
+ if dst_type == np.uint8:
+ img = img.round()
+ else:
+ img /= 255.
+ return img.astype(dst_type)
+
+
+def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+ """Convert a RGB image to YCbCr image.
+
+ This function produces the same results as Matlab's `rgb2ycbcr` function.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+ y_only (bool): Whether to only return Y channel. Default: False.
+
+ Returns:
+ ndarray: The converted YCbCr image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img)
+ if y_only:
+ out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+ else:
+ out_img = np.matmul(
+ img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+ [24.966, 112.0, -18.214]]) + [16, 128, 128]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+ """Convert a BGR image to YCbCr image.
+
+ The bgr version of rgb2ycbcr.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+ y_only (bool): Whether to only return Y channel. Default: False.
+
+ Returns:
+ ndarray: The converted YCbCr image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img)
+ if y_only:
+ out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+ else:
+ out_img = np.matmul(
+ img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+ [65.481, -37.797, 112.0]]) + [16, 128, 128]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
+ """Convert a YCbCr image to RGB image.
+
+ This function produces the same results as Matlab's ycbcr2rgb function.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ ndarray: The converted RGB image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img) * 255
+ out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+ [0, -0.00153632, 0.00791071],
+ [0.00625893, -0.00318811, 0]]) * 255.0 + [
+ -222.921, 135.576, -276.836
+ ]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
+ """Convert a YCbCr image to BGR image.
+
+ The bgr version of ycbcr2rgb.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ ndarray: The converted BGR image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img) * 255
+ out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+ [0.00791071, -0.00153632, 0],
+ [0, -0.00318811, 0.00625893]]) * 255.0 + [
+ -276.836, 135.576, -222.921
+ ]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def convert_color_factory(src: str, dst: str) -> Callable:
+
+ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+ def convert_color(img: np.ndarray) -> np.ndarray:
+ out_img = cv2.cvtColor(img, code)
+ return out_img
+
+ convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+ image.
+
+ Args:
+ img (ndarray or str): The input image.
+
+ Returns:
+ ndarray: The converted {dst.upper()} image.
+ """
+
+ return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/external/cv/mmcv/image/geometric.py b/external/cv/mmcv/image/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd1029e42148d4093b553bafa52616f186c55fa
--- /dev/null
+++ b/external/cv/mmcv/image/geometric.py
@@ -0,0 +1,793 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numbers
+from typing import List, Optional, Tuple, Union, no_type_check
+
+import cv2
+import numpy as np
+from mmengine.utils import to_2tuple
+
+from .io import imread_backend
+
+try:
+ from PIL import Image
+except ImportError:
+ Image = None
+
+
+def _scale_size(
+ size: Tuple[int, int],
+ scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
+) -> Tuple[int, int]:
+ """Rescale a size by a ratio.
+
+ Args:
+ size (tuple[int]): (w, h).
+ scale (float | int | tuple(float) | tuple(int)): Scaling factor.
+
+ Returns:
+ tuple[int]: scaled size.
+ """
+ if isinstance(scale, (float, int)):
+ scale = (scale, scale)
+ w, h = size
+ return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+ 'nearest': cv2.INTER_NEAREST,
+ 'bilinear': cv2.INTER_LINEAR,
+ 'bicubic': cv2.INTER_CUBIC,
+ 'area': cv2.INTER_AREA,
+ 'lanczos': cv2.INTER_LANCZOS4
+}
+
+cv2_border_modes = {
+ 'constant': cv2.BORDER_CONSTANT,
+ 'replicate': cv2.BORDER_REPLICATE,
+ 'reflect': cv2.BORDER_REFLECT,
+ 'wrap': cv2.BORDER_WRAP,
+ 'reflect_101': cv2.BORDER_REFLECT_101,
+ 'transparent': cv2.BORDER_TRANSPARENT,
+ 'isolated': cv2.BORDER_ISOLATED
+}
+
+# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
+# Set pillow_interp_codes according to the naming scheme used.
+if Image is not None:
+ if hasattr(Image, 'Resampling'):
+ pillow_interp_codes = {
+ 'nearest': Image.Resampling.NEAREST,
+ 'bilinear': Image.Resampling.BILINEAR,
+ 'bicubic': Image.Resampling.BICUBIC,
+ 'box': Image.Resampling.BOX,
+ 'lanczos': Image.Resampling.LANCZOS,
+ 'hamming': Image.Resampling.HAMMING
+ }
+ else:
+ pillow_interp_codes = {
+ 'nearest': Image.NEAREST,
+ 'bilinear': Image.BILINEAR,
+ 'bicubic': Image.BICUBIC,
+ 'box': Image.BOX,
+ 'lanczos': Image.LANCZOS,
+ 'hamming': Image.HAMMING
+ }
+
+
+def imresize(
+ img: np.ndarray,
+ size: Tuple[int, int],
+ return_scale: bool = False,
+ interpolation: str = 'bilinear',
+ out: Optional[np.ndarray] = None,
+ backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+ """Resize image to a given size.
+
+ Args:
+ img (ndarray): The input image.
+ size (tuple[int]): Target size (w, h).
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Interpolation method, accepted values are
+ "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+ backend, "nearest", "bilinear" for 'pillow' backend.
+ out (ndarray): The output destination.
+ backend (str | None): The image resize backend type. Options are `cv2`,
+ `pillow`, `None`. If backend is None, the global imread_backend
+ specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+ Returns:
+ tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = img.shape[:2]
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported for resize.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ pil_image = Image.fromarray(img)
+ pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+ resized_img = np.array(pil_image)
+ else:
+ resized_img = cv2.resize(
+ img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+ if not return_scale:
+ return resized_img
+ else:
+ w_scale = size[0] / w
+ h_scale = size[1] / h
+ return resized_img, w_scale, h_scale
+
+
+@no_type_check
+def imresize_to_multiple(
+ img: np.ndarray,
+ divisor: Union[int, Tuple[int, int]],
+ size: Union[int, Tuple[int, int], None] = None,
+ scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
+ None] = None,
+ keep_ratio: bool = False,
+ return_scale: bool = False,
+ interpolation: str = 'bilinear',
+ out: Optional[np.ndarray] = None,
+ backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+ """Resize image according to a given size or scale factor and then rounds
+ up the the resized or rescaled image size to the nearest value that can be
+ divided by the divisor.
+
+ Args:
+ img (ndarray): The input image.
+ divisor (int | tuple): Resized image size will be a multiple of
+ divisor. If divisor is a tuple, divisor should be
+ (w_divisor, h_divisor).
+ size (None | int | tuple[int]): Target size (w, h). Default: None.
+ scale_factor (None | float | int | tuple[float] | tuple[int]):
+ Multiplier for spatial size. Should match input size if it is a
+ tuple and the 2D style is (w_scale_factor, h_scale_factor).
+ Default: None.
+ keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+ image. Default: False.
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Interpolation method, accepted values are
+ "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+ backend, "nearest", "bilinear" for 'pillow' backend.
+ out (ndarray): The output destination.
+ backend (str | None): The image resize backend type. Options are `cv2`,
+ `pillow`, `None`. If backend is None, the global imread_backend
+ specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+ Returns:
+ tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = img.shape[:2]
+ if size is not None and scale_factor is not None:
+ raise ValueError('only one of size or scale_factor should be defined')
+ elif size is None and scale_factor is None:
+ raise ValueError('one of size or scale_factor should be defined')
+ elif size is not None:
+ size = to_2tuple(size)
+ if keep_ratio:
+ size = rescale_size((w, h), size, return_scale=False)
+ else:
+ size = _scale_size((w, h), scale_factor)
+
+ divisor = to_2tuple(divisor)
+ size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
+ resized_img, w_scale, h_scale = imresize(
+ img,
+ size,
+ return_scale=True,
+ interpolation=interpolation,
+ out=out,
+ backend=backend)
+ if return_scale:
+ return resized_img, w_scale, h_scale
+ else:
+ return resized_img
+
+
+def imresize_like(
+ img: np.ndarray,
+ dst_img: np.ndarray,
+ return_scale: bool = False,
+ interpolation: str = 'bilinear',
+ backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+ """Resize image to the same size of a given image.
+
+ Args:
+ img (ndarray): The input image.
+ dst_img (ndarray): The target image.
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Same as :func:`resize`.
+ backend (str | None): Same as :func:`resize`.
+
+ Returns:
+ tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = dst_img.shape[:2]
+ return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size: tuple,
+ scale: Union[float, int, Tuple[int, int]],
+ return_scale: bool = False) -> tuple:
+ """Calculate the new size to be rescaled to.
+
+ Args:
+ old_size (tuple[int]): The old size (w, h) of image.
+ scale (float | int | tuple[int]): The scaling factor or maximum size.
+ If it is a float number or an integer, then the image will be
+ rescaled by this factor, else if it is a tuple of 2 integers, then
+ the image will be rescaled as large as possible within the scale.
+ return_scale (bool): Whether to return the scaling factor besides the
+ rescaled image size.
+
+ Returns:
+ tuple[int]: The new rescaled image size.
+ """
+ w, h = old_size
+ if isinstance(scale, (float, int)):
+ if scale <= 0:
+ raise ValueError(f'Invalid scale {scale}, must be positive.')
+ scale_factor = scale
+ elif isinstance(scale, tuple):
+ max_long_edge = max(scale)
+ max_short_edge = min(scale)
+ scale_factor = min(max_long_edge / max(h, w),
+ max_short_edge / min(h, w))
+ else:
+ raise TypeError(
+ f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+ new_size = _scale_size((w, h), scale_factor)
+
+ if return_scale:
+ return new_size, scale_factor
+ else:
+ return new_size
+
+
+def imrescale(
+ img: np.ndarray,
+ scale: Union[float, int, Tuple[int, int]],
+ return_scale: bool = False,
+ interpolation: str = 'bilinear',
+ backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+ """Resize image while keeping the aspect ratio.
+
+ Args:
+ img (ndarray): The input image.
+ scale (float | int | tuple[int]): The scaling factor or maximum size.
+ If it is a float number or an integer, then the image will be
+ rescaled by this factor, else if it is a tuple of 2 integers, then
+ the image will be rescaled as large as possible within the scale.
+ return_scale (bool): Whether to return the scaling factor besides the
+ rescaled image.
+ interpolation (str): Same as :func:`resize`.
+ backend (str | None): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The rescaled image.
+ """
+ h, w = img.shape[:2]
+ new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+ rescaled_img = imresize(
+ img, new_size, interpolation=interpolation, backend=backend)
+ if return_scale:
+ return rescaled_img, scale_factor
+ else:
+ return rescaled_img
+
+
+def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+ """Flip an image horizontally or vertically.
+
+ Args:
+ img (ndarray): Image to be flipped.
+ direction (str): The flip direction, either "horizontal" or
+ "vertical" or "diagonal".
+
+ Returns:
+ ndarray: The flipped image.
+ """
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ if direction == 'horizontal':
+ return np.flip(img, axis=1)
+ elif direction == 'vertical':
+ return np.flip(img, axis=0)
+ else:
+ return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+ """Inplace flip an image horizontally or vertically.
+
+ Args:
+ img (ndarray): Image to be flipped.
+ direction (str): The flip direction, either "horizontal" or
+ "vertical" or "diagonal".
+
+ Returns:
+ ndarray: The flipped image (inplace).
+ """
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ if direction == 'horizontal':
+ return cv2.flip(img, 1, img)
+ elif direction == 'vertical':
+ return cv2.flip(img, 0, img)
+ else:
+ return cv2.flip(img, -1, img)
+
+
+def imrotate(img: np.ndarray,
+ angle: float,
+ center: Optional[Tuple[float, float]] = None,
+ scale: float = 1.0,
+ border_value: int = 0,
+ interpolation: str = 'bilinear',
+ auto_bound: bool = False,
+ border_mode: str = 'constant') -> np.ndarray:
+ """Rotate an image.
+
+ Args:
+ img (np.ndarray): Image to be rotated.
+ angle (float): Rotation angle in degrees, positive values mean
+ clockwise rotation.
+ center (tuple[float], optional): Center point (w, h) of the rotation in
+ the source image. If not specified, the center of the image will be
+ used.
+ scale (float): Isotropic scale factor.
+ border_value (int): Border value used in case of a constant border.
+ Defaults to 0.
+ interpolation (str): Same as :func:`resize`.
+ auto_bound (bool): Whether to adjust the image size to cover the whole
+ rotated image.
+ border_mode (str): Pixel extrapolation method. Defaults to 'constant'.
+
+ Returns:
+ np.ndarray: The rotated image.
+ """
+ if center is not None and auto_bound:
+ raise ValueError('`auto_bound` conflicts with `center`')
+ h, w = img.shape[:2]
+ if center is None:
+ center = ((w - 1) * 0.5, (h - 1) * 0.5)
+ assert isinstance(center, tuple)
+
+ matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+ if auto_bound:
+ cos = np.abs(matrix[0, 0])
+ sin = np.abs(matrix[0, 1])
+ new_w = h * sin + w * cos
+ new_h = h * cos + w * sin
+ matrix[0, 2] += (new_w - w) * 0.5
+ matrix[1, 2] += (new_h - h) * 0.5
+ w = int(np.round(new_w))
+ h = int(np.round(new_h))
+ rotated = cv2.warpAffine(
+ img,
+ matrix, (w, h),
+ flags=cv2_interp_codes[interpolation],
+ borderMode=cv2_border_modes[border_mode],
+ borderValue=border_value)
+ return rotated
+
+
+def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
+ """Clip bboxes to fit the image shape.
+
+ Args:
+ bboxes (ndarray): Shape (..., 4*k)
+ img_shape (tuple[int]): (height, width) of the image.
+
+ Returns:
+ ndarray: Clipped bboxes.
+ """
+ assert bboxes.shape[-1] % 4 == 0
+ cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+ cmin[0::2] = img_shape[1] - 1
+ cmin[1::2] = img_shape[0] - 1
+ clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+ return clipped_bboxes
+
+
+def bbox_scaling(bboxes: np.ndarray,
+ scale: float,
+ clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
+ """Scaling bboxes w.r.t the box center.
+
+ Args:
+ bboxes (ndarray): Shape(..., 4).
+ scale (float): Scaling factor.
+ clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+ boundary will be clipped according to the given shape (h, w).
+
+ Returns:
+ ndarray: Scaled bboxes.
+ """
+ if float(scale) == 1.0:
+ scaled_bboxes = bboxes.copy()
+ else:
+ w = bboxes[..., 2] - bboxes[..., 0] + 1
+ h = bboxes[..., 3] - bboxes[..., 1] + 1
+ dw = (w * (scale - 1)) * 0.5
+ dh = (h * (scale - 1)) * 0.5
+ scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+ if clip_shape is not None:
+ return bbox_clip(scaled_bboxes, clip_shape)
+ else:
+ return scaled_bboxes
+
+
+def imcrop(
+ img: np.ndarray,
+ bboxes: np.ndarray,
+ scale: float = 1.0,
+ pad_fill: Union[float, list, None] = None
+) -> Union[np.ndarray, List[np.ndarray]]:
+ """Crop image patches.
+
+ 3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+ Args:
+ img (ndarray): Image to be cropped.
+ bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+ scale (float, optional): Scale ratio of bboxes, the default value
+ 1.0 means no scaling.
+ pad_fill (Number | list[Number]): Value to be filled for padding.
+ Default: None, which means no padding.
+
+ Returns:
+ list[ndarray] | ndarray: The cropped image patches.
+ """
+ chn = 1 if img.ndim == 2 else img.shape[2]
+ if pad_fill is not None:
+ if isinstance(pad_fill, (int, float)):
+ pad_fill = [pad_fill for _ in range(chn)]
+ assert len(pad_fill) == chn
+
+ _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+ scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+ clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+ patches = []
+ for i in range(clipped_bbox.shape[0]):
+ x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+ if pad_fill is None:
+ patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+ else:
+ _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+ patch_h = _y2 - _y1 + 1
+ patch_w = _x2 - _x1 + 1
+ if chn == 1:
+ patch_shape = (patch_h, patch_w)
+ else:
+ patch_shape = (patch_h, patch_w, chn) # type: ignore
+ patch = np.array(
+ pad_fill, dtype=img.dtype) * np.ones(
+ patch_shape, dtype=img.dtype)
+ x_start = 0 if _x1 >= 0 else -_x1
+ y_start = 0 if _y1 >= 0 else -_y1
+ w = x2 - x1 + 1
+ h = y2 - y1 + 1
+ patch[y_start:y_start + h, x_start:x_start + w,
+ ...] = img[y1:y1 + h, x1:x1 + w, ...]
+ patches.append(patch)
+
+ if bboxes.ndim == 1:
+ return patches[0]
+ else:
+ return patches
+
+
+def impad(img: np.ndarray,
+ *,
+ shape: Optional[Tuple[int, int]] = None,
+ padding: Union[int, tuple, None] = None,
+ pad_val: Union[float, List] = 0,
+ padding_mode: str = 'constant') -> np.ndarray:
+ """Pad the given image to a certain shape or pad on all sides with
+ specified padding mode and padding value.
+
+ Args:
+ img (ndarray): Image to be padded.
+ shape (tuple[int]): Expected padding shape (h, w). Default: None.
+ padding (int or tuple[int]): Padding on each border. If a single int is
+ provided this is used to pad all borders. If tuple of length 2 is
+ provided this is the padding on left/right and top/bottom
+ respectively. If a tuple of length 4 is provided this is the
+ padding for the left, top, right and bottom borders respectively.
+ Default: None. Note that `shape` and `padding` can not be both
+ set.
+ pad_val (Number | Sequence[Number]): Values to be filled in padding
+ areas when padding_mode is 'constant'. Default: 0.
+ padding_mode (str): Type of padding. Should be: constant, edge,
+ reflect or symmetric. Default: constant.
+
+ - constant: pads with a constant value, this value is specified
+ with pad_val.
+ - edge: pads with the last value at the edge of the image.
+ - reflect: pads with reflection of image without repeating the last
+ value on the edge. For example, padding [1, 2, 3, 4] with 2
+ elements on both sides in reflect mode will result in
+ [3, 2, 1, 2, 3, 4, 3, 2].
+ - symmetric: pads with reflection of image repeating the last value
+ on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+ both sides in symmetric mode will result in
+ [2, 1, 1, 2, 3, 4, 4, 3]
+
+ Returns:
+ ndarray: The padded image.
+ """
+
+ assert (shape is not None) ^ (padding is not None)
+ if shape is not None:
+ width = max(shape[1] - img.shape[1], 0)
+ height = max(shape[0] - img.shape[0], 0)
+ padding = (0, 0, width, height)
+
+ # check pad_val
+ if isinstance(pad_val, tuple):
+ assert len(pad_val) == img.shape[-1]
+ elif not isinstance(pad_val, numbers.Number):
+ raise TypeError('pad_val must be a int or a tuple. '
+ f'But received {type(pad_val)}')
+
+ # check padding
+ if isinstance(padding, tuple) and len(padding) in [2, 4]:
+ if len(padding) == 2:
+ padding = (padding[0], padding[1], padding[0], padding[1])
+ elif isinstance(padding, numbers.Number):
+ padding = (padding, padding, padding, padding)
+ else:
+ raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+ f'But received {padding}')
+
+ # check padding mode
+ assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+ border_type = {
+ 'constant': cv2.BORDER_CONSTANT,
+ 'edge': cv2.BORDER_REPLICATE,
+ 'reflect': cv2.BORDER_REFLECT_101,
+ 'symmetric': cv2.BORDER_REFLECT
+ }
+ img = cv2.copyMakeBorder(
+ img,
+ padding[1],
+ padding[3],
+ padding[0],
+ padding[2],
+ border_type[padding_mode],
+ value=pad_val)
+
+ return img
+
+
+def impad_to_multiple(img: np.ndarray,
+ divisor: int,
+ pad_val: Union[float, List] = 0) -> np.ndarray:
+ """Pad an image to ensure each edge to be multiple to some number.
+
+ Args:
+ img (ndarray): Image to be padded.
+ divisor (int): Padded image edges will be multiple to divisor.
+ pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+ Returns:
+ ndarray: The padded image.
+ """
+ pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+ pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+ return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img: np.ndarray,
+ shape: Union[int, Tuple[int, int]],
+ pad_val: Union[int, float, tuple] = 0) -> np.ndarray:
+ """Randomly cut out a rectangle from the original img.
+
+ Args:
+ img (ndarray): Image to be cutout.
+ shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+ int, the value will be used for both h and w.
+ pad_val (int | float | tuple[int | float]): Values to be filled in the
+ cut area. Defaults to 0.
+
+ Returns:
+ ndarray: The cutout image.
+ """
+
+ channels = 1 if img.ndim == 2 else img.shape[2]
+ if isinstance(shape, int):
+ cut_h, cut_w = shape, shape
+ else:
+ assert isinstance(shape, tuple) and len(shape) == 2, \
+ f'shape must be a int or a tuple with length 2, but got type ' \
+ f'{type(shape)} instead.'
+ cut_h, cut_w = shape
+ if isinstance(pad_val, (int, float)):
+ pad_val = tuple([pad_val] * channels)
+ elif isinstance(pad_val, tuple):
+ assert len(pad_val) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(pad_val), channels)
+ else:
+ raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+ img_h, img_w = img.shape[:2]
+ y0 = np.random.uniform(img_h)
+ x0 = np.random.uniform(img_w)
+
+ y1 = int(max(0, y0 - cut_h / 2.))
+ x1 = int(max(0, x0 - cut_w / 2.))
+ y2 = min(img_h, y1 + cut_h)
+ x2 = min(img_w, x1 + cut_w)
+
+ if img.ndim == 2:
+ patch_shape = (y2 - y1, x2 - x1)
+ else:
+ patch_shape = (y2 - y1, x2 - x1, channels) # type: ignore
+
+ img_cutout = img.copy()
+ patch = np.array(
+ pad_val, dtype=img.dtype) * np.ones(
+ patch_shape, dtype=img.dtype)
+ img_cutout[y1:y2, x1:x2, ...] = patch
+
+ return img_cutout
+
+
+def _get_shear_matrix(magnitude: Union[int, float],
+ direction: str = 'horizontal') -> np.ndarray:
+ """Generate the shear matrix for transformation.
+
+ Args:
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The flip direction, either "horizontal"
+ or "vertical".
+
+ Returns:
+ ndarray: The shear matrix with dtype float32.
+ """
+ if direction == 'horizontal':
+ shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+ elif direction == 'vertical':
+ shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+ return shear_matrix
+
+
+def imshear(img: np.ndarray,
+ magnitude: Union[int, float],
+ direction: str = 'horizontal',
+ border_value: Union[int, Tuple[int, int]] = 0,
+ interpolation: str = 'bilinear') -> np.ndarray:
+ """Shear an image.
+
+ Args:
+ img (ndarray): Image to be sheared with format (h, w)
+ or (h, w, c).
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The flip direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border.
+ interpolation (str): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The sheared image.
+ """
+ assert direction in ['horizontal',
+ 'vertical'], f'Invalid direction: {direction}'
+ height, width = img.shape[:2]
+ if img.ndim == 2:
+ channels = 1
+ elif img.ndim == 3:
+ channels = img.shape[-1]
+ if isinstance(border_value, int):
+ border_value = tuple([border_value] * channels) # type: ignore
+ elif isinstance(border_value, tuple):
+ assert len(border_value) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(border_value), channels)
+ else:
+ raise ValueError(
+ f'Invalid type {type(border_value)} for `border_value`')
+ shear_matrix = _get_shear_matrix(magnitude, direction)
+ sheared = cv2.warpAffine(
+ img,
+ shear_matrix,
+ (width, height),
+ # Note case when the number elements in `border_value`
+ # greater than 3 (e.g. shearing masks whose channels large
+ # than 3) will raise TypeError in `cv2.warpAffine`.
+ # Here simply slice the first 3 values in `border_value`.
+ borderValue=border_value[:3], # type: ignore
+ flags=cv2_interp_codes[interpolation])
+ return sheared
+
+
+def _get_translate_matrix(offset: Union[int, float],
+ direction: str = 'horizontal') -> np.ndarray:
+ """Generate the translate matrix.
+
+ Args:
+ offset (int | float): The offset used for translate.
+ direction (str): The translate direction, either
+ "horizontal" or "vertical".
+
+ Returns:
+ ndarray: The translate matrix with dtype float32.
+ """
+ if direction == 'horizontal':
+ translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+ elif direction == 'vertical':
+ translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+ return translate_matrix
+
+
+def imtranslate(img: np.ndarray,
+ offset: Union[int, float],
+ direction: str = 'horizontal',
+ border_value: Union[int, tuple] = 0,
+ interpolation: str = 'bilinear') -> np.ndarray:
+ """Translate an image.
+
+ Args:
+ img (ndarray): Image to be translated with format
+ (h, w) or (h, w, c).
+ offset (int | float): The offset used for translate.
+ direction (str): The translate direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border.
+ interpolation (str): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The translated image.
+ """
+ assert direction in ['horizontal',
+ 'vertical'], f'Invalid direction: {direction}'
+ height, width = img.shape[:2]
+ if img.ndim == 2:
+ channels = 1
+ elif img.ndim == 3:
+ channels = img.shape[-1]
+ if isinstance(border_value, int):
+ border_value = tuple([border_value] * channels)
+ elif isinstance(border_value, tuple):
+ assert len(border_value) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(border_value), channels)
+ else:
+ raise ValueError(
+ f'Invalid type {type(border_value)} for `border_value`.')
+ translate_matrix = _get_translate_matrix(offset, direction)
+ translated = cv2.warpAffine(
+ img,
+ translate_matrix,
+ (width, height),
+ # Note case when the number elements in `border_value`
+ # greater than 3 (e.g. translating masks whose channels
+ # large than 3) will raise TypeError in `cv2.warpAffine`.
+ # Here simply slice the first 3 values in `border_value`.
+ borderValue=border_value[:3],
+ flags=cv2_interp_codes[interpolation])
+ return translated
diff --git a/external/cv/mmcv/image/io.py b/external/cv/mmcv/image/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..a69eb5c6940b96d6a839572100f3caf8f3375587
--- /dev/null
+++ b/external/cv/mmcv/image/io.py
@@ -0,0 +1,369 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import io
+import os.path as osp
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+import cv2
+import mmengine.fileio as fileio
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+ IMREAD_UNCHANGED)
+from mmengine.utils import is_filepath, is_str
+
+try:
+ from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+ TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+ from PIL import Image, ImageOps
+except ImportError:
+ Image = None
+
+try:
+ import tifffile
+except ImportError:
+ tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+ 'color': IMREAD_COLOR,
+ 'grayscale': IMREAD_GRAYSCALE,
+ 'unchanged': IMREAD_UNCHANGED,
+ 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+ 'grayscale_ignore_orientation':
+ IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend: str) -> None:
+ """Select a backend for image decoding.
+
+ Args:
+ backend (str): The image decoding backend type. Options are `cv2`,
+ `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+ and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+ file format.
+ """
+ assert backend in supported_backends
+ global imread_backend
+ imread_backend = backend
+ if imread_backend == 'turbojpeg':
+ if TurboJPEG is None:
+ raise ImportError('`PyTurboJPEG` is not installed')
+ global jpeg
+ if jpeg is None:
+ jpeg = TurboJPEG()
+ elif imread_backend == 'pillow':
+ if Image is None:
+ raise ImportError('`Pillow` is not installed')
+ elif imread_backend == 'tifffile':
+ if tifffile is None:
+ raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
+ channel_order = channel_order.lower()
+ if channel_order not in ['rgb', 'bgr']:
+ raise ValueError('channel order must be either "rgb" or "bgr"')
+
+ if flag == 'color':
+ if channel_order == 'bgr':
+ return TJPF_BGR
+ elif channel_order == 'rgb':
+ return TJCS_RGB
+ elif flag == 'grayscale':
+ return TJPF_GRAY
+ else:
+ raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img,
+ flag: str = 'color',
+ channel_order: str = 'bgr') -> np.ndarray:
+ """Convert a pillow image to numpy array.
+
+ Args:
+ img (:obj:`PIL.Image.Image`): The image loaded using PIL
+ flag (str): Flags specifying the color type of a loaded image,
+ candidates are 'color', 'grayscale' and 'unchanged'.
+ Default to 'color'.
+ channel_order (str): The channel order of the output image array,
+ candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+ Returns:
+ np.ndarray: The converted numpy array
+ """
+ channel_order = channel_order.lower()
+ if channel_order not in ['rgb', 'bgr']:
+ raise ValueError('channel order must be either "rgb" or "bgr"')
+
+ if flag == 'unchanged':
+ array = np.array(img)
+ if array.ndim >= 3 and array.shape[2] >= 3: # color image
+ array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
+ else:
+ # Handle exif orientation tag
+ if flag in ['color', 'grayscale']:
+ img = ImageOps.exif_transpose(img)
+ # If the image mode is not 'RGB', convert it to 'RGB' first.
+ if img.mode != 'RGB':
+ if img.mode != 'LA':
+ # Most formats except 'LA' can be directly converted to RGB
+ img = img.convert('RGB')
+ else:
+ # When the mode is 'LA', the default conversion will fill in
+ # the canvas with black, which sometimes shadows black objects
+ # in the foreground.
+ #
+ # Therefore, a random color (124, 117, 104) is used for canvas
+ img_rgba = img.convert('RGBA')
+ img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+ img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
+ if flag in ['color', 'color_ignore_orientation']:
+ array = np.array(img)
+ if channel_order != 'rgb':
+ array = array[:, :, ::-1] # RGB to BGR
+ elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+ img = img.convert('L')
+ array = np.array(img)
+ else:
+ raise ValueError(
+ 'flag must be "color", "grayscale", "unchanged", '
+ f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+ f' but got {flag}')
+ return array
+
+
+def imread(img_or_path: Union[np.ndarray, str, Path],
+ flag: str = 'color',
+ channel_order: str = 'bgr',
+ backend: Optional[str] = None,
+ file_client_args: Optional[dict] = None,
+ *,
+ backend_args: Optional[dict] = None) -> np.ndarray:
+ """Read an image.
+
+ Args:
+ img_or_path (ndarray or str or Path): Either a numpy array or str or
+ pathlib.Path. If it is a numpy array (loaded image), then
+ it will be returned as is.
+ flag (str): Flags specifying the color type of a loaded image,
+ candidates are `color`, `grayscale`, `unchanged`,
+ `color_ignore_orientation` and `grayscale_ignore_orientation`.
+ By default, `cv2` and `pillow` backend would rotate the image
+ according to its EXIF info unless called with `unchanged` or
+ `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+ always ignore image's EXIF info regardless of the flag.
+ The `turbojpeg` backend only supports `color` and `grayscale`.
+ channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+ backend (str | None): The image decoding backend type. Options are
+ `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+ If backend is None, the global imread_backend specified by
+ ``mmcv.use_backend()`` will be used. Default: None.
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmengine.fileio.FileClient` for details.
+ Default: None. It will be deprecated in future. Please use
+ ``backend_args`` instead.
+ Deprecated in version 2.0.0rc4.
+ backend_args (dict, optional): Instantiates the corresponding file
+ backend. It may contain `backend` key to specify the file
+ backend. If it contains, the file backend corresponding to this
+ value will be used and initialized with the remaining values,
+ otherwise the corresponding file backend will be selected
+ based on the prefix of the file path. Defaults to None.
+ New in version 2.0.0rc4.
+
+ Returns:
+ ndarray: Loaded image array.
+
+ Examples:
+ >>> import mmcv
+ >>> img_path = '/path/to/img.jpg'
+ >>> img = mmcv.imread(img_path)
+ >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+ ... backend='cv2')
+ >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+ ... backend='pillow')
+ >>> s3_img_path = 's3://bucket/img.jpg'
+ >>> # infer the file backend by the prefix s3
+ >>> img = mmcv.imread(s3_img_path)
+ >>> # manually set the file backend petrel
+ >>> img = mmcv.imread(s3_img_path, backend_args={
+ ... 'backend': 'petrel'})
+ >>> http_img_path = 'http://path/to/img.jpg'
+ >>> img = mmcv.imread(http_img_path)
+ >>> img = mmcv.imread(http_img_path, backend_args={
+ ... 'backend': 'http'})
+ """
+ if file_client_args is not None:
+ warnings.warn(
+ '"file_client_args" will be deprecated in future. '
+ 'Please use "backend_args" instead', DeprecationWarning)
+ if backend_args is not None:
+ raise ValueError(
+ '"file_client_args" and "backend_args" cannot be set at the '
+ 'same time.')
+
+ if isinstance(img_or_path, Path):
+ img_or_path = str(img_or_path)
+
+ if isinstance(img_or_path, np.ndarray):
+ return img_or_path
+ elif is_str(img_or_path):
+ if file_client_args is not None:
+ file_client = fileio.FileClient.infer_client(
+ file_client_args, img_or_path)
+ img_bytes = file_client.get(img_or_path)
+ else:
+ img_bytes = fileio.get(img_or_path, backend_args=backend_args)
+ return imfrombytes(img_bytes, flag, channel_order, backend)
+ else:
+ raise TypeError('"img" must be a numpy array or a str or '
+ 'a pathlib.Path object')
+
+
+def imfrombytes(content: bytes,
+ flag: str = 'color',
+ channel_order: str = 'bgr',
+ backend: Optional[str] = None) -> np.ndarray:
+ """Read an image from bytes.
+
+ Args:
+ content (bytes): Image bytes got from files or other streams.
+ flag (str): Same as :func:`imread`.
+ channel_order (str): The channel order of the output, candidates
+ are 'bgr' and 'rgb'. Default to 'bgr'.
+ backend (str | None): The image decoding backend type. Options are
+ `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+ None, the global imread_backend specified by ``mmcv.use_backend()``
+ will be used. Default: None.
+
+ Returns:
+ ndarray: Loaded image array.
+
+ Examples:
+ >>> img_path = '/path/to/img.jpg'
+ >>> with open(img_path, 'rb') as f:
+ >>> img_buff = f.read()
+ >>> img = mmcv.imfrombytes(img_buff)
+ >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+ >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+ >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
+ """
+
+ if backend is None:
+ backend = imread_backend
+ if backend not in supported_backends:
+ raise ValueError(
+ f'backend: {backend} is not supported. Supported '
+ "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+ if backend == 'turbojpeg':
+ img = jpeg.decode( # type: ignore
+ content, _jpegflag(flag, channel_order))
+ if img.shape[-1] == 1:
+ img = img[:, :, 0]
+ return img
+ elif backend == 'pillow':
+ with io.BytesIO(content) as buff:
+ img = Image.open(buff)
+ img = _pillow2array(img, flag, channel_order)
+ return img
+ elif backend == 'tifffile':
+ with io.BytesIO(content) as buff:
+ img = tifffile.imread(buff)
+ return img
+ else:
+ img_np = np.frombuffer(content, np.uint8)
+ flag = imread_flags[flag] if is_str(flag) else flag
+ img = cv2.imdecode(img_np, flag)
+ if flag == IMREAD_COLOR and channel_order == 'rgb':
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+ return img
+
+
+def imwrite(img: np.ndarray,
+ file_path: str,
+ params: Optional[list] = None,
+ auto_mkdir: Optional[bool] = None,
+ file_client_args: Optional[dict] = None,
+ *,
+ backend_args: Optional[dict] = None) -> bool:
+ """Write image to file.
+
+ Warning:
+ The parameter `auto_mkdir` will be deprecated in the future and every
+ file clients will make directory automatically.
+
+ Args:
+ img (ndarray): Image array to be written.
+ file_path (str): Image file path.
+ params (None or list): Same as opencv :func:`imwrite` interface.
+ auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+ whether to create it automatically. It will be deprecated.
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmengine.fileio.FileClient` for details.
+ Default: None. It will be deprecated in future. Please use
+ ``backend_args`` instead.
+ Deprecated in version 2.0.0rc4.
+ backend_args (dict, optional): Instantiates the corresponding file
+ backend. It may contain `backend` key to specify the file
+ backend. If it contains, the file backend corresponding to this
+ value will be used and initialized with the remaining values,
+ otherwise the corresponding file backend will be selected
+ based on the prefix of the file path. Defaults to None.
+ New in version 2.0.0rc4.
+
+ Returns:
+ bool: Successful or not.
+
+ Examples:
+ >>> # write to hard disk client
+ >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+ >>> # infer the file backend by the prefix s3
+ >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+ >>> # manually set the file backend petrel
+ >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={
+ ... 'backend': 'petrel'})
+ """
+ if file_client_args is not None:
+ warnings.warn(
+ '"file_client_args" will be deprecated in future. '
+ 'Please use "backend_args" instead', DeprecationWarning)
+ if backend_args is not None:
+ raise ValueError(
+ '"file_client_args" and "backend_args" cannot be set at the '
+ 'same time.')
+
+ assert is_filepath(file_path)
+ file_path = str(file_path)
+ if auto_mkdir is not None:
+ warnings.warn(
+ 'The parameter `auto_mkdir` will be deprecated in the future and '
+ 'every file clients will make directory automatically.')
+
+ img_ext = osp.splitext(file_path)[-1]
+ # Encode image according to image suffix.
+ # For example, if image path is '/path/your/img.jpg', the encode
+ # format is '.jpg'.
+ flag, img_buff = cv2.imencode(img_ext, img, params)
+
+ if file_client_args is not None:
+ file_client = fileio.FileClient.infer_client(file_client_args,
+ file_path)
+ file_client.put(img_buff.tobytes(), file_path)
+ else:
+ fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)
+
+ return flag
diff --git a/external/cv/mmcv/image/misc.py b/external/cv/mmcv/image/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8d5bfb16028ed03cc60be344c5202390c29f83
--- /dev/null
+++ b/external/cv/mmcv/image/misc.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import numpy as np
+
+import mmcv
+
+try:
+ import torch
+except ImportError:
+ torch = None
+
+
+def tensor2imgs(tensor,
+ mean: Optional[tuple] = None,
+ std: Optional[tuple] = None,
+ to_rgb: bool = True) -> list:
+ """Convert tensor to 3-channel images or 1-channel gray images.
+
+ Args:
+ tensor (torch.Tensor): Tensor that contains multiple images, shape (
+ N, C, H, W). :math:`C` can be either 3 or 1.
+ mean (tuple[float], optional): Mean of images. If None,
+ (0, 0, 0) will be used for tensor with 3-channel,
+ while (0, ) for tensor with 1-channel. Defaults to None.
+ std (tuple[float], optional): Standard deviation of images. If None,
+ (1, 1, 1) will be used for tensor with 3-channel,
+ while (1, ) for tensor with 1-channel. Defaults to None.
+ to_rgb (bool, optional): Whether the tensor was converted to RGB
+ format in the first place. If so, convert it back to BGR.
+ For the tensor with 1 channel, it must be False. Defaults to True.
+
+ Returns:
+ list[np.ndarray]: A list that contains multiple images.
+ """
+
+ if torch is None:
+ raise RuntimeError('pytorch is not installed')
+ assert torch.is_tensor(tensor) and tensor.ndim == 4
+ channels = tensor.size(1)
+ assert channels in [1, 3]
+ if mean is None:
+ mean = (0, ) * channels
+ if std is None:
+ std = (1, ) * channels
+ assert (channels == len(mean) == len(std) == 3) or \
+ (channels == len(mean) == len(std) == 1 and not to_rgb)
+
+ num_imgs = tensor.size(0)
+ mean = np.array(mean, dtype=np.float32)
+ std = np.array(std, dtype=np.float32)
+ imgs = []
+ for img_id in range(num_imgs):
+ img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+ img = mmcv.imdenormalize(
+ img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+ imgs.append(np.ascontiguousarray(img))
+ return imgs
diff --git a/external/cv/mmcv/image/photometric.py b/external/cv/mmcv/image/photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8c231f1ef403f60918f8bdd0e980e4c9e3dcec
--- /dev/null
+++ b/external/cv/mmcv/image/photometric.py
@@ -0,0 +1,566 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Optional
+
+import cv2
+import numpy as np
+from mmengine.utils import is_tuple_of
+from PIL import Image, ImageEnhance
+
+from .colorspace import bgr2gray, gray2bgr
+from .io import imread_backend
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+ """Normalize an image with mean and std.
+
+ Args:
+ img (ndarray): Image to be normalized.
+ mean (ndarray): The mean to be used for normalize.
+ std (ndarray): The std to be used for normalize.
+ to_rgb (bool): Whether to convert to rgb.
+
+ Returns:
+ ndarray: The normalized image.
+ """
+ img = img.copy().astype(np.float32)
+ return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+ """Inplace normalize an image with mean and std.
+
+ Args:
+ img (ndarray): Image to be normalized.
+ mean (ndarray): The mean to be used for normalize.
+ std (ndarray): The std to be used for normalize.
+ to_rgb (bool): Whether to convert to rgb.
+
+ Returns:
+ ndarray: The normalized image.
+ """
+ # cv2 inplace normalization does not accept uint8
+ assert img.dtype != np.uint8
+ mean = np.float64(mean.reshape(1, -1))
+ stdinv = 1 / np.float64(std.reshape(1, -1))
+ if to_rgb:
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
+ cv2.subtract(img, mean, img) # inplace
+ cv2.multiply(img, stdinv, img) # inplace
+ return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+ assert img.dtype != np.uint8
+ mean = mean.reshape(1, -1).astype(np.float64)
+ std = std.reshape(1, -1).astype(np.float64)
+ img = cv2.multiply(img, std) # make a copy
+ cv2.add(img, mean, img) # inplace
+ if to_bgr:
+ cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace
+ return img
+
+
+def iminvert(img):
+ """Invert (negate) an image.
+
+ Args:
+ img (ndarray): Image to be inverted.
+
+ Returns:
+ ndarray: The inverted image.
+ """
+ return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+ """Solarize an image (invert all pixel values above a threshold)
+
+ Args:
+ img (ndarray): Image to be solarized.
+ thr (int): Threshold for solarizing (0 - 255).
+
+ Returns:
+ ndarray: The solarized image.
+ """
+ img = np.where(img < thr, img, 255 - img)
+ return img
+
+
+def posterize(img, bits):
+ """Posterize an image (reduce the number of bits for each color channel)
+
+ Args:
+ img (ndarray): Image to be posterized.
+ bits (int): Number of bits (1 to 8) to use for posterizing.
+
+ Returns:
+ ndarray: The posterized image.
+ """
+ shift = 8 - bits
+ img = np.left_shift(np.right_shift(img, shift), shift)
+ return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
+ r"""It blends the source image and its gray image:
+
+ .. math::
+ output = img * alpha + gray\_img * beta + gamma
+
+ Args:
+ img (ndarray): The input source image.
+ alpha (int | float): Weight for the source image. Default 1.
+ beta (int | float): Weight for the converted gray image.
+ If None, it's assigned the value (1 - `alpha`).
+ gamma (int | float): Scalar added to each sum.
+ Same as :func:`cv2.addWeighted`. Default 0.
+ backend (str | None): The image processing backend type. Options are
+ `cv2`, `pillow`, `None`. If backend is None, the global
+ ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+ used. Defaults to None.
+
+ Returns:
+ ndarray: Colored image which has the same size and dtype as input.
+ """
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ warnings.warn("Only use 'alpha' for pillow backend.")
+ # Image.fromarray defaultly supports RGB, not BGR.
+ pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+ enhancer = ImageEnhance.Color(pil_image)
+ pil_image = enhancer.enhance(alpha)
+ return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+ else:
+ gray_img = bgr2gray(img)
+ gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+ if beta is None:
+ beta = 1 - alpha
+ colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+ if not colored_img.dtype == np.uint8:
+ # Note when the dtype of `img` is not the default `np.uint8`
+ # (e.g. np.float32), the value in `colored_img` got from cv2
+ # is not guaranteed to be in range [0, 255], so here clip
+ # is needed.
+ colored_img = np.clip(colored_img, 0, 255)
+ return colored_img.astype(img.dtype)
+
+
+def imequalize(img):
+ """Equalize the image histogram.
+
+ This function applies a non-linear mapping to the input image,
+ in order to create a uniform distribution of grayscale values
+ in the output image.
+
+ Args:
+ img (ndarray): Image to be equalized.
+
+ Returns:
+ ndarray: The equalized image.
+ """
+
+ def _scale_channel(im, c):
+ """Scale the data in the corresponding channel."""
+ im = im[:, :, c]
+ # Compute the histogram of the image channel.
+ histo = np.histogram(im, 256, (0, 255))[0]
+ # For computing the step, filter out the nonzeros.
+ nonzero_histo = histo[histo > 0]
+ step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+ if not step:
+ lut = np.array(range(256))
+ else:
+ # Compute the cumulative sum, shifted by step // 2
+ # and then normalized by step.
+ lut = (np.cumsum(histo) + (step // 2)) // step
+ # Shift lut, prepending with 0.
+ lut = np.concatenate([[0], lut[:-1]], 0)
+ # handle potential integer overflow
+ lut[lut > 255] = 255
+ # If step is zero, return the original image.
+ # Otherwise, index from lut.
+ return np.where(np.equal(step, 0), im, lut[im])
+
+ # Scales each channel independently and then stacks
+ # the result.
+ s1 = _scale_channel(img, 0)
+ s2 = _scale_channel(img, 1)
+ s3 = _scale_channel(img, 2)
+ equalized_img = np.stack([s1, s2, s3], axis=-1)
+ return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1., backend=None):
+ """Adjust image brightness.
+
+ This function controls the brightness of an image. An
+ enhancement factor of 0.0 gives a black image.
+ A factor of 1.0 gives the original image. This function
+ blends the source image and the degenerated black image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be brightened.
+ factor (float): A value controls the enhancement.
+ Factor 1.0 returns the original image, lower
+ factors mean less color (brightness, contrast,
+ etc), and higher values more. Default 1.
+ backend (str | None): The image processing backend type. Options are
+ `cv2`, `pillow`, `None`. If backend is None, the global
+ ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+ used. Defaults to None.
+
+ Returns:
+ ndarray: The brightened image.
+ """
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ # Image.fromarray defaultly supports RGB, not BGR.
+ pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+ enhancer = ImageEnhance.Brightness(pil_image)
+ pil_image = enhancer.enhance(factor)
+ return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+ else:
+ degenerated = np.zeros_like(img)
+ # Note manually convert the dtype to np.float32, to
+ # achieve as close results as PIL.ImageEnhance.Brightness.
+ # Set beta=1-factor, and gamma=0
+ brightened_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ brightened_img = np.clip(brightened_img, 0, 255)
+ return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1., backend=None):
+ """Adjust image contrast.
+
+ This function controls the contrast of an image. An
+ enhancement factor of 0.0 gives a solid grey
+ image. A factor of 1.0 gives the original image. It
+ blends the source image and the degenerated mean image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be contrasted. BGR order.
+ factor (float): Same as :func:`mmcv.adjust_brightness`.
+ backend (str | None): The image processing backend type. Options are
+ `cv2`, `pillow`, `None`. If backend is None, the global
+ ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+ used. Defaults to None.
+
+ Returns:
+ ndarray: The contrasted image.
+ """
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ # Image.fromarray defaultly supports RGB, not BGR.
+ pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+ enhancer = ImageEnhance.Contrast(pil_image)
+ pil_image = enhancer.enhance(factor)
+ return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+ else:
+ gray_img = bgr2gray(img)
+ hist = np.histogram(gray_img, 256, (0, 255))[0]
+ mean = round(np.sum(gray_img) / np.sum(hist))
+ degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+ degenerated = gray2bgr(degenerated)
+ contrasted_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ contrasted_img = np.clip(contrasted_img, 0, 255)
+ return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+ """Auto adjust image contrast.
+
+ This function maximize (normalize) image contrast by first removing cutoff
+ percent of the lightest and darkest pixels from the histogram and remapping
+ the image so that the darkest pixel becomes black (0), and the lightest
+ becomes white (255).
+
+ Args:
+ img (ndarray): Image to be contrasted. BGR order.
+ cutoff (int | float | tuple): The cutoff percent of the lightest and
+ darkest pixels to be removed. If given as tuple, it shall be
+ (low, high). Otherwise, the single value will be used for both.
+ Defaults to 0.
+
+ Returns:
+ ndarray: The contrasted image.
+ """
+
+ def _auto_contrast_channel(im, c, cutoff):
+ im = im[:, :, c]
+ # Compute the histogram of the image channel.
+ histo = np.histogram(im, 256, (0, 255))[0]
+ # Remove cut-off percent pixels from histo
+ histo_sum = np.cumsum(histo)
+ cut_low = histo_sum[-1] * cutoff[0] // 100
+ cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+ histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+ histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+ # Compute mapping
+ low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+ # If all the values have been cut off, return the origin img
+ if low >= high:
+ return im
+ scale = 255.0 / (high - low)
+ offset = -low * scale
+ lut = np.array(range(256))
+ lut = lut * scale + offset
+ lut = np.clip(lut, 0, 255)
+ return lut[im]
+
+ if isinstance(cutoff, (int, float)):
+ cutoff = (cutoff, cutoff)
+ else:
+ assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+ f'float or tuple, but got {type(cutoff)} instead.'
+ # Auto adjusts contrast for each channel independently and then stacks
+ # the result.
+ s1 = _auto_contrast_channel(img, 0, cutoff)
+ s2 = _auto_contrast_channel(img, 1, cutoff)
+ s3 = _auto_contrast_channel(img, 2, cutoff)
+ contrasted_img = np.stack([s1, s2, s3], axis=-1)
+ return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+ """Adjust image sharpness.
+
+ This function controls the sharpness of an image. An
+ enhancement factor of 0.0 gives a blurred image. A
+ factor of 1.0 gives the original image. And a factor
+ of 2.0 gives a sharpened image. It blends the source
+ image and the degenerated mean image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be sharpened. BGR order.
+ factor (float): Same as :func:`mmcv.adjust_brightness`.
+ kernel (np.ndarray, optional): Filter kernel to be applied on the img
+ to obtain the degenerated img. Defaults to None.
+
+ Note:
+ No value sanity check is enforced on the kernel set by users. So with
+ an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+ the function its name indicates but end up performing whatever
+ transform determined by the kernel.
+
+ Returns:
+ ndarray: The sharpened image.
+ """
+
+ if kernel is None:
+ # adopted from PIL.ImageFilter.SMOOTH
+ kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+ assert isinstance(kernel, np.ndarray), \
+ f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+ assert kernel.ndim == 2, \
+ f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+ degenerated = cv2.filter2D(img, -1, kernel)
+ sharpened_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ sharpened_img = np.clip(sharpened_img, 0, 255)
+ return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+ """AlexNet-style PCA jitter.
+
+ This data augmentation is proposed in `ImageNet Classification with Deep
+ Convolutional Neural Networks
+ `_.
+
+ Args:
+ img (ndarray): Image to be adjusted lighting. BGR order.
+ eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+ values, respectively.
+ eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+ values, respectively.
+ alphastd (float): The standard deviation for distribution of alpha.
+ Defaults to 0.1
+ to_rgb (bool): Whether to convert img to rgb.
+
+ Returns:
+ ndarray: The adjusted image.
+ """
+ assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+ f'eigval and eigvec should both be of type np.ndarray, got ' \
+ f'{type(eigval)} and {type(eigvec)} instead.'
+
+ assert eigval.ndim == 1 and eigvec.ndim == 2
+ assert eigvec.shape == (3, eigval.shape[0])
+ n_eigval = eigval.shape[0]
+ assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+ f'got {type(alphastd)} instead.'
+
+ img = img.copy().astype(np.float32)
+ if to_rgb:
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
+
+ alpha = np.random.normal(0, alphastd, n_eigval)
+ alter = eigvec \
+ * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+ * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+ alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+ img_adjusted = img + alter
+ return img_adjusted
+
+
+def lut_transform(img, lut_table):
+ """Transform array by look-up table.
+
+ The function lut_transform fills the output array with values from the
+ look-up table. Indices of the entries are taken from the input array.
+
+ Args:
+ img (ndarray): Image to be transformed.
+ lut_table (ndarray): look-up table of 256 elements; in case of
+ multi-channel input array, the table should either have a single
+ channel (in this case the same table is used for all channels) or
+ the same number of channels as in the input array.
+
+ Returns:
+ ndarray: The transformed image.
+ """
+ assert isinstance(img, np.ndarray)
+ assert 0 <= np.min(img) and np.max(img) <= 255
+ assert isinstance(lut_table, np.ndarray)
+ assert lut_table.shape == (256, )
+
+ return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+ """Use CLAHE method to process the image.
+
+ See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+ Graphics Gems, 1994:474-485.` for more information.
+
+ Args:
+ img (ndarray): Image to be processed.
+ clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+ tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+ Input image will be divided into equally sized rectangular tiles.
+ It defines the number of tiles in row and column. Default: (8, 8).
+
+ Returns:
+ ndarray: The processed image.
+ """
+ assert isinstance(img, np.ndarray)
+ assert img.ndim == 2
+ assert isinstance(clip_limit, (float, int))
+ assert is_tuple_of(tile_grid_size, int)
+ assert len(tile_grid_size) == 2
+
+ clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+ return clahe.apply(np.array(img, dtype=np.uint8))
+
+
+def adjust_hue(img: np.ndarray,
+ hue_factor: float,
+ backend: Optional[str] = None) -> np.ndarray:
+ """Adjust hue of an image.
+
+ The image hue is adjusted by converting the image to HSV and cyclically
+ shifting the intensities in the hue channel (H). The image is then
+ converted back to original image mode.
+
+ `hue_factor` is the amount of shift in H channel and must be in the
+ interval `[-0.5, 0.5]`.
+
+ Modified from
+ https://github.com/pytorch/vision/blob/main/torchvision/
+ transforms/functional.py
+
+ Args:
+ img (ndarray): Image to be adjusted.
+ hue_factor (float): How much to shift the hue channel. Should be in
+ [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+ HSV space in positive and negative direction respectively.
+ 0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+ with complementary colors while 0 gives the original image.
+ backend (str | None): The image processing backend type. Options are
+ `cv2`, `pillow`, `None`. If backend is None, the global
+ ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+ used. Defaults to None.
+
+ Returns:
+ ndarray: Hue adjusted image.
+ """
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if not (-0.5 <= hue_factor <= 0.5):
+ raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
+ if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
+ raise TypeError('img should be ndarray with dim=[2 or 3].')
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ # Image.fromarray defaultly supports RGB, not BGR.
+ pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+ input_mode = pil_image.mode
+ if input_mode in {'L', '1', 'I', 'F'}:
+ return pil_image
+
+ h, s, v = pil_image.convert('HSV').split()
+
+ np_h = np.array(h, dtype=np.uint8)
+ # uint8 addition take cares of rotation across boundaries
+ with np.errstate(over='ignore'):
+ np_h += np.uint8(hue_factor * 255)
+ h = Image.fromarray(np_h, 'L')
+
+ pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)
+ return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+ else:
+ dtype = img.dtype
+ img = img.astype(np.uint8)
+ hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+ h, s, v = cv2.split(hsv_img)
+ h = h.astype(np.uint8)
+ # uint8 addition take cares of rotation across boundaries
+ with np.errstate(over='ignore'):
+ h += np.uint8(hue_factor * 255)
+ hsv_img = cv2.merge([h, s, v])
+ return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
diff --git a/external/cv/mmcv/ops/__init__.py b/external/cv/mmcv/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a772151cff3454c03da2659d62f621151d03f88
--- /dev/null
+++ b/external/cv/mmcv/ops/__init__.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from mmcv.utils import IS_MLU_AVAILABLE
+from .active_rotated_filter import active_rotated_filter
+from .assign_score_withk import assign_score_withk
+from .ball_query import ball_query
+from .bbox import bbox_overlaps
+from .bezier_align import BezierAlign, bezier_align
+from .bias_act import bias_act
+from .border_align import BorderAlign, border_align
+from .box_iou_quadri import box_iou_quadri
+from .box_iou_rotated import box_iou_rotated
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
+from .contour_expand import contour_expand
+from .conv2d_gradfix import conv2d, conv_transpose2d
+from .convex_iou import convex_giou, convex_iou
+from .corner_pool import CornerPool
+from .correlation import Correlation
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+ ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .deprecated_wrappers import Conv2d_deprecated as Conv2d
+from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
+from .deprecated_wrappers import Linear_deprecated as Linear
+from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
+from .filtered_lrelu import filtered_lrelu
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+ sigmoid_focal_loss, softmax_focal_loss)
+from .furthest_point_sample import (furthest_point_sample,
+ furthest_point_sample_with_dist)
+from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
+from .gather_points import gather_points
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+from .info import get_compiler_version, get_compiling_cuda_version
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+ nms3d_normal, nms_bev, nms_normal_bev)
+from .knn import knn
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+ ModulatedDeformConv2dPack,
+ modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms
+from .pixel_group import pixel_group
+from .point_sample import (SimpleRoIAlign, point_sample,
+ rel_roi_point_to_rel_img_point)
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+ points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
+from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
+from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
+from .roi_align import RoIAlign, roi_align
+from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
+from .roi_pool import RoIPool, roi_pool
+from .roiaware_pool3d import RoIAwarePool3d
+from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
+from .saconv import SAConv2d
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+ SparseConvTranspose3d, SparseInverseConv2d,
+ SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
+from .sync_bn import SyncBatchNorm
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+from .tin_shift import TINShift, tin_shift
+from .upfirdn2d import filter2d, upfirdn2d, upsample2d
+from .voxelize import Voxelization, voxelization
+
+__all__ = [
+ 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+ 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+ 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+ 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+ 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+ 'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',
+ 'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+ 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+ 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+ 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+ 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+ 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+ 'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',
+ 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',
+ 'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',
+ 'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',
+ 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',
+ 'contour_expand', 'three_nn', 'three_interpolate',
+ 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',
+ 'gather_points', 'furthest_point_sample', 'nms_quadri',
+ 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+ 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+ 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+ 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+ 'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+ 'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+ 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+ 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+ 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+ 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+ 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+ 'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
+ 'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
+]
+
+if IS_MLU_AVAILABLE:
+ from .deform_conv import DeformConv2dPack_MLU # noqa:F401
+ from .modulated_deform_conv import \
+ ModulatedDeformConv2dPack_MLU # noqa:F401
+ __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])
diff --git a/external/cv/mmcv/ops/active_rotated_filter.py b/external/cv/mmcv/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e46ae50457892d2a6a8933649304e4b53a8a72
--- /dev/null
+++ b/external/cv/mmcv/ops/active_rotated_filter.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext',
+ ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+ """Encoding the orientation information and generating orientation-
+ sensitive features.
+
+ The details are described in the paper `Align Deep Features for Oriented
+ Object Detection _`.
+ """
+
+ @staticmethod
+ def forward(ctx, input: torch.Tensor,
+ indices: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ input (torch.Tensor): Input features with shape
+ [num_output_planes, num_input_planes, num_orientations, H, W].
+ indices (torch.Tensor): Indices with shape
+ [num_orientations, H, W, num_rotations].
+
+ Returns:
+ torch.Tensor: Refined features with shape [num_output_planes *
+ num_rotations, num_input_planes * num_orientations, H, W].
+ """
+ ctx.save_for_backward(input, indices)
+ op, ip, o, h, w = input.size()
+ o, h, w, r = indices.size()
+ output = input.new_zeros((op * r, ip * o, h, w))
+ ext_module.active_rotated_filter_forward(input, indices, output)
+
+ return output
+
+ @staticmethod
+ @once_differentiable
+ def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+ """
+ Args:
+ grad_output (torch.Tensor): The gradient of output features
+ with shape [num_output_planes * num_rotations,
+ num_input_planes * num_orientations, H, W].
+
+ Returns:
+ torch.Tensor: The gradient of input features with shape
+ [num_output_planes, num_input_planes, num_orientations, H, W].
+ """
+ input, indices = ctx.saved_tensors
+ grad_in = torch.zeros_like(input)
+ ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+ return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/external/cv/mmcv/ops/assign_score_withk.py b/external/cv/mmcv/ops/assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f18f5ec9918fd14e60d3efa6666c01ccb7a868c
--- /dev/null
+++ b/external/cv/mmcv/ops/assign_score_withk.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
+
+
+class AssignScoreWithK(Function):
+ r"""Perform weighted sum to generate output features according to scores.
+ Modified from `PAConv `_.
+
+ This is a memory-efficient CUDA implementation of assign_scores operation,
+ which first transform all point features with weight bank, then assemble
+ neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
+
+ See the `paper `_ appendix Sec. D for
+ more detailed descriptions.
+
+ Note:
+ This implementation assumes using ``neighbor`` kernel input, which is
+ (point_features - center_features, point_features).
+ See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
+ pointnet2/paconv.py#L128 for more details.
+ """
+
+ @staticmethod
+ def forward(ctx,
+ scores: torch.Tensor,
+ point_features: torch.Tensor,
+ center_features: torch.Tensor,
+ knn_idx: torch.Tensor,
+ aggregate: str = 'sum') -> torch.Tensor:
+ """
+ Args:
+ scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+ aggregate weight matrices in the weight bank.
+ ``npoint`` is the number of sampled centers.
+ ``K`` is the number of queried neighbors.
+ ``M`` is the number of weight matrices in the weight bank.
+ point_features (torch.Tensor): (B, N, M, out_dim)
+ Pre-computed point features to be aggregated.
+ center_features (torch.Tensor): (B, N, M, out_dim)
+ Pre-computed center features to be aggregated.
+ knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
+ We assume the first idx in each row is the idx of the center.
+ aggregate (str, optional): Aggregation method.
+ Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
+
+ Returns:
+ torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
+ """
+ agg = {'sum': 0, 'avg': 1, 'max': 2}
+
+ B, N, M, out_dim = point_features.size()
+ _, npoint, K, _ = scores.size()
+
+ output = point_features.new_zeros((B, out_dim, npoint, K))
+ ext_module.assign_score_withk_forward(
+ point_features.contiguous(),
+ center_features.contiguous(),
+ scores.contiguous(),
+ knn_idx.contiguous(),
+ output,
+ B=B,
+ N0=N,
+ N1=npoint,
+ M=M,
+ K=K,
+ O=out_dim,
+ aggregate=agg[aggregate])
+
+ ctx.save_for_backward(output, point_features, center_features, scores,
+ knn_idx)
+ ctx.agg = agg[aggregate]
+
+ return output
+
+ @staticmethod
+ def backward(
+ ctx, grad_out: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
+ """
+ Args:
+ grad_out (torch.Tensor): (B, out_dim, npoint, K)
+
+ Returns:
+ tuple[torch.Tensor]: A tuple contains five elements. The first one
+ is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+ second is the gradient of ``point_features`` whose shape is
+ (B, N, M, out_dim). The third is the gradient of
+ ``center_features`` with the shape of (B, N, M, out_dim). The last
+ two are ``None``.
+ """
+ _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
+
+ agg = ctx.agg
+
+ B, N, M, out_dim = point_features.size()
+ _, npoint, K, _ = scores.size()
+
+ grad_point_features = point_features.new_zeros(point_features.shape)
+ grad_center_features = center_features.new_zeros(center_features.shape)
+ grad_scores = scores.new_zeros(scores.shape)
+
+ ext_module.assign_score_withk_backward(
+ grad_out.contiguous(),
+ point_features.contiguous(),
+ center_features.contiguous(),
+ scores.contiguous(),
+ knn_idx.contiguous(),
+ grad_point_features,
+ grad_center_features,
+ grad_scores,
+ B=B,
+ N0=N,
+ N1=npoint,
+ M=M,
+ K=K,
+ O=out_dim,
+ aggregate=agg)
+
+ return grad_scores, grad_point_features, \
+ grad_center_features, None, None
+
+
+assign_score_withk = AssignScoreWithK.apply
diff --git a/external/cv/mmcv/ops/ball_query.py b/external/cv/mmcv/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..f26c36b82f6eed6b6c61306eeba61c4f91c609b4
--- /dev/null
+++ b/external/cv/mmcv/ops/ball_query.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['ball_query_forward', 'stack_ball_query_forward'])
+
+
+class BallQuery(Function):
+ """Find nearby points in spherical space."""
+
+ @staticmethod
+ def forward(
+ ctx,
+ min_radius: float,
+ max_radius: float,
+ sample_num: int,
+ xyz: torch.Tensor,
+ center_xyz: torch.Tensor,
+ xyz_batch_cnt: Optional[torch.Tensor] = None,
+ center_xyz_batch_cnt: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """
+ Args:
+ min_radius (float): minimum radius of the balls.
+ max_radius (float): maximum radius of the balls.
+ sample_num (int): maximum number of features in the balls.
+ xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,
+ or staked input (N1 + N2 ..., 3).
+ center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+ query, or staked input (M1 + M2 ..., 3).
+ xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in
+ each batch, just like (N1, N2, ...). Defaults to None.
+ New in version 1.7.0.
+ center_xyz_batch_cnt: (batch_size): Stacked centers coordinates
+ nums in each batch, just line (M1, M2, ...). Defaults to None.
+ New in version 1.7.0.
+
+ Returns:
+ torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+ features that form the query balls.
+ """
+ assert center_xyz.is_contiguous()
+ assert xyz.is_contiguous()
+ assert min_radius < max_radius
+ if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:
+ assert xyz_batch_cnt.dtype == torch.int
+ assert center_xyz_batch_cnt.dtype == torch.int
+ idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),
+ dtype=torch.int32)
+ ext_module.stack_ball_query_forward(
+ center_xyz,
+ center_xyz_batch_cnt,
+ xyz,
+ xyz_batch_cnt,
+ idx,
+ max_radius=max_radius,
+ nsample=sample_num,
+ )
+ else:
+ B, N, _ = xyz.size()
+ npoint = center_xyz.size(1)
+ idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)
+ ext_module.ball_query_forward(
+ center_xyz,
+ xyz,
+ idx,
+ b=B,
+ n=N,
+ m=npoint,
+ min_radius=min_radius,
+ max_radius=max_radius,
+ nsample=sample_num)
+ if torch.__version__ != 'parrots':
+ ctx.mark_non_differentiable(idx)
+ return idx
+
+ @staticmethod
+ def backward(ctx, a=None) -> Tuple[None, None, None, None]:
+ return None, None, None, None
+
+
+ball_query = BallQuery.apply
diff --git a/external/cv/mmcv/ops/bbox.py b/external/cv/mmcv/ops/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ea5b96bebec84f9e6167278f60a9add3178156
--- /dev/null
+++ b/external/cv/mmcv/ops/bbox.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+
+
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+ bboxes2: torch.Tensor,
+ mode: str = 'iou',
+ aligned: bool = False,
+ offset: int = 0) -> torch.Tensor:
+ assert mode in ['iou', 'iof']
+
+ if aligned:
+ lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2]
+ rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2]
+
+ wh = (rb - lt + offset).clamp(min=0) # [rows, 2]
+ overlap = wh[:, 0] * wh[:, 1]
+ area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+ bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+ if mode == 'iou':
+ area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+ bboxes2[:, 3] - bboxes2[:, 1] + offset)
+ ious = overlap / (area1 + area2 - overlap)
+ else:
+ ious = overlap / area1
+ else:
+ lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2]
+ rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2]
+
+ wh = (rb - lt + offset).clamp(min=0) # [rows, cols, 2]
+ overlap = wh[:, :, 0] * wh[:, :, 1]
+ area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+ bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+ if mode == 'iou':
+ area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+ bboxes2[:, 3] - bboxes2[:, 1] + offset)
+ ious = overlap / (area1[:, None] + area2 - overlap)
+ else:
+ ious = overlap / (area1[:, None])
+
+ return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+ bboxes2: torch.Tensor,
+ mode: str = 'iou',
+ aligned: bool = False,
+ offset: int = 0) -> torch.Tensor:
+ """Calculate overlap between two set of bboxes.
+
+ If ``aligned`` is ``False``, then calculate the ious between each bbox
+ of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+ bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (torch.Tensor): shape (m, 4) in format or
+ empty.
+ bboxes2 (torch.Tensor): shape (n, 4) in format or
+ empty. If aligned is ``True``, then m and n must be equal.
+ mode (str): "iou" (intersection over union) or iof (intersection over
+ foreground).
+
+ Returns:
+ torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+ ``False``, the shape of ious is (m, n) else (m, 1).
+
+ Example:
+ >>> bboxes1 = torch.FloatTensor([
+ >>> [0, 0, 10, 10],
+ >>> [10, 10, 20, 20],
+ >>> [32, 32, 38, 42],
+ >>> ])
+ >>> bboxes2 = torch.FloatTensor([
+ >>> [0, 0, 10, 20],
+ >>> [0, 10, 10, 19],
+ >>> [10, 10, 20, 20],
+ >>> ])
+ >>> bbox_overlaps(bboxes1, bboxes2)
+ tensor([[0.5000, 0.0000, 0.0000],
+ [0.0000, 0.0000, 1.0000],
+ [0.0000, 0.0000, 0.0000]])
+
+ Example:
+ >>> empty = torch.FloatTensor([])
+ >>> nonempty = torch.FloatTensor([
+ >>> [0, 0, 10, 9],
+ >>> ])
+ >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+ >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+ >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+ """
+
+ mode_dict = {'iou': 0, 'iof': 1}
+ assert mode in mode_dict.keys()
+ mode_flag = mode_dict[mode]
+ # Either the boxes are empty or the length of boxes' last dimension is 4
+ assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+ assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+ assert offset == 1 or offset == 0
+
+ rows = bboxes1.size(0)
+ cols = bboxes2.size(0)
+
+ if aligned:
+ assert rows == cols
+ ious = bboxes1.new_zeros(rows)
+ else:
+ ious = bboxes1.new_zeros((rows, cols))
+
+ if rows * cols == 0:
+ return ious
+
+ if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
+ return _bbox_overlaps_cpu(
+ bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
+
+ ext_module.bbox_overlaps(
+ bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
+
+ return ious
diff --git a/external/cv/mmcv/ops/bezier_align.py b/external/cv/mmcv/ops/bezier_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53d0fd2bcc53525e1482d167d6dffb831b2941a
--- /dev/null
+++ b/external/cv/mmcv/ops/bezier_align.py
@@ -0,0 +1,142 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['bezier_align_forward', 'bezier_align_backward'])
+
+
+class BezierAlignFunction(Function):
+
+ @staticmethod
+ def forward(ctx,
+ input: torch.Tensor,
+ beziers: torch.Tensor,
+ output_size: Union[int, Tuple[int, int]],
+ spatial_scale: Union[int, float] = 1.0,
+ sampling_ratio: int = 0,
+ aligned: bool = True) -> torch.Tensor:
+ ctx.output_size = _pair(output_size)
+ ctx.spatial_scale = spatial_scale
+ ctx.input_shape = input.size()
+ ctx.sampling_ratio = sampling_ratio
+ ctx.aligned = aligned
+
+ assert beziers.size(1) == 17
+ output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],
+ ctx.output_size[1])
+ output = input.new_zeros(output_shape)
+ ext_module.bezier_align_forward(
+ input,
+ beziers,
+ output,
+ aligned_height=ctx.output_size[0],
+ aligned_width=ctx.output_size[1],
+ spatial_scale=ctx.spatial_scale,
+ sampling_ratio=ctx.sampling_ratio,
+ aligned=ctx.aligned)
+
+ ctx.save_for_backward(beziers)
+ return output
+
+ @staticmethod
+ @once_differentiable
+ def backward(ctx, grad_output: torch.Tensor):
+ beziers = ctx.saved_tensors[0]
+ grad_input = grad_output.new_zeros(ctx.input_shape)
+ grad_output = grad_output.contiguous()
+ ext_module.bezier_align_backward(
+ grad_output,
+ beziers,
+ grad_input,
+ aligned_height=ctx.output_size[0],
+ aligned_width=ctx.output_size[1],
+ spatial_scale=ctx.spatial_scale,
+ sampling_ratio=ctx.sampling_ratio,
+ aligned=ctx.aligned)
+ return grad_input, None, None, None, None, None
+
+
+bezier_align = BezierAlignFunction.apply
+
+
+class BezierAlign(nn.Module):
+ """Bezier align pooling layer.
+
+ Args:
+ output_size (tuple): h, w
+ spatial_scale (float): scale the input boxes by this number
+ sampling_ratio (int): number of inputs samples to take for each
+ output sample. 0 to take samples densely for current models.
+ aligned (bool): if False, use the legacy implementation in
+ MMDetection. If True, align the results more perfectly.
+
+ Note:
+ The implementation of BezierAlign is modified from
+ https://github.com/aim-uofa/AdelaiDet
+
+ The meaning of aligned=True:
+
+ Given a continuous coordinate c, its two neighboring pixel
+ indices (in our pixel model) are computed by floor(c - 0.5) and
+ ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+ indices [0] and [1] (which are sampled from the underlying signal
+ at continuous coordinates 0.5 and 1.5). But the original roi_align
+ (aligned=False) does not subtract the 0.5 when computing
+ neighboring pixel indices and therefore it uses pixels with a
+ slightly incorrect alignment (relative to our pixel model) when
+ performing bilinear interpolation.
+
+ With `aligned=True`,
+ we first appropriately scale the ROI and then shift it by -0.5
+ prior to calling roi_align. This produces the correct neighbors;
+
+ The difference does not make a difference to the model's
+ performance if ROIAlign is used together with conv layers.
+ """
+
+ def __init__(
+ self,
+ output_size: Tuple,
+ spatial_scale: Union[int, float],
+ sampling_ratio: int,
+ aligned: bool = True,
+ ) -> None:
+ super().__init__()
+
+ self.output_size = _pair(output_size)
+ self.spatial_scale = float(spatial_scale)
+ self.sampling_ratio = int(sampling_ratio)
+ self.aligned = aligned
+
+ def forward(self, input: torch.Tensor,
+ beziers: torch.Tensor) -> torch.Tensor:
+ """BezierAlign forward.
+
+ Args:
+ inputs (Tensor): input features.
+ beziers (Tensor): beziers for align.
+ """
+ return bezier_align(input, beziers, self.output_size,
+ self.spatial_scale, self.sampling_ratio,
+ self.aligned)
+
+ def __repr__(self):
+ s = self.__class__.__name__
+ s += f'(output_size={self.output_size}, '
+ s += f'spatial_scale={self.spatial_scale})'
+ s += f'sampling_ratio={self.sampling_ratio})'
+ s += f'aligned={self.aligned})'
+ return s
diff --git a/external/cv/mmcv/ops/bias_act.py b/external/cv/mmcv/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed5f3d4c60a8c11a0662d278f1cf8bab7519159
--- /dev/null
+++ b/external/cv/mmcv/ops/bias_act.py
@@ -0,0 +1,381 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from
+# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa
+"""Custom PyTorch ops for efficient bias and activation."""
+
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bias_act'])
+
+
+class EasyDict(dict):
+ """Convenience class that behaves like a dict but allows access with the
+ attribute syntax."""
+
+ def __getattr__(self, name: str) -> Any:
+ try:
+ return self[name]
+ except KeyError:
+ raise AttributeError(name)
+
+ def __setattr__(self, name: str, value: Any) -> None:
+ self[name] = value
+
+ def __delattr__(self, name: str) -> None:
+ del self[name]
+
+
+activation_funcs = {
+ 'linear':
+ EasyDict(
+ func=lambda x, **_: x,
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=1,
+ ref='',
+ has_2nd_grad=False),
+ 'relu':
+ EasyDict(
+ func=lambda x, **_: torch.nn.functional.relu(x),
+ def_alpha=0,
+ def_gain=np.sqrt(2),
+ cuda_idx=2,
+ ref='y',
+ has_2nd_grad=False),
+ 'lrelu':
+ EasyDict(
+ func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
+ def_alpha=0.2,
+ def_gain=np.sqrt(2),
+ cuda_idx=3,
+ ref='y',
+ has_2nd_grad=False),
+ 'tanh':
+ EasyDict(
+ func=lambda x, **_: torch.tanh(x),
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=4,
+ ref='y',
+ has_2nd_grad=True),
+ 'sigmoid':
+ EasyDict(
+ func=lambda x, **_: torch.sigmoid(x),
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=5,
+ ref='y',
+ has_2nd_grad=True),
+ 'elu':
+ EasyDict(
+ func=lambda x, **_: torch.nn.functional.elu(x),
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=6,
+ ref='y',
+ has_2nd_grad=True),
+ 'selu':
+ EasyDict(
+ func=lambda x, **_: torch.nn.functional.selu(x),
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=7,
+ ref='y',
+ has_2nd_grad=True),
+ 'softplus':
+ EasyDict(
+ func=lambda x, **_: torch.nn.functional.softplus(x),
+ def_alpha=0,
+ def_gain=1,
+ cuda_idx=8,
+ ref='y',
+ has_2nd_grad=True),
+ 'swish':
+ EasyDict(
+ func=lambda x, **_: torch.sigmoid(x) * x,
+ def_alpha=0,
+ def_gain=np.sqrt(2),
+ cuda_idx=9,
+ ref='x',
+ has_2nd_grad=True),
+}
+
+_null_tensor = torch.empty([0])
+
+
+def bias_act(input: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ dim: int = 1,
+ act: str = 'linear',
+ alpha: Optional[Union[float, int]] = None,
+ gain: Optional[float] = None,
+ clamp: Optional[float] = None,
+ use_custom_op: bool = True):
+ r"""Fused bias and activation function.
+
+ Adds `bias` to activation tensor `input`, and evaluates activation
+ function `act`, and scales the result by `gain`. Each of the steps is
+ optional.
+
+ In most cases, the fused op is considerably more efficient than performing
+ the same calculation using standard PyTorch ops. It supports first and
+ second order gradients, but not third order gradients.
+
+ Args:
+ input (torch.Tensor): Input activation tensor. Can be of any shape.
+ bias (torch.Tensor): Bias vector, or `None` to disable.
+ Must be a 1D tensor of the same type as `input`. The shape must
+ be known, and it must match the dimension of `input` corresponding
+ to `dim`. Defaults to None.
+ dim (int): The dimension in `input` corresponding to the elements of
+ `bias`. The value of `dim` is ignored if `b` is not specified.
+ Defaults to 1.
+ act (str): Name of the activation function to evaluate, or `"linear"`
+ to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+ "swish", etc. See `activation_funcs` for a full list. `None` is not
+ allowed. Defaults to `linear`.
+ alpha (float or int): Shape parameter for the activation
+ function, or `None` to use the default. Defaults to None.
+ gain (float): Scaling factor for the output tensor, or `None`
+ to use default. See `activation_funcs` for the default scaling of
+ each activation function. If unsure, consider specifying 1.
+ Defaults to None.
+ clamp (float): Clamp the output values to `[-clamp, +clamp]`,
+ or `None` to disable the clamping (default). Defaults to None.
+ use_custom_op (bool): Whether to use customized op.
+ Defaults to True.
+
+ Returns:
+ torch.Tensor: Tensor of the same shape and datatype as `input`.
+ """
+ assert isinstance(input, torch.Tensor)
+ if use_custom_op and input.is_cuda:
+ return _bias_act_cuda(
+ dim=dim, act=act, alpha=alpha, gain=gain,
+ clamp=clamp).apply(input, bias)
+ return _bias_act_ref(
+ input=input,
+ bias=bias,
+ dim=dim,
+ act=act,
+ alpha=alpha,
+ gain=gain,
+ clamp=clamp)
+
+
+def _bias_act_ref(input: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ dim: int = 1,
+ act: str = 'linear',
+ alpha: Optional[Union[float, int]] = None,
+ gain: Optional[float] = None,
+ clamp: Optional[float] = None):
+ """Slow reference implementation of `bias_act()` using standard PyTorch
+ ops.
+
+ Adds `bias` to activation tensor `input`, and evaluates activation
+ function `act`, and scales the result by `gain`. Each of the steps is
+ optional.
+
+ In most cases, the fused op is considerably more efficient than performing
+ the same calculation using standard PyTorch ops. It supports first and
+ second order gradients, but not third order gradients.
+
+ Args:
+ input (torch.Tensor): Input activation tensor. Can be of any shape.
+ bias (torch.Tensor): Bias vector, or `None` to disable.
+ Must be a 1D tensor of the same type as `input`. The shape must
+ be known, and it must match the dimension of `input` corresponding
+ to `dim`. Defaults to None.
+ dim (int): The dimension in `input` corresponding to the elements of
+ `bias`. The value of `dim` is ignored if `b` is not specified.
+ Defaults to 1.
+ act (str): Name of the activation function to evaluate, or `"linear"`
+ to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+ "swish", etc. See `activation_funcs` for a full list. `None` is not
+ allowed. Defaults to `linear`.
+ alpha (float or int): Shape parameter for the activation
+ function, or `None` to use the default. Defaults to None.
+ gain (float): Scaling factor for the output tensor, or `None`
+ to use default. See `activation_funcs` for the default scaling of
+ each activation function. If unsure, consider specifying 1.
+ Defaults to None.
+ clamp (float): Clamp the output values to
+ `[-clamp, +clamp]`, or `None` to disable the clamping (default).
+ Defaults to None.
+
+ Returns:
+ torch.Tensor: Tensor of the same shape and datatype as `input`.
+ """
+ assert isinstance(input, torch.Tensor)
+ assert clamp is None or clamp >= 0
+ spec = activation_funcs[act]
+ alpha = float(alpha if alpha is not None else spec.def_alpha)
+ gain = float(gain if gain is not None else spec.def_gain)
+ clamp = float(clamp if clamp is not None else -1)
+
+ # Add bias.
+ if bias is not None:
+ assert isinstance(bias, torch.Tensor) and bias.ndim == 1
+ assert 0 <= dim < input.ndim
+ assert bias.shape[0] == input.shape[dim]
+ input = input + bias.reshape(
+ [-1 if i == dim else 1 for i in range(input.ndim)])
+
+ # Evaluate activation function.
+ alpha = float(alpha)
+ output = spec.func(input, alpha=alpha)
+
+ # Scale by gain.
+ gain = float(gain)
+ if gain != 1:
+ output = output * gain
+
+ # Clamp.
+ if clamp >= 0:
+ # pylint: disable=invalid-unary-operand-type
+ output = output.clamp(-clamp, clamp)
+ return output
+
+
+_bias_act_cuda_cache: Dict = dict()
+
+
+def _bias_act_cuda(dim: int = 1,
+ act: str = 'linear',
+ alpha: Optional[Union[float, int]] = None,
+ gain: Optional[float] = None,
+ clamp: Optional[float] = None):
+ """"Fast CUDA implementation of `bias_act()` using custom ops.
+
+ Args:
+ dim (int): The dimension in `x` corresponding to the elements of `b`.
+ The value of `dim` is ignored if `b` is not specified.
+ Defaults to 1.
+ act (str): Name of the activation function to evaluate, or `"linear"`
+ to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+ "swish", etc. See `activation_funcs` for a full list. `None` is not
+ allowed. Defaults to `linear`.
+ alpha (float | int): Shape parameter for the activation
+ function, or `None` to use the default. Defaults to None.
+ gain (float): Scaling factor for the output tensor, or `None`
+ to use default. See `activation_funcs` for the default scaling of
+ each activation function. If unsure, consider specifying 1.
+ Defaults to None.
+ clamp (float): Clamp the output values to `[-clamp, +clamp]`,
+ or `None` to disable the clamping (default). Defaults to None.
+
+ Returns:
+ torch.Tensor: Tensor of the same shape and datatype as `x`.
+ """
+ # Parse arguments.
+ assert clamp is None or clamp >= 0
+ spec = activation_funcs[act]
+ alpha = float(alpha if alpha is not None else spec.def_alpha)
+ gain = float(gain if gain is not None else spec.def_gain)
+ clamp = float(clamp if clamp is not None else -1)
+
+ # Lookup from cache.
+ key = (dim, act, alpha, gain, clamp)
+ if key in _bias_act_cuda_cache:
+ return _bias_act_cuda_cache[key]
+
+ # Forward op.
+ class BiasActCuda(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, x, b): # pylint: disable=arguments-differ
+ ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
+ 1) == 1 else torch.contiguous_format
+ x = x.contiguous(memory_format=ctx.memory_format)
+ b = b.contiguous() if b is not None else _null_tensor.to(x.device)
+ y = x
+ if act != 'linear' or gain != 1 or clamp >= 0 or (
+ b is not _null_tensor.to(x.device)):
+ y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
+ _null_tensor.to(x.device),
+ _null_tensor.to(x.device), 0, dim,
+ spec.cuda_idx, alpha, gain, clamp)
+ ctx.save_for_backward(
+ x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
+ x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
+ _null_tensor.to(x.device),
+ y if 'y' in spec.ref else _null_tensor.to(x.device))
+ return y
+
+ @staticmethod
+ def backward(ctx, dy): # pylint: disable=arguments-differ
+ dy = dy.contiguous(memory_format=ctx.memory_format)
+ x, b, y = ctx.saved_tensors
+ dx = None
+ db = None
+
+ if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+ dx = dy
+ if act != 'linear' or gain != 1 or clamp >= 0:
+ dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+ if ctx.needs_input_grad[1]:
+ db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+ return dx, db
+
+ # Backward op.
+ class BiasActCudaGrad(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
+ ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
+ dy.stride(1) == 1) else torch.contiguous_format
+ dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
+ dim, spec.cuda_idx, alpha, gain, clamp)
+ ctx.save_for_backward(
+ dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
+ y)
+ return dx
+
+ @staticmethod
+ def backward(ctx, d_dx): # pylint: disable=arguments-differ
+ d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+ dy, x, b, y = ctx.saved_tensors
+ d_dy = None
+ d_x = None
+ d_b = None
+ d_y = None
+
+ if ctx.needs_input_grad[0]:
+ d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+ if spec.has_2nd_grad and (ctx.needs_input_grad[1]
+ or ctx.needs_input_grad[2]):
+ d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
+ spec.cuda_idx, alpha, gain, clamp)
+
+ if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+ d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+ return d_dy, d_x, d_b, d_y
+
+ # Add to cache.
+ _bias_act_cuda_cache[key] = BiasActCuda
+ return BiasActCuda
diff --git a/external/cv/mmcv/ops/border_align.py b/external/cv/mmcv/ops/border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b2dcf45b69126cabd02df4a95191af33d70333
--- /dev/null
+++ b/external/cv/mmcv/ops/border_align.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# modified from
+# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['border_align_forward', 'border_align_backward'])
+
+
+class BorderAlignFunction(Function):
+
+ @staticmethod
+ def symbolic(g, input, boxes, pool_size):
+ return g.op(
+ 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
+
+ @staticmethod
+ def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+ pool_size: int) -> torch.Tensor:
+ ctx.pool_size = pool_size
+ ctx.input_shape = input.size()
+
+ assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
+ assert boxes.size(2) == 4, \
+ 'the last dimension of boxes must be (x1, y1, x2, y2)'
+ assert input.size(1) % 4 == 0, \
+ 'the channel for input feature must be divisible by factor 4'
+
+ # [B, C//4, H*W, 4]
+ output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
+ output = input.new_zeros(output_shape)
+ # `argmax_idx` only used for backward
+ argmax_idx = input.new_zeros(output_shape).to(torch.int)
+
+ ext_module.border_align_forward(
+ input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
+
+ ctx.save_for_backward(boxes, argmax_idx)
+ return output
+
+ @staticmethod
+ @once_differentiable
+ def backward(ctx,
+ grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+ boxes, argmax_idx = ctx.saved_tensors
+ grad_input = grad_output.new_zeros(ctx.input_shape)
+ # complex head architecture may cause grad_output uncontiguous
+ grad_output = grad_output.contiguous()
+ ext_module.border_align_backward(
+ grad_output,
+ boxes,
+ argmax_idx,
+ grad_input,
+ pool_size=ctx.pool_size)
+ return grad_input, None, None
+
+
+border_align = BorderAlignFunction.apply
+
+
+class BorderAlign(nn.Module):
+ r"""Border align pooling layer.
+
+ Applies border_align over the input feature based on predicted bboxes.
+ The details were described in the paper
+ `BorderDet: Border Feature for Dense Object Detection
+ `_.
+
+ For each border line (e.g. top, left, bottom or right) of each box,
+ border_align does the following:
+
+ 1. uniformly samples ``pool_size`` +1 positions on this line, involving
+ the start and end points.
+ 2. the corresponding features on these points are computed by bilinear
+ interpolation.
+ 3. max pooling over all the ``pool_size`` +1 positions are used for
+ computing pooled feature.
+
+ Args:
+ pool_size (int): number of positions sampled over the boxes' borders
+ (e.g. top, bottom, left, right).
+ """
+
+ def __init__(self, pool_size: int):
+ super().__init__()
+ self.pool_size = pool_size
+
+ def forward(self, input: torch.Tensor,
+ boxes: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
+ [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
+ right features respectively.
+ boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
+
+ Returns:
+ torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+ (top,left,bottom,right) for the last dimension.
+ """
+ return border_align(input, boxes, self.pool_size)
+
+ def __repr__(self):
+ s = self.__class__.__name__
+ s += f'(pool_size={self.pool_size})'
+ return s
diff --git a/external/cv/mmcv/ops/box_iou_quadri.py b/external/cv/mmcv/ops/box_iou_quadri.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac70c5bb9968397b07c208aa4431180561bc457a
--- /dev/null
+++ b/external/cv/mmcv/ops/box_iou_quadri.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
+
+
+def box_iou_quadri(bboxes1: torch.Tensor,
+ bboxes2: torch.Tensor,
+ mode: str = 'iou',
+ aligned: bool = False) -> torch.Tensor:
+ """Return intersection-over-union (Jaccard index) of boxes.
+
+ Both sets of boxes are expected to be in
+ (x1, y1, ..., x4, y4) format.
+
+ If ``aligned`` is ``False``, then calculate the ious between each bbox
+ of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+ bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
+ indicating (x1, y1, ..., x4, y4) for each row.
+ bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
+ indicating (x1, y1, ..., x4, y4) for each row.
+ mode (str): "iou" (intersection over union) or iof (intersection over
+ foreground).
+
+ Returns:
+ torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+ ``False``, the shape of ious is (N, M) else (N,).
+ """
+ assert mode in ['iou', 'iof']
+ mode_dict = {'iou': 0, 'iof': 1}
+ mode_flag = mode_dict[mode]
+ rows = bboxes1.size(0)
+ cols = bboxes2.size(0)
+ if aligned:
+ ious = bboxes1.new_zeros(rows)
+ else:
+ ious = bboxes1.new_zeros(rows * cols)
+ bboxes1 = bboxes1.contiguous()
+ bboxes2 = bboxes2.contiguous()
+ ext_module.box_iou_quadri(
+ bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+ if not aligned:
+ ious = ious.view(rows, cols)
+ return ious
diff --git a/external/cv/mmcv/ops/box_iou_rotated.py b/external/cv/mmcv/ops/box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..664551350370b30b29169403cb8ef151b318078b
--- /dev/null
+++ b/external/cv/mmcv/ops/box_iou_rotated.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
+
+
+def box_iou_rotated(bboxes1: torch.Tensor,
+ bboxes2: torch.Tensor,
+ mode: str = 'iou',
+ aligned: bool = False,
+ clockwise: bool = True) -> torch.Tensor:
+ """Return intersection-over-union (Jaccard index) of boxes.
+
+ Both sets of boxes are expected to be in
+ (x_center, y_center, width, height, angle) format.
+
+ If ``aligned`` is ``False``, then calculate the ious between each bbox
+ of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+ bboxes1 and bboxes2.
+
+ .. note::
+ The operator assumes:
+
+ 1) The positive direction along x axis is left -> right.
+
+ 2) The positive direction along y axis is top -> down.
+
+ 3) The w border is in parallel with x axis when angle = 0.
+
+ However, there are 2 opposite definitions of the positive angular
+ direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+ both definitions and uses CW by default.
+
+ Please set ``clockwise=False`` if you are using the CCW definition.
+
+ The coordinate system when ``clockwise`` is ``True`` (default)
+
+ .. code-block:: none
+
+ 0-------------------> x (0 rad)
+ | A-------------B
+ | | |
+ | | box h
+ | | angle=0 |
+ | D------w------C
+ v
+ y (pi/2 rad)
+
+ In such coordination system the rotation matrix is
+
+ .. math::
+ \\begin{pmatrix}
+ \\cos\\alpha & -\\sin\\alpha \\\\
+ \\sin\\alpha & \\cos\\alpha
+ \\end{pmatrix}
+
+ The coordinates of the corner point A can be calculated as:
+
+ .. math::
+ P_A=
+ \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+ =
+ \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+ \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+ \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+ \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+ =
+ \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+ \\\\
+ y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+ The coordinate system when ``clockwise`` is ``False``
+
+ .. code-block:: none
+
+ 0-------------------> x (0 rad)
+ | A-------------B
+ | | |
+ | | box h
+ | | angle=0 |
+ | D------w------C
+ v
+ y (-pi/2 rad)
+
+ In such coordination system the rotation matrix is
+
+ .. math::
+ \\begin{pmatrix}
+ \\cos\\alpha & \\sin\\alpha \\\\
+ -\\sin\\alpha & \\cos\\alpha
+ \\end{pmatrix}
+
+ The coordinates of the corner point A can be calculated as:
+
+ .. math::
+ P_A=
+ \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+ =
+ \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+ \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+ -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+ \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+ =
+ \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+ \\\\
+ y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+ Args:
+ boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+ indicating (x, y, w, h, theta) for each row. Note that theta is in
+ radian.
+ boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+ indicating (x, y, w, h, theta) for each row. Note that theta is in
+ radian.
+ mode (str): "iou" (intersection over union) or iof (intersection over
+ foreground).
+ clockwise (bool): flag indicating whether the positive angular
+ orientation is clockwise. default True.
+ `New in version 1.4.3.`
+
+ Returns:
+ torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+ ``False``, the shape of ious is (N, M) else (N,).
+ """
+ assert mode in ['iou', 'iof']
+ mode_dict = {'iou': 0, 'iof': 1}
+ mode_flag = mode_dict[mode]
+ rows = bboxes1.size(0)
+ cols = bboxes2.size(0)
+ if aligned:
+ ious = bboxes1.new_zeros(rows)
+ else:
+ if bboxes1.device.type == 'mlu':
+ ious = bboxes1.new_zeros([rows, cols])
+ else:
+ ious = bboxes1.new_zeros(rows * cols)
+ if not clockwise:
+ flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+ flip_mat[-1] = -1
+ bboxes1 = bboxes1 * flip_mat
+ bboxes2 = bboxes2 * flip_mat
+ if bboxes1.device.type == 'npu':
+ scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
+ scale_mat[-1] = 1.0 / 0.01745329252
+ bboxes1 = bboxes1 * scale_mat
+ bboxes2 = bboxes2 * scale_mat
+ bboxes1 = bboxes1.contiguous()
+ bboxes2 = bboxes2.contiguous()
+ ext_module.box_iou_rotated(
+ bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+ if not aligned:
+ ious = ious.view(rows, cols)
+ return ious
diff --git a/external/cv/mmcv/ops/carafe.py b/external/cv/mmcv/ops/carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..5562c8dc2c272f7f1b42f6e99dbd4aa9e1a1516e
--- /dev/null
+++ b/external/cv/mmcv/ops/carafe.py
@@ -0,0 +1,305 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import normal_init, xavier_init
+from mmengine.registry import MODELS
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+ 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+ 'carafe_backward'
+])
+
+
+class CARAFENaiveFunction(Function):
+
+ @staticmethod
+ def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+ group_size: int, scale_factor: int) -> Tensor:
+ return g.op(
+ 'mmcv::MMCVCARAFENaive',
+ features,
+ masks,
+ kernel_size_i=kernel_size,
+ group_size_i=group_size,
+ scale_factor_f=scale_factor)
+
+ @staticmethod
+ def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+ group_size: int, scale_factor: int) -> Tensor:
+ assert scale_factor >= 1
+ assert masks.size(1) == kernel_size * kernel_size * group_size
+ assert masks.size(-1) == features.size(-1) * scale_factor
+ assert masks.size(-2) == features.size(-2) * scale_factor
+ assert features.size(1) % group_size == 0
+ assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+ ctx.kernel_size = kernel_size
+ ctx.group_size = group_size
+ ctx.scale_factor = scale_factor
+ ctx.feature_size = features.size()
+ ctx.mask_size = masks.size()
+
+ n, c, h, w = features.size()
+ output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+ ext_module.carafe_naive_forward(
+ features,
+ masks,
+ output,
+ kernel_size=kernel_size,
+ group_size=group_size,
+ scale_factor=scale_factor)
+
+ if features.requires_grad or masks.requires_grad or \
+ torch.__version__ == 'parrots':
+ ctx.save_for_backward(features, masks)
+ return output
+
+ @staticmethod
+ def backward(
+ ctx,
+ grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+ assert grad_output.is_cuda
+
+ features, masks = ctx.saved_tensors
+ kernel_size = ctx.kernel_size
+ group_size = ctx.group_size
+ scale_factor = ctx.scale_factor
+
+ grad_input = torch.zeros_like(features)
+ grad_masks = torch.zeros_like(masks)
+ ext_module.carafe_naive_backward(
+ grad_output.contiguous(),
+ features,
+ masks,
+ grad_input,
+ grad_masks,
+ kernel_size=kernel_size,
+ group_size=group_size,
+ scale_factor=scale_factor)
+
+ return grad_input, grad_masks, None, None, None
+
+
+carafe_naive = CARAFENaiveFunction.apply
+
+
+class CARAFENaive(Module):
+
+ def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+ super().__init__()
+
+ assert isinstance(kernel_size, int) and isinstance(
+ group_size, int) and isinstance(scale_factor, int)
+ self.kernel_size = kernel_size
+ self.group_size = group_size
+ self.scale_factor = scale_factor
+
+ def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+ return carafe_naive(features, masks, self.kernel_size, self.group_size,
+ self.scale_factor)
+
+
+class CARAFEFunction(Function):
+
+ @staticmethod
+ def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+ group_size: int, scale_factor: int) -> Tensor:
+ return g.op(
+ 'mmcv::MMCVCARAFE',
+ features,
+ masks,
+ kernel_size_i=kernel_size,
+ group_size_i=group_size,
+ scale_factor_f=scale_factor)
+
+ @staticmethod
+ def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+ group_size: int, scale_factor: int) -> Tensor:
+ assert scale_factor >= 1
+ assert masks.size(1) == kernel_size * kernel_size * group_size
+ assert masks.size(-1) == features.size(-1) * scale_factor
+ assert masks.size(-2) == features.size(-2) * scale_factor
+ assert features.size(1) % group_size == 0
+ assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+ ctx.kernel_size = kernel_size
+ ctx.group_size = group_size
+ ctx.scale_factor = scale_factor
+ ctx.feature_size = features.size()
+ ctx.mask_size = masks.size()
+
+ n, c, h, w = features.size()
+ output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+ routput = features.new_zeros(output.size(), requires_grad=False)
+ rfeatures = features.new_zeros(features.size(), requires_grad=False)
+ rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+ ext_module.carafe_forward(
+ features,
+ masks,
+ rfeatures,
+ routput,
+ rmasks,
+ output,
+ kernel_size=kernel_size,
+ group_size=group_size,
+ scale_factor=scale_factor)
+
+ if features.requires_grad or masks.requires_grad or \
+ torch.__version__ == 'parrots':
+ ctx.save_for_backward(features, masks, rfeatures)
+ return output
+
+ @staticmethod
+ def backward(
+ ctx,
+ grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+ features, masks, rfeatures = ctx.saved_tensors
+ kernel_size = ctx.kernel_size
+ group_size = ctx.group_size
+ scale_factor = ctx.scale_factor
+
+ rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+ rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+ rgrad_input = torch.zeros_like(features, requires_grad=False)
+ rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+ grad_input = torch.zeros_like(features, requires_grad=False)
+ grad_masks = torch.zeros_like(masks, requires_grad=False)
+ ext_module.carafe_backward(
+ grad_output.contiguous(),
+ rfeatures,
+ masks,
+ rgrad_output,
+ rgrad_input_hs,
+ rgrad_input,
+ rgrad_masks,
+ grad_input,
+ grad_masks,
+ kernel_size=kernel_size,
+ group_size=group_size,
+ scale_factor=scale_factor)
+ return grad_input, grad_masks, None, None, None
+
+
+carafe = CARAFEFunction.apply
+
+
+class CARAFE(Module):
+ """ CARAFE: Content-Aware ReAssembly of FEatures
+
+ Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+ `_ for more details.
+
+ Args:
+ kernel_size (int): reassemble kernel size
+ group_size (int): reassemble group size
+ scale_factor (int): upsample ratio
+
+ Returns:
+ upsampled feature map
+ """
+
+ def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+ super().__init__()
+
+ assert isinstance(kernel_size, int) and isinstance(
+ group_size, int) and isinstance(scale_factor, int)
+ self.kernel_size = kernel_size
+ self.group_size = group_size
+ self.scale_factor = scale_factor
+
+ def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+ return carafe(features, masks, self.kernel_size, self.group_size,
+ self.scale_factor)
+
+
+@MODELS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+ """A unified package of CARAFE upsampler that contains: 1) channel
+ compressor 2) content encoder 3) CARAFE op.
+
+ Official implementation of ICCV 2019 paper
+ `CARAFE: Content-Aware ReAssembly of FEatures
+ `_.
+
+ Args:
+ channels (int): input feature channels
+ scale_factor (int): upsample ratio
+ up_kernel (int): kernel size of CARAFE op
+ up_group (int): group size of CARAFE op
+ encoder_kernel (int): kernel size of content encoder
+ encoder_dilation (int): dilation of content encoder
+ compressed_channels (int): output channels of channels compressor
+
+ Returns:
+ upsampled feature map
+ """
+
+ def __init__(self,
+ channels: int,
+ scale_factor: int,
+ up_kernel: int = 5,
+ up_group: int = 1,
+ encoder_kernel: int = 3,
+ encoder_dilation: int = 1,
+ compressed_channels: int = 64):
+ super().__init__()
+ self.channels = channels
+ self.scale_factor = scale_factor
+ self.up_kernel = up_kernel
+ self.up_group = up_group
+ self.encoder_kernel = encoder_kernel
+ self.encoder_dilation = encoder_dilation
+ self.compressed_channels = compressed_channels
+ self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+ 1)
+ self.content_encoder = nn.Conv2d(
+ self.compressed_channels,
+ self.up_kernel * self.up_kernel * self.up_group *
+ self.scale_factor * self.scale_factor,
+ self.encoder_kernel,
+ padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+ dilation=self.encoder_dilation,
+ groups=1)
+ self.init_weights()
+
+ def init_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ xavier_init(m, distribution='uniform')
+ normal_init(self.content_encoder, std=0.001)
+
+ def kernel_normalizer(self, mask: Tensor) -> Tensor:
+ mask = F.pixel_shuffle(mask, self.scale_factor)
+ n, mask_c, h, w = mask.size()
+ # use float division explicitly,
+ # to void inconsistency while exporting to onnx
+ mask_channel = int(mask_c / float(self.up_kernel**2))
+ mask = mask.view(n, mask_channel, -1, h, w)
+
+ mask = F.softmax(mask, dim=2, dtype=mask.dtype)
+ mask = mask.view(n, mask_c, h, w).contiguous()
+
+ return mask
+
+ def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+ x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+ return x
+
+ def forward(self, x: Tensor) -> Tensor:
+ compressed_x = self.channel_compressor(x)
+ mask = self.content_encoder(compressed_x)
+ mask = self.kernel_normalizer(mask)
+
+ x = self.feature_reassemble(x, mask)
+ return x
diff --git a/external/cv/mmcv/ops/cc_attention.py b/external/cv/mmcv/ops/cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..06637969f86b9efb1697d78ea7260a76063e0d53
--- /dev/null
+++ b/external/cv/mmcv/ops/cc_attention.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+from mmcv.cnn import Scale
+
+
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+ """Returns a diagonal matrix of size [n, n].
+
+ The diagonal are all "-inf". This is for avoiding calculating the
+ overlapped element in the Criss-Cross twice.
+ """
+ return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
+
+
+@MODELS.register_module()
+class CrissCrossAttention(nn.Module):
+ """Criss-Cross Attention Module.
+
+ .. note::
+ Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
+ to a pure PyTorch and equivalent implementation. For more
+ details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
+
+ Speed comparison for one forward pass
+
+ - Input size: [2,512,97,97]
+ - Device: 1 NVIDIA GeForce RTX 2080 Ti
+
+ +-----------------------+---------------+------------+---------------+
+ | |PyTorch version|CUDA version|Relative speed |
+ +=======================+===============+============+===============+
+ |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x |
+ +-----------------------+---------------+------------+---------------+
+ |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x |
+ +-----------------------+---------------+------------+---------------+
+
+ Args:
+ in_channels (int): Channels of the input feature map.
+ """
+
+ def __init__(self, in_channels: int) -> None:
+ super().__init__()
+ self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+ self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+ self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+ self.gamma = Scale(0.)
+ self.in_channels = in_channels
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """forward function of Criss-Cross Attention.
+
+ Args:
+ x (torch.Tensor): Input feature with the shape of
+ (batch_size, in_channels, height, width).
+
+ Returns:
+ torch.Tensor: Output of the layer, with the shape of
+ (batch_size, in_channels, height, width)
+ """
+ B, C, H, W = x.size()
+ query = self.query_conv(x)
+ key = self.key_conv(x)
+ value = self.value_conv(x)
+ energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
+ H, query.device)
+ energy_H = energy_H.transpose(1, 2)
+ energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
+ attn = F.softmax(
+ torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)]
+ out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
+ out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
+
+ out = self.gamma(out) + x
+ out = out.contiguous()
+
+ return out
+
+ def __repr__(self) -> str:
+ s = self.__class__.__name__
+ s += f'(in_channels={self.in_channels})'
+ return s
diff --git a/external/cv/mmcv/ops/chamfer_distance.py b/external/cv/mmcv/ops/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d5f179d0a6238fab03f692e97d4325f67aeefc4
--- /dev/null
+++ b/external/cv/mmcv/ops/chamfer_distance.py
@@ -0,0 +1,98 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+ """This is an implementation of the 2D Chamfer Distance.
+
+ It has been used in the paper `Oriented RepPoints for Aerial Object
+ Detection (CVPR 2022) _`.
+ """
+
+ @staticmethod
+ def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+ """
+ Args:
+ xyz1 (Tensor): Point set with shape (B, N, 2).
+ xyz2 (Tensor): Point set with shape (B, N, 2).
+
+ Returns:
+ Sequence[Tensor]:
+
+ - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+ shape (B, N).
+ - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+ shape (B, N).
+ - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+ with shape (B, N), which be used in compute gradient.
+ - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+ with shape (B, N), which be used in compute gradient.
+ """
+ batch_size, n, _ = xyz1.size()
+ _, m, _ = xyz2.size()
+ device = xyz1.device
+ xyz1 = xyz1.contiguous()
+ xyz2 = xyz2.contiguous()
+
+ dist1 = torch.zeros(batch_size, n).to(device)
+ dist2 = torch.zeros(batch_size, m).to(device)
+ idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+ idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+ ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+ idx2)
+ ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+ return dist1, dist2, idx1, idx2
+
+ @staticmethod
+ @once_differentiable
+ def backward(ctx,
+ grad_dist1: Tensor,
+ grad_dist2: Tensor,
+ grad_idx1=None,
+ grad_idx2=None) -> Tuple[Tensor, Tensor]:
+ """
+
+ Args:
+ grad_dist1 (Tensor): Gradient of chamfer distance
+ (xyz1 to xyz2) with shape (B, N).
+ grad_dist2 (Tensor): Gradient of chamfer distance
+ (xyz2 to xyz1) with shape (B, N).
+
+ Returns:
+ Tuple[Tensor, Tensor]:
+
+ - grad_xyz1 (Tensor): Gradient of the point set with shape \
+ (B, N, 2).
+ - grad_xyz2 (Tensor):Gradient of the point set with shape \
+ (B, N, 2).
+ """
+ xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+ device = grad_dist1.device
+ grad_dist1 = grad_dist1.contiguous()
+ grad_dist2 = grad_dist2.contiguous()
+ grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+ grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+ ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
+ grad_dist1, grad_dist2, grad_xyz1,
+ grad_xyz2)
+ return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
diff --git a/external/cv/mmcv/ops/contour_expand.py b/external/cv/mmcv/ops/contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2ff0e2ceebab2c5479683572ffc0be4438d28c
--- /dev/null
+++ b/external/cv/mmcv/ops/contour_expand.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
+
+
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+ internal_kernel_label: Union[np.array, torch.Tensor],
+ min_kernel_area: int, kernel_num: int) -> list:
+ """Expand kernel contours so that foreground pixels are assigned into
+ instances.
+
+ Args:
+ kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+ size hxw.
+ internal_kernel_label (np.array or torch.Tensor): The instance internal
+ kernel label with size hxw.
+ min_kernel_area (int): The minimum kernel area.
+ kernel_num (int): The instance kernel number.
+
+ Returns:
+ list: The instance index map with size hxw.
+ """
+ assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
+ assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
+ assert isinstance(min_kernel_area, int)
+ assert isinstance(kernel_num, int)
+
+ if isinstance(kernel_mask, np.ndarray):
+ kernel_mask = torch.from_numpy(kernel_mask)
+ if isinstance(internal_kernel_label, np.ndarray):
+ internal_kernel_label = torch.from_numpy(internal_kernel_label)
+
+ if torch.__version__ == 'parrots':
+ if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
+ label = []
+ else:
+ label = ext_module.contour_expand(
+ kernel_mask,
+ internal_kernel_label,
+ min_kernel_area=min_kernel_area,
+ kernel_num=kernel_num)
+ label = label.tolist() # type: ignore
+ else:
+ label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
+ min_kernel_area, kernel_num)
+ return label
diff --git a/external/cv/mmcv/ops/conv2d_gradfix.py b/external/cv/mmcv/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..74536014ca62d05832de715c2e0c1a82c659b6ab
--- /dev/null
+++ b/external/cv/mmcv/ops/conv2d_gradfix.py
@@ -0,0 +1,344 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import contextlib
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
+
+enabled = True
+weight_gradients_disabled = False
+
+
+@contextlib.contextmanager
+def no_weight_gradients(disable=True):
+ global weight_gradients_disabled
+ old = weight_gradients_disabled
+ if disable:
+ weight_gradients_disabled = True
+ yield
+ weight_gradients_disabled = old
+
+
+def conv2d(input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ stride: Union[int, Tuple[int, ...]] = 1,
+ padding: Union[int, Tuple[int, ...]] = 0,
+ dilation: Union[int, Tuple[int, ...]] = 1,
+ groups: int = 1):
+ flag = True
+ if digit_version(torch.__version__) >= digit_version('1.10.0'):
+ warnings.warn('Since '
+ 'aten:cudnn_convolution_backward_weight is '
+ f'not supported in torch=={torch.__version__},'
+ ' rolling back to `torch.nn.functional.conv2d`')
+ flag = False
+ if _should_use_custom_op(input) and flag:
+ return _conv2d_gradfix(
+ transpose=False,
+ weight_shape=weight.shape,
+ stride=stride,
+ padding=padding,
+ output_padding=0,
+ dilation=dilation,
+ groups=groups).apply(input, weight, bias)
+ return torch.nn.functional.conv2d(
+ input=input,
+ weight=weight,
+ bias=bias,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups)
+
+
+def conv_transpose2d(input: torch.Tensor,
+ weight: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ stride: Union[int, Tuple[int, ...]] = 1,
+ padding: Union[int, Tuple[int, ...]] = 0,
+ output_padding: Union[int, Tuple[int, ...]] = 0,
+ groups: int = 1,
+ dilation: Union[int, Tuple[int, ...]] = 1):
+ if _should_use_custom_op(input):
+ return _conv2d_gradfix(
+ transpose=True,
+ weight_shape=weight.shape,
+ stride=stride,
+ padding=padding,
+ output_padding=output_padding,
+ groups=groups,
+ dilation=dilation).apply(input, weight, bias)
+ return torch.nn.functional.conv_transpose2d(
+ input=input,
+ weight=weight,
+ bias=bias,
+ stride=stride,
+ padding=padding,
+ output_padding=output_padding,
+ groups=groups,
+ dilation=dilation)
+
+
+def _should_use_custom_op(input):
+ assert isinstance(input, torch.Tensor)
+ if (not enabled) or (not torch.backends.cudnn.enabled):
+ return False
+ if input.device.type != 'cuda':
+ return False
+ return True
+
+
+def _to_tuple(x, ndim):
+ xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
+ assert len(xs) == ndim
+ assert all(isinstance(x, int) for x in xs)
+ return xs
+
+
+_conv2d_gradfix_cache: Dict = dict()
+_null_tensor = torch.empty([0])
+
+
+def _conv2d_gradfix(
+ transpose: bool,
+ weight_shape: Tuple[int, ...],
+ stride: Union[int, Tuple[int, ...]],
+ padding: Union[int, Tuple[int, ...]],
+ output_padding: Union[int, Tuple[int, ...]],
+ dilation: Union[int, Tuple[int, ...]],
+ groups: int,
+):
+ # Parse arguments.
+ ndim = 2
+ weight_shape = tuple(weight_shape)
+ stride = _to_tuple(stride, ndim)
+ padding = _to_tuple(padding, ndim)
+ output_padding = _to_tuple(output_padding, ndim)
+ dilation = _to_tuple(dilation, ndim)
+
+ # Lookup from cache.
+ key = (transpose, weight_shape, stride, padding, output_padding, dilation,
+ groups)
+ if key in _conv2d_gradfix_cache:
+ return _conv2d_gradfix_cache[key]
+
+ # Validate arguments.
+
+ assert groups >= 1
+ assert len(weight_shape) == ndim + 2
+ assert all(stride[i] >= 1 for i in range(ndim)) # type: ignore
+ assert all(padding[i] >= 0 for i in range(ndim)) # type: ignore
+ assert all(dilation[i] >= 0 for i in range(ndim)) # type: ignore
+ if not transpose:
+ assert all(output_padding[i] == 0 for i in range(ndim)) # type: ignore
+ else: # transpose
+ for i in range(ndim):
+ assert 0 <= output_padding[i] < max( # type: ignore
+ stride[i], # type: ignore
+ dilation[i]) # type: ignore
+
+ # Helpers.
+ common_kwargs = dict(
+ stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+ def calc_output_padding(input_shape, output_shape):
+ if transpose:
+ return [0, 0]
+ return [
+ input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
+ (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
+ for i in range(ndim)
+ ]
+
+ # Forward & backward.
+ class Conv2d(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, input, weight, bias):
+ assert weight.shape == weight_shape
+ ctx.save_for_backward(
+ input if weight.requires_grad else _null_tensor,
+ weight if input.requires_grad else _null_tensor,
+ )
+ ctx.input_shape = input.shape
+
+ # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
+ if weight_shape[2:] == stride == dilation == (
+ 1, 1) and padding == (
+ 0, 0) and torch.cuda.get_device_capability(
+ input.device) < (8, 0):
+ a = weight.reshape(groups, weight_shape[0] // groups,
+ weight_shape[1])
+ b = input.reshape(input.shape[0], groups,
+ input.shape[1] // groups, -1)
+ c = (a.transpose(1, 2) if transpose else a) @ b.permute(
+ 1, 2, 0, 3).flatten(2)
+ c = c.reshape(-1, input.shape[0],
+ *input.shape[2:]).transpose(0, 1)
+ c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
+ 2).unsqueeze(3)
+ return c.contiguous(
+ memory_format=(torch.channels_last if input.stride(1) ==
+ 1 else torch.contiguous_format))
+
+ # General case => cuDNN.
+ if transpose:
+ return torch.nn.functional.conv_transpose2d(
+ input=input,
+ weight=weight,
+ bias=bias,
+ output_padding=output_padding,
+ **common_kwargs)
+ return torch.nn.functional.conv2d(
+ input=input, weight=weight, bias=bias, **common_kwargs)
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ input, weight = ctx.saved_tensors
+ input_shape = ctx.input_shape
+ grad_input = None
+ grad_weight = None
+ grad_bias = None
+
+ if ctx.needs_input_grad[0]:
+ p = calc_output_padding(
+ input_shape=input_shape, output_shape=grad_output.shape)
+ op = _conv2d_gradfix(
+ transpose=(not transpose),
+ weight_shape=weight_shape,
+ output_padding=p,
+ **common_kwargs)
+ grad_input = op.apply(grad_output, weight, None)
+ assert grad_input.shape == input_shape
+
+ if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+ grad_weight = Conv2dGradWeight.apply(grad_output, input)
+ assert grad_weight.shape == weight_shape
+
+ if ctx.needs_input_grad[2]:
+ grad_bias = grad_output.sum([0, 2, 3])
+
+ return grad_input, grad_weight, grad_bias
+
+ # Gradient with respect to the weights.
+ class Conv2dGradWeight(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, grad_output, input):
+ ctx.save_for_backward(
+ grad_output if input.requires_grad else _null_tensor,
+ input if grad_output.requires_grad else _null_tensor,
+ )
+ ctx.grad_output_shape = grad_output.shape
+ ctx.input_shape = input.shape
+
+ # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
+ if weight_shape[2:] == stride == dilation == (
+ 1, 1) and padding == (0, 0):
+ a = grad_output.reshape(grad_output.shape[0], groups,
+ grad_output.shape[1] // groups,
+ -1).permute(1, 2, 0, 3).flatten(2)
+ b = input.reshape(input.shape[0], groups,
+ input.shape[1] // groups,
+ -1).permute(1, 2, 0, 3).flatten(2)
+ c = (b @ a.transpose(1, 2) if transpose else
+ a @ b.transpose(1, 2)).reshape(weight_shape)
+ return c.contiguous(
+ memory_format=(torch.channels_last if input.stride(1) ==
+ 1 else torch.contiguous_format))
+
+ # PyTorch consolidated convolution backward API in PR:
+ # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122 # noqa: E501
+ # Enhance the code referring to the discussion:
+ # https://github.com/pytorch/pytorch/issues/74437
+ if digit_version(torch.__version__) >= digit_version('1.11.0'):
+ empty_weight = torch.tensor(
+ 0.0, dtype=input.dtype,
+ device=input.device).expand(weight_shape)
+ output_padding = calc_output_padding(input.shape,
+ grad_output.shape)
+ return torch.ops.aten.convolution_backward(
+ grad_output,
+ input,
+ empty_weight,
+ None,
+ stride=stride,
+ dilation=dilation,
+ transposed=transpose,
+ padding=padding,
+ groups=groups,
+ output_padding=output_padding,
+ output_mask=[0, 1, 0])[1]
+ else:
+ if is_rocm_pytorch():
+ name = 'aten::miopen_convolution_transpose_backward_weight'
+ if not transpose:
+ name = 'aten::miopen_convolution_backward_weight'
+ flags = [
+ torch.backends.cudnn.benchmark,
+ torch.backends.cudnn.deterministic
+ ]
+ else:
+ # General case => cuDNN.
+ name = ('aten::cudnn_convolution_transpose_backward_weight'
+ if transpose else
+ 'aten::cudnn_convolution_backward_weight')
+ flags = [
+ torch.backends.cudnn.benchmark,
+ torch.backends.cudnn.deterministic,
+ torch.backends.cudnn.allow_tf32
+ ]
+ return torch._C._jit_get_operation(name)(weight_shape,
+ grad_output, input,
+ padding, stride,
+ dilation, groups,
+ *flags)
+
+ @staticmethod
+ def backward(ctx, grad2_grad_weight):
+ grad_output, input = ctx.saved_tensors
+ grad_output_shape = ctx.grad_output_shape
+ input_shape = ctx.input_shape
+ grad2_grad_output = None
+ grad2_input = None
+
+ if ctx.needs_input_grad[0]:
+ grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
+ None)
+ assert grad2_grad_output.shape == grad_output_shape
+
+ if ctx.needs_input_grad[1]:
+ p = calc_output_padding(
+ input_shape=input_shape, output_shape=grad_output_shape)
+ op = _conv2d_gradfix(
+ transpose=(not transpose),
+ weight_shape=weight_shape,
+ output_padding=p,
+ **common_kwargs)
+ grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+ assert grad2_input.shape == input_shape
+
+ return grad2_grad_output, grad2_input
+
+ _conv2d_gradfix_cache[key] = Conv2d
+ return Conv2d
diff --git a/external/cv/mmcv/ops/convex_iou.py b/external/cv/mmcv/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..395ec62f98d204c3509c49dc69984f313ddadef7
--- /dev/null
+++ b/external/cv/mmcv/ops/convex_iou.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+ polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Return generalized intersection-over-union (Jaccard index) between point
+ sets and polygons.
+
+ Args:
+ pointsets (torch.Tensor): It has shape (N, 18),
+ indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+ polygons (torch.Tensor): It has shape (N, 8),
+ indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+ Returns:
+ tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+ between point sets and polygons with the shape (N,). The second
+ element is the gradient of point sets with the shape (N, 18).
+ """
+ output = pointsets.new_zeros((pointsets.size(0), 19))
+ ext_module.convex_giou(pointsets, polygons, output)
+ convex_giou = output[:, -1]
+ points_grad = output[:, 0:-1]
+ return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+ polygons: torch.Tensor) -> torch.Tensor:
+ """Return intersection-over-union (Jaccard index) between point sets and
+ polygons.
+
+ Args:
+ pointsets (torch.Tensor): It has shape (N, 18),
+ indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+ polygons (torch.Tensor): It has shape (K, 8),
+ indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+ Returns:
+ torch.Tensor: Return the ious between point sets and polygons with the
+ shape (N, K).
+ """
+ N, K = pointsets.size(0), polygons.size(0)
+ ious = pointsets.new_zeros((N, K))
+ ext_module.convex_iou(pointsets, polygons, ious)
+ return ious
diff --git a/external/cv/mmcv/ops/corner_pool.py b/external/cv/mmcv/ops/corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..669a399393f906429bb83ac9de92f08b93ada5a7
--- /dev/null
+++ b/external/cv/mmcv/ops/corner_pool.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from mmengine.utils import digit_version
+from torch import Tensor, nn
+
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+
+
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+ size = x.size(dim)
+ output = x.clone()
+
+ ind = 1
+ while ind < size:
+ if flip:
+ cur_start = 0
+ cur_len = size - ind
+ next_start = ind
+ next_len = size - ind
+ else:
+ cur_start = ind
+ cur_len = size - ind
+ next_start = 0
+ next_len = size - ind
+
+ # max_temp should be cloned for backward computation
+ max_temp = output.narrow(dim, cur_start, cur_len).clone()
+ cur_temp = output.narrow(dim, cur_start, cur_len)
+ next_temp = output.narrow(dim, next_start, next_len)
+
+ cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+ ind = ind << 1
+
+ return output
+
+
+class CornerPool(nn.Module):
+ """Corner Pooling.
+
+ Corner Pooling is a new type of pooling layer that helps a
+ convolutional network better localize corners of bounding boxes.
+
+ Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+ `_ for more details.
+
+ Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+
+ Args:
+ mode (str): Pooling orientation for the pooling layer
+
+ - 'bottom': Bottom Pooling
+ - 'left': Left Pooling
+ - 'right': Right Pooling
+ - 'top': Top Pooling
+
+ Returns:
+ Feature map after pooling.
+ """
+
+ cummax_dim_flip = {
+ 'bottom': (2, False),
+ 'left': (3, True),
+ 'right': (3, False),
+ 'top': (2, True),
+ }
+
+ def __init__(self, mode: str):
+ super().__init__()
+ assert mode in self.cummax_dim_flip
+ self.mode = mode
+
+ def forward(self, x: Tensor) -> Tensor:
+ if (torch.__version__ != 'parrots' and
+ digit_version(torch.__version__) >= digit_version('1.5.0')):
+ dim, flip = self.cummax_dim_flip[self.mode]
+ if flip:
+ x = x.flip(dim)
+ pool_tensor, _ = torch.cummax(x, dim=dim)
+ if flip:
+ pool_tensor = pool_tensor.flip(dim)
+ return pool_tensor
+ else:
+ dim, flip = self.cummax_dim_flip[self.mode]
+ return _corner_pool(x, dim, flip)
diff --git a/external/cv/mmcv/ops/correlation.py b/external/cv/mmcv/ops/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..bebba1c5871da6ac12abb1b29305dcdc3f389c36
--- /dev/null
+++ b/external/cv/mmcv/ops/correlation.py
@@ -0,0 +1,205 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+ '_ext', ['correlation_forward', 'correlation_backward'])
+
+
+class CorrelationFunction(Function):
+
+ @staticmethod
+ def forward(ctx,
+ input1: Tensor,
+ input2: Tensor,
+ kernel_size: int = 1,
+ max_displacement: int = 1,
+ stride: int = 1,
+ padding: int = 1,
+ dilation: int = 1,
+ dilation_patch: int = 1) -> Tensor:
+
+ ctx.save_for_backward(input1, input2)
+
+ kH, kW = ctx.kernel_size = _pair(kernel_size)
+ patch_size = max_displacement * 2 + 1
+ ctx.patch_size = patch_size
+ dH, dW = ctx.stride = _pair(stride)
+ padH, padW = ctx.padding = _pair(padding)
+ dilationH, dilationW = ctx.dilation = _pair(dilation)
+ dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
+ dilation_patch)
+
+ output_size = CorrelationFunction._output_size(ctx, input1)
+
+ output = input1.new_zeros(output_size)
+
+ ext_module.correlation_forward(
+ input1,
+ input2,
+ output,
+ kH=kH,
+ kW=kW,
+ patchH=patch_size,
+ patchW=patch_size,
+ padH=padH,
+ padW=padW,
+ dilationH=dilationH,
+ dilationW=dilationW,
+ dilation_patchH=dilation_patchH,
+ dilation_patchW=dilation_patchW,
+ dH=dH,
+ dW=dW)
+
+ return output
+
+ @staticmethod
+ @once_differentiable
+ def backward(
+ ctx, grad_output: Tensor
+ ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
+ input1, input2 = ctx.saved_tensors
+
+ kH, kW = ctx.kernel_size
+ patch_size = ctx.patch_size
+ padH, padW = ctx.padding
+ dilationH, dilationW = ctx.dilation
+ dilation_patchH, dilation_patchW = ctx.dilation_patch
+ dH, dW = ctx.stride
+ grad_input1 = torch.zeros_like(input1)
+ grad_input2 = torch.zeros_like(input2)
+
+ ext_module.correlation_backward(
+ grad_output,
+ input1,
+ input2,
+ grad_input1,
+ grad_input2,
+ kH=kH,
+ kW=kW,
+ patchH=patch_size,
+ patchW=patch_size,
+ padH=padH,
+ padW=padW,
+ dilationH=dilationH,
+ dilationW=dilationW,
+ dilation_patchH=dilation_patchH,
+ dilation_patchW=dilation_patchW,
+ dH=dH,
+ dW=dW)
+ return grad_input1, grad_input2, None, None, None, None, None, None
+
+ @staticmethod
+ def _output_size(ctx, input1):
+ iH, iW = input1.size(2), input1.size(3)
+ batch_size = input1.size(0)
+ kH, kW = ctx.kernel_size
+ patch_size = ctx.patch_size
+ dH, dW = ctx.stride
+ padH, padW = ctx.padding
+ dilationH, dilationW = ctx.dilation
+ dilatedKH = (kH - 1) * dilationH + 1
+ dilatedKW = (kW - 1) * dilationW + 1
+
+ oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
+ oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
+
+ output_size = (batch_size, patch_size, patch_size, oH, oW)
+ return output_size
+
+
+class Correlation(nn.Module):
+ r"""Correlation operator
+
+ This correlation operator works for optical flow correlation computation.
+
+ There are two batched tensors with shape :math:`(N, C, H, W)`,
+ and the correlation output's shape is :math:`(N, max\_displacement \times
+ 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
+
+ where
+
+ .. math::
+ H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding -
+ dilation \times (kernel\_size - 1) - 1}
+ {stride} + 1\right\rfloor
+
+ .. math::
+ W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation
+ \times (kernel\_size - 1) - 1}
+ {stride} + 1\right\rfloor
+
+ the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
+ window convolution between input1 and shifted input2,
+
+ .. math::
+ Corr(N_i, dx, dy) =
+ \sum_{c=0}^{C-1}
+ input1(N_i, c) \star
+ \mathcal{S}(input2(N_i, c), dy, dx)
+
+ where :math:`\star` is the valid 2d sliding window convolution operator,
+ and :math:`\mathcal{S}` means shifting the input features (auto-complete
+ zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
+ [-max\_displacement \times dilation\_patch, max\_displacement \times
+ dilation\_patch]`.
+
+ Args:
+ kernel_size (int): The size of sliding window i.e. local neighborhood
+ representing the center points and involved in correlation
+ computation. Defaults to 1.
+ max_displacement (int): The radius for computing correlation volume,
+ but the actual working space can be dilated by dilation_patch.
+ Defaults to 1.
+ stride (int): The stride of the sliding blocks in the input spatial
+ dimensions. Defaults to 1.
+ padding (int): Zero padding added to all four sides of the input1.
+ Defaults to 0.
+ dilation (int): The spacing of local neighborhood that will involved
+ in correlation. Defaults to 1.
+ dilation_patch (int): The spacing between position need to compute
+ correlation. Defaults to 1.
+ """
+
+ def __init__(self,
+ kernel_size: int = 1,
+ max_displacement: int = 1,
+ stride: int = 1,
+ padding: int = 0,
+ dilation: int = 1,
+ dilation_patch: int = 1) -> None:
+ super().__init__()
+ self.kernel_size = kernel_size
+ self.max_displacement = max_displacement
+ self.stride = stride
+ self.padding = padding
+ self.dilation = dilation
+ self.dilation_patch = dilation_patch
+
+ def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+ return CorrelationFunction.apply(input1, input2, self.kernel_size,
+ self.max_displacement, self.stride,
+ self.padding, self.dilation,
+ self.dilation_patch)
+
+ def __repr__(self) -> str:
+ s = self.__class__.__name__
+ s += f'(kernel_size={self.kernel_size}, '
+ s += f'max_displacement={self.max_displacement}, '
+ s += f'stride={self.stride}, '
+ s += f'padding={self.padding}, '
+ s += f'dilation={self.dilation}, '
+ s += f'dilation_patch={self.dilation_patch})'
+ return s
diff --git a/external/cv/mmcv/ops/csrc/README.md b/external/cv/mmcv/ops/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc6eb1a3260148aa7448470967684f8c9f0365
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/README.md
@@ -0,0 +1,162 @@
+# Code Structure of CUDA operators
+
+This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.
+
+## Directories Tree
+
+```folder
+.
+├── common
+│ ├── box_iou_rotated_utils.hpp
+│ ├── parrots_cpp_helper.hpp
+│ ├── parrots_cuda_helper.hpp
+│ ├── pytorch_cpp_helper.hpp
+│ ├── pytorch_cuda_helper.hpp
+│ ├── pytorch_device_registry.hpp
+│ ├── cuda
+│ │ ├── common_cuda_helper.hpp
+│ │ ├── parrots_cudawarpfunction.cuh
+│ │ ├── ...
+│ │ └── ops_cuda_kernel.cuh
+| ├── mps
+│ │ ├── MPSLibrary.h
+│ │ ├── ...
+│ │ └── MPSUtils.h
+| ├── mlu
+│ │ └── ...
+| └── utils
+│ │ └── ...
+├── parrots
+│ ├── ...
+│ ├── ops.cpp
+│ ├── ops_parrots.cpp
+│ └── ops_pytorch.h
+└── pytorch
+ ├── info.cpp
+ ├── pybind.cpp
+ ├── ...
+ ├── ops.cpp
+ ├── cuda
+ │ ├── ...
+ │ └── ops_cuda.cu
+ ├── cpu
+ │ ├── ...
+ │ └── ops.cpp
+ ├── mps
+ │ ├── ...
+ | └── op_mps.mm
+ └── mlu
+ ├── ...
+ └── op_mlu.cpp
+```
+
+## Components
+
+- `common`: This directory contains all tools and shared codes.
+ - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
+ - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+ - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+ - `utils`: The kernels and utils of spconv.
+- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
+- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
+ - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
+ - `cpu`: This directory contain cpu implementations of corresponding custom ops.
+ - `mlu`: This directory contain launchers of each MLU kernels.
+ - `mps`: MPS ops implementation and launchers.
+
+## How to add new PyTorch ops?
+
+1. (Optional) Add shared kernel in `common` to support special hardware platform.
+
+ ```c++
+ // src/common/cuda/new_ops_cuda_kernel.cuh
+
+ template
+ __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+ // forward here
+ }
+
+ ```
+
+ Add cuda kernel launcher in `pytorch/cuda`.
+
+ ```c++
+ // src/pytorch/cuda
+ #include
+
+ void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+ // initialize
+ at::cuda::CUDAGuard device_guard(input.device());
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+ ...
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+ new_ops_forward_cuda_kernel
+ <<>>(
+ input.data_ptr(), output.data_ptr(),...);
+ }));
+ AT_CUDA_CHECK(cudaGetLastError());
+ }
+ ```
+
+2. Register implementation for different devices.
+
+ ```c++
+ // src/pytorch/cuda/cudabind.cpp
+ ...
+
+ Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+ // implement cuda forward here
+ // use `NewOpsForwardCUDAKernelLauncher` here
+ }
+ // declare interface here.
+ Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+ // register the implementation for given device (CUDA here).
+ REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+ ```
+
+3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
+
+ ```c++
+ // src/pytorch/new_ops.cpp
+ Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+ // dispatch the implementation according to the device type of input.
+ DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+ }
+ ...
+
+ Tensor new_ops_forward(Tensor input, Tensor output, ...){
+ return new_ops_forward_impl(input, output, ...);
+ }
+ ```
+
+4. Binding the implementation in `pytorch/pybind.cpp`
+
+ ```c++
+ // src/pytorch/pybind.cpp
+
+ ...
+
+ Tensor new_ops_forward(Tensor input, Tensor output, ...);
+
+ ...
+
+ // bind with pybind11
+ m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+ py::arg("input"), py::arg("output"), ...);
+
+ ...
+
+ ```
+
+5. Build MMCV again. Enjoy new ops in python
+
+ ```python
+ from ..utils import ext_loader
+ ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+
+ ...
+
+ ext_module.new_ops_forward(input, output, ...)
+
+ ```
diff --git a/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8453eaa8d3638394df8a0b169d8df01dfc27a11
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,426 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include
+#include
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template
+struct RotatedBox {
+ T x_ctr, y_ctr, w, h, a;
+};
+
+template
+struct Point {
+ T x, y;
+ HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+ HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+ return Point(x + p.x, y + p.y);
+ }
+ HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+ x += p.x;
+ y += p.y;
+ return *this;
+ }
+ HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+ return Point(x - p.x, y - p.y);
+ }
+ HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+ return Point(x * coeff, y * coeff);
+ }
+};
+
+template
+HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) {
+ return A.x * B.x + A.y * B.y;
+}
+
+template
+HOST_DEVICE_INLINE T cross_2d(const Point& A, const Point& B) {
+ return A.x * B.y - B.x * A.y;
+}
+
+template
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox& box,
+ Point (&pts)[4]) {
+ // M_PI / 180. == 0.01745329251
+ // double theta = box.a * 0.01745329251;
+ // MODIFIED
+ double theta = box.a;
+ T cosTheta2 = (T)cos(theta) * 0.5f;
+ T sinTheta2 = (T)sin(theta) * 0.5f;
+
+ // y: top --> down; x: left --> right
+ pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+ pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+ pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+ pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+ pts[2].x = 2 * box.x_ctr - pts[0].x;
+ pts[2].y = 2 * box.y_ctr - pts[0].y;
+ pts[3].x = 2 * box.x_ctr - pts[1].x;
+ pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template
+HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4],
+ const Point (&pts2)[4],
+ Point (&intersections)[24]) {
+ // Line vector
+ // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+ Point vec1[4], vec2[4];
+ for (int i = 0; i < 4; i++) {
+ vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+ vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+ }
+
+ // Line test - test all line combos for intersection
+ int num = 0; // number of intersections
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ // Solve for 2x2 Ax=b
+ T det = cross_2d(vec2[j], vec1[i]);
+
+ // This takes care of parallel lines
+ if (fabs(det) <= 1e-14) {
+ continue;
+ }
+
+ auto vec12 = pts2[j] - pts1[i];
+
+ T t1 = cross_2d(vec2[j], vec12) / det;
+ T t2 = cross_2d(vec1[i], vec12) / det;
+
+ if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+ intersections[num++] = pts1[i] + vec1[i] * t1;
+ }
+ }
+ }
+
+ // Check for vertices of rect1 inside rect2
+ {
+ const auto& AB = vec2[0];
+ const auto& DA = vec2[3];
+ auto ABdotAB = dot_2d(AB, AB);
+ auto ADdotAD = dot_2d(DA, DA);
+ for (int i = 0; i < 4; i++) {
+ // assume ABCD is the rectangle, and P is the point to be judged
+ // P is inside ABCD iff. P's projection on AB lies within AB
+ // and P's projection on AD lies within AD
+
+ auto AP = pts1[i] - pts2[0];
+
+ auto APdotAB = dot_2d(AP, AB);
+ auto APdotAD = -dot_2d(AP, DA);
+
+ if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+ (APdotAD <= ADdotAD)) {
+ intersections[num++] = pts1[i];
+ }
+ }
+ }
+
+ // Reverse the check - check for vertices of rect2 inside rect1
+ {
+ const auto& AB = vec1[0];
+ const auto& DA = vec1[3];
+ auto ABdotAB = dot_2d(AB, AB);
+ auto ADdotAD = dot_2d(DA, DA);
+ for (int i = 0; i < 4; i++) {
+ auto AP = pts2[i] - pts1[0];
+
+ auto APdotAB = dot_2d(AP, AB);
+ auto APdotAD = -dot_2d(AP, DA);
+
+ if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+ (APdotAD <= ADdotAD)) {
+ intersections[num++] = pts2[i];
+ }
+ }
+ }
+
+ return num;
+}
+
+template
+HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24],
+ const int& num_in, Point (&q)[24],
+ bool shift_to_zero = false) {
+ assert(num_in >= 2);
+
+ // Step 1:
+ // Find point with minimum y
+ // if more than 1 points have the same minimum y,
+ // pick the one with the minimum x.
+ int t = 0;
+ for (int i = 1; i < num_in; i++) {
+ if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+ t = i;
+ }
+ }
+ auto& start = p[t]; // starting point
+
+ // Step 2:
+ // Subtract starting point from every points (for sorting in the next step)
+ for (int i = 0; i < num_in; i++) {
+ q[i] = p[i] - start;
+ }
+
+ // Swap the starting point to position 0
+ auto tmp = q[0];
+ q[0] = q[t];
+ q[t] = tmp;
+
+ // Step 3:
+ // Sort point 1 ~ num_in according to their relative cross-product values
+ // (essentially sorting according to angles)
+ // If the angles are the same, sort according to their distance to origin
+ T dist[24];
+ for (int i = 0; i < num_in; i++) {
+ dist[i] = dot_2d(q[i], q[i]);
+ }
+
+#ifdef __CUDACC__
+ // CUDA version
+ // In the future, we can potentially use thrust
+ // for sorting here to improve speed (though not guaranteed)
+ for (int i = 1; i < num_in - 1; i++) {
+ for (int j = i + 1; j < num_in; j++) {
+ T crossProduct = cross_2d(q[i], q[j]);
+ if ((crossProduct < -1e-6) ||
+ (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+ auto q_tmp = q[i];
+ q[i] = q[j];
+ q[j] = q_tmp;
+ auto dist_tmp = dist[i];
+ dist[i] = dist[j];
+ dist[j] = dist_tmp;
+ }
+ }
+ }
+#else
+ // CPU version
+ std::sort(q + 1, q + num_in,
+ [](const Point& A, const Point& B) -> bool {
+ T temp = cross_2d(A, B);
+ if (fabs(temp) < 1e-6) {
+ return dot_2d(A, A) < dot_2d(B, B);
+ } else {
+ return temp > 0;
+ }
+ });
+ // compute distance to origin after sort, since the points are now different.
+ for (int i = 0; i < num_in; i++) {
+ dist[i] = dot_2d(q[i], q[i]);
+ }
+#endif
+
+ // Step 4:
+ // Make sure there are at least 2 points (that don't overlap with each other)
+ // in the stack
+ int k; // index of the non-overlapped second point
+ for (k = 1; k < num_in; k++) {
+ if (dist[k] > 1e-8) {
+ break;
+ }
+ }
+ if (k == num_in) {
+ // We reach the end, which means the convex hull is just one point
+ q[0] = p[t];
+ return 1;
+ }
+ q[1] = q[k];
+ int m = 2; // 2 points in the stack
+ // Step 5:
+ // Finally we can start the scanning process.
+ // When a non-convex relationship between the 3 points is found
+ // (either concave shape or duplicated points),
+ // we pop the previous point from the stack
+ // until the 3-point relationship is convex again, or
+ // until the stack only contains two points
+ for (int i = k + 1; i < num_in; i++) {
+ while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+ m--;
+ }
+ q[m++] = q[i];
+ }
+
+ // Step 6 (Optional):
+ // In general sense we need the original coordinates, so we
+ // need to shift the points back (reverting Step 2)
+ // But if we're only interested in getting the area/perimeter of the shape
+ // We can simply return.
+ if (!shift_to_zero) {
+ for (int i = 0; i < m; i++) {
+ q[i] += start;
+ }
+ }
+
+ return m;
+}
+
+template
+HOST_DEVICE_INLINE T quadri_box_area(const Point (&q)[4]) {
+ T area = 0;
+#pragma unroll
+ for (int i = 1; i < 3; i++) {
+ area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
+ }
+
+ return area / 2.0;
+}
+
+template
+HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) {
+ if (m <= 2) {
+ return 0;
+ }
+
+ T area = 0;
+ for (int i = 1; i < m - 1; i++) {
+ area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
+ }
+
+ return area / 2.0;
+}
+
+template
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox& box1,
+ const RotatedBox& box2) {
+ // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+ // from rotated_rect_intersection_pts
+ Point intersectPts[24], orderedPts[24];
+
+ Point pts1[4];
+ Point pts2[4];
+ get_rotated_vertices(box1, pts1);
+ get_rotated_vertices(box2, pts2);
+
+ int num = get_intersection_points(pts1, pts2, intersectPts);
+
+ if (num <= 2) {
+ return 0.0;
+ }
+
+ // Convex Hull to order the intersection points in clockwise order and find
+ // the contour area.
+ int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+ return polygon_area(orderedPts, num_convex);
+}
+
+template
+HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point (&pts1)[4],
+ const Point (&pts2)[4]) {
+ // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+ // from rotated_rect_intersection_pts
+ Point intersectPts[24], orderedPts[24];
+
+ int num = get_intersection_points(pts1, pts2, intersectPts);
+
+ if (num <= 2) {
+ return 0.0;
+ }
+
+ // Convex Hull to order the intersection points in clockwise order and find
+ // the contour area.
+ int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+ return polygon_area(orderedPts, num_convex);
+}
+
+} // namespace
+
+template
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+ T const* const box2_raw,
+ const int mode_flag) {
+ // shift center to the middle point to achieve higher precision in result
+ RotatedBox box1, box2;
+ auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+ auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+ box1.x_ctr = box1_raw[0] - center_shift_x;
+ box1.y_ctr = box1_raw[1] - center_shift_y;
+ box1.w = box1_raw[2];
+ box1.h = box1_raw[3];
+ box1.a = box1_raw[4];
+ box2.x_ctr = box2_raw[0] - center_shift_x;
+ box2.y_ctr = box2_raw[1] - center_shift_y;
+ box2.w = box2_raw[2];
+ box2.h = box2_raw[3];
+ box2.a = box2_raw[4];
+
+ const T area1 = box1.w * box1.h;
+ const T area2 = box2.w * box2.h;
+ if (area1 < 1e-14 || area2 < 1e-14) {
+ return 0.f;
+ }
+
+ const T intersection = rotated_boxes_intersection(box1, box2);
+ T baseS = 1.0;
+ if (mode_flag == 0) {
+ baseS = (area1 + area2 - intersection);
+ } else if (mode_flag == 1) {
+ baseS = area1;
+ }
+ const T iou = intersection / baseS;
+ return iou;
+}
+
+template
+HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
+ T const* const pts2_raw,
+ const int mode_flag) {
+ // shift center to the middle point to achieve higher precision in result
+ Point pts1[4], pts2[4];
+
+ auto center_shift_x =
+ (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
+ pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
+ 8.0;
+ auto center_shift_y =
+ (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
+ pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
+ 8.0;
+ pts1[0].x = pts1_raw[0] - center_shift_x;
+ pts1[0].y = pts1_raw[1] - center_shift_y;
+ pts1[1].x = pts1_raw[2] - center_shift_x;
+ pts1[1].y = pts1_raw[3] - center_shift_y;
+ pts1[2].x = pts1_raw[4] - center_shift_x;
+ pts1[2].y = pts1_raw[5] - center_shift_y;
+ pts1[3].x = pts1_raw[6] - center_shift_x;
+ pts1[3].y = pts1_raw[7] - center_shift_y;
+ pts2[0].x = pts2_raw[0] - center_shift_x;
+ pts2[0].y = pts2_raw[1] - center_shift_y;
+ pts2[1].x = pts2_raw[2] - center_shift_x;
+ pts2[1].y = pts2_raw[3] - center_shift_y;
+ pts2[2].x = pts2_raw[4] - center_shift_x;
+ pts2[2].y = pts2_raw[5] - center_shift_y;
+ pts2[3].x = pts2_raw[6] - center_shift_x;
+ pts2[3].y = pts2_raw[7] - center_shift_y;
+
+ const T area1 = quadri_box_area(pts1);
+ const T area2 = quadri_box_area(pts2);
+ if (area1 < 1e-14 || area2 < 1e-14) {
+ return 0.f;
+ }
+
+ const T intersection = quadri_boxes_intersection(pts1, pts2);
+ T baseS = 1.0;
+ if (mode_flag == 0) {
+ baseS = (area1 + area2 - intersection);
+ } else if (mode_flag == 1) {
+ baseS = area1;
+ }
+ const T iou = intersection / baseS;
+ return iou;
+}
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__global__ void active_rotated_filter_forward_cuda_kernel(
+ const int nthreads, const scalar_t* weight_data, const int* indices_data,
+ const int num_input_planes, const int num_output_planes,
+ const int num_orientations, const int num_rotations, const int nEntry,
+ scalar_t* output_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ int l = index % nEntry;
+ int j = (index / nEntry) % num_input_planes;
+ int i = index / nEntry / num_input_planes;
+ int k;
+ scalar_t val = *(weight_data + index);
+ for (k = 0; k < num_rotations; k++) {
+ int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+ scalar_t* target = output_data +
+ i * (num_rotations * num_input_planes * nEntry) +
+ k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+ *target = val;
+ }
+ }
+}
+
+template
+__global__ void active_rotated_filter_backward_cuda_kernel(
+ const int nthreads, const scalar_t* gradWeight_data,
+ const int* indices_data, const int num_input_planes,
+ const int num_output_planes, const int num_orientations,
+ const int num_rotations, const int nEntry, scalar_t* weight_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ int l = index % nEntry;
+ int j = (index / nEntry) % num_input_planes;
+ int i = index / nEntry / num_input_planes;
+ int k;
+ scalar_t* val = weight_data + index;
+ *val = 0;
+ scalar_t tmp = 0;
+ for (k = 0; k < num_rotations; k++) {
+ int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+ scalar_t target =
+ *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+ k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+ tmp = tmp + target;
+ }
+ *val = tmp;
+ }
+}
+#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9f9250844b9ceeca0df0377640c3d28e3f61cecc
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,116 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) = s(b,i,k,m)*p(b,i(k),m,j)
+// i(k) = idx(b,i,k)
+// sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+// avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+// max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template
+__global__ void assign_score_withk_forward_cuda_kernel(
+ const int B, const int N0, const int N1, const int M, const int K,
+ const int O, const int aggregate, const T* points, const T* centers,
+ const T* scores, const int64_t* knn_idx, T* output) {
+ // ----- parallel loop for B, N1, K and O ---------
+ CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+ // ------- loop for M ----------
+ const int b = (int)(i / (O * N1 * K));
+ const int o = (int)(i % (O * N1 * K) / (N1 * K));
+ const int n = (int)(i % (N1 * K) / K);
+ const int k = (int)(i % K);
+ const int cn = (int)knn_idx[b * K * N1 + n * K +
+ 0]; // The first neighbor is the center point
+ const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+ if (kn >= N0 ||
+ kn < 0) { // if index overflows, it is out of the neighborhood range
+ return;
+ }
+ assert(b < B);
+ assert(kn < N0);
+ assert(cn < N0);
+ assert(o < O);
+ assert(n < N1);
+ const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+ T val = output[out_idx];
+ for (int m = 0; m < M; m++) {
+ val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+ scores[b * N1 * K * M + n * K * M + k * M + m] -
+ centers[b * N0 * M * O + cn * M * O + m * O + o] *
+ scores[b * N1 * K * M + n * K * M + k * M + m];
+ }
+ output[out_idx] = val;
+ }
+}
+
+template
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+ const int B, const int N0, const int N, const int M, const int K,
+ const int O, const int aggregate, const T* grad_out, const T* scores,
+ const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+ // ----- parallel loop for B, M, O ---------
+ CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+ int b = (int)(i / (M * O));
+ int m = (int)(i % (M * O) / O);
+ int o = (int)(i % O);
+
+ // ----- loop for N,K ---------
+ for (int n = 0; n < N; n++) {
+ for (int k = 0; k < K; k++) {
+ int kn = knn_idx[b * N * K + n * K + k];
+ int cn = knn_idx[b * N * K + n * K + 0];
+ if (kn >= N0 || kn < 0) { // if index overflows, it is out of the
+ // neighborhood range
+ continue;
+ }
+ atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+ scores[b * N * K * M + n * K * M + k * M + m] *
+ grad_out[b * O * N * K + o * N * K + n * K + k]);
+ atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+ -scores[b * N * K * M + n * K * M + k * M + m] *
+ grad_out[b * O * N * K + o * N * K + n * K + k]);
+ }
+ }
+ }
+}
+
+template
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+ const int B, const int N0, const int N, const int M, const int K,
+ const int O, const int aggregate, const T* grad_out, const T* points,
+ const T* centers, const int64_t* knn_idx, T* grad_scores) {
+ // ----- parallel loop for B, N, K, M ---------
+ CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+ const int b = (int)(i / (N * M * K));
+ const int n = (int)(i % (N * M * K) / M / K);
+ const int k = (int)(i % (M * K) / M);
+ const int m = (int)(i % M);
+ const int cn = knn_idx[b * N * K + n * K + 0];
+ const int kn = knn_idx[b * N * K + n * K + k];
+ if (kn >= N0 ||
+ kn < 0) { // if index overflows, it is out of the neighborhood range
+ return;
+ }
+
+ // -------------- loop for O ------------------------
+ const int out_idx = b * N * K * M + n * K * M + k * M + m;
+ T val = grad_scores[out_idx];
+ for (int o = 0; o < O; o++) {
+ val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+ centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+ grad_out[b * O * N * K + o * N * K + n * K + k];
+ }
+ grad_scores[out_idx] = val;
+ }
+}
+
+#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+ float min_radius,
+ float max_radius, int nsample,
+ const T* new_xyz, const T* xyz,
+ int* idx) {
+ // new_xyz: (B, M, 3)
+ // xyz: (B, N, 3)
+ // output:
+ // idx: (B, M, nsample)
+ int bs_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+ if (bs_idx >= b) return;
+
+ new_xyz += bs_idx * m * 3 + pt_idx * 3;
+ xyz += bs_idx * n * 3;
+ idx += bs_idx * m * nsample + pt_idx * nsample;
+
+ float max_radius2 = max_radius * max_radius;
+ float min_radius2 = min_radius * min_radius;
+ T new_x = new_xyz[0];
+ T new_y = new_xyz[1];
+ T new_z = new_xyz[2];
+
+ int cnt = 0;
+ for (int k = 0; k < n; ++k) {
+ T x = xyz[k * 3 + 0];
+ T y = xyz[k * 3 + 1];
+ T z = xyz[k * 3 + 2];
+ T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+ (new_z - z) * (new_z - z);
+ if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+ if (cnt == 0) {
+ for (int l = 0; l < nsample; ++l) {
+ idx[l] = k;
+ }
+ }
+ idx[cnt] = k;
+ ++cnt;
+ if (cnt >= nsample) break;
+ }
+ }
+ }
+}
+
+#endif // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15bd91eca629895d3a99dde3fe6614036ca31dc9
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+ T& y1, T& x2, T& y2) {
+ x1 = bbox[base];
+ y1 = bbox[base + 1];
+ x2 = bbox[base + 2];
+ y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox(const float* bbox,
+ const int base, float& x1,
+ float& y1, float& x2,
+ float& y2) {
+ const float4 bbox_offset = reinterpret_cast(bbox + base)[0];
+ x1 = bbox_offset.x;
+ y1 = bbox_offset.y;
+ x2 = bbox_offset.z;
+ y2 = bbox_offset.w;
+}
+
+template
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+ T* ious, const int num_bbox1,
+ const int num_bbox2, const int mode,
+ const bool aligned,
+ const int offset) {
+ if (aligned) {
+ CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+ const int b1 = index;
+ const int b2 = index;
+
+ const int base1 = b1 << 2; // b1 * 4
+ T b1_x1, b1_y1, b1_x2, b1_y2;
+ load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+ const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+ const int base2 = b2 << 2; // b2 * 4
+ T b2_x1, b2_y1, b2_x2, b2_y2;
+ load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+ const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+ const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+ const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+ const T width = fmaxf(right - left + offset, 0.f);
+ const T height = fmaxf(bottom - top + offset, 0.f);
+ const T interS = width * height;
+
+ const T baseS =
+ fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+ ious[index] = interS / baseS;
+ }
+ } else {
+ CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+ const int b1 = index / num_bbox2;
+ const int b2 = index % num_bbox2;
+
+ const int base1 = b1 << 2; // b1 * 4
+ T b1_x1, b1_y1, b1_x2, b1_y2;
+ load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+ const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+ const int base2 = b2 << 2; // b2 * 4
+ T b2_x1, b2_y1, b2_x2, b2_y2;
+ load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+ const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+ const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+ const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+ const T width = fmaxf(right - left + offset, 0.f);
+ const T height = fmaxf(bottom - top + offset, 0.f);
+ const T interS = width * height;
+
+ const T baseS =
+ fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+ ious[index] = interS / baseS;
+ }
+ }
+}
+
+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+ const __half x2, const __half y2,
+ const __half offset) {
+ const __half half_w = __hadd(__hsub(x2, x1), offset);
+ const __half half_h = __hadd(__hsub(y2, y1), offset);
+ return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+ return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+ return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+ const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+ const int num_bbox2, const int mode, const bool aligned, const int offset) {
+ const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+ const __half h_offset = __int2half_rn(offset);
+ CUDA_1D_KERNEL_LOOP(index, num_output) {
+ const int b1 = aligned ? index : index / num_bbox2;
+ const int b2 = aligned ? index : index % num_bbox2;
+
+ const int base1 = b1 << 2;
+ __half b1_x1, b1_y1, b1_x2, b1_y2;
+ load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+ const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+ const int base2 = b2 << 2;
+ __half b2_x1, b2_y1, b2_x2, b2_y2;
+ load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+ const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+ const __half left = __half_max(b1_x1, b2_x1),
+ right = __half_min(b1_x2, b2_x2);
+ const __half top = __half_max(b1_y1, b2_y1),
+ bottom = __half_min(b1_y2, b2_y2);
+ const __half width =
+ __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+ const __half height =
+ __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+ const __half interS = __hmul(width, height);
+
+ const __half baseS = __half_max(
+ mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+ h_offset);
+ ious[index] = __hdiv(interS, baseS);
+ }
+}
+#endif // __CUDA_ARCH__ >= 530
+
+#endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..537610416e16aae8979d0843972e090d127b0d43
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
+#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
+#define BEZIER_ALIGN_CUDA_KERNEL_CUH
+
+#include
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif // MMCV_USE_PARROTS
+#endif // MMCV_WITH_TRT
+
+template
+__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
+ const T u) {
+ return ((1. - u) * (1. - u) * (1. - u) * p0 +
+ 3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
+ u * u * u * p3);
+}
+
+template
+__global__ void bezier_align_forward_cuda_kernel(
+ const int nthreads,
+ const T *bottom_data, // inputs
+ const T *bottom_rois, // bottom rois contains the bezier curve
+ T *top_data, // outputs
+ const int pooled_height, const int pooled_width, const T spatial_scale,
+ const int sampling_ratio, bool aligned, const int channels,
+ const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ // beziers have size Nx(1+8*2) = Nx17
+ const T *offset_bottom_rois = bottom_rois + n * 17;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not use rounding; this implementation detail is critical
+ T offset = aligned ? (T)0.5 : (T)0.0;
+
+ // TODO: avoid this by using parallel annotation, for good
+ T p0_x = offset_bottom_rois[1] * spatial_scale;
+ T p0_y = offset_bottom_rois[2] * spatial_scale;
+ T p1_x = offset_bottom_rois[3] * spatial_scale;
+ T p1_y = offset_bottom_rois[4] * spatial_scale;
+ T p2_x = offset_bottom_rois[5] * spatial_scale;
+ T p2_y = offset_bottom_rois[6] * spatial_scale;
+ T p3_x = offset_bottom_rois[7] * spatial_scale;
+ T p3_y = offset_bottom_rois[8] * spatial_scale;
+ T p4_x = offset_bottom_rois[15] * spatial_scale;
+ T p4_y = offset_bottom_rois[16] * spatial_scale;
+ T p5_x = offset_bottom_rois[13] * spatial_scale;
+ T p5_y = offset_bottom_rois[14] * spatial_scale;
+ T p6_x = offset_bottom_rois[11] * spatial_scale;
+ T p6_y = offset_bottom_rois[12] * spatial_scale;
+ T p7_x = offset_bottom_rois[9] * spatial_scale;
+ T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+ // compute the coords
+ const T u = pw / static_cast(pooled_width);
+ const T v = ph / static_cast(pooled_height);
+ const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+ const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+ const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+ const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+ const T x_center = x1 * v + x0 * (1. - v) - offset;
+ const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+ T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+ T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+ if (!aligned) { // for backward-compatibility only
+ roi_width = max(roi_width, (T)1.);
+ roi_height = max(roi_height, (T)1.);
+ }
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T *offset_bottom_data =
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ // When the grid is empty, output zeros == 0/1, instead of NaN.
+ const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T y = y_center - (T)0.5 * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T x = x_center - (T)0.5 * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
+ index);
+ output_val += val;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ }
+}
+
+template
+__global__ void bezier_align_backward_cuda_kernel(
+ const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
+ const int pooled_height, const int pooled_width, const T spatial_scale,
+ const int sampling_ratio, bool aligned, const int channels,
+ const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ // beziers have size Nx(1+8*2) = Nx17
+ const T *offset_bottom_rois = bottom_rois + n * 17;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not use rounding; this implementation detail is critical
+ T offset = aligned ? (T)0.5 : (T)0.0;
+ T p0_x = offset_bottom_rois[1] * spatial_scale;
+ T p0_y = offset_bottom_rois[2] * spatial_scale;
+ T p1_x = offset_bottom_rois[3] * spatial_scale;
+ T p1_y = offset_bottom_rois[4] * spatial_scale;
+ T p2_x = offset_bottom_rois[5] * spatial_scale;
+ T p2_y = offset_bottom_rois[6] * spatial_scale;
+ T p3_x = offset_bottom_rois[7] * spatial_scale;
+ T p3_y = offset_bottom_rois[8] * spatial_scale;
+ T p4_x = offset_bottom_rois[15] * spatial_scale;
+ T p4_y = offset_bottom_rois[16] * spatial_scale;
+ T p5_x = offset_bottom_rois[13] * spatial_scale;
+ T p5_y = offset_bottom_rois[14] * spatial_scale;
+ T p6_x = offset_bottom_rois[11] * spatial_scale;
+ T p6_y = offset_bottom_rois[12] * spatial_scale;
+ T p7_x = offset_bottom_rois[9] * spatial_scale;
+ T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+ // compute the coords
+ const T u = pw / static_cast(pooled_width);
+ const T v = ph / static_cast(pooled_height);
+ const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+ const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+ const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+ const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+ const T x_center = x1 * v + x0 * (1. - v) - offset;
+ const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+ T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+ T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+ if (!aligned) { // for backward-compatibility only
+ roi_width = max(roi_width, (T)1.);
+ roi_height = max(roi_height, (T)1.);
+ }
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T *offset_bottom_diff =
+ bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T *offset_top_diff = top_diff + top_offset;
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T y = y_center - (T)0.5 * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T x = x_center - (T)0.5 * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+ x_low, x_high, y_low, y_high, index);
+
+ T g1 = top_diff_this_bin * w1 / count;
+ T g2 = top_diff_this_bin * w2 / count;
+ T g3 = top_diff_this_bin * w3 / count;
+ T g4 = top_diff_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ atomicAdd(offset_bottom_diff + y_low * width + x_low,
+ static_cast(g1));
+ atomicAdd(offset_bottom_diff + y_low * width + x_high,
+ static_cast(g2));
+ atomicAdd(offset_bottom_diff + y_high * width + x_low,
+ static_cast(g3));
+ atomicAdd(offset_bottom_diff + y_high * width + x_high,
+ static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // CUDA_1D_KERNEL_LOOP
+} // BezierAlignBackward
+
+#endif // BEZIER_ALIGN_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d2a2197b45ef5c82412c4b75d7819a7e27674f6
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif // MMCV_USE_PARROTS
+#endif // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template
+__global__ void border_align_forward_cuda_kernel(
+ const int nthreads, const T* input, const T* boxes, T* output,
+ int* argmax_idx, const int channels, const int box_size, const int height,
+ const int width, const int pool_size) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+ // output, and `extreme_idx` is in range [0,3]
+ int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+ const T *offset_box, *offset_input, *offset_box_x;
+ T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+ val, maxval;
+
+ extreme_idx = threadIdx.y;
+ // shape (N, C, box_size, 4) for output
+ batch_idx = index / channels / box_size;
+ // shape (N, box_size, 4) for boxes
+ box_idx = index % box_size + batch_idx * box_size;
+ c_idx = (index / box_size) % channels;
+
+ offset_box = boxes + box_idx * 4;
+ box_width = *(offset_box + 2) - *offset_box;
+ box_height = *(offset_box + 3) - *(offset_box + 1);
+ offset_output = output + index * 4 + extreme_idx;
+ offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+ // shape (N, 4C, h, w) for input.
+ // [0,C) for top feature, [C,2C) for left feature,
+ // [2C,3C) for bottom feature, [3C,4C) for right feature
+ offset_input =
+ input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+ height * width;
+
+ // extreme_idx in [0,1] -> offset_box_x indexed at x1
+ // extreme_idx in [2,3] -> offset_box_x indexed at x2
+ offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+ // (x1,y1) or (x2,y2) for (x,y)
+ x = *offset_box_x;
+ y = *(offset_box_x + 1);
+
+ switch (extreme_idx) {
+ // top
+ case BorderMode::Top:
+ stride = box_width / pool_size;
+ x_stride = stride;
+ y_stride = 0;
+ break;
+ // left
+ case BorderMode::Left:
+ stride = box_height / pool_size;
+ x_stride = 0;
+ y_stride = stride;
+ break;
+ // bottom
+ case BorderMode::Bottom:
+ stride = box_width / pool_size;
+ x_stride = -stride;
+ y_stride = 0;
+ break;
+ // right
+ case BorderMode::Right:
+ stride = box_height / pool_size;
+ x_stride = 0;
+ y_stride = -stride;
+ break;
+ }
+
+ // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+ // (x2,y2))
+ maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+ maxidx = 0;
+
+ // do max_pool along the border
+ for (int i = 1; i <= pool_size; i++) {
+ x += x_stride;
+ y += y_stride;
+ val = bilinear_interpolate(offset_input, height, width, y, x, index);
+ if (val > maxval) {
+ maxval = val;
+ maxidx = i;
+ }
+ }
+
+ // update output and argmax_idx
+ *offset_output = maxval;
+ *offset_argmax_idx = maxidx;
+ }
+}
+
+/*** Backward ***/
+template
+__global__ void border_align_backward_cuda_kernel(
+ const int nthreads, const T* grad_output, const T* boxes,
+ const int* argmax_idx, T* grad_input, const int channels,
+ const int box_size, const int height, const int width,
+ const int pool_size) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+ // output, and `extreme_idx` is in range [0,3]
+ int batch_idx, c_idx, box_idx, extreme_idx;
+ const int* offset_argmax_idx;
+ const T *offset_grad_output, *offset_box, *offset_box_x;
+ T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+ y;
+
+ extreme_idx = threadIdx.y;
+ batch_idx = index / channels / box_size;
+ box_idx = index % box_size + batch_idx * box_size;
+ c_idx = (index / box_size) % channels;
+
+ offset_box = boxes + box_idx * 4;
+ box_width = *(offset_box + 2) - *offset_box;
+ box_height = *(offset_box + 3) - *(offset_box + 1);
+ offset_grad_output = grad_output + index * 4 + extreme_idx;
+ offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+ // [0,C) for top feature grad, [C,2C) for left feature grad,
+ // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+ offset_grad_input = grad_input + (batch_idx * channels * 4 +
+ extreme_idx * channels + c_idx) *
+ height * width;
+
+ // extreme_idx in [0,1] -> offset_box_x indexed at x1
+ // extreme_idx in [2,3] -> offset_box_x indexed at x2
+ offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+ switch (extreme_idx) {
+ // top
+ case BorderMode::Top:
+ stride = box_width / pool_size;
+ x_stride = stride;
+ y_stride = 0;
+ break;
+ // left
+ case BorderMode::Left:
+ stride = box_height / pool_size;
+ x_stride = 0;
+ y_stride = stride;
+ break;
+ // bottom
+ case BorderMode::Bottom:
+ stride = box_width / pool_size;
+ x_stride = -stride;
+ y_stride = 0;
+ break;
+ // right
+ case BorderMode::Right:
+ stride = box_height / pool_size;
+ x_stride = 0;
+ y_stride = -stride;
+ break;
+ }
+
+ // get position (x,y) which has maximum value during forward
+ x = *offset_box_x;
+ y = *(offset_box_x + 1);
+ x += x_stride * (T)(*offset_argmax_idx);
+ y += y_stride * (T)(*offset_argmax_idx);
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+ bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+ x_high, y_low, y_high, index);
+
+ // update grad_output
+ atomicAdd(offset_grad_input + y_low * width + x_low,
+ *offset_grad_output * w1);
+ atomicAdd(offset_grad_input + y_low * width + x_high,
+ *offset_grad_output * w2);
+ atomicAdd(offset_grad_input + y_high * width + x_low,
+ *offset_grad_output * w3);
+ atomicAdd(offset_grad_input + y_high * width + x_high,
+ *offset_grad_output * w4);
+ }
+}
+
+#endif // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cf8ad5e1a324de3a11c8fc8af28a8d559a661ed6
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
@@ -0,0 +1,91 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef BOX_IOU_QUADRI_CUDA_CUH
+#define BOX_IOU_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template
+__global__ void box_iou_quadri_cuda_kernel(
+ const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+ const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+ if (aligned) {
+ CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+ int b1 = index;
+ int b2 = index;
+
+ int base1 = b1 * 8;
+
+ float block_boxes1[8];
+ float block_boxes2[8];
+
+ block_boxes1[0] = dev_boxes1[base1 + 0];
+ block_boxes1[1] = dev_boxes1[base1 + 1];
+ block_boxes1[2] = dev_boxes1[base1 + 2];
+ block_boxes1[3] = dev_boxes1[base1 + 3];
+ block_boxes1[4] = dev_boxes1[base1 + 4];
+ block_boxes1[5] = dev_boxes1[base1 + 5];
+ block_boxes1[6] = dev_boxes1[base1 + 6];
+ block_boxes1[7] = dev_boxes1[base1 + 7];
+
+ int base2 = b2 * 8;
+
+ block_boxes2[0] = dev_boxes2[base2 + 0];
+ block_boxes2[1] = dev_boxes2[base2 + 1];
+ block_boxes2[2] = dev_boxes2[base2 + 2];
+ block_boxes2[3] = dev_boxes2[base2 + 3];
+ block_boxes2[4] = dev_boxes2[base2 + 4];
+ block_boxes2[5] = dev_boxes2[base2 + 5];
+ block_boxes2[6] = dev_boxes2[base2 + 6];
+ block_boxes2[7] = dev_boxes2[base2 + 7];
+
+ dev_ious[index] =
+ single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag);
+ }
+ } else {
+ CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+ int b1 = index / n_boxes2;
+ int b2 = index % n_boxes2;
+
+ int base1 = b1 * 8;
+
+ float block_boxes1[8];
+ float block_boxes2[8];
+
+ block_boxes1[0] = dev_boxes1[base1 + 0];
+ block_boxes1[1] = dev_boxes1[base1 + 1];
+ block_boxes1[2] = dev_boxes1[base1 + 2];
+ block_boxes1[3] = dev_boxes1[base1 + 3];
+ block_boxes1[4] = dev_boxes1[base1 + 4];
+ block_boxes1[5] = dev_boxes1[base1 + 5];
+ block_boxes1[6] = dev_boxes1[base1 + 6];
+ block_boxes1[7] = dev_boxes1[base1 + 7];
+
+ int base2 = b2 * 8;
+
+ block_boxes2[0] = dev_boxes2[base2 + 0];
+ block_boxes2[1] = dev_boxes2[base2 + 1];
+ block_boxes2[2] = dev_boxes2[base2 + 2];
+ block_boxes2[3] = dev_boxes2[base2 + 3];
+ block_boxes2[4] = dev_boxes2[base2 + 4];
+ block_boxes2[5] = dev_boxes2[base2 + 5];
+ block_boxes2[6] = dev_boxes2[base2 + 6];
+ block_boxes2[7] = dev_boxes2[base2 + 7];
+
+ dev_ious[index] =
+ single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag);
+ }
+ }
+}
+
+#endif
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template
+__global__ void box_iou_rotated_cuda_kernel(
+ const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+ const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+ if (aligned) {
+ CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+ int b1 = index;
+ int b2 = index;
+
+ int base1 = b1 * 5;
+
+ float block_boxes1[5];
+ float block_boxes2[5];
+
+ block_boxes1[0] = dev_boxes1[base1 + 0];
+ block_boxes1[1] = dev_boxes1[base1 + 1];
+ block_boxes1[2] = dev_boxes1[base1 + 2];
+ block_boxes1[3] = dev_boxes1[base1 + 3];
+ block_boxes1[4] = dev_boxes1[base1 + 4];
+
+ int base2 = b2 * 5;
+
+ block_boxes2[0] = dev_boxes2[base2 + 0];
+ block_boxes2[1] = dev_boxes2[base2 + 1];
+ block_boxes2[2] = dev_boxes2[base2 + 2];
+ block_boxes2[3] = dev_boxes2[base2 + 3];
+ block_boxes2[4] = dev_boxes2[base2 + 4];
+
+ dev_ious[index] =
+ single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag);
+ }
+ } else {
+ CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+ int b1 = index / n_boxes2;
+ int b2 = index % n_boxes2;
+
+ int base1 = b1 * 5;
+
+ float block_boxes1[5];
+ float block_boxes2[5];
+
+ block_boxes1[0] = dev_boxes1[base1 + 0];
+ block_boxes1[1] = dev_boxes1[base1 + 1];
+ block_boxes1[2] = dev_boxes1[base1 + 2];
+ block_boxes1[3] = dev_boxes1[base1 + 3];
+ block_boxes1[4] = dev_boxes1[base1 + 4];
+
+ int base2 = b2 * 5;
+
+ block_boxes2[0] = dev_boxes2[base2 + 0];
+ block_boxes2[1] = dev_boxes2[base2 + 1];
+ block_boxes2[2] = dev_boxes2[base2 + 2];
+ block_boxes2[3] = dev_boxes2[base2 + 3];
+ block_boxes2[4] = dev_boxes2[base2 + 4];
+
+ dev_ious[index] =
+ single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag);
+ }
+ }
+}
+
+#endif
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..311900fcd303483dea815a1eb996a7eb33fdc55b
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,335 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#include
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#ifdef MMCV_WITH_HIP
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+ const int w, const int channel_num,
+ const int height, const int width) {
+ int index = w + (h + (c + n * channel_num) * height) * width;
+ return index;
+}
+#ifndef MMCV_WITH_HIP
+/* TODO: move this to a common place */
+template
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+ return a < b ? a : b;
+}
+
+template
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+ return a > b ? a : b;
+}
+#endif
+template
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+ for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+ val += __shfl_down(val, offset);
+#else
+ val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+ return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+ for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+ // Using PyTorch's macro for half support
+ __PHALF(val) += WARP_SHFL_DOWN(val, offset);
+#else
+ __PHALF(val) +=
+ __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
+#endif
+ return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+ const int W, const int dh,
+ const int dw,
+ const scalar_t *__restrict__ X,
+ scalar_t *__restrict__ Y) {
+ __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+ const int n = blockIdx.x / (dh * dw);
+ const int k = blockIdx.x % (dh * dw);
+ const int r = k / dw;
+ const int c = k % dw;
+ const int offset = n * H * W;
+ int x = c * kTileDim + threadIdx.x;
+ int y = r * kTileDim + threadIdx.y;
+ if (x < W) {
+ for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+ tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+ }
+ }
+ __syncthreads();
+ x = r * kTileDim + threadIdx.x;
+ y = c * kTileDim + threadIdx.y;
+ if (x < H) {
+ for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+ Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+ }
+ }
+}
+template
+__global__ void CARAFEForward(
+ const int num_kernels, const scalar_t *__restrict__ bottom_data,
+ const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+ const int group_size, const int scale_factor, const int channels,
+ const int down_height, const int down_width, const int height,
+ const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+ __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+ __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index > num_kernels - 1) {
+ return;
+ }
+ const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+ const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+ index = index / THREADS_PER_PIXEL;
+ const int pw = index % width;
+ const int ph = (index / width) % height;
+ const int n = index / width / height;
+
+ const int down_pw = pw / scale_factor;
+ const int down_ph = ph / scale_factor;
+
+ const int start_w = down_pw - (kernel_size - 1) / 2;
+ const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+ const int start_h = down_ph - (kernel_size - 1) / 2;
+ const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+ for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+ int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+ shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+ }
+ __syncthreads();
+
+ const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+ for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+ int mask_group = c / channels_per_group;
+ scalar_t output_val = 0;
+#pragma unroll
+ for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+ for (int ix = start_w; ix < end_w; ix++) {
+ if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+ continue;
+ }
+ int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+ int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+ int mask_c =
+ (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+ int feat_index =
+ Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+ output_val += bottom_data[feat_index] *
+ shared_mask[mask_c * WARP_SIZE + pixel_id];
+ }
+ }
+
+ int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+ top_data[top_index] = output_val;
+ }
+}
+
+template
+__global__ void CARAFEBackward_Feature(
+ const int num_kernels, const scalar_t *__restrict__ top_diff,
+ const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+ const int group_size, const int scale_factor, const int channels,
+ const int down_height, const int down_width, const int height,
+ const int width, const int mask_channels,
+ scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+ __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+ __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index > num_kernels - 1) {
+ return;
+ }
+
+ const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+ const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+ // (n, c, ph, pw) is an element in the bottom_data
+ index = index / THREADS_PER_PIXEL;
+ const int pw = index % width;
+ const int ph = (index / width) % height;
+ const int n = index / width / height;
+
+ const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+ const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+ const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+ const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+ for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+ const int mask_w = (c % kernel_size) * scale_factor;
+ const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+ const int mask_x = start_w + mask_w;
+ const int mask_y = start_h + mask_h;
+ if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+ shared_mask[c * WARP_SIZE + pixel_id] = 0;
+ continue;
+ }
+ const int mask_group = c / (kernel_size * kernel_size);
+ const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+ int mask_index =
+ Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+ shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+ }
+ __syncthreads();
+ const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+ for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+ int mask_group = c / channels_per_group;
+ int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+ scalar_t output_val = 0;
+#pragma unroll
+ for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+ for (int ix = start_w; ix < end_w; ix += scale_factor) {
+ if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+ continue;
+ }
+ int mask_iy =
+ (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+ int mask_ix =
+ (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+ int mask_c =
+ (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+ int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+ output_val +=
+ shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+ }
+ }
+ bottom_diff[top_index] = output_val;
+ }
+}
+
+template
+__global__ void FeatureSum(const int num_kernels,
+ const scalar_t *__restrict__ input_data,
+ const int scale_factor, const int channels,
+ const int height, const int width,
+ scalar_t *__restrict__ output_data) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index > num_kernels - 1) {
+ return;
+ }
+ const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+ index = index / THREADS_PER_PIXEL;
+ const int pw = index % width;
+ const int ph = (index / width) % height;
+ const int n = index / width / height;
+ for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+ scalar_t output_val = 0;
+ for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+ for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+ int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+ width * scale_factor, channels);
+ output_val += input_data[input_id];
+ }
+ }
+ const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+ output_data[output_id] = output_val;
+ }
+}
+
+template
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+ const scalar_t *__restrict__ top_diff,
+ const scalar_t *__restrict__ bottom_data,
+ const int kernel_size, const int group_size,
+ const int scale_factor, const int channels,
+ const int down_height, const int down_width,
+ const int height, const int width,
+ const int mask_channels,
+ scalar_t *__restrict__ mask_diff) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index > num_kernels - 1) {
+ return;
+ }
+
+ const int lane_id = index % WARP_SIZE;
+ index = index / WARP_SIZE;
+ const int mask_c = index % mask_channels;
+ // (n, c, ph, pw) is an element in the bottom_data
+ index = index / mask_channels;
+ const int pw = index % width;
+ const int ph = (index / width) % height;
+ const int n = index / width / height;
+
+ const int down_pw = pw / scale_factor;
+ const int down_ph = ph / scale_factor;
+
+ const int mask_group = mask_c / (kernel_size * kernel_size);
+ const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+ const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+ const int offset_y =
+ mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+ const int down_x = down_pw + offset_x;
+ const int down_y = down_ph + offset_y;
+
+ scalar_t output_val = 0;
+
+ if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+ down_x <= down_width - 1) {
+ const int channels_per_mask = ceilf(channels / (float)group_size);
+ const int start = channels_per_mask * mask_group;
+ const int end = min(channels_per_mask * (mask_group + 1), channels);
+ for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+ int bottom_id =
+ Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+ int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+ output_val += top_diff[top_id] * bottom_data[bottom_id];
+ }
+ }
+#ifdef MMCV_WITH_HIP
+ __syncthreads();
+#else
+ __syncwarp();
+#endif
+ output_val = warpReduceSum(output_val);
+ if (lane_id == 0) {
+ const int mask_id =
+ Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+ mask_diff[mask_id] = output_val;
+ }
+}
+
+#endif // CARAFE_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..48230c632f223b736aa72a9d5fd682c97b3aa93a
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+ const int w, const int channel_num,
+ const int height, const int width) {
+ int index = w + (h + (c + n * channel_num) * height) * width;
+ return index;
+}
+
+template
+__global__ void carafe_naive_forward_cuda_kernel(
+ const int nthreads, const scalar_t *bottom_data,
+ const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+ const int group_size, const int scale_factor, const int channels,
+ const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the bottom_data
+ int pw = index % width;
+ int ph = (index / width) % height;
+ int c = (index / width / height) % channels;
+ int n = index / width / height / channels;
+
+ int mask_channels = kernel_size * kernel_size * group_size;
+ int mask_group = c / (channels / group_size);
+
+ int down_pw = pw / scale_factor;
+ int down_ph = ph / scale_factor;
+ int down_width = width / scale_factor;
+ int down_height = height / scale_factor;
+ int start_w = down_pw - (kernel_size - 1) / 2;
+ int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+ int start_h = down_ph - (kernel_size - 1) / 2;
+ int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+ scalar_t output_val = 0;
+ for (int iy = start_h; iy < end_h; iy++) {
+ for (int ix = start_w; ix < end_w; ix++) {
+ if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+ continue;
+ }
+ int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+ int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+ int mask_c =
+ (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+ int feat_index =
+ Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+ int mask_index =
+ Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+ output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+ }
+ }
+ top_data[index] = output_val;
+ }
+}
+
+template
+__global__ void carafe_naive_backward_cuda_kernel(
+ const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+ const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+ const int kernel_size, const int group_size, const int scale_factor,
+ const int channels, const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the bottom_data
+ int pw = index % width;
+ int ph = (index / width) % height;
+ int c = (index / width / height) % channels;
+ int n = index / width / height / channels;
+
+ int mask_channels = kernel_size * kernel_size * group_size;
+ int mask_group = c / (channels / group_size);
+
+ int down_pw = pw / scale_factor;
+ int down_ph = ph / scale_factor;
+ int down_width = width / scale_factor;
+ int down_height = height / scale_factor;
+ int start_w = down_pw - (kernel_size - 1) / 2;
+ int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+ int start_h = down_ph - (kernel_size - 1) / 2;
+ int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+ for (int iy = start_h; iy < end_h; iy++) {
+ for (int ix = start_w; ix < end_w; ix++) {
+ if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+ continue;
+ }
+ int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+ int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+ int mask_c =
+ (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+ int feat_index =
+ Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+ int mask_index =
+ Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+ atomicAdd(bottom_diff + feat_index,
+ bottom_masks[mask_index] * top_diff[index]);
+ atomicAdd(mask_diff + mask_index,
+ bottom_data[feat_index] * top_diff[index]);
+ }
+ }
+ }
+}
+
+#endif // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
@@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
+#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
+
+template
+__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
+ const scalar_t* xyz, int m,
+ const scalar_t* xyz2,
+ scalar_t* result,
+ int* result_i) {
+ __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
+ for (int i = blockIdx.x; i < b; i += gridDim.x) {
+ for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
+ int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
+ for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
+ buf[j] = xyz2[(i * m + k2) * 2 + j];
+ }
+ __syncthreads();
+ for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+ scalar_t x1 = xyz[(i * n + j) * 2 + 0];
+ scalar_t y1 = xyz[(i * n + j) * 2 + 1];
+ int best_i = 0;
+ scalar_t best = 1e10;
+ int end_ka = end_k & (~2);
+ if (end_ka == THREADS_PER_BLOCK) {
+ for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
+#pragma unroll
+ for (int j = 0; j < 4; ++j) {
+ scalar_t x2 = buf[(k + j) * 2] - x1;
+ scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+ scalar_t d = x2 * x2 + y2 * y2;
+ if (d < best) {
+ best = d;
+ best_i = k + k2 + j;
+ }
+ }
+ }
+ } else {
+ for (int k = 0; k < end_ka; k += 4) {
+#pragma unroll
+ for (int j = 0; j < 4; ++j) {
+ scalar_t x2 = buf[(k + j) * 2] - x1;
+ scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+ scalar_t d = x2 * x2 + y2 * y2;
+ if (d < best) {
+ best = d;
+ best_i = k + k2 + j;
+ }
+ }
+ }
+ }
+ for (int k = end_ka; k < end_k; k++) {
+ scalar_t x2 = buf[k * 2 + 0] - x1;
+ scalar_t y2 = buf[k * 2 + 1] - y1;
+ scalar_t d = x2 * x2 + y2 * y2;
+ if (k == 0 || d < best) {
+ best = d;
+ best_i = k + k2;
+ }
+ }
+ if (k2 == 0 || result[(i * n + j)] > best) {
+ result[(i * n + j)] = best;
+ result_i[(i * n + j)] = best_i;
+ }
+ }
+ __syncthreads();
+ }
+ }
+}
+
+template
+__global__ void chamfer_distance_backward_cuda_kernel(
+ int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
+ const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
+ scalar_t* grad_xyz2) {
+ for (int i = blockIdx.x; i < b; i += gridDim.x) {
+ for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+ scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
+ scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
+ int j2 = idx1[i * n + j];
+ scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
+ scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
+ scalar_t g = grad_dist1[i * n + j] * 2;
+ atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
+ atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
+ atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
+ atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
+ }
+ }
+}
+#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,120 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include
+
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
+ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ i += blockDim.x * gridDim.x) \
+ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+ j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
+ for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+ for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+ int optimal_block_num = (N + num_threads - 1) / num_threads;
+ int max_block_num = 4096;
+ return min(optimal_block_num, max_block_num);
+}
+
+template
+__device__ T bilinear_interpolate(const T* input, const int height,
+ const int width, T y, T x,
+ const int index /* index for debug only*/) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ // do bilinear interpolation
+ T v1 = input[y_low * width + x_low];
+ T v2 = input[y_low * width + x_high];
+ T v3 = input[y_high * width + x_low];
+ T v4 = input[y_high * width + x_high];
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ return val;
+}
+
+template
+__device__ void bilinear_interpolate_gradient(
+ const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+ int& x_low, int& x_high, int& y_low, int& y_high,
+ const int index /* index for debug only*/) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ y_low = (int)y;
+ x_low = (int)x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = input[y_low * width + x_low];
+ // T v2 = input[y_low * width + x_high];
+ // T v3 = input[y_high * width + x_low];
+ // T v4 = input[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+#endif // COMMON_CUDA_HELPER
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,831 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+ double x, y;
+ __device__ Point() {}
+ __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+ return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+ Point temp;
+ temp.x = a->x;
+ temp.y = a->y;
+
+ a->x = b->x;
+ a->y = b->y;
+
+ b->x = temp.x;
+ b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+ for (int i = 0; i < (n - 1) / 2.0; i++) {
+ Point* j = &(a[i]);
+ Point* k = &(a[n - 1 - i]);
+ swap1(j, k);
+ }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+ return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+ return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+ ps[n] = ps[0];
+ double res = 0;
+ for (int i = 0; i < n; i++) {
+ res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+ }
+ return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+ int* polygon_to_pred_index,
+ int n_pred, double* grad_C) {
+ ps[n] = ps[0];
+ double partion_grad[4 * 30 + 2];
+ double res = 0;
+ for (int i = 0; i < n; i++) {
+ res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+ partion_grad[i * 4 + 2] = ps[i + 1].y;
+ partion_grad[i * 4 + 3] = -ps[i + 1].x;
+ if (i != n - 1) {
+ partion_grad[i * 4 + 4] = -ps[i].y;
+ partion_grad[i * 4 + 5] = ps[i].x;
+ } else {
+ partion_grad[0] = -ps[i].y;
+ partion_grad[1] = ps[i].x;
+ }
+ }
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < n_pred; j++) {
+ if (i == polygon_to_pred_index[j]) {
+ grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+ (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+ break;
+ }
+ }
+ for (int j = 0; j < n_pred; j++) {
+ if (i == polygon_to_pred_index[j]) {
+ grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+ (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+ break;
+ }
+ }
+ }
+
+ return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+ double* cut_grad, int m, int n, int i) {
+ double s1, s2;
+ double s2_s1_2;
+ double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+ double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+ s1 = cross(a, b, c);
+ s2 = cross(a, b, d);
+
+ ds1_dxc = -(b.y - a.y);
+ ds1_dyc = b.x - a.x;
+ ds2_dxd = ds1_dxc;
+ ds2_dyd = ds1_dyc;
+ s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+ if (sig(s1) == 0 && sig(s2) == 0) return 2;
+ if (sig(s2 - s1) == 0) return 0;
+
+ dxp_dxc =
+ ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+ (s2_s1_2);
+ dxp_dyc =
+ ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+ (s2_s1_2);
+ dxp_dxd =
+ ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+ (s2_s1_2);
+ dxp_dyd =
+ ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+ (s2_s1_2);
+
+ dyp_dxc =
+ ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+ (s2_s1_2);
+ dyp_dyc =
+ ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+ (s2_s1_2);
+ dyp_dxd =
+ ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+ (s2_s1_2);
+ dyp_dyd =
+ ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+ (s2_s1_2);
+
+ p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+ p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+ if (i == n - 1) {
+ cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
+ cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+ cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
+ cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+ cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd;
+ cut_grad[4 * n * m + 1] = dyp_dxd;
+ cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd;
+ cut_grad[4 * n * m + 3] = dyp_dyd;
+ } else {
+ cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
+ cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+ cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
+ cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+ cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd;
+ cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+ cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd;
+ cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+ }
+
+ return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+ double* cut_grad) {
+ Point pp[MAXN];
+ double ccur_grad[MAXN] = {};
+ int m = 0;
+ p[n] = p[0];
+ int k = n;
+ for (int i = 0; i < n; i++) {
+ if (sig(cross(a, b, p[i])) > 0) {
+ pp[m] = p[i];
+ ccur_grad[4 * n * m + 4 * i] = 1.0;
+ ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+ m++;
+ }
+ if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+ lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+ m++;
+ }
+ }
+
+ n = 0;
+ for (int i = 0; i < m; i++) {
+ if (!i || !(point_same(pp[i], pp[i - 1]))) {
+ p[n] = pp[i];
+ for (int j = 0; j < 4 * k; j++) {
+ cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+ }
+ n++;
+ }
+ }
+
+ while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+ double* grad_AB, int order,
+ int convex_n) {
+ Point o(0, 0);
+ int res_flag = 0;
+ int s1 = sig(cross(o, a, b));
+ int s2 = sig(cross(o, c, d));
+ if (s1 == 0 || s2 == 0) return 0.0;
+ if (s1 == -1) {
+ Point* i = &a;
+ Point* j = &b;
+ swap1(i, j);
+ res_flag = 1;
+ }
+ if (s2 == -1) {
+ Point* i = &c;
+ Point* j = &d;
+ swap1(i, j);
+ }
+ Point p[10] = {o, a, b};
+ int n = 3, n0 = 3, n1, n2, n3;
+ double cut_grad1[MAXN] = {};
+ double cut_grad2[MAXN] = {};
+ double cut_grad3[MAXN] = {};
+ double p1_p_grad[10][10] = {};
+ double p2_p1_grad[10][10] = {};
+ double p3_p2_grad[10][10] = {};
+
+ double p3_p1_grad[10][10] = {};
+ double p3_p_grad[10][10] = {};
+
+ // 1
+ polygon_cut(p, n, o, c, cut_grad1);
+ n1 = n;
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < 4 * n0; j++) {
+ if (!(j % 2)) {
+ p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+ } else {
+ p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+ }
+ }
+ }
+
+ // 2
+ polygon_cut(p, n, c, d, cut_grad2);
+ n2 = n;
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < 4 * n1; j++) {
+ if (!(j % 2)) {
+ p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+ } else {
+ p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+ }
+ }
+ }
+ // 3
+ polygon_cut(p, n, d, o, cut_grad3);
+ n3 = n;
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < 4 * n2; j++) {
+ if (!(j % 2)) {
+ p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+ } else {
+ p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+ }
+ }
+ }
+
+ // mul
+ // p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+ for (int i = 0; i < 2 * n3; i++) {
+ for (int j = 0; j < 2 * n1; j++) {
+ double sum = 0.0;
+ for (int m = 0; m < 2 * n2; m++) {
+ sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+ }
+ p3_p1_grad[i][j] = sum;
+ }
+ }
+
+ // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+ for (int i = 0; i < 2 * n3; i++) {
+ for (int j = 0; j < 2 * n0; j++) {
+ double sum = 0.0;
+ for (int m = 0; m < 2 * n1; m++) {
+ sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+ }
+ p3_p_grad[i][j] = sum;
+ }
+ }
+
+ // calculate S_grad
+ int polygon_index_box_index[20];
+ double grad_polygon[20];
+ double S_grad[6];
+
+ for (int i = 0; i < n3; i++) {
+ polygon_index_box_index[i] = i;
+ polygon_index_box_index[i + n3] = i;
+ }
+
+ double res =
+ polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+ if (s1 * s2 == -1) {
+ for (int j = 0; j < 2 * 3; j++) {
+ double sum = 0.0;
+ for (int m = 0; m < 2 * n3; m++) {
+ sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+ }
+ S_grad[j] = sum;
+ }
+
+ if (order != convex_n - 1) {
+ if (res_flag) {
+ grad_AB[2 * order] += S_grad[4];
+ grad_AB[2 * order + 1] += S_grad[5];
+ grad_AB[2 * order + 2] += S_grad[2];
+ grad_AB[2 * order + 3] += S_grad[3];
+
+ } else {
+ grad_AB[2 * order] += S_grad[2];
+ grad_AB[2 * order + 1] += S_grad[3];
+ grad_AB[2 * order + 2] += S_grad[4];
+ grad_AB[2 * order + 3] += S_grad[5];
+ }
+ } else {
+ if (res_flag) {
+ grad_AB[2 * order] += S_grad[4];
+ grad_AB[2 * order + 1] += S_grad[5];
+ grad_AB[0] += S_grad[2];
+ grad_AB[1] += S_grad[3];
+
+ } else {
+ grad_AB[2 * order] += S_grad[2];
+ grad_AB[2 * order + 1] += S_grad[3];
+ grad_AB[0] += S_grad[4];
+ grad_AB[1] += S_grad[5];
+ }
+ }
+ res = -res;
+ } else {
+ for (int j = 0; j < 2 * 3; j++) {
+ double sum = 0.0;
+ for (int m = 0; m < 2 * n3; m++) {
+ sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+ }
+ S_grad[j] = sum;
+ }
+
+ if (order != convex_n - 1) {
+ if (res_flag) {
+ grad_AB[2 * order] += S_grad[4];
+ grad_AB[2 * order + 1] += S_grad[5];
+ grad_AB[2 * order + 2] += S_grad[2];
+ grad_AB[2 * order + 3] += S_grad[3];
+ } else {
+ grad_AB[2 * order] += S_grad[2];
+ grad_AB[2 * order + 1] += S_grad[3];
+ grad_AB[2 * order + 2] += S_grad[4];
+ grad_AB[2 * order + 3] += S_grad[5];
+ }
+ } else {
+ if (res_flag) {
+ grad_AB[2 * order] += S_grad[4];
+ grad_AB[2 * order + 1] += S_grad[5];
+ grad_AB[0] += S_grad[2];
+ grad_AB[1] += S_grad[3];
+ } else {
+ grad_AB[2 * order] += S_grad[2];
+ grad_AB[2 * order + 1] += S_grad[3];
+ grad_AB[0] += S_grad[4];
+ grad_AB[1] += S_grad[5];
+ }
+ }
+ }
+ return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+ double* grad_AB) {
+ if (area(ps1, n1) < 0) reverse1(ps1, n1);
+ if (area(ps2, n2) < 0) reverse1(ps2, n2);
+ ps1[n1] = ps1[0];
+ ps2[n2] = ps2[0];
+ double res = 0;
+ for (int i = 0; i < n1; i++) {
+ for (int j = 0; j < n2; j++) {
+ res +=
+ intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+ }
+ }
+ return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+ Point p_max, p_k;
+ int max_index, k_index;
+ int Stack[NMAX] = {}, top1, top2;
+ double sign;
+ Point right_point[10], left_point[10];
+
+ for (int i = 0; i < n_poly; i++) {
+ if (in_poly[i].y < in_poly[0].y ||
+ in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+ Point* j = &(in_poly[0]);
+ Point* k = &(in_poly[i]);
+ swap1(j, k);
+ }
+ if (i == 0) {
+ p_max = in_poly[0];
+ max_index = 0;
+ }
+ if (in_poly[i].y > p_max.y ||
+ in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+ p_max = in_poly[i];
+ max_index = i;
+ }
+ }
+
+ if (max_index == 0) {
+ max_index = 1;
+ p_max = in_poly[max_index];
+ }
+
+ k_index = 0, Stack[0] = 0, top1 = 0;
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+ if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+ dis(in_poly[Stack[top1]], p_k)))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top1++;
+ Stack[top1] = k_index;
+ }
+ for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+ k_index = 0, Stack[0] = 0, top2 = 0;
+
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+ if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+ dis(in_poly[Stack[top2]], p_k))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top2++;
+ Stack[top2] = k_index;
+ }
+ for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+ for (int i = 0; i < top1 + top2; i++) {
+ if (i <= top1) {
+ in_poly[i] = right_point[i];
+ } else {
+ in_poly[i] = left_point[top2 - (i - top1)];
+ }
+ }
+ n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+ int n2, double* grad_C) {
+ Point polygon[MAXN];
+ int n = n1 + n2, n_poly = 0;
+ for (int i = 0; i < n1; i++) {
+ for (int j = 0; j < n - n1; j++) {
+ if (point_same(ps1[i], ps2[j])) {
+ for (int k = j; k < n - n1 - 1; k++) {
+ ps2[k] = ps2[k + 1];
+ }
+ n2--;
+ break;
+ }
+ }
+ }
+ n_poly = n1 + n2;
+ for (int i = 0; i < n_poly; i++) {
+ if (i < n1) {
+ polygon[i] = ps1[i];
+ } else {
+ polygon[i] = ps2[i - n1];
+ }
+ }
+
+ Jarvis(polygon, n_poly);
+
+ int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ int n_pred = 0;
+ for (int i = 0; i < n_poly; i++) {
+ for (int j = 0; j < n1; j++) {
+ if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+ polygon_to_pred_index[n_pred] = i;
+ polygon_to_pred_index[n_pred + n1] = j;
+ n_pred += 1;
+ break;
+ }
+ }
+ }
+ if (n_pred == 0) {
+ double polygon_area = fabs(area(polygon, n_poly));
+ for (int i = 0; i < 18; i++) {
+ grad_C[i] = 0.0;
+ }
+ return polygon_area;
+ } else {
+ double polygon_area =
+ polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+ if (polygon_area < 0) {
+ for (int i = 0; i < 18; i++) {
+ grad_C[i] = -grad_C[i];
+ }
+ }
+ return fabs(polygon_area);
+ }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+ int* points_to_convex_ind) {
+ int n_input = n_poly;
+ Point input_poly[20];
+ for (int i = 0; i < n_input; i++) {
+ input_poly[i].x = in_poly[i].x;
+ input_poly[i].y = in_poly[i].y;
+ }
+ Point p_max, p_k;
+ int max_index, k_index;
+ int Stack[20], top1, top2;
+ double sign;
+ Point right_point[10], left_point[10];
+
+ for (int i = 0; i < n_poly; i++) {
+ if (in_poly[i].y < in_poly[0].y ||
+ in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+ Point* j = &(in_poly[0]);
+ Point* k = &(in_poly[i]);
+ swap1(j, k);
+ }
+ if (i == 0) {
+ p_max = in_poly[0];
+ max_index = 0;
+ }
+ if (in_poly[i].y > p_max.y ||
+ in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+ p_max = in_poly[i];
+ max_index = i;
+ }
+ }
+ if (max_index == 0) {
+ max_index = 1;
+ p_max = in_poly[max_index];
+ }
+
+ k_index = 0, Stack[0] = 0, top1 = 0;
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+ if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+ dis(in_poly[Stack[top1]], p_k)))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top1++;
+ Stack[top1] = k_index;
+ }
+ for (int i = 0; i <= top1; i++) {
+ right_point[i] = in_poly[Stack[i]];
+ }
+
+ k_index = 0, Stack[0] = 0, top2 = 0;
+
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+ if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+ dis(in_poly[Stack[top2]], p_k))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top2++;
+ Stack[top2] = k_index;
+ }
+
+ for (int i = top2 - 1; i >= 0; i--) {
+ left_point[i] = in_poly[Stack[i]];
+ }
+
+ for (int i = 0; i < top1 + top2; i++) {
+ if (i <= top1) {
+ in_poly[i] = right_point[i];
+ } else {
+ in_poly[i] = left_point[top2 - (i - top1)];
+ }
+ }
+ n_poly = top1 + top2;
+ for (int i = 0; i < n_poly; i++) {
+ for (int j = 0; j < n_input; j++) {
+ if (point_same(in_poly[i], input_poly[j])) {
+ points_to_convex_ind[i] = j;
+ break;
+ }
+ }
+ }
+}
+
+template
+__device__ inline float devrIoU(T const* const p, T const* const q,
+ T* point_grad, const int idx) {
+ Point ps1[MAXN], ps2[MAXN];
+
+ Point convex[MAXN];
+ for (int i = 0; i < 9; i++) {
+ convex[i].x = (double)p[i * 2];
+ convex[i].y = (double)p[i * 2 + 1];
+ }
+ int n_convex = 9;
+ int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+ Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+ int n1 = n_convex;
+ int n2 = 4;
+
+ for (int i = 0; i < n1; i++) {
+ ps1[i].x = (double)convex[i].x;
+ ps1[i].y = (double)convex[i].y;
+ }
+
+ for (int i = 0; i < n2; i++) {
+ ps2[i].x = (double)q[i * 2];
+ ps2[i].y = (double)q[i * 2 + 1];
+ }
+
+ int polygon_index_box_index[18];
+ for (int i = 0; i < n1; i++) {
+ polygon_index_box_index[i] = i;
+ polygon_index_box_index[i + n1] = i;
+ }
+
+ double grad_A[18] = {};
+ double grad_AB[18] = {};
+ double grad_C[18] = {};
+
+ double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+ double S_pred =
+ polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+ if (S_pred < 0) {
+ for (int i = 0; i < n_convex * 2; i++) {
+ grad_A[i] = -grad_A[i];
+ }
+ }
+ double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+ double iou = inter_area / union_area;
+ double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+ // printf("%d:live\n", idx);
+ double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+ float grad_point_temp[18] = {};
+
+ for (int i = 0; i < n_convex; i++) {
+ int grad_point = points_to_convex_ind[i];
+ grad_point_temp[2 * grad_point] =
+ (float)((union_area + inter_area) / (union_area * union_area) *
+ grad_AB[2 * i] -
+ iou / union_area * grad_A[2 * i] -
+ 1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+ (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+ grad_point_temp[2 * grad_point + 1] =
+ (float)((union_area + inter_area) / (union_area * union_area) *
+ grad_AB[2 * i + 1] -
+ iou / union_area * grad_A[2 * i + 1] -
+ 1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+ (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+ }
+
+ for (int i = 0; i < 9; i++) {
+ point_grad[2 * i] = grad_point_temp[2 * i];
+ point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+ }
+ return (float)rot_giou;
+}
+
+template
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+ const int gt_n_boxes, const T* ex_boxes,
+ const T* gt_boxes, T* point_grad) {
+ CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+ const T* cur_box = ex_boxes + index * 18;
+ const T* cur_gt_box = gt_boxes + index * 8;
+ T* cur_grad = point_grad + index * 19;
+ T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+ cur_grad[18] = giou;
+ }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+ double s1, s2;
+ s1 = cross(a, b, c);
+ s2 = cross(a, b, d);
+ if (sig(s1) == 0 && sig(s2) == 0) return 2;
+ if (sig(s2 - s1) == 0) return 0;
+ p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+ p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+ return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+ Point pp[MAXN];
+ int m = 0;
+ p[n] = p[0];
+ for (int i = 0; i < n; i++) {
+ if (sig(cross(a, b, p[i])) > 0) {
+ pp[m] = p[i];
+ m++;
+ }
+ if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+ lineCross(a, b, p[i], p[i + 1], pp[m]);
+ m++;
+ }
+ }
+ n = 0;
+ for (int i = 0; i < m; i++) {
+ if (!i || !(point_same(pp[i], pp[i - 1]))) {
+ p[n] = pp[i];
+ n++;
+ }
+ }
+
+ while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+ Point o(0, 0);
+ int s1 = sig(cross(o, a, b));
+ int s2 = sig(cross(o, c, d));
+ if (s1 == 0 || s2 == 0) return 0.0;
+ if (s1 == -1) {
+ Point* i = &a;
+ Point* j = &b;
+ swap1(i, j);
+ }
+ if (s2 == -1) {
+ Point* i = &c;
+ Point* j = &d;
+ swap1(i, j);
+ }
+ Point p[10] = {o, a, b};
+ int n = 3;
+
+ polygon_cut(p, n, o, c);
+ polygon_cut(p, n, c, d);
+ polygon_cut(p, n, d, o);
+ double res = area(p, n);
+ if (s1 * s2 == -1) res = -res;
+ return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+ int n2) {
+ if (area(ps1, n1) < 0) reverse1(ps1, n1);
+ if (area(ps2, n2) < 0) reverse1(ps2, n2);
+ ps1[n1] = ps1[0];
+ ps2[n2] = ps2[0];
+ double res = 0;
+ for (int i = 0; i < n1; i++) {
+ for (int j = 0; j < n2; j++) {
+ res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+ }
+ }
+ return res;
+}
+
+template
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+ Point ps1[MAXN], ps2[MAXN];
+ Point convex[MAXN];
+ for (int i = 0; i < 9; i++) {
+ convex[i].x = (double)p[i * 2];
+ convex[i].y = (double)p[i * 2 + 1];
+ }
+ int n_convex = 9;
+ int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+ Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+ int n1 = n_convex;
+ for (int i = 0; i < n1; i++) {
+ ps1[i].x = (double)convex[i].x;
+ ps1[i].y = (double)convex[i].y;
+ }
+ int n2 = 4;
+ for (int i = 0; i < n2; i++) {
+ ps2[i].x = (double)q[i * 2];
+ ps2[i].y = (double)q[i * 2 + 1];
+ }
+ double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+ double S_pred = area(ps1, n1);
+ double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+ double iou = inter_area / union_area;
+ return (float)iou;
+}
+
+template
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+ const int gt_n_boxes, const T* ex_boxes,
+ const T* gt_boxes, T* iou) {
+ CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+ const T* cur_box = ex_boxes + index * 18;
+ for (int i = 0; i < gt_n_boxes; i++) {
+ iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+ }
+ }
+}
+#endif // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f910561ec309cd50fd6d4da131ab36cdf3ca963a
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,231 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#include
+#include
+// Using is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include
+
+#include
+#include
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32
+#define TensorAcc5R PackedTensorAccessor32
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template
+__global__ void correlation_forward_cuda_kernel(
+ const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+ int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+ int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
+ int oH, int oW) {
+ const int iH = rInput1.size(1);
+ const int iW = rInput1.size(2);
+ const int C = rInput1.size(3);
+
+ const int n = blockIdx.x;
+ const int h = blockIdx.y * blockDim.y + threadIdx.y;
+ const int w = blockIdx.z * blockDim.z + threadIdx.z;
+
+ if (h >= oH || w >= oW) return;
+
+ const int thread = threadIdx.x;
+
+ const int start_i = -padH + h * dH;
+ const int start_j = -padW + w * dW;
+
+ const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+ const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+ for (int ph = 0; ph < patchH; ++ph) {
+ int ph_dilated = ph * dilation_patchH - patchRadH;
+ for (int pw = 0; pw < patchW; ++pw) {
+ int pw_dilated = pw * dilation_patchW - patchRadW;
+ scalar_t prod_sum = 0.0f;
+ for (int i = 0; i < kH; ++i) {
+ int i1 = start_i + i * dilationH;
+ int i2 = i1 + ph_dilated;
+ if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
+ for (int j = 0; j < kW; ++j) {
+ int j1 = start_j + j * dilationW;
+ int j2 = j1 + pw_dilated;
+ if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
+ for (int c = thread; c < C; c += WARP_SIZE) {
+ scalar_t v1 = rInput1[n][i1][j1][c];
+ scalar_t v2 = rInput2[n][i2][j2][c];
+ prod_sum += v1 * v2;
+ }
+ }
+ }
+ }
+ }
+ // accumulate
+ for (int offset = 16; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+ prod_sum += __shfl_down(float(prod_sum), offset);
+#else
+ prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
+#endif
+ if (thread == 0) {
+ output[n][ph][pw][h][w] = prod_sum;
+ }
+ }
+ }
+}
+
+template
+__global__ void correlation_backward_cuda_kernel_input1(
+ const TensorAcc5R grad_output, const TensorAcc4R input2,
+ TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+ const int patchW, const int padH, const int padW, const int dilationH,
+ const int dilationW, const int dilation_patchH, const int dilation_patchW,
+ const int dH, const int dW) {
+ const int iH = input2.size(1);
+ const int iW = input2.size(2);
+ const int C = input2.size(3);
+
+ const int H = grad_output.size(3);
+ const int W = grad_output.size(4);
+
+ const int patchRadH = (patchH - 1) / 2;
+ const int patchRadW = (patchW - 1) / 2;
+
+ const int n = blockIdx.x;
+ const int h = blockIdx.y;
+ const int w = blockIdx.z;
+
+ const int h_2 = h + padH;
+ const int w_2 = w + padW;
+ const int min_h = h_2 - kH * dilationH;
+ const int min_w = w_2 - kW * dilationW;
+
+ extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+ scalar_t *grad_cache = reinterpret_cast(grad_cache_char);
+ for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+ const int ph = i / patchW;
+ const int pw = i % patchW;
+ int i1 = h + dilation_patchH * (ph - patchRadH);
+ int j1 = w + dilation_patchW * (pw - patchRadW);
+
+ if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+ scalar_t grad_val = 0.0f;
+ for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+ int i2 = (h_3) / dH;
+ if (i2 * dH != h_3) continue;
+ for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+ int j2 = (w_3) / dW;
+ if (j2 * dW != w_3) continue;
+ if (WITHIN_BOUNDS(i2, j2, H, W)) {
+ grad_val += grad_output[n][ph][pw][i2][j2];
+ }
+ }
+ }
+ grad_cache[i] = grad_val;
+ }
+ }
+ __syncthreads();
+
+ for (int c = threadIdx.x; c < C; c += blockDim.x) {
+ scalar_t grad_input_val = 0.0f;
+ for (int ph = 0; ph < patchH; ++ph) {
+ int i1 = h + dilation_patchH * (ph - patchRadH);
+ for (int pw = 0; pw < patchW; ++pw) {
+ int j1 = w + dilation_patchW * (pw - patchRadW);
+ if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+ grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+ }
+ }
+ }
+ grad_input1[n][c][h][w] = grad_input_val;
+ }
+}
+
+template
+__global__ void correlation_backward_cuda_kernel_input2(
+ const TensorAcc5R grad_output, const TensorAcc4R input1,
+ TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+ int padW, int dilationH, int dilationW, int dilation_patchH,
+ int dilation_patchW, int dH, int dW) {
+ const int iH = input1.size(1);
+ const int iW = input1.size(2);
+ const int C = input1.size(3);
+
+ const int patchRadH = (patchH - 1) / 2;
+ const int patchRadW = (patchW - 1) / 2;
+
+ const int H = grad_output.size(3);
+ const int W = grad_output.size(4);
+
+ const int dilatedKH = kH * dilationH;
+ const int dilatedKW = kW * dilationW;
+
+ const int n = blockIdx.x;
+ const int h = blockIdx.y;
+ const int w = blockIdx.z;
+
+ extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+ scalar_t *grad_cache = reinterpret_cast(grad_cache_char);
+ for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+ const int ph = i / patchW;
+ const int pw = i % patchW;
+ int i1 = h - dilation_patchH * (ph - patchRadH);
+ int j1 = w - dilation_patchW * (pw - patchRadW);
+
+ if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+ scalar_t grad_val = 0.0f;
+
+ const int h_2 = i1 + padH;
+ const int w_2 = j1 + padW;
+ const int min_h = h_2 - dilatedKH;
+ const int min_w = w_2 - dilatedKW;
+
+ for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+ int i2 = (h_3) / dH;
+ if (i2 * dH != h_3) continue;
+ for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+ int j2 = (w_3) / dW;
+ if (j2 * dW != w_3) continue;
+ if (WITHIN_BOUNDS(i2, j2, H, W)) {
+ grad_val += grad_output[n][ph][pw][i2][j2];
+ }
+ }
+ }
+ grad_cache[i] = grad_val;
+ }
+ }
+ __syncthreads();
+
+ for (int c = threadIdx.x; c < C; c += blockDim.x) {
+ scalar_t grad_input_val = 0.0f;
+ for (int ph = 0; ph < patchH; ++ph) {
+ int i1 = h - dilation_patchH * (ph - patchRadH);
+ for (int pw = 0; pw < patchW; ++pw) {
+ int j1 = w - dilation_patchW * (pw - patchRadW);
+ if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+ grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+ }
+ }
+ }
+ grad_input2[n][c][h][w] = grad_input_val;
+ }
+}
+#endif
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif // MMCV_USE_PARROTS
+#endif // MMCV_WITH_TRT
+
+template
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+ const int height, const int width, T h,
+ T w) {
+ if (h <= -1 || height <= h || w <= -1 || width <= w) {
+ return 0;
+ }
+
+ int h_low = floorf(h);
+ int w_low = floorf(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ T lh = h - h_low;
+ T lw = w - w_low;
+ T hh = 1 - lh, hw = 1 - lw;
+
+ T v1 = 0;
+ if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+ T v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = input[h_low * data_width + w_high];
+ T v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = input[h_high * data_width + w_low];
+ T v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = input[h_high * data_width + w_high];
+
+ T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+ const int w, const int height,
+ const int width) {
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+ argmax_w >= width) {
+ // empty
+ return 0;
+ }
+
+ int argmax_h_low = floorf(argmax_h);
+ int argmax_w_low = floorf(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ T weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+ const int width, const T *im_data,
+ const int data_width, const int bp_dir) {
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+ argmax_w >= width) {
+ // empty
+ return 0;
+ }
+
+ int argmax_h_low = floorf(argmax_h);
+ int argmax_w_low = floorf(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ T weight = 0;
+
+ if (bp_dir == 0) {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) *
+ im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) *
+ im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) *
+ im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) *
+ im_data[argmax_h_high * data_width + argmax_w_high];
+ } else if (bp_dir == 1) {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) *
+ im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) *
+ im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) *
+ im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) *
+ im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void deformable_im2col_gpu_kernel(
+ const int n, const T *data_im, const T *data_offset, const int height,
+ const int width, const int kernel_h, const int kernel_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group, const int batch_size,
+ const int num_channels, const int deformable_group, const int height_col,
+ const int width_col, T *data_col) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+ T *data_col_ptr =
+ data_col +
+ ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ const T *data_im_ptr =
+ data_im + (b_col * num_channels + c_im) * height * width;
+ const T *data_offset_ptr =
+ data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i) {
+ for (int j = 0; j < kernel_w; ++j) {
+ const int data_offset_h_ptr =
+ ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr =
+ ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+ w_col;
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ T val = static_cast(0);
+ const T h_im = h_in + i * dilation_h + offset_h;
+ const T w_im = w_in + j * dilation_w + offset_w;
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+ h_im, w_im);
+ *data_col_ptr = val;
+ data_col_ptr += batch_size * height_col * width_col;
+ }
+ }
+ }
+}
+
+template
+__global__ void deformable_col2im_gpu_kernel(
+ const int n, const T *data_col, const T *data_offset, const int channels,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group, const int batch_size,
+ const int deformable_group, const int height_col, const int width_col,
+ T *grad_im) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i =
+ (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c =
+ index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const T *data_offset_ptr =
+ data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+ const int data_offset_h_ptr =
+ ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr =
+ ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const T cur_top_grad = data_col[index];
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++) {
+ for (int dx = -2; dx <= 2; dx++) {
+ if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+ cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+ int cur_bottom_grad_pos =
+ ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+ cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+template
+__global__ void deformable_col2im_coord_gpu_kernel(
+ const int n, const T *data_col, const T *data_im, const T *data_offset,
+ const int channels, const int height, const int width, const int kernel_h,
+ const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group, const int batch_size,
+ const int offset_channels, const int deformable_group, const int height_col,
+ const int width_col, T *grad_offset) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ T val = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const T *data_col_ptr = data_col + deformable_group_index *
+ channel_per_deformable_group *
+ batch_size * width_col * height_col;
+ const T *data_im_ptr =
+ data_im + (b * deformable_group + deformable_group_index) *
+ channel_per_deformable_group / kernel_h / kernel_w *
+ height * width;
+ const T *data_offset_ptr =
+ data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+ col_c += col_step) {
+ const int col_pos =
+ (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i =
+ (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr =
+ (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr =
+ (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+ w_out);
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ T inv_h = h_in + i * dilation_h + offset_h;
+ T inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ inv_h = inv_w = -2;
+ const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+ data_im_ptr + cnt * height * width,
+ width, bp_dir);
+ val += weight * data_col_ptr[col_pos];
+ cnt += 1;
+ }
+
+ grad_offset[index] = val;
+ }
+}
+
+#endif // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..86c4bc66dd2fb289340a4fb1714edb5db1e798c4
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__global__ void deform_roi_pool_forward_cuda_kernel(
+ const int nthreads, const T* input, const T* rois, const T* offset,
+ T* output, const int pooled_height, const int pooled_width,
+ const T spatial_scale, const int sampling_ratio, const T gamma,
+ const int channels, const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_rois = rois + n * 5;
+ int roi_batch_ind = offset_rois[0];
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+ T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+ T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+ T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+ T roi_width = roi_end_w - roi_start_w;
+ T roi_height = roi_end_h - roi_start_h;
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T* offset_input =
+ input + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h =
+ (sampling_ratio > 0)
+ ? sampling_ratio
+ : static_cast(ceilf(roi_height / pooled_height));
+ int roi_bin_grid_w =
+ (sampling_ratio > 0)
+ ? sampling_ratio
+ : static_cast(ceilf(roi_width / pooled_width));
+
+ // Compute roi offset
+ if (offset != NULL) {
+ const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+ ph * pooled_width + pw;
+ T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+ T offset_roi_h =
+ gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+ roi_start_w += offset_roi_w;
+ roi_start_h += offset_roi_h;
+ }
+
+ // We do average pooling inside a bin
+ const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ const T y = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h);
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T x = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+ T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+ output_val += val;
+ }
+ }
+ output[index] = output_val / count;
+ }
+}
+
+template
+__global__ void deform_roi_pool_backward_cuda_kernel(
+ const int nthreads, const T* grad_output, const T* input, const T* rois,
+ const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+ const int pooled_width, const T spatial_scale, const int sampling_ratio,
+ const T gamma, const int channels, const int height, const int width) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_rois = rois + n * 5;
+ int roi_batch_ind = offset_rois[0];
+ const T* offset_input =
+ input + ((roi_batch_ind * channels + c) * height * width);
+ T* offset_grad_input =
+ grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+ T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+ T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+ T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+ T roi_width = roi_end_w - roi_start_w;
+ T roi_height = roi_end_h - roi_start_h;
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h =
+ (sampling_ratio > 0)
+ ? sampling_ratio
+ : static_cast(ceilf(roi_height / pooled_height));
+ int roi_bin_grid_w =
+ (sampling_ratio > 0)
+ ? sampling_ratio
+ : static_cast(ceilf(roi_width / pooled_width));
+
+ // Compute roi offset
+ if (offset != NULL) {
+ const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+ ph * pooled_width + pw;
+ T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+ T offset_roi_h =
+ gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+ roi_start_w += offset_roi_w;
+ roi_start_h += offset_roi_h;
+ }
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+ const T grad_output_this_bin = grad_output[index] / count;
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ const T y = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h);
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T x = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+ bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+ x_low, x_high, y_low, y_high, index);
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ atomicAdd(offset_grad_input + y_low * width + x_low,
+ grad_output_this_bin * w1);
+ atomicAdd(offset_grad_input + y_low * width + x_high,
+ grad_output_this_bin * w2);
+ atomicAdd(offset_grad_input + y_high * width + x_low,
+ grad_output_this_bin * w3);
+ atomicAdd(offset_grad_input + y_high * width + x_high,
+ grad_output_this_bin * w4);
+ if (offset != NULL) {
+ T input_00 = offset_input[y_low * width + x_low];
+ T input_10 = offset_input[y_low * width + x_high];
+ T input_01 = offset_input[y_high * width + x_low];
+ T input_11 = offset_input[y_high * width + x_high];
+ T ogx = gamma * roi_width * grad_output_this_bin *
+ (input_11 * (y - y_low) + input_10 * (y_high - y) +
+ input_01 * (y_low - y) + input_00 * (y - y_high));
+ T ogy = gamma * roi_height * grad_output_this_bin *
+ (input_11 * (x - x_low) + input_01 * (x_high - x) +
+ input_10 * (x_low - x) + input_00 * (x - x_high));
+ atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+ ph * pooled_width + pw,
+ ogx);
+ atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+ pooled_width * pooled_height + ph * pooled_width + pw,
+ ogy);
+ }
+ }
+ }
+ }
+ }
+}
+
+#endif // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..053977a3011692b22a5dce6050fcfec4797f092c
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
@@ -0,0 +1,137 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_NUM_VERT_IDX 9
+#define INTERSECTION_OFFSET 8
+#define EPSILON 1e-8
+
+inline int opt_n_thread(int work_size) {
+ const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0);
+ return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
+}
+
+/*
+compare normalized vertices (vertices around (0,0))
+if vertex1 < vertex2 return true.
+order: minimum at x-aixs, become larger in anti-clockwise direction
+*/
+__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
+ if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
+ return false; // if equal, return false
+
+ if (y1 > 0 && y2 < 0) return true;
+ if (y1 < 0 && y2 > 0) return false;
+
+ float n1 = x1 * x1 + y1 * y1 + EPSILON;
+ float n2 = x2 * x2 + y2 * y2 + EPSILON;
+ float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
+
+ if (y1 > 0 && y2 > 0) {
+ if (diff > EPSILON)
+ return true;
+ else
+ return false;
+ }
+ if (y1 < 0 && y2 < 0) {
+ if (diff < EPSILON)
+ return true;
+ else
+ return false;
+ }
+ return false;
+}
+
+__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
+ int b, int n, int m, const float *__restrict__ vertices,
+ const bool *__restrict__ mask, const int *__restrict__ num_valid,
+ int *__restrict__ idx) {
+ int batch_idx = blockIdx.x;
+ vertices += batch_idx * n * m * 2;
+ mask += batch_idx * n * m;
+ num_valid += batch_idx * n;
+ idx += batch_idx * n * MAX_NUM_VERT_IDX;
+
+ int index = threadIdx.x; // index of polygon
+ int stride = blockDim.x;
+ for (int i = index; i < n; i += stride) {
+ int pad; // index of arbitrary invalid intersection point (not box corner!)
+ for (int j = INTERSECTION_OFFSET; j < m; ++j) {
+ if (!mask[i * m + j]) {
+ pad = j;
+ break;
+ }
+ }
+ if (num_valid[i] < 3) {
+ // not enough vertices, take an invalid intersection point
+ // (zero padding)
+ for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
+ idx[i * MAX_NUM_VERT_IDX + j] = pad;
+ }
+ } else {
+ // sort the valid vertices
+ // note the number of valid vertices is known
+ // note: check that num_valid[i] < MAX_NUM_VERT_IDX
+ for (int j = 0; j < num_valid[i]; ++j) {
+ // initialize with a "big" value
+ float x_min = 1;
+ float y_min = -EPSILON;
+ int i_take = 0;
+ int i2;
+ float x2, y2;
+ if (j != 0) {
+ i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
+ x2 = vertices[i * m * 2 + i2 * 2 + 0];
+ y2 = vertices[i * m * 2 + i2 * 2 + 1];
+ }
+ for (int k = 0; k < m; ++k) {
+ float x = vertices[i * m * 2 + k * 2 + 0];
+ float y = vertices[i * m * 2 + k * 2 + 1];
+ if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
+ if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
+ x_min = x;
+ y_min = y;
+ i_take = k;
+ }
+ }
+ }
+ idx[i * MAX_NUM_VERT_IDX + j] = i_take;
+ }
+ // duplicate the first idx
+ idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
+
+ // pad zeros
+ for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
+ idx[i * MAX_NUM_VERT_IDX + j] = pad;
+ }
+
+ // for corner case: the two boxes are exactly the same.
+ // in this case, idx would have duplicate elements, which makes the
+ // shoelace formula broken because of the definition, the duplicate
+ // elements only appear in the first 8 positions (they are "corners in
+ // box", not "intersection of edges")
+ if (num_valid[i] == 8) {
+ int counter = 0;
+ for (int j = 0; j < 4; ++j) {
+ int check = idx[i * MAX_NUM_VERT_IDX + j];
+ for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
+ if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
+ }
+ }
+ if (counter == 4) {
+ idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
+ for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
+ idx[i * MAX_NUM_VERT_IDX + j] = pad;
+ }
+ }
+ }
+
+ // TODO: still might need to cover some other corner cases :(
+ }
+ }
+}
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d3801a02c1c8f44874fb84fa884cc23bee25c331
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,152 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+ int idx1, int idx2) {
+ const float v1 = dists[idx1], v2 = dists[idx2];
+ const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+ dists[idx1] = max(v1, v2);
+ dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+ int b, int n, int m, const float *__restrict__ dataset,
+ float *__restrict__ temp, int *__restrict__ idxs) {
+ // dataset: (B, N, 3)
+ // tmp: (B, N)
+ // output:
+ // idx: (B, M)
+
+ if (m <= 0) return;
+ __shared__ float dists[block_size];
+ __shared__ int dists_i[block_size];
+
+ int batch_index = blockIdx.x;
+ dataset += batch_index * n * 3;
+ temp += batch_index * n;
+ idxs += batch_index * m;
+
+ int tid = threadIdx.x;
+ const int stride = block_size;
+
+ int old = 0;
+ if (threadIdx.x == 0) idxs[0] = old;
+
+ __syncthreads();
+ for (int j = 1; j < m; j++) {
+ int besti = 0;
+ float best = -1;
+ float x1 = dataset[old * 3 + 0];
+ float y1 = dataset[old * 3 + 1];
+ float z1 = dataset[old * 3 + 2];
+ for (int k = tid; k < n; k += stride) {
+ float x2, y2, z2;
+ x2 = dataset[k * 3 + 0];
+ y2 = dataset[k * 3 + 1];
+ z2 = dataset[k * 3 + 2];
+ // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+ // if (mag <= 1e-3)
+ // continue;
+
+ float d =
+ (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+ float d2 = min(d, temp[k]);
+ temp[k] = d2;
+ besti = d2 > best ? k : besti;
+ best = d2 > best ? d2 : best;
+ }
+ dists[tid] = best;
+ dists_i[tid] = besti;
+ __syncthreads();
+
+#pragma unroll
+ for (int block_size_thres = 1024; block_size_thres >= 2;
+ block_size_thres >>= 1) {
+ const int tid_thres = block_size_thres / 2;
+ if (block_size >= block_size_thres && tid < tid_thres) {
+ __update(dists, dists_i, tid, tid + tid_thres);
+ }
+ __syncthreads();
+ }
+
+ old = dists_i[0];
+ if (tid == 0) idxs[j] = old;
+ }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+ int b, int n, int m, const float *__restrict__ dataset,
+ float *__restrict__ temp, int *__restrict__ idxs) {
+ // dataset: (B, N, N)
+ // tmp: (B, N)
+ // output:
+ // idx: (B, M)
+
+ if (m <= 0) return;
+ __shared__ float dists[block_size];
+ __shared__ int dists_i[block_size];
+
+ int batch_index = blockIdx.x;
+ dataset += batch_index * n * n;
+ temp += batch_index * n;
+ idxs += batch_index * m;
+
+ int tid = threadIdx.x;
+ const int stride = block_size;
+
+ int old = 0;
+ if (threadIdx.x == 0) idxs[0] = old;
+
+ __syncthreads();
+ for (int j = 1; j < m; j++) {
+ int besti = 0;
+ float best = -1;
+ // float x1 = dataset[old * 3 + 0];
+ // float y1 = dataset[old * 3 + 1];
+ // float z1 = dataset[old * 3 + 2];
+ for (int k = tid; k < n; k += stride) {
+ // float x2, y2, z2;
+ // x2 = dataset[k * 3 + 0];
+ // y2 = dataset[k * 3 + 1];
+ // z2 = dataset[k * 3 + 2];
+
+ // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+ // (z2 - z1);
+ float d = dataset[old * n + k];
+
+ float d2 = min(d, temp[k]);
+ temp[k] = d2;
+ besti = d2 > best ? k : besti;
+ best = d2 > best ? d2 : best;
+ }
+ dists[tid] = best;
+ dists_i[tid] = besti;
+ __syncthreads();
+
+#pragma unroll
+ for (int block_size_thres = 1024; block_size_thres >= 2;
+ block_size_thres >>= 1) {
+ const int tid_thres = block_size_thres / 2;
+ if (block_size >= block_size_thres && tid < tid_thres) {
+ __update(dists, dists_i, tid, tid + tid_thres);
+ }
+ __syncthreads();
+ }
+
+ old = dists_i[0];
+ if (tid == 0) idxs[j] = old;
+ }
+}
+
+#endif // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d932434cba245833e661b8c7e140601940bc35b
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define TOTAL_THREADS 1024
+
+template
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+ const T *points,
+ const int *__restrict__ idx,
+ T *out) {
+ // points: (B, C, N)
+ // idx: (B, M)
+ // output:
+ // out: (B, C, M)
+
+ int bs_idx = blockIdx.z;
+ int c_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+ if (bs_idx >= b || c_idx >= c) return;
+
+ out += bs_idx * c * m + c_idx * m + pt_idx;
+ idx += bs_idx * m + pt_idx;
+ points += bs_idx * c * n + c_idx * n;
+ out[0] = points[idx[0]];
+ }
+}
+
+template
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+ const T *grad_out,
+ const int *__restrict__ idx,
+ T *grad_points) {
+ // grad_out: (B, C, M)
+ // idx: (B, M)
+ // output:
+ // grad_points: (B, C, N)
+
+ int bs_idx = blockIdx.z;
+ int c_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+ if (bs_idx >= b || c_idx >= c) return;
+
+ grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+ idx += bs_idx * m + pt_idx;
+ grad_points += bs_idx * c * n + c_idx * n;
+
+ atomicAdd(grad_points + idx[0], grad_out[0]);
+ }
+}
+
+#endif // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dfad66fc16d8759f614d7f36fa961673976b1d95
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+ int npoints, int nsample,
+ const T *points,
+ const int *__restrict__ idx,
+ T *out) {
+ // points: (B, C, N)
+ // idx: (B, npoints, nsample)
+ // output:
+ // out: (B, C, npoints, nsample)
+ int bs_idx = blockIdx.z;
+ int c_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+ if (bs_idx >= b || c_idx >= c) return;
+
+ int pt_idx = index / nsample;
+ int sample_idx = index % nsample;
+
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+ int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+ int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+ pt_idx * nsample + sample_idx;
+
+ out[out_idx] = points[in_idx];
+ }
+}
+
+template
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+ int npoints, int nsample,
+ const T *grad_out,
+ const int *__restrict__ idx,
+ T *grad_points) {
+ // grad_out: (B, C, npoints, nsample)
+ // idx: (B, npoints, nsample)
+ // output:
+ // grad_points: (B, C, N)
+ int bs_idx = blockIdx.z;
+ int c_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+ int pt_idx = index / nsample;
+ if (bs_idx >= b || c_idx >= c) return;
+
+ int sample_idx = index % nsample;
+ grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+ pt_idx * nsample + sample_idx;
+ idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+ atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+ }
+}
+
+#endif // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9ebdcad15eee05a9f412ef34eb12d3553874a4dc
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+ float x, y;
+ __device__ Point() {}
+ __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+ __device__ void set(float _x, float _y) {
+ x = _x;
+ y = _y;
+ }
+
+ __device__ Point operator+(const Point &b) const {
+ return Point(x + b.x, y + b.y);
+ }
+
+ __device__ Point operator-(const Point &b) const {
+ return Point(x - b.x, y - b.y);
+ }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+ return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+ const Point &p0) {
+ return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+ const Point &q1, const Point &q2) {
+ int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+ min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+ min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+ min(q1.y, q2.y) <= max(p1.y, p2.y);
+ return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+ // params: box (7) [x, y, z, dx, dy, dz, heading]
+ const float MARGIN = 1e-2;
+
+ float center_x = box[0], center_y = box[1];
+ // rotate the point in the opposite direction of box
+ float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+ float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+ float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+ return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+ fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+ const Point &q1, const Point &q0,
+ Point &ans_point) {
+ // fast exclusion
+ if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+ // check cross standing
+ float s1 = cross(q0, p1, p0);
+ float s2 = cross(p1, q1, p0);
+ float s3 = cross(p0, q1, q0);
+ float s4 = cross(q1, p1, q0);
+
+ if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+ // calculate intersection of two lines
+ float s5 = cross(q1, p1, p0);
+ if (fabs(s5 - s1) > EPS) {
+ ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+ ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+ } else {
+ float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+ float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+ float D = a0 * b1 - a1 * b0;
+
+ ans_point.x = (b0 * c1 - b1 * c0) / D;
+ ans_point.y = (a1 * c0 - a0 * c1) / D;
+ }
+
+ return 1;
+}
+
+__device__ inline void rotate_around_center(const Point ¢er,
+ const float angle_cos,
+ const float angle_sin, Point &p) {
+ float new_x =
+ (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+ float new_y =
+ (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+ p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+ const Point ¢er) {
+ return atan2(a.y - center.y, a.x - center.x) >
+ atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+ // params box_a: [x, y, z, dx, dy, dz, heading]
+ // params box_b: [x, y, z, dx, dy, dz, heading]
+
+ float a_angle = box_a[6], b_angle = box_b[6];
+ float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+ a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+ float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+ float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+ float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+ float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+ Point center_a(box_a[0], box_a[1]);
+ Point center_b(box_b[0], box_b[1]);
+
+ Point box_a_corners[5];
+ box_a_corners[0].set(a_x1, a_y1);
+ box_a_corners[1].set(a_x2, a_y1);
+ box_a_corners[2].set(a_x2, a_y2);
+ box_a_corners[3].set(a_x1, a_y2);
+
+ Point box_b_corners[5];
+ box_b_corners[0].set(b_x1, b_y1);
+ box_b_corners[1].set(b_x2, b_y1);
+ box_b_corners[2].set(b_x2, b_y2);
+ box_b_corners[3].set(b_x1, b_y2);
+
+ // get oriented corners
+ float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+ float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+ for (int k = 0; k < 4; k++) {
+ rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+ rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+ }
+
+ box_a_corners[4] = box_a_corners[0];
+ box_b_corners[4] = box_b_corners[0];
+
+ // get intersection of lines
+ Point cross_points[16];
+ Point poly_center;
+ int cnt = 0, flag = 0;
+
+ poly_center.set(0, 0);
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+ box_b_corners[j + 1], box_b_corners[j],
+ cross_points[cnt]);
+ if (flag) {
+ poly_center = poly_center + cross_points[cnt];
+ cnt++;
+ }
+ }
+ }
+
+ // check corners
+ for (int k = 0; k < 4; k++) {
+ if (check_in_box2d(box_a, box_b_corners[k])) {
+ poly_center = poly_center + box_b_corners[k];
+ cross_points[cnt] = box_b_corners[k];
+ cnt++;
+ }
+ if (check_in_box2d(box_b, box_a_corners[k])) {
+ poly_center = poly_center + box_a_corners[k];
+ cross_points[cnt] = box_a_corners[k];
+ cnt++;
+ }
+ }
+
+ poly_center.x /= cnt;
+ poly_center.y /= cnt;
+
+ // sort the points of polygon
+ Point temp;
+ for (int j = 0; j < cnt - 1; j++) {
+ for (int i = 0; i < cnt - j - 1; i++) {
+ if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+ temp = cross_points[i];
+ cross_points[i] = cross_points[i + 1];
+ cross_points[i + 1] = temp;
+ }
+ }
+ }
+
+ // get the overlap areas
+ float area = 0;
+ for (int k = 0; k < cnt - 1; k++) {
+ area += cross(cross_points[k] - cross_points[0],
+ cross_points[k + 1] - cross_points[0]);
+ }
+
+ return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+ // params box_a: [x, y, z, dx, dy, dz, heading]
+ // params box_b: [x, y, z, dx, dy, dz, heading]
+ float sa = box_a[3] * box_a[4];
+ float sb = box_b[3] * box_b[4];
+ float s_overlap = box_overlap(box_a, box_b);
+ return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+ const int num_a, const float *boxes_a, const int num_b,
+ const float *boxes_b, float *ans_overlap) {
+ // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+ // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+ CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+ if (a_idx >= num_a || b_idx >= num_b) {
+ return;
+ }
+
+ const float *cur_box_a = boxes_a + a_idx * 7;
+ const float *cur_box_b = boxes_b + b_idx * 7;
+ float cur_overlap = box_overlap(cur_box_a, cur_box_b);
+ ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
+ }
+}
+
+__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
+ const float nms_overlap_thresh,
+ const float *boxes,
+ unsigned long long *mask) {
+ // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+ // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+ const int blocks =
+ (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+ CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+ // if (row_start > col_start) return;
+
+ const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+ THREADS_PER_BLOCK_NMS);
+ const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+ THREADS_PER_BLOCK_NMS);
+
+ __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 7 + 0] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+ block_boxes[threadIdx.x * 7 + 1] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+ block_boxes[threadIdx.x * 7 + 2] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+ block_boxes[threadIdx.x * 7 + 3] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+ block_boxes[threadIdx.x * 7 + 4] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+ block_boxes[threadIdx.x * 7 + 5] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+ block_boxes[threadIdx.x * 7 + 6] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+ const float *cur_box = boxes + cur_box_idx * 7;
+
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks =
+ (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+ mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+ // params: a: [x, y, z, dx, dy, dz, heading]
+ // params: b: [x, y, z, dx, dy, dz, heading]
+
+ float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+ right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+ float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+ bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+ float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+ float interS = width * height;
+ float Sa = a[3] * a[4];
+ float Sb = b[3] * b[4];
+ return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
+ const int boxes_num, const float nms_overlap_thresh, const float *boxes,
+ unsigned long long *mask) {
+ // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+ // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+ const int blocks =
+ (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+ CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+ // if (row_start > col_start) return;
+
+ const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+ THREADS_PER_BLOCK_NMS);
+ const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+ THREADS_PER_BLOCK_NMS);
+
+ __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 7 + 0] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+ block_boxes[threadIdx.x * 7 + 1] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+ block_boxes[threadIdx.x * 7 + 2] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+ block_boxes[threadIdx.x * 7 + 3] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+ block_boxes[threadIdx.x * 7 + 4] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+ block_boxes[threadIdx.x * 7 + 5] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+ block_boxes[threadIdx.x * 7 + 6] =
+ boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+ const float *cur_box = boxes + cur_box_idx * 7;
+
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks =
+ (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+ mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ }
+}
+
+#endif // IOU3D_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3cf52bb90eb27d02b28c52069c760c8a38f83f08
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+inline __device__ void swap_float(float *x, float *y) {
+ float tmp = *x;
+ *x = *y;
+ *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+ int tmp = *x;
+ *x = *y;
+ *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+ int root = 0;
+ int child = root * 2 + 1;
+ while (child < k) {
+ if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+ if (dist[root] > dist[child]) return;
+ swap_float(&dist[root], &dist[child]);
+ swap_int(&idx[root], &idx[child]);
+ root = child;
+ child = root * 2 + 1;
+ }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+ int i;
+ for (i = k - 1; i > 0; i--) {
+ swap_float(&dist[0], &dist[i]);
+ swap_int(&idx[0], &idx[i]);
+ reheap(dist, idx, i);
+ }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+ const T *xyz, const T *new_xyz,
+ int *__restrict__ idx, T *dist2) {
+ int bs_idx = blockIdx.y;
+ CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+ if (bs_idx >= b) return;
+
+ new_xyz += bs_idx * m * 3 + pt_idx * 3;
+ xyz += bs_idx * n * 3;
+ idx += bs_idx * m * nsample + pt_idx * nsample;
+ dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+ T new_x = new_xyz[0];
+ T new_y = new_xyz[1];
+ T new_z = new_xyz[2];
+
+ float best_dist[100];
+ int best_idx[100];
+ for (int i = 0; i < nsample; i++) {
+ best_dist[i] = 1e10;
+ best_idx[i] = 0;
+ }
+ for (int i = 0; i < n; i++) {
+ T x = xyz[i * 3 + 0];
+ T y = xyz[i * 3 + 1];
+ T z = xyz[i * 3 + 2];
+ T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+ (new_z - z) * (new_z - z);
+ if (d2 < best_dist[0]) {
+ best_dist[0] = d2;
+ best_idx[0] = i;
+ reheap(best_dist, best_idx, nsample);
+ }
+ }
+ heap_sort(best_dist, best_idx, nsample);
+ for (int i = 0; i < nsample; i++) {
+ idx[i] = best_idx[i];
+ dist2[i] = best_dist[i];
+ }
+ }
+}
+
+#endif // KNN_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1a0bd040e823eaaa79f96e525f961a8b8fbeafb5
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+ const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int64_t *mask_h_idx,
+ const int64_t *mask_w_idx,
+ const int mask_cnt, scalar_t *data_col) {
+ // mask_cnt * channels
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ const int m_index = index % mask_cnt;
+ const int h_col = mask_h_idx[m_index];
+ const int w_col = mask_w_idx[m_index];
+ const int c_im = index / mask_cnt;
+ const int c_col = c_im * kernel_h * kernel_w;
+ const int h_offset = h_col - pad_h;
+ const int w_offset = w_col - pad_w;
+ scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+ for (int i = 0; i < kernel_h; ++i) {
+ int h_im = h_offset + i;
+ for (int j = 0; j < kernel_w; ++j) {
+ int w_im = w_offset + j;
+ if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+ *data_col_ptr =
+ (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+ } else {
+ *data_col_ptr = 0.0;
+ }
+ data_col_ptr += mask_cnt;
+ }
+ }
+ }
+}
+
+template
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+ const int height, const int width,
+ const int channels,
+ const int64_t *mask_h_idx,
+ const int64_t *mask_w_idx,
+ const int mask_cnt, scalar_t *data_im) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ const int m_index = index % mask_cnt;
+ const int h_im = mask_h_idx[m_index];
+ const int w_im = mask_w_idx[m_index];
+ const int c_im = index / mask_cnt;
+ // compute the start and end of the output
+ data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+ }
+}
+
+#endif // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,300 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+ float x, y;
+ __device__ Point() {}
+ __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+ Point temp;
+ temp.x = a->x;
+ temp.y = a->y;
+
+ a->x = b->x;
+ a->y = b->y;
+
+ b->x = temp.x;
+ b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+ return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+ return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+ float convex_points[2][MAXN];
+ for (int j = 0; j < n_points; j++) {
+ convex_points[0][j] = ps[j].x;
+ }
+ for (int j = 0; j < n_points; j++) {
+ convex_points[1][j] = ps[j].y;
+ }
+
+ Point edges[MAXN];
+ float edges_angles[MAXN];
+ float unique_angles[MAXN];
+ int n_edges = n_points - 1;
+ int n_unique = 0;
+ int unique_flag = 0;
+
+ for (int i = 0; i < n_edges; i++) {
+ edges[i].x = ps[i + 1].x - ps[i].x;
+ edges[i].y = ps[i + 1].y - ps[i].y;
+ }
+ for (int i = 0; i < n_edges; i++) {
+ edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+ if (edges_angles[i] >= 0) {
+ edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+ } else {
+ edges_angles[i] =
+ edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+ }
+ }
+ unique_angles[0] = edges_angles[0];
+ n_unique += 1;
+ for (int i = 1; i < n_edges; i++) {
+ for (int j = 0; j < n_unique; j++) {
+ if (edges_angles[i] == unique_angles[j]) {
+ unique_flag += 1;
+ }
+ }
+ if (unique_flag == 0) {
+ unique_angles[n_unique] = edges_angles[i];
+ n_unique += 1;
+ unique_flag = 0;
+ } else {
+ unique_flag = 0;
+ }
+ }
+
+ float minarea = 1e12;
+ for (int i = 0; i < n_unique; i++) {
+ float R[2][2];
+ float rot_points[2][MAXN];
+ R[0][0] = cos(unique_angles[i]);
+ R[0][1] = sin(unique_angles[i]);
+ R[1][0] = -sin(unique_angles[i]);
+ R[1][1] = cos(unique_angles[i]);
+ // R x Points
+ for (int m = 0; m < 2; m++) {
+ for (int n = 0; n < n_points; n++) {
+ float sum = 0.0;
+ for (int k = 0; k < 2; k++) {
+ sum = sum + R[m][k] * convex_points[k][n];
+ }
+ rot_points[m][n] = sum;
+ }
+ }
+
+ // xmin;
+ float xmin, ymin, xmax, ymax;
+ xmin = 1e12;
+ for (int j = 0; j < n_points; j++) {
+ if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+ continue;
+ } else {
+ if (rot_points[0][j] < xmin) {
+ xmin = rot_points[0][j];
+ }
+ }
+ }
+ // ymin
+ ymin = 1e12;
+ for (int j = 0; j < n_points; j++) {
+ if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+ continue;
+ } else {
+ if (rot_points[1][j] < ymin) {
+ ymin = rot_points[1][j];
+ }
+ }
+ }
+ // xmax
+ xmax = -1e12;
+ for (int j = 0; j < n_points; j++) {
+ if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+ continue;
+ } else {
+ if (rot_points[0][j] > xmax) {
+ xmax = rot_points[0][j];
+ }
+ }
+ }
+ // ymax
+ ymax = -1e12;
+ for (int j = 0; j < n_points; j++) {
+ if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+ continue;
+ } else {
+ if (rot_points[1][j] > ymax) {
+ ymax = rot_points[1][j];
+ }
+ }
+ }
+ float area = (xmax - xmin) * (ymax - ymin);
+ if (area < minarea) {
+ minarea = area;
+ minbox[0] = unique_angles[i];
+ minbox[1] = xmin;
+ minbox[2] = ymin;
+ minbox[3] = xmax;
+ minbox[4] = ymax;
+ }
+ }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+ int n_input = n_poly;
+ Point input_poly[20];
+ for (int i = 0; i < n_input; i++) {
+ input_poly[i].x = in_poly[i].x;
+ input_poly[i].y = in_poly[i].y;
+ }
+ Point p_max, p_k;
+ int max_index, k_index;
+ int Stack[20], top1, top2;
+ // float sign;
+ double sign;
+ Point right_point[10], left_point[10];
+
+ for (int i = 0; i < n_poly; i++) {
+ if (in_poly[i].y < in_poly[0].y ||
+ in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+ Point *j = &(in_poly[0]);
+ Point *k = &(in_poly[i]);
+ swap1(j, k);
+ }
+ if (i == 0) {
+ p_max = in_poly[0];
+ max_index = 0;
+ }
+ if (in_poly[i].y > p_max.y ||
+ in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+ p_max = in_poly[i];
+ max_index = i;
+ }
+ }
+ if (max_index == 0) {
+ max_index = 1;
+ p_max = in_poly[max_index];
+ }
+
+ k_index = 0, Stack[0] = 0, top1 = 0;
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+ if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+ dis(in_poly[Stack[top1]], p_k)))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top1++;
+ Stack[top1] = k_index;
+ }
+
+ for (int i = 0; i <= top1; i++) {
+ right_point[i] = in_poly[Stack[i]];
+ }
+
+ k_index = 0, Stack[0] = 0, top2 = 0;
+
+ while (k_index != max_index) {
+ p_k = p_max;
+ k_index = max_index;
+ for (int i = 1; i < n_poly; i++) {
+ sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+ if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+ dis(in_poly[Stack[top2]], p_k))) {
+ p_k = in_poly[i];
+ k_index = i;
+ }
+ }
+ top2++;
+ Stack[top2] = k_index;
+ }
+
+ for (int i = top2 - 1; i >= 0; i--) {
+ left_point[i] = in_poly[Stack[i]];
+ }
+
+ for (int i = 0; i < top1 + top2; i++) {
+ if (i <= top1) {
+ in_poly[i] = right_point[i];
+ } else {
+ in_poly[i] = left_point[top2 - (i - top1)];
+ }
+ }
+ n_poly = top1 + top2;
+}
+
+template
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+ Point ps1[MAXN];
+ Point convex[MAXN];
+ for (int i = 0; i < 9; i++) {
+ convex[i].x = p[i * 2];
+ convex[i].y = p[i * 2 + 1];
+ }
+ int n_convex = 9;
+ Jarvis(convex, n_convex);
+ int n1 = n_convex;
+ for (int i = 0; i < n1; i++) {
+ ps1[i].x = convex[i].x;
+ ps1[i].y = convex[i].y;
+ }
+ ps1[n1].x = convex[0].x;
+ ps1[n1].y = convex[0].y;
+
+ float minbbox[5] = {0};
+ minBoundingRect(ps1, n1 + 1, minbbox);
+ float angle = minbbox[0];
+ float xmin = minbbox[1];
+ float ymin = minbbox[2];
+ float xmax = minbbox[3];
+ float ymax = minbbox[4];
+ float R[2][2];
+
+ R[0][0] = cos(angle);
+ R[0][1] = sin(angle);
+ R[1][0] = -sin(angle);
+ R[1][1] = cos(angle);
+
+ minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+ minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+ minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+ minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+ minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+ minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+ minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+ minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+ const T *ex_boxes, T *minbox) {
+ CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+ const T *cur_box = ex_boxes + index * 18;
+ T *cur_min_box = minbox + index * 8;
+ Findminbox(cur_box, cur_min_box);
+ }
+}
+
+#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,399 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif // MMCV_USE_PARROTS
+#endif // MMCV_WITH_TRT
+
+template
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+ const int height, const int width, T h, T w) {
+ int h_low = floorf(h);
+ int w_low = floorf(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ T lh = h - h_low;
+ T lw = w - w_low;
+ T hh = 1 - lh, hw = 1 - lw;
+
+ T v1 = 0;
+ if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+ T v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = input[h_low * data_width + w_high];
+ T v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = input[h_high * data_width + w_low];
+ T v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = input[h_high * data_width + w_high];
+
+ T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+ const int w, const int height,
+ const int width) {
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+ argmax_w >= width) {
+ // empty
+ return 0;
+ }
+
+ int argmax_h_low = floorf(argmax_h);
+ int argmax_w_low = floorf(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ T weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+ const int height, const int width,
+ const T *im_data, const int data_width,
+ const int bp_dir) {
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+ argmax_w >= width) {
+ // empty
+ return 0;
+ }
+
+ int argmax_h_low = floorf(argmax_h);
+ int argmax_w_low = floorf(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ T weight = 0;
+
+ if (bp_dir == 0) {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) *
+ im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) *
+ im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) *
+ im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) *
+ im_data[argmax_h_high * data_width + argmax_w_high];
+ } else if (bp_dir == 1) {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) *
+ im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) *
+ im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) *
+ im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) *
+ im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void modulated_deformable_im2col_gpu_kernel(
+ const int n, const T *data_im, const T *data_offset, const T *data_mask,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group, const int batch_size,
+ const int num_channels, const int deformable_group, const int height_col,
+ const int width_col, T *data_col) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+
+ T *data_col_ptr =
+ data_col +
+ ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ const T *data_im_ptr =
+ data_im + (b_col * num_channels + c_im) * height * width;
+ const T *data_offset_ptr =
+ data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+
+ const T *data_mask_ptr =
+ data_mask + (b_col * deformable_group + deformable_group_index) *
+ kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i) {
+ for (int j = 0; j < kernel_w; ++j) {
+ const int data_offset_h_ptr =
+ ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr =
+ ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+ w_col;
+ const int data_mask_hw_ptr =
+ ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ const T mask = data_mask_ptr[data_mask_hw_ptr];
+ T val = static_cast(0);
+ const T h_im = h_in + i * dilation_h + offset_h;
+ const T w_im = w_in + j * dilation_w + offset_w;
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+ w_im);
+ *data_col_ptr = val * mask;
+ data_col_ptr += batch_size * height_col * width_col;
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_gpu_kernel(
+ const int n, const T *data_col, const T *data_offset, const T *data_mask,
+ const int channels, const int height, const int width, const int kernel_h,
+ const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group, const int batch_size,
+ const int deformable_group, const int height_col, const int width_col,
+ T *grad_im) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i =
+ (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c =
+ index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const T *data_offset_ptr =
+ data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+ const T *data_mask_ptr =
+ data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+ kernel_w * height_col * width_col;
+ const int data_offset_h_ptr =
+ ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr =
+ ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const int data_mask_hw_ptr =
+ ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ const T mask = data_mask_ptr[data_mask_hw_ptr];
+ const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const T cur_top_grad = data_col[index] * mask;
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++) {
+ for (int dx = -2; dx <= 2; dx++) {
+ if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+ cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+ int cur_bottom_grad_pos =
+ ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ T weight =
+ dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+ cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+ const int n, const T *data_col, const T *data_im, const T *data_offset,
+ const T *data_mask, const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w, const int dilation_h,
+ const int dilation_w, const int channel_per_deformable_group,
+ const int batch_size, const int offset_channels, const int deformable_group,
+ const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ T val = 0, mval = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const T *data_col_ptr = data_col + deformable_group_index *
+ channel_per_deformable_group *
+ batch_size * width_col * height_col;
+ const T *data_im_ptr =
+ data_im + (b * deformable_group + deformable_group_index) *
+ channel_per_deformable_group / kernel_h / kernel_w *
+ height * width;
+ const T *data_offset_ptr =
+ data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+ const T *data_mask_ptr =
+ data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+ kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+ col_c += col_step) {
+ const int col_pos =
+ (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i =
+ (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr =
+ (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr =
+ (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+ w_out);
+ const int data_mask_hw_ptr =
+ (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+ const T offset_h = data_offset_ptr[data_offset_h_ptr];
+ const T offset_w = data_offset_ptr[data_offset_w_ptr];
+ const T mask = data_mask_ptr[data_mask_hw_ptr];
+ T inv_h = h_in + i * dilation_h + offset_h;
+ T inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ inv_h = inv_w = -2;
+ else
+ mval += data_col_ptr[col_pos] *
+ dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+ height, width, inv_h, inv_w);
+ const T weight = dmcn_get_coordinate_weight(
+ inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+ width, bp_dir);
+ val += weight * data_col_ptr[col_pos] * mask;
+ cnt += 1;
+ }
+ // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+ grad_offset[index] = val;
+ if (offset_c % 2 == 0)
+ // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+ // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+ // height_col + h) * width_col + w], mask_req, mval);
+ grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+ kernel_w +
+ offset_c / 2) *
+ height_col +
+ h) *
+ width_col +
+ w] = mval;
+ }
+}
+
+#endif // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..12225ffdb3b1691ad9edabcd1663109f67ef1a6f
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,801 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+template
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+ const scalar_t *&bottom_data, const int &height, const int &width,
+ const int &nheads, const int &channels, const scalar_t &h,
+ const scalar_t &w, const int &m, const int &c) {
+ const int h_low = floorf(h);
+ const int w_low = floorf(w);
+ const int h_high = h_low + 1;
+ const int w_high = w_low + 1;
+
+ const scalar_t lh = h - h_low;
+ const scalar_t lw = w - w_low;
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ const int w_stride = nheads * channels;
+ const int h_stride = width * w_stride;
+ const int h_low_ptr_offset = h_low * h_stride;
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+ const int w_low_ptr_offset = w_low * w_stride;
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+ const int base_ptr = m * channels + c;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0) {
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+ v1 = bottom_data[ptr1];
+ }
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1) {
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+ v2 = bottom_data[ptr2];
+ }
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0) {
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+ v3 = bottom_data[ptr3];
+ }
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1) {
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+ v4 = bottom_data[ptr4];
+ }
+
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ void ms_deform_attn_col2im_bilinear(
+ const scalar_t *&bottom_data, const int &height, const int &width,
+ const int &nheads, const int &channels, const scalar_t &h,
+ const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+ const scalar_t &attn_weight, scalar_t *&grad_value,
+ scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+ const int h_low = floorf(h);
+ const int w_low = floorf(w);
+ const int h_high = h_low + 1;
+ const int w_high = w_low + 1;
+
+ const scalar_t lh = h - h_low;
+ const scalar_t lw = w - w_low;
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ const int w_stride = nheads * channels;
+ const int h_stride = width * w_stride;
+ const int h_low_ptr_offset = h_low * h_stride;
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+ const int w_low_ptr_offset = w_low * w_stride;
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+ const int base_ptr = m * channels + c;
+
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+ const scalar_t top_grad_value = top_grad * attn_weight;
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0) {
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+ v1 = bottom_data[ptr1];
+ grad_h_weight -= hw * v1;
+ grad_w_weight -= hh * v1;
+ atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+ }
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1) {
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+ v2 = bottom_data[ptr2];
+ grad_h_weight -= lw * v2;
+ grad_w_weight += hh * v2;
+ atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+ }
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0) {
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+ v3 = bottom_data[ptr3];
+ grad_h_weight += hw * v3;
+ grad_w_weight -= lh * v3;
+ atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+ }
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1) {
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+ v4 = bottom_data[ptr4];
+ grad_h_weight += lw * v4;
+ grad_w_weight += lh * v4;
+ atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+ }
+
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ *grad_attn_weight = top_grad * val;
+ *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+ *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+ const scalar_t *&bottom_data, const int &height, const int &width,
+ const int &nheads, const int &channels, const scalar_t &h,
+ const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+ const scalar_t &attn_weight, scalar_t *&grad_value,
+ scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+ const int h_low = floorf(h);
+ const int w_low = floorf(w);
+ const int h_high = h_low + 1;
+ const int w_high = w_low + 1;
+
+ const scalar_t lh = h - h_low;
+ const scalar_t lw = w - w_low;
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ const int w_stride = nheads * channels;
+ const int h_stride = width * w_stride;
+ const int h_low_ptr_offset = h_low * h_stride;
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+ const int w_low_ptr_offset = w_low * w_stride;
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+ const int base_ptr = m * channels + c;
+
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+ const scalar_t top_grad_value = top_grad * attn_weight;
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0) {
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+ v1 = bottom_data[ptr1];
+ grad_h_weight -= hw * v1;
+ grad_w_weight -= hh * v1;
+ atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+ }
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1) {
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+ v2 = bottom_data[ptr2];
+ grad_h_weight -= lw * v2;
+ grad_w_weight += hh * v2;
+ atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+ }
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0) {
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+ v3 = bottom_data[ptr3];
+ grad_h_weight += hw * v3;
+ grad_w_weight -= lh * v3;
+ atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+ }
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1) {
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+ v4 = bottom_data[ptr4];
+ grad_h_weight += lw * v4;
+ grad_w_weight += lh * v4;
+ atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+ }
+
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ atomicAdd(grad_attn_weight, top_grad * val);
+ atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+ atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template
+__global__ void ms_deformable_im2col_gpu_kernel(
+ const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+ const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+ const scalar_t *data_attn_weight, const int batch_size,
+ const int spatial_size, const int num_heads, const int channels,
+ const int num_levels, const int num_query, const int num_point,
+ scalar_t *data_col) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ scalar_t *data_col_ptr = data_col + index;
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+ scalar_t col = 0;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const scalar_t *data_value_ptr =
+ data_value +
+ (data_value_ptr_init_offset + level_start_id * qid_stride);
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+ spatial_w, num_heads, channels,
+ h_im, w_im, m_col, c_col) *
+ weight;
+ }
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ }
+ }
+ *data_col_ptr = col;
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
+ unsigned int tid = threadIdx.x;
+ const int qid_stride = num_heads * channels;
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+ *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+ *(cache_grad_attn_weight + threadIdx.x) = 0;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ cache_grad_sampling_loc + (threadIdx.x << 1),
+ cache_grad_attn_weight + threadIdx.x);
+ }
+
+ __syncthreads();
+ if (tid == 0) {
+ scalar_t _grad_w = cache_grad_sampling_loc[0],
+ _grad_h = cache_grad_sampling_loc[1],
+ _grad_a = cache_grad_attn_weight[0];
+ int sid = 2;
+ for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
+ _grad_w += cache_grad_sampling_loc[sid];
+ _grad_h += cache_grad_sampling_loc[sid + 1];
+ _grad_a += cache_grad_attn_weight[_tid];
+ sid += 2;
+ }
+
+ *grad_sampling_loc_out = _grad_w;
+ *(grad_sampling_loc_out + 1) = _grad_h;
+ *grad_attn_weight_out = _grad_a;
+ }
+ __syncthreads();
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
+ unsigned int tid = threadIdx.x;
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+ *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+ *(cache_grad_attn_weight + threadIdx.x) = 0;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ cache_grad_sampling_loc + (threadIdx.x << 1),
+ cache_grad_attn_weight + threadIdx.x);
+ }
+
+ __syncthreads();
+
+ for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+ if (tid < s) {
+ const unsigned int xid1 = tid << 1;
+ const unsigned int xid2 = (tid + s) << 1;
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+ cache_grad_sampling_loc[xid1 + 1] +=
+ cache_grad_sampling_loc[xid2 + 1];
+ }
+ __syncthreads();
+ }
+
+ if (tid == 0) {
+ *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+ *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+ *grad_attn_weight_out = cache_grad_attn_weight[0];
+ }
+ __syncthreads();
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ extern __shared__ int _s[];
+ scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s);
+ scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+ unsigned int tid = threadIdx.x;
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+ *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+ *(cache_grad_attn_weight + threadIdx.x) = 0;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ cache_grad_sampling_loc + (threadIdx.x << 1),
+ cache_grad_attn_weight + threadIdx.x);
+ }
+
+ __syncthreads();
+ if (tid == 0) {
+ scalar_t _grad_w = cache_grad_sampling_loc[0],
+ _grad_h = cache_grad_sampling_loc[1],
+ _grad_a = cache_grad_attn_weight[0];
+ int sid = 2;
+ for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
+ _grad_w += cache_grad_sampling_loc[sid];
+ _grad_h += cache_grad_sampling_loc[sid + 1];
+ _grad_a += cache_grad_attn_weight[_tid];
+ sid += 2;
+ }
+
+ *grad_sampling_loc_out = _grad_w;
+ *(grad_sampling_loc_out + 1) = _grad_h;
+ *grad_attn_weight_out = _grad_a;
+ }
+ __syncthreads();
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ extern __shared__ int _s[];
+ scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s);
+ scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+ unsigned int tid = threadIdx.x;
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+ *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+ *(cache_grad_attn_weight + threadIdx.x) = 0;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ cache_grad_sampling_loc + (threadIdx.x << 1),
+ cache_grad_attn_weight + threadIdx.x);
+ }
+
+ __syncthreads();
+
+ for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+ s >>= 1, spre >>= 1) {
+ if (tid < s) {
+ const unsigned int xid1 = tid << 1;
+ const unsigned int xid2 = (tid + s) << 1;
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+ cache_grad_sampling_loc[xid1 + 1] +=
+ cache_grad_sampling_loc[xid2 + 1];
+ if (tid + (s << 1) < spre) {
+ cache_grad_attn_weight[tid] +=
+ cache_grad_attn_weight[tid + (s << 1)];
+ cache_grad_sampling_loc[xid1] +=
+ cache_grad_sampling_loc[xid2 + (s << 1)];
+ cache_grad_sampling_loc[xid1 + 1] +=
+ cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+ }
+ }
+ __syncthreads();
+ }
+
+ if (tid == 0) {
+ *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+ *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+ *grad_attn_weight_out = cache_grad_attn_weight[0];
+ }
+ __syncthreads();
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ extern __shared__ int _s[];
+ scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s);
+ scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+ unsigned int tid = threadIdx.x;
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+ *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+ *(cache_grad_attn_weight + threadIdx.x) = 0;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ cache_grad_sampling_loc + (threadIdx.x << 1),
+ cache_grad_attn_weight + threadIdx.x);
+ }
+
+ __syncthreads();
+
+ for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+ s >>= 1, spre >>= 1) {
+ if (tid < s) {
+ const unsigned int xid1 = tid << 1;
+ const unsigned int xid2 = (tid + s) << 1;
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+ cache_grad_sampling_loc[xid1 + 1] +=
+ cache_grad_sampling_loc[xid2 + 1];
+ if (tid + (s << 1) < spre) {
+ cache_grad_attn_weight[tid] +=
+ cache_grad_attn_weight[tid + (s << 1)];
+ cache_grad_sampling_loc[xid1] +=
+ cache_grad_sampling_loc[xid2 + (s << 1)];
+ cache_grad_sampling_loc[xid1 + 1] +=
+ cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+ }
+ }
+ __syncthreads();
+ }
+
+ if (tid == 0) {
+ atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
+ atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
+ atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
+ }
+ __syncthreads();
+
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+
+template
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+ const int n, const scalar_t *grad_col, const scalar_t *data_value,
+ const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+ const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+ const int batch_size, const int spatial_size, const int num_heads,
+ const int channels, const int num_levels, const int num_query,
+ const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+ scalar_t *grad_attn_weight) {
+ CUDA_1D_KERNEL_LOOP(index, n) {
+ int _temp = index;
+ const int c_col = _temp % channels;
+ _temp /= channels;
+ const int sampling_index = _temp;
+ const int m_col = _temp % num_heads;
+ _temp /= num_heads;
+ _temp /= num_query;
+ const int b_col = _temp;
+
+ const scalar_t top_grad = grad_col[index];
+
+ int data_weight_ptr = sampling_index * num_levels * num_point;
+ int data_loc_w_ptr = data_weight_ptr << 1;
+ const int grad_sampling_ptr = data_weight_ptr;
+ scalar_t *grad_sampling_loc_out =
+ grad_sampling_loc + (grad_sampling_ptr << 1);
+ scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+ const int grad_weight_stride = 1;
+ const int grad_loc_stride = 2;
+ const int qid_stride = num_heads * channels;
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+ for (int l_col = 0; l_col < num_levels; ++l_col) {
+ const int level_start_id = data_level_start_index[l_col];
+ const int spatial_h_ptr = l_col << 1;
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+ const int value_ptr_offset =
+ data_value_ptr_init_offset + level_start_id * qid_stride;
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+ for (int p_col = 0; p_col < num_point; ++p_col) {
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+ ms_deform_attn_col2im_bilinear_gm(
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+ w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+ grad_sampling_loc_out, grad_attn_weight_out);
+ }
+ data_weight_ptr += 1;
+ data_loc_w_ptr += 2;
+ grad_attn_weight_out += grad_weight_stride;
+ grad_sampling_loc_out += grad_loc_stride;
+ }
+ }
+ }
+}
+#endif // DEFORM_ATTN_CUDA_KERNEL
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..281d9f0b409f54260a81a79ad96ab09fde9580ce
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,117 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif // MMCV_USE_PARROTS
+#endif // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+ const int offset, const float threshold) {
+ float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+ float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+ float width = fmaxf(right - left + offset, 0.f),
+ height = fmaxf(bottom - top + offset, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+ float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+ return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
+ const int offset, const float *dev_boxes,
+ unsigned long long *dev_mask) {
+ int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+ CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+ const int tid = threadIdx.x;
+
+ if (row_start > col_start) return;
+
+ const int row_size =
+ fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ __shared__ float block_boxes[threadsPerBlock * 4];
+ if (tid < col_size) {
+ block_boxes[tid * 4 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+ block_boxes[tid * 4 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+ block_boxes[tid * 4 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+ block_boxes[tid * 4 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+ }
+ __syncthreads();
+
+ if (tid < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + tid;
+ const float *cur_box = dev_boxes + cur_box_idx * 4;
+ int i = 0;
+ unsigned long long int t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = tid + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+ t |= 1ULL << i;
+ }
+ }
+ dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+ }
+ }
+}
+
+__global__ static void gather_keep_from_mask(bool *keep,
+ const unsigned long long *dev_mask,
+ const int n_boxes) {
+ const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+ const int tid = threadIdx.x;
+
+ // mark the bboxes which have been removed.
+ extern __shared__ unsigned long long removed[];
+
+ // initialize removed.
+ for (int i = tid; i < col_blocks; i += blockDim.x) {
+ removed[i] = 0;
+ }
+ __syncthreads();
+
+ for (int nblock = 0; nblock < col_blocks; ++nblock) {
+ auto removed_val = removed[nblock];
+ __syncthreads();
+ const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+ for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+ const int i = i_offset + inblock;
+ if (i >= n_boxes) break;
+ // select a candidate, check if it should kept.
+ if (!(removed_val & (1ULL << inblock))) {
+ if (tid == 0) {
+ // mark the output.
+ keep[i] = true;
+ }
+ auto p = dev_mask + i * col_blocks;
+ // remove all bboxes which overlap the candidate.
+ for (int j = tid; j < col_blocks; j += blockDim.x) {
+ if (j >= nblock) removed[j] |= p[j];
+ }
+ __syncthreads();
+ removed_val = removed[nblock];
+ }
+ }
+ }
+}
+
+#endif // NMS_CUDA_KERNEL_CUH
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bba3b8258f6b8798b9d1a651bfda29c48bb5376a
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef NMS_QUADRI_CUDA_CUH
+#define NMS_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+ return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template
+__global__ void nms_quadri_cuda_kernel(const int n_boxes,
+ const float iou_threshold,
+ const T* dev_boxes,
+ unsigned long long* dev_mask,
+ const int multi_label) {
+ if (multi_label == 1) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ // Compared to nms_cuda_kernel, where each box is represented with 4 values
+ // (x1, y1, x2, y2), each rotated box is represented with 8 values
+ // (x1, y1, ..., x4, y4) here.
+ __shared__ T block_boxes[threadsPerBlock * 8];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 8 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
+ block_boxes[threadIdx.x * 8 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
+ block_boxes[threadIdx.x * 8 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
+ block_boxes[threadIdx.x * 8 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
+ block_boxes[threadIdx.x * 8 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
+ block_boxes[threadIdx.x * 8 + 5] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
+ block_boxes[threadIdx.x * 8 + 6] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
+ block_boxes[threadIdx.x * 8 + 7] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const T* cur_box = dev_boxes + cur_box_idx * 9;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ // Instead of devIoU used by original horizontal nms, here
+ // we use the single_box_iou_quadri function from
+ // box_iou_rotated_utils.h
+ if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) >
+ iou_threshold) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ } else {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ // Compared to nms_cuda_kernel, where each box is represented with 4 values
+ // (x1, y1, x2, y2), each rotated box is represented with 8 values
+ // (x1, y1, , ..., x4, y4) here.
+ __shared__ T block_boxes[threadsPerBlock * 8];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 8 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
+ block_boxes[threadIdx.x * 8 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
+ block_boxes[threadIdx.x * 8 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
+ block_boxes[threadIdx.x * 8 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
+ block_boxes[threadIdx.x * 8 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
+ block_boxes[threadIdx.x * 8 + 5] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
+ block_boxes[threadIdx.x * 8 + 6] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
+ block_boxes[threadIdx.x * 8 + 7] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const T* cur_box = dev_boxes + cur_box_idx * 8;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ // Instead of devIoU used by original horizontal nms, here
+ // we use the single_box_iou_quadri function from
+ // box_iou_rotated_utils.h
+ if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) >
+ iou_threshold) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ }
+}
+
+#endif
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..747327afb83900177dd4721f1b0ba99153f658d7
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,133 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+ return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+ const float iou_threshold,
+ const T* dev_boxes,
+ unsigned long long* dev_mask,
+ const int multi_label) {
+ // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+ if (multi_label == 1) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ // Compared to nms_cuda_kernel, where each box is represented with 4 values
+ // (x1, y1, x2, y2), each rotated box is represented with 5 values
+ // (x_center, y_center, width, height, angle_degrees) here.
+ __shared__ T block_boxes[threadsPerBlock * 5];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 5 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+ block_boxes[threadIdx.x * 5 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+ block_boxes[threadIdx.x * 5 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+ block_boxes[threadIdx.x * 5 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+ block_boxes[threadIdx.x * 5 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const T* cur_box = dev_boxes + cur_box_idx * 6;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ // Instead of devIoU used by original horizontal nms, here
+ // we use the single_box_iou_rotated function from
+ // box_iou_rotated_utils.h
+ if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) >
+ iou_threshold) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ } else {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ // Compared to nms_cuda_kernel, where each box is represented with 4 values
+ // (x1, y1, x2, y2), each rotated box is represented with 5 values
+ // (x_center, y_center, width, height, angle_degrees) here.
+ __shared__ T block_boxes[threadsPerBlock * 5];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 5 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+ block_boxes[threadIdx.x * 5 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+ block_boxes[threadIdx.x * 5 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+ block_boxes[threadIdx.x * 5 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+ block_boxes[threadIdx.x * 5 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const T* cur_box = dev_boxes + cur_box_idx * 5;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ // Instead of devIoU used by original horizontal nms, here
+ // we use the single_box_iou_rotated function from
+ // box_iou_rotated_utils.h
+ if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) >
+ iou_threshold) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+ }
+}
+
+#endif
diff --git a/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf
--- /dev/null
+++ b/external/cv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include
+
+#include
+
+#ifdef PARROTS_USE_HALF
+#include
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+ CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+ CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+ CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+ CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+ int width = warpSize) {
+ CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+ return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+ unsigned delta, int width = warpSize) {
+ CUDA_INTRINSIC_FUNC(
+ float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+ unsigned delta,
+ int width = warpSize) {
+ CUDA_INTRINSIC_FUNC(
+ float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+ int laneMask, int width) {
+ CUDA_INTRINSIC_FUNC(float16 r;
+ r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+ return r;);
+}
+
+#endif // CUDA_VERSION < 9000
+
+#endif // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+ int width = warpSize) {
+ CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template