Spaces:

YinuoGuo27
/

test_kdtalker

Sleeping

App Files Files Community

YinuoGuo27 commited on Mar 30

Commit

ea19453

verified ·

1 Parent(s): ef2158a

Upload 20 files

Browse files

Files changed (20) hide show

difpoint/model/__init__.py +6 -0
difpoint/model/__pycache__/__init__.cpython-310.pyc +0 -0
difpoint/model/__pycache__/__init__.cpython-38.pyc +0 -0
difpoint/model/__pycache__/model.cpython-310.pyc +0 -0
difpoint/model/__pycache__/model.cpython-38.pyc +0 -0
difpoint/model/__pycache__/model_utils.cpython-310.pyc +0 -0
difpoint/model/__pycache__/model_utils.cpython-38.pyc +0 -0
difpoint/model/__pycache__/point_model.cpython-310.pyc +0 -0
difpoint/model/__pycache__/point_model.cpython-38.pyc +0 -0
difpoint/model/model.py +409 -0
difpoint/model/model_utils.py +35 -0
difpoint/model/point_model.py +38 -0
difpoint/model/temporaltrans/__pycache__/temptrans.cpython-310.pyc +0 -0
difpoint/model/temporaltrans/__pycache__/temptrans.cpython-38.pyc +0 -0
difpoint/model/temporaltrans/__pycache__/transformer_utils.cpython-310.pyc +0 -0
difpoint/model/temporaltrans/__pycache__/transformer_utils.cpython-38.pyc +0 -0
difpoint/model/temporaltrans/pointnet_util.py +311 -0
difpoint/model/temporaltrans/pointtransformerv2.py +250 -0
difpoint/model/temporaltrans/temptrans.py +347 -0
difpoint/model/temporaltrans/transformer_utils.py +146 -0

difpoint/model/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .model import ConditionalPointCloudDiffusionModel
+def get_model():
+    model = ConditionalPointCloudDiffusionModel()
+    return model

difpoint/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (325 Bytes). View file

difpoint/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (328 Bytes). View file

difpoint/model/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.33 kB). View file

difpoint/model/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (5.25 kB). View file

difpoint/model/__pycache__/model_utils.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

difpoint/model/__pycache__/model_utils.cpython-38.pyc ADDED Viewed

Binary file (1.7 kB). View file

difpoint/model/__pycache__/point_model.cpython-310.pyc ADDED Viewed

Binary file (1.78 kB). View file

difpoint/model/__pycache__/point_model.cpython-38.pyc ADDED Viewed

Binary file (1.73 kB). View file

difpoint/model/model.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import inspect
+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_ddim import DDIMScheduler
+from diffusers.schedulers.scheduling_pndm import PNDMScheduler
+from torch import Tensor
+from tqdm import tqdm
+from diffusers import ModelMixin
+from .model_utils import get_custom_betas
+from .point_model import PointModel
+import copy
+class ConditionalPointCloudDiffusionModel(ModelMixin):
+    def __init__(
+        self,
+        beta_start: float = 1e-5,
+        beta_end: float = 8e-3,
+        beta_schedule: str = 'linear',
+        point_cloud_model: str = 'simple',
+        point_cloud_model_embed_dim: int = 64,
+    ):
+        super().__init__()
+        self.in_channels = 70  # 3 for 3D point positions
+        self.out_channels = 70
+        # Checks
+        # Create diffusion model schedulers which define the sampling timesteps
+        scheduler_kwargs = {}
+        if beta_schedule == 'custom':
+            scheduler_kwargs.update(dict(trained_betas=get_custom_betas(beta_start=beta_start, beta_end=beta_end)))
+        else:
+            scheduler_kwargs.update(dict(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule))
+        self.schedulers_map = {
+            'ddpm': DDPMScheduler(**scheduler_kwargs, clip_sample=False),
+            'ddim': DDIMScheduler(**scheduler_kwargs, clip_sample=False),
+            'pndm': PNDMScheduler(**scheduler_kwargs),
+        }
+        self.scheduler = self.schedulers_map['ddim']  # this can be changed for inference
+        # Create point cloud model for processing point cloud at each diffusion step
+        self.point_model = PointModel(
+            model_type=point_cloud_model,
+            embed_dim=point_cloud_model_embed_dim,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+        )
+    def forward_train(
+        self,
+        pc: Optional[Tensor],
+        ref_kps: Optional[Tensor],
+        ori_kps: Optional[Tensor],
+        aud_feat: Optional[Tensor],
+        mode: str = 'train',
+        return_intermediate_steps: bool = False
+    ):
+        # Normalize colors and convert to tensor
+        x_0 = pc
+        B, Nf, Np, D = x_0.shape# batch, nums of frames, nums of points, 3
+        x_0=x_0[:,:,:,0]# batch, nums of frames, 70
+        # Sample random noise
+        noise = torch.randn_like(x_0)
+        # Sample random timesteps for each point_cloud
+        timestep = torch.randint(0, self.scheduler.num_train_timesteps, (B,),
+            device=self.device, dtype=torch.long)
+        # Add noise to points
+        x_t = self.scheduler.add_noise(x_0, noise, timestep)
+        # Conditioning
+        ref_kps = ref_kps[:, :, 0]
+        x_t_input = torch.cat([ori_kps.unsqueeze(1), ref_kps.unsqueeze(1), x_t], dim=1)
+        # x_t_input = torch.cat([ref_kps.unsqueeze(1), x_t], dim=1)
+        # ori_kps_repeat = torch.repeat_interleave(ori_kps.unsqueeze(1), repeats=Nf+1, dim=1)
+        # x_t_input = torch.cat([x_t_input, ori_kps_repeat], dim=-1)  #B, 32+1, 51+45
+        aud_feat = torch.cat([torch.zeros(B, 2, 512).cuda(), aud_feat], 1)
+        # Augmentation for audio feature
+        if mode in 'train':
+            if torch.rand(1) > 0.3:
+                mean = torch.mean(aud_feat)
+                std = torch.std(aud_feat)
+                sample = torch.normal(mean=torch.full(aud_feat.shape, mean), std=torch.full(aud_feat.shape, std)).cuda()
+                aud_feat = sample + aud_feat
+            else:
+                pass
+        else:
+            pass
+        # Forward
+        noise_pred = self.point_model(x_t_input, timestep, context=aud_feat)    #torch.cat([mel_feat,style_embed],-1))
+        noise_pred = noise_pred[:, 2:]
+        #
+        # Check
+        if not noise_pred.shape == noise.shape:
+            raise ValueError(f'{noise_pred.shape=} and {noise.shape=}')
+        # Loss
+        loss = F.mse_loss(noise_pred, noise)
+        loss_pose = F.mse_loss(noise_pred[:, :, :6], noise[:, :, :6])
+        loss_exp = F.mse_loss(noise_pred[:, :, 6:], noise[:, :, 6:])
+        # Whether to return intermediate steps
+        if return_intermediate_steps:
+            return loss, (x_0, x_t, noise, noise_pred)
+        return loss, loss_exp, loss_pose
+    # def forward_train(
+    #     self,
+    #     pc: Optional[Tensor],
+    #     ref_kps: Optional[Tensor],
+    #     ori_kps: Optional[Tensor],
+    #     aud_feat: Optional[Tensor],
+    #     mode: str = 'train',
+    #     return_intermediate_steps: bool = False
+    # ):
+    #
+    #     # Normalize colors and convert to tensor
+    #     x_0 = pc
+    #     B, Nf, Np, D = x_0.shape# batch, nums of frames, nums of points, 3
+    #
+    #     # ori_kps = torch.repeat_interleave(ori_kps.unsqueeze(1), Nf, dim=1)      # B, Nf, 45
+    #     #
+    #     # ref_kps = ref_kps[:, :, 0]
+    #     # ref_kps = torch.repeat_interleave(ref_kps.unsqueeze(1), Nf, dim=1)  # B, Nf, 91
+    #
+    #     x_0 = x_0[:,:,:,0]
+    #
+    #     # Sample random noise
+    #     noise = torch.randn_like(x_0)
+    #
+    #     # Sample random timesteps for each point_cloud
+    #     timestep = torch.randint(0, self.scheduler.num_train_timesteps, (B,),
+    #         device=self.device, dtype=torch.long)
+    #
+    #     # Add noise to points
+    #     x_t = self.scheduler.add_noise(x_0, noise, timestep)
+    #
+    #     # Conditioning
+    #     ref_kps = ref_kps[:,:,0]
+    #
+    #     # x_t_input = torch.cat([ref_kps.unsqueeze(1), x_t], dim=1)
+    #
+    #     # x_0 = torch.cat([x_0, ref_kps, ori_kps], dim=2)  # B, Nf, 91+91+45
+    #
+    #     x_t_input = torch.cat([ref_kps.unsqueeze(1), x_t], dim=1)
+    #     # x_t_input = torch.cat([ori_kps.unsqueeze(1), ref_kps.unsqueeze(1), x_t], dim=1)
+    #
+    #     aud_feat = torch.cat([torch.zeros(B, 1, 512).cuda(), aud_feat], 1)
+    #
+    #     # Augmentation for audio feature
+    #     if mode in 'train':
+    #         if torch.rand(1) > 0.3:
+    #             mean = torch.mean(aud_feat)
+    #             std = torch.std(aud_feat)
+    #             sample = torch.normal(mean=torch.full(aud_feat.shape, mean), std=torch.full(aud_feat.shape, std)).cuda()
+    #             aud_feat = sample + aud_feat
+    #         else:
+    #             pass
+    #     else:
+    #         pass
+    #
+    #     # Forward
+    #     noise_pred = self.point_model(x_t_input, timestep, context=aud_feat)
+    #     noise_pred = noise_pred[:, 1:]
+    #
+    #     # Check
+    #     # if not noise_pred.shape == noise.shape:
+    #     #     raise ValueError(f'{noise_pred.shape=} and {noise.shape=}')
+    #
+    #     # Loss
+    #     loss = F.mse_loss(noise_pred, noise)
+    #
+    #     # loss_kp = F.mse_loss(noise_pred[:, :, :45], noise[:, :, :45])
+    #
+    #     # Whether to return intermediate steps
+    #     if return_intermediate_steps:
+    #         return loss, (x_0, x_t, noise, noise_pred)
+    #
+    #     return loss
+    # @torch.no_grad()
+    # def forward_sample(
+    #         self,
+    #         num_points: int,
+    #         ref_kps: Optional[Tensor],
+    #         ori_kps: Optional[Tensor],
+    #         aud_feat: Optional[Tensor],
+    #         # Optional overrides
+    #         scheduler: Optional[str] = 'ddpm',
+    #         # Inference parameters
+    #         num_inference_steps: Optional[int] = 1000,
+    #         eta: Optional[float] = 0.0,  # for DDIM
+    #         # Whether to return all the intermediate steps in generation
+    #         return_sample_every_n_steps: int = -1,
+    #         # Whether to disable tqdm
+    #         disable_tqdm: bool = False,
+    # ):
+    #
+    #     # Get scheduler from mapping, or use self.scheduler if None
+    #     scheduler = self.scheduler if scheduler is None else self.schedulers_map[scheduler]
+    #
+    #     # Get the size of the noise
+    #     Np = num_points
+    #     Nf = aud_feat.size(1)
+    #     B = 1
+    #     D = 3
+    #     device = self.device
+    #
+    #     # Sample noise
+    #     x_t = torch.randn(B, Nf, Np, D, device=device)
+    #
+    #     x_t = x_t[:, :, :, 0]
+    #
+    #     # ori_kps = torch.repeat_interleave(ori_kps.unsqueeze(1), Nf, dim=1)  # B, Nf, 45
+    #
+    #     ref_kps = ref_kps[:, :, 0]
+    #     # ref_kps = torch.repeat_interleave(ref_kps.unsqueeze(1), Nf, dim=1)  # B, Nf, 91
+    #
+    #     # Set timesteps
+    #     accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+    #     extra_set_kwargs = {"offset": 1} if accepts_offset else {}
+    #     scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+    #
+    #     # Prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+    #     # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+    #     # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+    #     # and should be between [0, 1]
+    #     accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+    #     extra_step_kwargs = {"eta": eta} if accepts_eta else {}
+    #
+    #     # Loop over timesteps
+    #     all_outputs = []
+    #     return_all_outputs = (return_sample_every_n_steps > 0)
+    #     progress_bar = tqdm(scheduler.timesteps.to(device), desc=f'Sampling ({x_t.shape})', disable=disable_tqdm)
+    #
+    #     # ori_kps = torch.repeat_interleave(ori_kps[:, 6:].unsqueeze(1), Nf + 1, dim=1)
+    #     aud_feat = torch.cat([torch.zeros(B, 1, 512).cuda(), aud_feat], 1)
+    #     # aud_feat = torch.cat([ori_kps, aud_feat], -1)
+    #
+    #     # aud_feat = torch.cat([torch.zeros(B, 2, 512).cuda(), aud_feat], 1)
+    #
+    #     for i, t in enumerate(progress_bar):
+    #
+    #         # Conditioning
+    #         x_t_input = torch.cat([ref_kps.unsqueeze(1).detach(), x_t], dim=1)
+    #         # x_t_input = torch.cat([ori_kps.unsqueeze(1).detach(), ref_kps.unsqueeze(1).detach(), x_t], dim=1)
+    #         # x_t_input = torch.cat([x_t, ref_kps, ori_kps], dim=2)           # B, Nf, 91+91+45
+    #
+    #         # Forward
+    #         # noise_pred = self.point_model(x_t_input, t.reshape(1).expand(B), context=aud_feat)[:, 1:]
+    #         noise_pred = self.point_model(x_t_input, t.reshape(1).expand(B), context=aud_feat)[:, 1:]
+    #
+    #         # noise_pred = noise_pred[:, :, :51]
+    #
+    #         # Step
+    #         # x_t = x_t[:, :, :51]
+    #         x_t = scheduler.step(noise_pred, t, x_t, **extra_step_kwargs).prev_sample
+    #
+    #         # Append to output list if desired
+    #         if (return_all_outputs and (i % return_sample_every_n_steps == 0 or i == len(scheduler.timesteps) - 1)):
+    #             all_outputs.append(x_t)
+    #
+    #     # Convert output back into a point cloud, undoing normalization and scaling
+    #     output = x_t
+    #     output = torch.stack([output, output, output], -1)
+    #     if return_all_outputs:
+    #         all_outputs = torch.stack(all_outputs, dim=1)  # (B, sample_steps, N, D)
+    #     return (output, all_outputs) if return_all_outputs else output
+    @torch.no_grad()
+    def forward_sample(
+        self,
+        num_points: int,
+        ref_kps: Optional[Tensor],
+        ori_kps: Optional[Tensor],
+        aud_feat: Optional[Tensor],
+        # Optional overrides
+        scheduler: Optional[str] = 'ddpm',
+        # Inference parameters
+        num_inference_steps: Optional[int] = 1000,
+        eta: Optional[float] = 0.0,  # for DDIM
+        # Whether to return all the intermediate steps in generation
+        return_sample_every_n_steps: int = -1,
+        # Whether to disable tqdm
+        disable_tqdm: bool = False,
+    ):
+        # Get scheduler from mapping, or use self.scheduler if None
+        scheduler = self.scheduler if scheduler is None else self.schedulers_map[scheduler]
+        # Get the size of the noise
+        Np = num_points
+        Nf = aud_feat.size(1)
+        B = 1
+        D = 3
+        device = self.device
+        # Sample noise
+        x_t = torch.randn(B, Nf, Np, D, device=device)
+        x_t = x_t[:, :, :, 0]
+        ref_kps = ref_kps[:,:,0]
+        # Set timesteps
+        accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {"offset": 1} if accepts_offset else {}
+        scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        # Prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+        extra_step_kwargs = {"eta": eta} if accepts_eta else {}
+        # Loop over timesteps
+        all_outputs = []
+        return_all_outputs = (return_sample_every_n_steps > 0)
+        progress_bar = tqdm(scheduler.timesteps.to(device), desc=f'Sampling ({x_t.shape})', disable=disable_tqdm)
+        # ori_kps = torch.repeat_interleave(ori_kps[:, 6:].unsqueeze(1), Nf + 1, dim=1)
+        # aud_feat = torch.cat([torch.zeros(B, 1, 512).cuda(), aud_feat], 1)
+        # aud_feat = torch.cat([ori_kps, aud_feat], -1)
+        aud_feat = torch.cat([torch.zeros(B, 2, 512).cuda(), aud_feat], 1)
+        for i, t in enumerate(progress_bar):
+            # Conditioning
+            # x_t_input = torch.cat([ref_kps.unsqueeze(1), x_t], dim=1)
+            #
+            # ori_kps_repeat = torch.repeat_interleave(ori_kps.unsqueeze(1), repeats=Nf + 1, dim=1)
+            #
+            # x_t_input = torch.cat([x_t_input.detach(), ori_kps_repeat.detach()], dim=-1)  # B, 32+1, 51+45
+            x_t_input = torch.cat([ori_kps.unsqueeze(1).detach(),ref_kps.unsqueeze(1).detach(), x_t], dim=1)
+            # x_t_input = torch.cat([ref_kps.unsqueeze(1).detach(), x_t], dim=1)
+            # Forward
+            # noise_pred = self.point_model(x_t_input, t.reshape(1).expand(B), context=aud_feat)[:, 1:]
+            noise_pred = self.point_model(x_t_input, t.reshape(1).expand(B), context=aud_feat)[:, 2:]
+            # Step
+            x_t = scheduler.step(noise_pred, t, x_t, **extra_step_kwargs).prev_sample
+            # Append to output list if desired
+            if (return_all_outputs and (i % return_sample_every_n_steps == 0 or i == len(scheduler.timesteps) - 1)):
+                all_outputs.append(x_t)
+        # Convert output back into a point cloud, undoing normalization and scaling
+        output = x_t
+        output = torch.stack([output,output,output],-1)
+        if return_all_outputs:
+            all_outputs = torch.stack(all_outputs, dim=1)  # (B, sample_steps, N, D)
+        return (output, all_outputs) if return_all_outputs else output
+    def forward(self, batch: dict, mode: str = 'train', **kwargs):
+        """A wrapper around the forward method for training and inference"""
+        if mode == 'train':
+            return self.forward_train(
+                pc=batch['sequence_keypoints'],
+                ref_kps=batch['ref_keypoint'],
+                ori_kps=batch['ori_keypoint'],
+                aud_feat=batch['aud_feat'],
+                mode='train',
+                **kwargs)
+        elif mode == 'val':
+            return self.forward_train(
+                pc=batch['sequence_keypoints'],
+                ref_kps=batch['ref_keypoint'],
+                ori_kps=batch['ori_keypoint'],
+                aud_feat=batch['aud_feat'],
+                mode='val',
+                **kwargs)
+        elif mode == 'sample':
+            num_points = 68
+            return self.forward_sample(
+                num_points=num_points,
+                ref_kps=batch['ref_keypoint'],
+                ori_kps=batch['ori_keypoint'],
+                aud_feat=batch['aud_feat'],
+                **kwargs)
+        else:
+            raise NotImplementedError()

difpoint/model/model_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+def set_requires_grad(module: nn.Module, requires_grad: bool):
+    for p in module.parameters():
+        p.requires_grad_(requires_grad)
+def compute_distance_transform(mask: torch.Tensor):
+    image_size = mask.shape[-1]
+    distance_transform = torch.stack([
+        torch.from_numpy(cv2.distanceTransform(
+            (1 - m), distanceType=cv2.DIST_L2, maskSize=cv2.DIST_MASK_3
+        ) / (image_size / 2))
+        for m in mask.squeeze(1).detach().cpu().numpy().astype(np.uint8)
+    ]).unsqueeze(1).clip(0, 1).to(mask.device)
+    return distance_transform
+def default(x, d):
+    return d if x is None else x
+def get_custom_betas(beta_start: float, beta_end: float, warmup_frac: float = 0.3, num_train_timesteps: int = 1000):
+    """Custom beta schedule"""
+    betas = np.linspace(beta_start, beta_end, num_train_timesteps, dtype=np.float32)
+    warmup_frac = 0.3
+    warmup_time = int(num_train_timesteps * warmup_frac)
+    warmup_steps = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    warmup_time = min(warmup_time, num_train_timesteps)
+    betas[:warmup_time] = warmup_steps[:warmup_time]
+    return betas

difpoint/model/point_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers import ModelMixin
+from torch import Tensor
+from .temporaltrans.temptrans import SimpleTemperalPointModel, SimpleTransModel
+class PointModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        model_type: str = 'pvcnn',
+        in_channels: int = 3,
+        out_channels: int = 3,
+        embed_dim: int = 64,
+        dropout: float = 0.1,
+        width_multiplier: int = 1,
+        voxel_resolution_multiplier: int = 1,
+    ):
+        super().__init__()
+        self.model_type = model_type
+        if self.model_type == 'simple':
+            self.autocast_context = torch.autocast('cuda', dtype=torch.float32)
+            self.model = SimpleTransModel(
+                embed_dim=embed_dim,
+                num_classes=out_channels,
+                extra_feature_channels=(in_channels - 3),
+            )
+            self.model.output_projection.bias.data.normal_(0, 1e-6)
+            self.model.output_projection.weight.data.normal_(0, 1e-6)
+        else:
+            raise NotImplementedError()
+    def forward(self, inputs: Tensor, t: Tensor, context=None) -> Tensor:
+        """ Receives input of shape (B, N, in_channels) and returns output
+            of shape (B, N, out_channels) """
+        with self.autocast_context:
+            return self.model(inputs, t, context)

difpoint/model/temporaltrans/__pycache__/temptrans.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

difpoint/model/temporaltrans/__pycache__/temptrans.cpython-38.pyc ADDED Viewed

Binary file (11.1 kB). View file

difpoint/model/temporaltrans/__pycache__/transformer_utils.cpython-310.pyc ADDED Viewed

Binary file (5.09 kB). View file

difpoint/model/temporaltrans/__pycache__/transformer_utils.cpython-38.pyc ADDED Viewed

Binary file (5.09 kB). View file

difpoint/model/temporaltrans/pointnet_util.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from time import time
+import numpy as np
+# reference https://github.com/yanx27/Pointnet_Pointnet2_pytorch, modified by Yang You
+def timeit(tag, t):
+    print("{}: {}s".format(tag, time() - t))
+    return time()
+def pc_normalize(pc):
+    centroid = np.mean(pc, axis=0)
+    pc = pc - centroid
+    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
+    pc = pc / m
+    return pc
+def square_distance(src, dst):
+    """
+    Calculate Euclid distance between each two points.
+    src^T * dst = xn * xm + yn * ym + zn * zm；
+    sum(src^2, dim=-1) = xn*xn + yn*yn + zn*zn;
+    sum(dst^2, dim=-1) = xm*xm + ym*ym + zm*zm;
+    dist = (xn - xm)^2 + (yn - ym)^2 + (zn - zm)^2
+         = sum(src**2,dim=-1)+sum(dst**2,dim=-1)-2*src^T*dst
+    Input:
+        src: source points, [B, N, C]
+        dst: target points, [B, M, C]
+    Output:
+        dist: per-point square distance, [B, N, M]
+    """
+    return torch.sum((src[:, :, None] - dst[:, None]) ** 2, dim=-1)
+def index_points(points, idx):
+    """
+    Input:
+        points: input points data, [B, N, C]
+        idx: sample index data, [B, S, [K]]
+    Return:
+        new_points:, indexed points data, [B, S, [K], C]
+    """
+    raw_size = idx.size()
+    idx = idx.reshape(raw_size[0], -1)
+    res = torch.gather(points, 1, idx[..., None].expand(-1, -1, points.size(-1)))
+    return res.reshape(*raw_size, -1)
+def farthest_point_sample(xyz, npoint):
+    """
+    Input:
+        xyz: pointcloud data, [B, N, 3]
+        npoint: number of samples
+    Return:
+        centroids: sampled pointcloud index, [B, npoint]
+    """
+    device = xyz.device
+    B, N, C = xyz.shape
+    centroids = torch.zeros(B, npoint, dtype=torch.long).to(device)
+    distance = torch.ones(B, N).to(device) * 1e10
+    farthest = torch.randint(0, N, (B,), dtype=torch.long).to(device)
+    batch_indices = torch.arange(B, dtype=torch.long).to(device)
+    for i in range(npoint):
+        centroids[:, i] = farthest
+        centroid = xyz[batch_indices, farthest, :].view(B, 1, 3)
+        dist = torch.sum((xyz - centroid) ** 2, -1)
+        distance = torch.min(distance, dist)
+        farthest = torch.max(distance, -1)[1]
+    return centroids
+def query_ball_point(radius, nsample, xyz, new_xyz):
+    """
+    Input:
+        radius: local region radius
+        nsample: max sample number in local region
+        xyz: all points, [B, N, 3]
+        new_xyz: query points, [B, S, 3]
+    Return:
+        group_idx: grouped points index, [B, S, nsample]
+    """
+    device = xyz.device
+    B, N, C = xyz.shape
+    _, S, _ = new_xyz.shape
+    group_idx = torch.arange(N, dtype=torch.long).to(device).view(1, 1, N).repeat([B, S, 1])
+    sqrdists = square_distance(new_xyz, xyz)
+    group_idx[sqrdists > radius ** 2] = N
+    group_idx = group_idx.sort(dim=-1)[0][:, :, :nsample]
+    group_first = group_idx[:, :, 0].view(B, S, 1).repeat([1, 1, nsample])
+    mask = group_idx == N
+    group_idx[mask] = group_first[mask]
+    return group_idx
+def sample_and_group(npoint, radius, nsample, xyz, points, returnfps=False, knn=False):
+    """
+    Input:
+        npoint:
+        radius:
+        nsample:
+        xyz: input points position data, [B, N, 3]
+        points: input points data, [B, N, D]
+    Return:
+        new_xyz: sampled points position data, [B, npoint, nsample, 3]
+        new_points: sampled points data, [B, npoint, nsample, 3+D]
+    """
+    B, N, C = xyz.shape
+    S = npoint
+    fps_idx = farthest_point_sample(xyz, npoint) # [B, npoint]
+    torch.cuda.empty_cache()
+    new_xyz = index_points(xyz, fps_idx)
+    torch.cuda.empty_cache()
+    if knn:
+        dists = square_distance(new_xyz, xyz)  # B x npoint x N
+        idx = dists.argsort()[:, :, :nsample]  # B x npoint x K
+    else:
+        idx = query_ball_point(radius, nsample, xyz, new_xyz)
+    torch.cuda.empty_cache()
+    grouped_xyz = index_points(xyz, idx) # [B, npoint, nsample, C]
+    torch.cuda.empty_cache()
+    grouped_xyz_norm = grouped_xyz - new_xyz.view(B, S, 1, C)
+    torch.cuda.empty_cache()
+    if points is not None:
+        grouped_points = index_points(points, idx)
+        new_points = torch.cat([grouped_xyz_norm, grouped_points], dim=-1) # [B, npoint, nsample, C+D]
+    else:
+        new_points = grouped_xyz_norm
+    if returnfps:
+        return new_xyz, new_points, grouped_xyz, fps_idx
+    else:
+        return new_xyz, new_points
+def sample_and_group_all(xyz, points):
+    """
+    Input:
+        xyz: input points position data, [B, N, 3]
+        points: input points data, [B, N, D]
+    Return:
+        new_xyz: sampled points position data, [B, 1, 3]
+        new_points: sampled points data, [B, 1, N, 3+D]
+    """
+    device = xyz.device
+    B, N, C = xyz.shape
+    new_xyz = torch.zeros(B, 1, C).to(device)
+    grouped_xyz = xyz.view(B, 1, N, C)
+    if points is not None:
+        new_points = torch.cat([grouped_xyz, points.view(B, 1, N, -1)], dim=-1)
+    else:
+        new_points = grouped_xyz
+    return new_xyz, new_points
+class PointNetSetAbstraction(nn.Module):
+    def __init__(self, npoint, radius, nsample, in_channel, mlp, group_all, knn=False):
+        super(PointNetSetAbstraction, self).__init__()
+        self.npoint = npoint
+        self.radius = radius
+        self.nsample = nsample
+        self.knn = knn
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        last_channel = in_channel
+        for out_channel in mlp:
+            self.mlp_convs.append(nn.Conv2d(last_channel, out_channel, 1))
+            self.mlp_bns.append(nn.BatchNorm2d(out_channel))
+            last_channel = out_channel
+        self.group_all = group_all
+    def forward(self, xyz, points):
+        """
+        Input:
+            xyz: input points position data, [B, N, C]
+            points: input points data, [B, N, C]
+        Return:
+            new_xyz: sampled points position data, [B, S, C]
+            new_points_concat: sample points feature data, [B, S, D']
+        """
+        if self.group_all:
+            new_xyz, new_points = sample_and_group_all(xyz, points)
+        else:
+            new_xyz, new_points = sample_and_group(self.npoint, self.radius, self.nsample, xyz, points, knn=self.knn)
+        # new_xyz: sampled points position data, [B, npoint, C]
+        # new_points: sampled points data, [B, npoint, nsample, C+D]
+        new_points = new_points.permute(0, 3, 2, 1) # [B, C+D, nsample,npoint]
+        for i, conv in enumerate(self.mlp_convs):
+            bn = self.mlp_bns[i]
+            new_points =  F.relu(bn(conv(new_points)))
+        new_points = torch.max(new_points, 2)[0].transpose(1, 2)
+        return new_xyz, new_points
+class PointNetSetAbstractionMsg(nn.Module):
+    def __init__(self, npoint, radius_list, nsample_list, in_channel, mlp_list, knn=False):
+        super(PointNetSetAbstractionMsg, self).__init__()
+        self.npoint = npoint
+        self.radius_list = radius_list
+        self.nsample_list = nsample_list
+        self.knn = knn
+        self.conv_blocks = nn.ModuleList()
+        self.bn_blocks = nn.ModuleList()
+        for i in range(len(mlp_list)):
+            convs = nn.ModuleList()
+            bns = nn.ModuleList()
+            last_channel = in_channel + 3
+            for out_channel in mlp_list[i]:
+                convs.append(nn.Conv2d(last_channel, out_channel, 1))
+                bns.append(nn.BatchNorm2d(out_channel))
+                last_channel = out_channel
+            self.conv_blocks.append(convs)
+            self.bn_blocks.append(bns)
+    def forward(self, xyz, points, seed_idx=None):
+        """
+        Input:
+            xyz: input points position data, [B, C, N]
+            points: input points data, [B, D, N]
+        Return:
+            new_xyz: sampled points position data, [B, C, S]
+            new_points_concat: sample points feature data, [B, D', S]
+        """
+        B, N, C = xyz.shape
+        S = self.npoint
+        new_xyz = index_points(xyz, farthest_point_sample(xyz, S) if seed_idx is None else seed_idx)
+        new_points_list = []
+        for i, radius in enumerate(self.radius_list):
+            K = self.nsample_list[i]
+            if self.knn:
+                dists = square_distance(new_xyz, xyz)  # B x npoint x N
+                group_idx = dists.argsort()[:, :, :K]  # B x npoint x K
+            else:
+                group_idx = query_ball_point(radius, K, xyz, new_xyz)
+            grouped_xyz = index_points(xyz, group_idx)
+            grouped_xyz -= new_xyz.view(B, S, 1, C)
+            if points is not None:
+                grouped_points = index_points(points, group_idx)
+                grouped_points = torch.cat([grouped_points, grouped_xyz], dim=-1)
+            else:
+                grouped_points = grouped_xyz
+            grouped_points = grouped_points.permute(0, 3, 2, 1)  # [B, D, K, S]
+            for j in range(len(self.conv_blocks[i])):
+                conv = self.conv_blocks[i][j]
+                bn = self.bn_blocks[i][j]
+                grouped_points =  F.relu(bn(conv(grouped_points)))
+            new_points = torch.max(grouped_points, 2)[0]  # [B, D', S]
+            new_points_list.append(new_points)
+        new_points_concat = torch.cat(new_points_list, dim=1).transpose(1, 2)
+        return new_xyz, new_points_concat
+# NoteL this function swaps N and C
+class PointNetFeaturePropagation(nn.Module):
+    def __init__(self, in_channel, mlp):
+        super(PointNetFeaturePropagation, self).__init__()
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        last_channel = in_channel
+        for out_channel in mlp:
+            self.mlp_convs.append(nn.Conv1d(last_channel, out_channel, 1))
+            self.mlp_bns.append(nn.BatchNorm1d(out_channel))
+            last_channel = out_channel
+    def forward(self, xyz1, xyz2, points1, points2):
+        """
+        Input:
+            xyz1: input points position data, [B, C, N]
+            xyz2: sampled input points position data, [B, C, S]
+            points1: input points data, [B, D, N]
+            points2: input points data, [B, D, S]
+        Return:
+            new_points: upsampled points data, [B, D', N]
+        """
+        xyz1 = xyz1.permute(0, 2, 1)
+        xyz2 = xyz2.permute(0, 2, 1)
+        points2 = points2.permute(0, 2, 1)
+        B, N, C = xyz1.shape
+        _, S, _ = xyz2.shape
+        if S == 1:
+            interpolated_points = points2.repeat(1, N, 1)
+        else:
+            dists = square_distance(xyz1, xyz2)
+            dists, idx = dists.sort(dim=-1)
+            dists, idx = dists[:, :, :3], idx[:, :, :3]  # [B, N, 3]
+            dist_recip = 1.0 / (dists + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+            interpolated_points = torch.sum(index_points(points2, idx) * weight.view(B, N, 3, 1), dim=2)
+        if points1 is not None:
+            points1 = points1.permute(0, 2, 1)
+            new_points = torch.cat([points1, interpolated_points], dim=-1)
+        else:
+            new_points = interpolated_points
+        new_points = new_points.permute(0, 2, 1)
+        for i, conv in enumerate(self.mlp_convs):
+            bn = self.mlp_bns[i]
+            new_points = F.relu(bn(conv(new_points)))
+        return new_points

difpoint/model/temporaltrans/pointtransformerv2.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from .transformer_utils import BaseTemperalPointModel
+from copy import deepcopy
+import torch
+import einops
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+import pointops
+from pointcept.models.utils import offset2batch, batch2offset
+class PointBatchNorm(nn.Module):
+    """
+    Batch Normalization for Point Clouds data in shape of [B*N, C], [B*N, L, C]
+    """
+    def __init__(self, embed_channels):
+        super().__init__()
+        self.norm = nn.BatchNorm1d(embed_channels)
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if input.dim() == 3:
+            return (
+                self.norm(input.transpose(1, 2).contiguous())
+                .transpose(1, 2)
+                .contiguous()
+            )
+        elif input.dim() == 2:
+            return self.norm(input)
+        else:
+            raise NotImplementedError
+#https://github.com/Pointcept/Pointcept/blob/main/pointcept/models/point_transformer_v2/point_transformer_v2m2_base.py
+class GroupedVectorAttention(nn.Module):
+    def __init__(
+        self,
+        embed_channels,
+        groups,
+        attn_drop_rate=0.0,
+        qkv_bias=True,
+        pe_multiplier=False,
+        pe_bias=True,
+    ):
+        super(GroupedVectorAttention, self).__init__()
+        self.embed_channels = embed_channels
+        self.groups = groups
+        assert embed_channels % groups == 0
+        self.attn_drop_rate = attn_drop_rate
+        self.qkv_bias = qkv_bias
+        self.pe_multiplier = pe_multiplier
+        self.pe_bias = pe_bias
+        self.linear_q = nn.Sequential(
+            nn.Linear(embed_channels, embed_channels, bias=qkv_bias),
+            PointBatchNorm(embed_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.linear_k = nn.Sequential(
+            nn.Linear(embed_channels, embed_channels, bias=qkv_bias),
+            PointBatchNorm(embed_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.linear_v = nn.Linear(embed_channels, embed_channels, bias=qkv_bias)
+        if self.pe_multiplier:
+            self.linear_p_multiplier = nn.Sequential(
+                nn.Linear(3, embed_channels),
+                PointBatchNorm(embed_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(embed_channels, embed_channels),
+            )
+        if self.pe_bias:
+            self.linear_p_bias = nn.Sequential(
+                nn.Linear(3, embed_channels),
+                PointBatchNorm(embed_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(embed_channels, embed_channels),
+            )
+        self.weight_encoding = nn.Sequential(
+            nn.Linear(embed_channels, groups),
+            PointBatchNorm(groups),
+            nn.ReLU(inplace=True),
+            nn.Linear(groups, groups),
+        )
+        self.softmax = nn.Softmax(dim=1)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+    def forward(self, feat, coord, reference_index):
+        query, key, value = (
+            self.linear_q(feat),
+            self.linear_k(feat),
+            self.linear_v(feat),
+        )
+        key = pointops.grouping(reference_index, key, coord, with_xyz=True)
+        value = pointops.grouping(reference_index, value, coord, with_xyz=False)
+        pos, key = key[:, :, 0:3], key[:, :, 3:]
+        relation_qk = key - query.unsqueeze(1)
+        if self.pe_multiplier:
+            pem = self.linear_p_multiplier(pos)
+            relation_qk = relation_qk * pem
+        if self.pe_bias:
+            peb = self.linear_p_bias(pos)
+            relation_qk = relation_qk + peb
+            value = value + peb
+        weight = self.weight_encoding(relation_qk)
+        weight = self.attn_drop(self.softmax(weight))
+        mask = torch.sign(reference_index + 1)
+        weight = torch.einsum("n s g, n s -> n s g", weight, mask)
+        value = einops.rearrange(value, "n ns (g i) -> n ns g i", g=self.groups)
+        feat = torch.einsum("n s g i, n s g -> n g i", value, weight)
+        feat = einops.rearrange(feat, "n g i -> n (g i)")
+        return feat
+class BlockSequence(nn.Module):
+    def __init__(
+        self,
+        depth,
+        embed_channels,
+        groups,
+        neighbours=16,
+        qkv_bias=True,
+        pe_multiplier=False,
+        pe_bias=True,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        enable_checkpoint=False,
+    ):
+        super(BlockSequence, self).__init__()
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        elif isinstance(drop_path_rate, float):
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+        else:
+            drop_path_rates = [0.0 for _ in range(depth)]
+        self.neighbours = neighbours
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                embed_channels=embed_channels,
+                groups=groups,
+                qkv_bias=qkv_bias,
+                pe_multiplier=pe_multiplier,
+                pe_bias=pe_bias,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                enable_checkpoint=enable_checkpoint,
+            )
+            self.blocks.append(block)
+    def forward(self, points):
+        coord, feat, offset = points
+        # reference index query of neighbourhood attention
+        # for windows attention, modify reference index query method
+        reference_index, _ = pointops.knn_query(self.neighbours, coord, offset)
+        for block in self.blocks:
+            points = block(points, reference_index)
+        return points
+class GVAPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        depth,
+        in_channels,
+        embed_channels,
+        groups,
+        neighbours=16,
+        qkv_bias=True,
+        pe_multiplier=False,
+        pe_bias=True,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        enable_checkpoint=False,
+    ):
+        super(GVAPatchEmbed, self).__init__()
+        self.in_channels = in_channels
+        self.embed_channels = embed_channels
+        self.proj = nn.Sequential(
+            nn.Linear(in_channels, embed_channels, bias=False),
+            PointBatchNorm(embed_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.blocks = BlockSequence(
+            depth=depth,
+            embed_channels=embed_channels,
+            groups=groups,
+            neighbours=neighbours,
+            qkv_bias=qkv_bias,
+            pe_multiplier=pe_multiplier,
+            pe_bias=pe_bias,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            enable_checkpoint=enable_checkpoint,
+        )
+    def forward(self, points):
+        coord, feat, offset = points
+        feat = self.proj(feat)
+        return self.blocks([coord, feat, offset])
+class Block(nn.Module):
+    def __init__(
+        self,
+        embed_channels,
+        groups,
+        qkv_bias=True,
+        pe_multiplier=False,
+        pe_bias=True,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        enable_checkpoint=False,
+    ):
+        super(Block, self).__init__()
+        self.attn = GroupedVectorAttention(
+            embed_channels=embed_channels,
+            groups=groups,
+            qkv_bias=qkv_bias,
+            attn_drop_rate=attn_drop_rate,
+            pe_multiplier=pe_multiplier,
+            pe_bias=pe_bias,
+        )
+        self.fc1 = nn.Linear(embed_channels, embed_channels, bias=False)
+        self.fc3 = nn.Linear(embed_channels, embed_channels, bias=False)
+        self.norm1 = PointBatchNorm(embed_channels)
+        self.norm2 = PointBatchNorm(embed_channels)
+        self.norm3 = PointBatchNorm(embed_channels)
+        self.act = nn.ReLU(inplace=True)
+        self.enable_checkpoint = enable_checkpoint
+        self.drop_path = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+    def forward(self, points, reference_index):
+        coord, feat, offset = points
+        identity = feat
+        feat = self.act(self.norm1(self.fc1(feat)))
+        feat = (
+            self.attn(feat, coord, reference_index)
+            if not self.enable_checkpoint
+            else checkpoint(self.attn, feat, coord, reference_index)
+        )
+        feat = self.act(self.norm2(feat))
+        feat = self.norm3(self.fc3(feat))
+        feat = identity + self.drop_path(feat)
+        feat = self.act(feat)
+        return [coord, feat, offset]

difpoint/model/temporaltrans/temptrans.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+from .transformer_utils import BaseTemperalPointModel
+import math
+from einops_exts import check_shape, rearrange_many
+from functools import partial
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class RelativePositionBias(nn.Module):
+    def __init__(
+        self,
+        heads = 8,
+        num_buckets = 32,
+        max_distance = 128
+    ):
+        super().__init__()
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, num_buckets = 32, max_distance = 128):
+        ret = 0
+        n = -relative_position
+        num_buckets //= 2
+        ret += (n < 0).long() * num_buckets
+        n = torch.abs(n)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    def forward(self, n, device):
+        q_pos = torch.arange(n, dtype = torch.long, device = device)
+        k_pos = torch.arange(n, dtype = torch.long, device = device)
+        rel_pos = rearrange(k_pos, 'j -> 1 j') - rearrange(q_pos, 'i -> i 1')
+        rp_bucket = self._relative_position_bucket(rel_pos, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        return rearrange(values, 'i j h -> h i j')
+def exists(x):
+    return x is not None
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+class LayerNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        var = torch.var(x, dim = -1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = -1, keepdim = True)
+        return (x - mean) / (var + self.eps).sqrt() * self.gamma + self.beta
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = LayerNorm(dim)
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, **kwargs)
+class EinopsToAndFrom(nn.Module):
+    def __init__(self, from_einops, to_einops, fn):
+        super().__init__()
+        self.from_einops = from_einops
+        self.to_einops = to_einops
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        shape = x.shape
+        reconstitute_kwargs = dict(tuple(zip(self.from_einops.split(' '), shape)))
+        x = rearrange(x, f'{self.from_einops} -> {self.to_einops}')
+        x = self.fn(x, **kwargs)
+        x = rearrange(x, f'{self.to_einops} -> {self.from_einops}', **reconstitute_kwargs)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, heads=4, attn_head_dim=None, casual_attn=False,rotary_emb = None):
+        super().__init__()
+        self.num_heads = heads
+        head_dim = dim // heads
+        self.casual_attn = casual_attn
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = head_dim ** -0.5
+        self.to_qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.rotary_emb = rotary_emb
+    def forward(self, x, pos_bias = None):
+        N, device = x.shape[-2], x.device
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = rearrange_many(qkv, '... n (h d) -> ... h n d', h=self.num_heads)
+        q = q * self.scale
+        if exists(self.rotary_emb):
+            q = self.rotary_emb.rotate_queries_or_keys(q)
+            k = self.rotary_emb.rotate_queries_or_keys(k)
+        sim = torch.einsum('... h i d, ... h j d -> ... h i j', q, k)
+        if exists(pos_bias):
+            sim = sim + pos_bias
+        if self.casual_attn:
+            mask = torch.tril(torch.ones(sim.size(-1), sim.size(-2))).to(device)
+            sim = sim.masked_fill(mask[..., :, :] == 0, float('-inf'))
+        attn = sim.softmax(dim = -1)
+        x = torch.einsum('... h i j, ... h j d -> ... h i d', attn, v)
+        x = rearrange(x, '... h n d -> ... n (h d)')
+        x = self.proj(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim, dim_out)
+        self.norm = LayerNorm(dim)
+        self.act = nn.SiLU()
+    def forward(self, x, scale_shift=None):
+        x = self.proj(x)
+        if exists(scale_shift):
+            x = self.norm(x)
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        return self.act(x)
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, dim_out, cond_dim=None):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(cond_dim, dim_out * 2)
+        ) if exists(cond_dim) else None
+        self.block1 = Block(dim, dim_out)
+        self.block2 = Block(dim_out, dim_out)
+    def forward(self, x, cond_emb=None):
+        scale_shift = None
+        if exists(self.mlp):
+            assert exists(cond_emb), 'time emb must be passed in'
+            cond_emb = self.mlp(cond_emb)
+            #cond_emb = rearrange(cond_emb, 'b f c -> b f 1 c')
+            scale_shift = cond_emb.chunk(2, dim=-1)
+        h = self.block1(x, scale_shift=scale_shift)
+        h = self.block2(h)
+        return h + x
+from rotary_embedding_torch import RotaryEmbedding
+class SimpleTransModel(BaseTemperalPointModel):
+    """
+    A simple model that processes a point cloud by applying a series of MLPs to each point
+    individually, along with some pooled global features.
+    """
+    def get_layers(self):
+        # self.input_projection = nn.Linear(
+        #     in_features=51,
+        #     out_features=self.dim
+        # )
+        self.input_projection = nn.Linear(
+            in_features=70,
+            out_features=self.dim
+        )
+        cond_dim = 512 + self.timestep_embed_dim
+        num_head = self.dim//64
+        rotary_emb = RotaryEmbedding(min(32, num_head))
+        self.time_rel_pos_bias = RelativePositionBias(heads=num_head, max_distance=128)  # realistically will not be able to generate that many frames of video... yet
+        temporal_casual_attn = lambda dim: Attention(dim, heads=num_head, casual_attn=False,rotary_emb=rotary_emb)
+        cond_block= partial(ResnetBlock,cond_dim=cond_dim)
+        layers = nn.ModuleList([])
+        for _ in range(self.num_layers):
+            layers.append(nn.ModuleList([
+                cond_block(self.dim,self.dim),
+                cond_block(self.dim,self.dim),
+                Residual(PreNorm(self.dim,temporal_casual_attn(self.dim)))
+            ]))
+        return layers
+    def forward(self, inputs: torch.Tensor, timesteps: torch.Tensor, context=None):
+        """
+         Apply the model to an input batch.
+         :param x: an [N x C x ...] Tensor of inputs.
+         :param timesteps: a 1-D batch of timesteps.
+         :param context: conditioning plugged in via crossattn
+         """
+        # Prepare inputs
+        batch, num_frames, channels = inputs.size()
+        device = inputs.device
+        #assert channels==3
+        # Positional encoding of point coords
+        # inputs=rearrange(inputs,'b f p c->(b f) p c')
+        # pos_emb=self.positional_encoding(inputs)
+        x = self.input_projection(inputs)
+        #x = rearrange(x,'(b f) p c-> b f p c',b=batch)
+        t_emb = self.time_mlp(timesteps) if exists(self.time_mlp) else None
+        t_emb = t_emb[:,None,:].expand(-1, num_frames, -1)  # b f c
+        if context is not None:
+            t_emb = torch.cat([t_emb, context],-1)
+        time_rel_pos_bias = self.time_rel_pos_bias(num_frames, device=device)
+        for block1, block2,  temporal_casual_attn in self.layers:
+            x = block1(x, t_emb)
+            x = block2(x, t_emb)
+            x = temporal_casual_attn(x, pos_bias=time_rel_pos_bias)
+        # Project
+        x = self.output_projection(x)
+        return x
+class SimpleTemperalPointModel(BaseTemperalPointModel):
+    """
+    A simple model that processes a point cloud by applying a series of MLPs to each point
+    individually, along with some pooled global features.
+    """
+    def get_layers(self):
+        audio_dim = 512
+        cond_dim = audio_dim + self.timestep_embed_dim
+        num_head = 4
+        rotary_emb = RotaryEmbedding(min(32, num_head))
+        self.time_rel_pos_bias = RelativePositionBias(heads=num_head, max_distance=128)  # realistically will not be able to generate that many frames of video... yet
+        temporal_casual_attn = lambda dim: EinopsToAndFrom('b f p c', 'b p f c', Attention(dim, heads=num_head, casual_attn=False, rotary_emb = rotary_emb))
+        spatial_kp_attn= lambda dim: EinopsToAndFrom('b f p c', 'b f p c', Attention(dim, heads=num_head))
+        cond_block= partial(ResnetBlock,cond_dim=cond_dim)
+        layers = nn.ModuleList([])
+        for _ in range(self.num_layers):
+            layers.append(nn.ModuleList([
+                cond_block(self.dim,self.dim),
+                cond_block(self.dim,self.dim),
+                Residual(PreNorm(self.dim,spatial_kp_attn(self.dim))),
+                Residual(PreNorm(self.dim,temporal_casual_attn(self.dim)))
+            ]))
+        return layers
+    def forward(self, inputs: torch.Tensor, timesteps: torch.Tensor, context=None):
+        """
+         Apply the model to an input batch.
+         :param x: an [N x C x ...] Tensor of inputs.
+         :param timesteps: a 1-D batch of timesteps.
+         :param context: conditioning plugged in via crossattn
+         """
+        # Prepare inputs
+        batch, num_frames, num_points, channels = inputs.size()
+        device = inputs.device
+        #assert channels==3
+        # Positional encoding of point coords
+        inputs=rearrange(inputs,'b f p c->(b f) p c')
+        pos_emb=self.positional_encoding(inputs)
+        x = self.input_projection(torch.cat([inputs, pos_emb], -1))
+        x = rearrange(x,'(b f) p c-> b f p c',b=batch)
+        t_emb = self.time_mlp(timesteps) if exists(self.time_mlp) else None
+        t_emb = t_emb[:,None,:].expand(-1, num_frames, -1)  # b f c
+        if context is not None:
+            t_emb = torch.cat([t_emb,context],-1)
+        time_rel_pos_bias = self.time_rel_pos_bias(num_frames, device=device)
+        for block1, block2, spatial_kp_attn, temporal_casual_attn in self.layers:
+            x = block1(x, t_emb)
+            x = block2(x, t_emb)
+            x = spatial_kp_attn(x)
+            x = temporal_casual_attn(x, pos_bias=time_rel_pos_bias)
+        # Project
+        x = self.output_projection(x)
+        return x

difpoint/model/temporaltrans/transformer_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+import math
+from einops_exts import check_shape, rearrange_many
+from torch import Size, Tensor, nn
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+def map_positional_encoding(v: Tensor, freq_bands: Tensor) -> Tensor:
+    """Map v to positional encoding representation phi(v)
+    Arguments:
+        v (Tensor): input features (B, IFeatures)
+        freq_bands (Tensor): frequency bands (N_freqs, )
+    Returns:
+        phi(v) (Tensor): fourrier features (B, 3 + (2 * N_freqs) * 3)
+    """
+    pe = [v]
+    for freq in freq_bands:
+        fv = freq * v
+        pe += [torch.sin(fv), torch.cos(fv)]
+    return torch.cat(pe, dim=-1)
+class FeatureMapping(nn.Module):
+    """FeatureMapping nn.Module
+    Maps v to features following transformation phi(v)
+    Arguments:
+        i_dim (int): input dimensions
+        o_dim (int): output dimensions
+    """
+    def __init__(self, i_dim: int, o_dim: int) -> None:
+        super().__init__()
+        self.i_dim = i_dim
+        self.o_dim = o_dim
+    def forward(self, v: Tensor) -> Tensor:
+        """FeratureMapping forward pass
+        Arguments:
+            v (Tensor): input features (B, IFeatures)
+        Returns:
+            phi(v) (Tensor): mapped features (B, OFeatures)
+        """
+        raise NotImplementedError("Forward pass not implemented yet!")
+class PositionalEncoding(FeatureMapping):
+    """PositionalEncoding module
+    Maps v to positional encoding representation phi(v)
+    Arguments:
+        i_dim (int): input dimension for v
+        N_freqs (int): #frequency to sample (default: 10)
+    """
+    def __init__(
+        self,
+        i_dim: int,
+        N_freqs: int = 10,
+    ) -> None:
+        super().__init__(i_dim, 3 + (2 * N_freqs) * 3)
+        self.N_freqs = N_freqs
+        a, b = 1, self.N_freqs - 1
+        freq_bands = 2 ** torch.linspace(a, b, self.N_freqs)
+        self.register_buffer("freq_bands", freq_bands)
+    def forward(self, v: Tensor) -> Tensor:
+        """Map v to positional encoding representation phi(v)
+        Arguments:
+            v (Tensor): input features (B, IFeatures)
+        Returns:
+            phi(v) (Tensor): fourrier features (B, 3 + (2 * N_freqs) * 3)
+        """
+        return map_positional_encoding(v, self.freq_bands)
+class BaseTemperalPointModel(nn.Module):
+    """ A base class providing useful methods for point cloud processing. """
+    def __init__(
+        self,
+        *,
+        num_classes,
+        embed_dim,
+        extra_feature_channels,
+        dim: int = 768,
+        num_layers: int = 6
+    ):
+        super().__init__()
+        self.extra_feature_channels = extra_feature_channels
+        self.timestep_embed_dim = 256
+        self.output_dim = num_classes
+        self.dim = dim
+        self.num_layers = num_layers
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(dim),
+            nn.Linear(dim, self.timestep_embed_dim ),
+            nn.SiLU(),
+            nn.Linear(self.timestep_embed_dim , self.timestep_embed_dim )
+        )
+        self.positional_encoding = PositionalEncoding(i_dim=3, N_freqs=10)
+        positional_encoding_d_out = 3 + (2 * 10) * 3
+        # Input projection (point coords, point coord encodings, other features, and timestep embeddings)
+        self.input_projection = nn.Linear(
+            in_features=(3 + positional_encoding_d_out),
+            out_features=self.dim
+        )#b f p c
+        # Transformer layers
+        self.layers = self.get_layers()
+        # Output projection
+        self.output_projection = nn.Linear(self.dim, self.output_dim)
+    def get_layers(self):
+        raise NotImplementedError('This method should be implemented by subclasses')
+    def forward(self, inputs: torch.Tensor, t: torch.Tensor):
+        raise NotImplementedError('This method should be implemented by subclasses')