Spaces:

3DAIGC
/

LAM

Runtime error

LAM

File size: 7,810 Bytes

17cd746

# 
# Toyota Motor Europe NV/SA and its affiliated companies retain all intellectual 
# property and proprietary rights in and to this software and related documentation. 
# Any commercial use, reproduction, disclosure or distribution of this software and 
# related documentation without an express license agreement from Toyota Motor Europe NV/SA 
# is strictly prohibited.
#


from typing import Tuple, Literal
import torch
import torch.nn.functional as F
import math
import numpy as np
from scipy.spatial.transform import Rotation


def align_cameras_to_axes(
    R: torch.Tensor,
    T: torch.Tensor,
    target_convention: Literal["opengl", "opencv"] = None,
):
    """align the averaged axes of cameras with the world axes.

    Args:
        R: rotation matrix (N, 3, 3)
        T: translation vector (N, 3)
    """
    # The column vectors of R are the basis vectors of each camera.
    # We construct new bases by taking the mean directions of axes, then use Gram-Schmidt
    # process to make them orthonormal
    bases_c2w = gram_schmidt_orthogonalization(R.mean(0))
    if target_convention == "opengl":
        bases_c2w[:, [1, 2]] *= -1  # flip y and z axes
    elif target_convention == "opencv":
        pass
    bases_w2c = bases_c2w.t()

    # convert the camera poses into the new coordinate system
    R = bases_w2c[None, ...] @ R
    T = bases_w2c[None, ...] @ T
    return R, T


def convert_camera_convention(camera_convention_conversion: str, R: torch.Tensor, K: torch.Tensor, H: int, W: int):
    if camera_convention_conversion is not None:
        if camera_convention_conversion == "opencv->opengl":
            R[:, :3, [1, 2]] *= -1
            # flip y of the principal point
            K[..., 1, 2] = H - K[..., 1, 2]
        elif camera_convention_conversion == "opencv->pytorch3d":
            R[:, :3, [0, 1]] *= -1
            # flip x and y of the principal point
            K[..., 0, 2] = W - K[..., 0, 2]
            K[..., 1, 2] = H - K[..., 1, 2]
        elif camera_convention_conversion == "opengl->pytorch3d":
            R[:, :3, [0, 2]] *= -1
            # flip x of the principal point
            K[..., 0, 2] = W - K[..., 0, 2]
        else:
            raise ValueError(
                f"Unknown camera coordinate conversion: {camera_convention_conversion}."
            )
    return R, K


def gram_schmidt_orthogonalization(M: torch.tensor):
    """conducting Gram-Schmidt process to transform column vectors into orthogonal bases

    Args:
        M: An matrix (num_rows, num_cols)
    Return:
        M: An matrix with orthonormal column vectors (num_rows, num_cols)
    """
    num_rows, num_cols = M.shape
    for c in range(1, num_cols):
        M[:, [c - 1, c]] = F.normalize(M[:, [c - 1, c]], p=2, dim=0)
        M[:, [c]] -= M[:, :c] @ (M[:, :c].T @ M[:, [c]])

    M[:, -1] = F.normalize(M[:, -1], p=2, dim=0)
    return M


def projection_from_intrinsics(K: np.ndarray, image_size: Tuple[int], near: float=0.01, far:float=10, flip_y: bool=False, z_sign=-1):
    """
    Transform points from camera space (x: right, y: up, z: out) to clip space (x: right, y: down, z: in)
    Args:
        K: Intrinsic matrix, (N, 3, 3)
            K = [[
                        [fx, 0, cx],
                        [0, fy, cy],
                        [0,  0,  1],
                ]
            ]
        image_size: (height, width)
    Output:
        proj = [[
                [2*fx/w, 0.0,     (w - 2*cx)/w,             0.0                     ],
                [0.0,    2*fy/h, (h - 2*cy)/h,             0.0                     ],
                [0.0,    0.0,     z_sign*(far+near) / (far-near), -2*far*near / (far-near)],
                [0.0,    0.0,     z_sign,                     0.0                     ]
            ]
        ]
    """

    B = K.shape[0]
    h, w = image_size

    if K.shape[-2:] == (3, 3):
        fx = K[..., 0, 0]
        fy = K[..., 1, 1]
        cx = K[..., 0, 2]
        cy = K[..., 1, 2]
    elif K.shape[-1] == 4:
        # fx, fy, cx, cy = K[..., [0, 1, 2, 3]].split(1, dim=-1)
        fx = K[..., [0]]
        fy = K[..., [1]]
        cx = K[..., [2]]
        cy = K[..., [3]]
    else:
        raise ValueError(f"Expected K to be (N, 3, 3) or (N, 4) but got: {K.shape}")

    proj = np.zeros([B, 4, 4])
    proj[:, 0, 0]  = fx * 2 / w 
    proj[:, 1, 1]  = fy * 2 / h
    proj[:, 0, 2]  = (w - 2 * cx) / w
    proj[:, 1, 2]  = (h - 2 * cy) / h
    proj[:, 2, 2]  = z_sign * (far+near) / (far-near)
    proj[:, 2, 3]  = -2*far*near / (far-near)
    proj[:, 3, 2]  = z_sign

    if flip_y:
        proj[:, 1, 1] *= -1
    return proj


class OrbitCamera:
    def __init__(self, W, H, r=2, fovy=60, znear=1e-8, zfar=10, convention: Literal["opengl", "opencv"]="opengl"):
        self.image_width = W
        self.image_height = H
        self.radius_default = r
        self.fovy_default = fovy
        self.znear = znear
        self.zfar = zfar
        self.convention = convention

        self.up = np.array([0, 1, 0], dtype=np.float32)
        self.reset()
    
    def reset(self):
        """ The internal state of the camera is based on the OpenGL convention, but 
            properties are converted to the target convention when queried.
        """
        self.rot = Rotation.from_matrix([[1, 0, 0], [0, 1, 0], [0, 0, 1]])  # OpenGL convention
        self.look_at = np.array([0, 0, 0], dtype=np.float32)  # look at this point
        self.radius = self.radius_default  # camera distance from center
        self.fovy = self.fovy_default
        if self.convention == "opencv":
            self.z_sign = 1
            self.y_sign = 1
        elif self.convention == "opengl":
            self.z_sign = -1
            self.y_sign = -1
        else:
            raise ValueError(f"Unknown convention: {self.convention}")

    @property
    def fovx(self):
        return self.fovy / self.image_height * self.image_width
    
    @property
    def intrinsics(self):
        focal = self.image_height / (2 * np.tan(np.radians(self.fovy) / 2))
        return np.array([focal, focal, self.image_width // 2, self.image_height // 2])
    
    @property
    def projection_matrix(self):
        return projection_from_intrinsics(self.intrinsics[None], (self.image_height, self.image_width), self.znear, self.zfar, z_sign=self.z_sign)[0]
    
    @property
    def world_view_transform(self):
        return np.linalg.inv(self.pose)  # world2cam

    @property
    def full_proj_transform(self):
        return self.projection_matrix @ self.world_view_transform

    @property
    def pose(self):
        # first move camera to radius
        pose = np.eye(4, dtype=np.float32)
        pose[2, 3] += self.radius

        # rotate
        rot = np.eye(4, dtype=np.float32)
        rot[:3, :3] = self.rot.as_matrix()
        pose = rot @ pose

        # translate
        pose[:3, 3] -= self.look_at

        if self.convention == "opencv":
            pose[:, [1, 2]] *= -1
        elif self.convention == "opengl":
            pass
        else:
            raise ValueError(f"Unknown convention: {self.convention}")
        return pose

    def orbit(self, dx, dy):
        # rotate along camera up/side axis!
        side = self.rot.as_matrix()[:3, 0]
        rotvec_x = self.up * np.radians(-0.3 * dx)
        rotvec_y = side * np.radians(-0.3 * dy)
        self.rot = Rotation.from_rotvec(rotvec_x) * Rotation.from_rotvec(rotvec_y) * self.rot

    def scale(self, delta):
        self.radius *= 1.1 ** (-delta)

    def pan(self, dx, dy, dz=0):
        # pan in camera coordinate system (careful on the sensitivity!)
        d = np.array([dx, -dy, dz])  # the y axis is flipped
        self.look_at += 2 * self.rot.as_matrix()[:3, :3] @ d * self.radius / self.image_height * math.tan(np.radians(self.fovy) / 2)