#include "viewer_cuda.h" #define THREADS 64 #define NUM_BLOCKS(batch_size) ((batch_size + THREADS - 1) / THREADS) #define GPU_1D_KERNEL_LOOP(k, n) \ for (size_t k = threadIdx.x; k images, const torch::PackedTensorAccessor32 poses, const torch::PackedTensorAccessor32 disps, const torch::PackedTensorAccessor32 intrinsics, torch::PackedTensorAccessor32 points, torch::PackedTensorAccessor32 colors, torch::PackedTensorAccessor32 count) { __shared__ float t[3], t1[3], t2[3]; __shared__ float q[4], q1[4], q2[4]; __shared__ float intrinsic[4], intrinsic1[4]; if (threadIdx.x < 3) { t[threadIdx.x] = poses[index][threadIdx.x + 0]; } if (threadIdx.x < 4) { q[threadIdx.x] = poses[index][threadIdx.x + 3]; } if (threadIdx.x < 4) { intrinsic[threadIdx.x] = 8 * intrinsics[index][threadIdx.x]; } __syncthreads(); if (threadIdx.x == 0) { invSE3(t, q, t1, q1); } __syncthreads(); const int ht = disps.size(1); const int wd = disps.size(2); const int k = blockIdx.x * THREADS + threadIdx.x; if (k < ht * wd) { float X0[4], X1[4], X2[4]; const int i = k / wd; const int j = k % wd; count[k] = 0; if ((i < ht - 1) && (j < wd - 1)) { const float d = disps[index][i][j]; const float dx = disps[index][i][j+1] - disps[index][i][j]; const float dy = disps[index][i+1][j] - disps[index][i][j]; if (sqrt(dx*dx + dy*dy) > 0.01) { count[k] = -100; } X0[0] = ((float) j - intrinsic[2]) / intrinsic[0]; X0[1] = ((float) i - intrinsic[3]) / intrinsic[1]; X0[2] = 1; X0[3] = d; actSE3(t1, q1, X0, X1); points[k][0] = X0[0] / X0[3]; points[k][1] = X0[1] / X0[3]; points[k][2] = X0[2] / X0[3]; colors[k][0] = images[index][2][i][j]; colors[k][1] = images[index][1][i][j]; colors[k][2] = images[index][0][i][j]; for (int jx=0; jx < nFrames; jx++) { if (jx == index) continue; if (threadIdx.x < 3) { t2[threadIdx.x] = poses[jx][threadIdx.x + 0]; } if (threadIdx.x < 4) { q2[threadIdx.x] = poses[jx][threadIdx.x + 3]; } if (threadIdx.x < 4) { intrinsic1[threadIdx.x] = 8 * intrinsics[jx][threadIdx.x]; } __syncthreads(); actSE3(t2, q2, X1, X2); const float x1 = intrinsic1[0] * (X2[0] / X2[2]) + intrinsic1[2]; const float y1 = intrinsic1[1] * (X2[1] / X2[2]) + intrinsic1[3]; const int i1 = static_cast(round(y1)); const int j1 = static_cast(round(x1)); if ((i1 >= 0) && (i1 < ht) && (j1 >= 0) && (j1 < wd) && (d > 0.1)) { const float z1 = disps[jx][i1][j1]; const float z2 = X2[3] / X2[2]; if (100 * (max(z1/z2, z2/z1) - 1) < thresh) { count[k] += 1; } } } } } } PointCloud backproject_and_filter( const int index, const int nFrames, const float thresh, const bool showForeground, const bool showBackground, const torch::Tensor images, const torch::Tensor poses, const torch::Tensor disps, const torch::Tensor masks, const torch::Tensor intrinsics) { const int ht = disps.size(1); const int wd = disps.size(2); const int nPoints = ht * wd; torch::Tensor points = torch::zeros({nPoints, 3}, disps.options()); torch::Tensor colors = torch::zeros({nPoints, 3}, images.options()); torch::Tensor count = torch::zeros({nPoints}, disps.options()); iproj_kernel<<>>(index, nFrames, thresh, images.packed_accessor32(), poses.packed_accessor32(), disps.packed_accessor32(), intrinsics.packed_accessor32(), points.packed_accessor32(), colors.packed_accessor32(), count.packed_accessor32()); torch::Tensor m = masks[index].reshape({-1}); torch::Tensor pointsFiltered, colorsFiltered; // std::cout << index << " " << dynamic << std::endl; pointsFiltered = torch::zeros({0, 3}, points.options()); colorsFiltered = torch::zeros({0, 3}, colors.options()); if (showForeground) { pointsFiltered = torch::cat({pointsFiltered, at::index(points, {(count >= 0) & (m < 0.5)})}, 0); colorsFiltered = torch::cat({colorsFiltered, at::index(colors, {(count >= 0) & (m < 0.5)})}, 0); } if (showBackground) { pointsFiltered = torch::cat({pointsFiltered, at::index(points, {(count >= 2.0) & (m > 0.5)})}, 0); colorsFiltered = torch::cat({colorsFiltered, at::index(colors, {(count >= 2.0) & (m > 0.5)})}, 0); } const int mPoints = pointsFiltered.size(0); return {mPoints, pointsFiltered, colorsFiltered}; } __global__ void pose_to_matrix_kernel( const torch::PackedTensorAccessor32 poses, torch::PackedTensorAccessor32 mat4x4) { const int index = blockIdx.x * THREADS + threadIdx.x; float t0[3], t[3]; float q0[4], q[4]; if (index < poses.size(0)) { t0[0] = poses[index][0]; t0[1] = poses[index][1]; t0[2] = poses[index][2]; q0[0] = poses[index][3]; q0[1] = poses[index][4]; q0[2] = poses[index][5]; q0[3] = poses[index][6]; invSE3(t0, q0, t, q); mat4x4[index][0][0] = 1 - 2*q[1]*q[1] - 2*q[2]*q[2]; mat4x4[index][0][1] = 2*q[0]*q[1] - 2*q[3]*q[2]; mat4x4[index][0][2] = 2*q[0]*q[2] + 2*q[3]*q[1]; mat4x4[index][0][3] = t[0]; mat4x4[index][1][0] = 2*q[0]*q[1] + 2*q[3]*q[2]; mat4x4[index][1][1] = 1 - 2*q[0]*q[0] - 2*q[2]*q[2]; mat4x4[index][1][2] = 2*q[1]*q[2] - 2*q[3]*q[0]; mat4x4[index][1][3] = t[1]; mat4x4[index][2][0] = 2*q[0]*q[2] - 2*q[3]*q[1]; mat4x4[index][2][1] = 2*q[1]*q[2] + 2*q[3]*q[0]; mat4x4[index][2][2] = 1 - 2*q[0]*q[0] - 2*q[1]*q[1]; mat4x4[index][2][3] = t[2]; mat4x4[index][3][0] = 0.0; mat4x4[index][3][1] = 0.0; mat4x4[index][3][2] = 0.0; mat4x4[index][3][3] = 1.0; } } torch::Tensor poseToMatrix(const torch::Tensor poses) { const int num = poses.size(0); torch::Tensor mat4x4 = torch::zeros({num, 4, 4}, poses.options()); pose_to_matrix_kernel<<>>( poses.packed_accessor32(), mat4x4.packed_accessor32()); return mat4x4; }