Spaces:
Runtime error
Runtime error
File size: 14,712 Bytes
8bb8404 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
import numpy as np
import time
import torch
import torch.nn as nn
from torch.autograd import Function
from torch.cuda.amp import custom_bwd, custom_fwd
try:
import _raymarching as _backend
except ImportError:
from .backend import _backend
# ----------------------------------------
# utils
# ----------------------------------------
class _near_far_from_aabb(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, rays_o, rays_d, aabb, min_near=0.2):
''' near_far_from_aabb, CUDA implementation
Calculate rays' intersection time (near and far) with aabb
Args:
rays_o: float, [N, 3]
rays_d: float, [N, 3]
aabb: float, [6], (xmin, ymin, zmin, xmax, ymax, zmax)
min_near: float, scalar
Returns:
nears: float, [N]
fars: float, [N]
'''
if not rays_o.is_cuda: rays_o = rays_o.cuda()
if not rays_d.is_cuda: rays_d = rays_d.cuda()
rays_o = rays_o.contiguous().view(-1, 3)
rays_d = rays_d.contiguous().view(-1, 3)
N = rays_o.shape[0] # num rays
nears = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
fars = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
_backend.near_far_from_aabb(rays_o, rays_d, aabb, N, min_near, nears, fars)
return nears, fars
near_far_from_aabb = _near_far_from_aabb.apply
class _sph_from_ray(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, rays_o, rays_d, radius):
''' sph_from_ray, CUDA implementation
get spherical coordinate on the background sphere from rays.
Assume rays_o are inside the Sphere(radius).
Args:
rays_o: [N, 3]
rays_d: [N, 3]
radius: scalar, float
Return:
coords: [N, 2], in [-1, 1], theta and phi on a sphere. (further-surface)
'''
if not rays_o.is_cuda: rays_o = rays_o.cuda()
if not rays_d.is_cuda: rays_d = rays_d.cuda()
rays_o = rays_o.contiguous().view(-1, 3)
rays_d = rays_d.contiguous().view(-1, 3)
N = rays_o.shape[0] # num rays
coords = torch.empty(N, 2, dtype=rays_o.dtype, device=rays_o.device)
_backend.sph_from_ray(rays_o, rays_d, radius, N, coords)
return coords
sph_from_ray = _sph_from_ray.apply
class _morton3D(Function):
@staticmethod
def forward(ctx, coords):
''' morton3D, CUDA implementation
Args:
coords: [N, 3], int32, in [0, 128) (for some reason there is no uint32 tensor in torch...)
TODO: check if the coord range is valid! (current 128 is safe)
Returns:
indices: [N], int32, in [0, 128^3)
'''
if not coords.is_cuda: coords = coords.cuda()
N = coords.shape[0]
indices = torch.empty(N, dtype=torch.int32, device=coords.device)
_backend.morton3D(coords.int(), N, indices)
return indices
morton3D = _morton3D.apply
class _morton3D_invert(Function):
@staticmethod
def forward(ctx, indices):
''' morton3D_invert, CUDA implementation
Args:
indices: [N], int32, in [0, 128^3)
Returns:
coords: [N, 3], int32, in [0, 128)
'''
if not indices.is_cuda: indices = indices.cuda()
N = indices.shape[0]
coords = torch.empty(N, 3, dtype=torch.int32, device=indices.device)
_backend.morton3D_invert(indices.int(), N, coords)
return coords
morton3D_invert = _morton3D_invert.apply
class _packbits(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, grid, thresh, bitfield=None):
''' packbits, CUDA implementation
Pack up the density grid into a bit field to accelerate ray marching.
Args:
grid: float, [C, H * H * H], assume H % 2 == 0
thresh: float, threshold
Returns:
bitfield: uint8, [C, H * H * H / 8]
'''
if not grid.is_cuda: grid = grid.cuda()
grid = grid.contiguous()
C = grid.shape[0]
H3 = grid.shape[1]
N = C * H3 // 8
if bitfield is None:
bitfield = torch.empty(N, dtype=torch.uint8, device=grid.device)
_backend.packbits(grid, N, thresh, bitfield)
return bitfield
packbits = _packbits.apply
# ----------------------------------------
# train functions
# ----------------------------------------
class _march_rays_train(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, rays_o, rays_d, bound, density_bitfield, C, H, nears, fars, step_counter=None, mean_count=-1, perturb=False, align=-1, force_all_rays=False, dt_gamma=0, max_steps=1024):
''' march rays to generate points (forward only)
Args:
rays_o/d: float, [N, 3]
bound: float, scalar
density_bitfield: uint8: [CHHH // 8]
C: int
H: int
nears/fars: float, [N]
step_counter: int32, (2), used to count the actual number of generated points.
mean_count: int32, estimated mean steps to accelerate training. (but will randomly drop rays if the actual point count exceeded this threshold.)
perturb: bool
align: int, pad output so its size is dividable by align, set to -1 to disable.
force_all_rays: bool, ignore step_counter and mean_count, always calculate all rays. Useful if rendering the whole image, instead of some rays.
dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
Returns:
xyzs: float, [M, 3], all generated points' coords. (all rays concated, need to use `rays` to extract points belonging to each ray)
dirs: float, [M, 3], all generated points' view dirs.
deltas: float, [M, 2], all generated points' deltas. (first for RGB, second for Depth)
rays: int32, [N, 3], all rays' (index, point_offset, point_count), e.g., xyzs[rays[i, 1]:rays[i, 2]] --> points belonging to rays[i, 0]
'''
if not rays_o.is_cuda: rays_o = rays_o.cuda()
if not rays_d.is_cuda: rays_d = rays_d.cuda()
if not density_bitfield.is_cuda: density_bitfield = density_bitfield.cuda()
rays_o = rays_o.contiguous().view(-1, 3)
rays_d = rays_d.contiguous().view(-1, 3)
density_bitfield = density_bitfield.contiguous()
N = rays_o.shape[0] # num rays
M = N * max_steps # init max points number in total
# running average based on previous epoch (mimic `measured_batch_size_before_compaction` in instant-ngp)
# It estimate the max points number to enable faster training, but will lead to random ignored rays if underestimated.
if not force_all_rays and mean_count > 0:
if align > 0:
mean_count += align - mean_count % align
M = mean_count
xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device)
rays = torch.empty(N, 3, dtype=torch.int32, device=rays_o.device) # id, offset, num_steps
if step_counter is None:
step_counter = torch.zeros(2, dtype=torch.int32, device=rays_o.device) # point counter, ray counter
if perturb:
noises = torch.rand(N, dtype=rays_o.dtype, device=rays_o.device)
else:
noises = torch.zeros(N, dtype=rays_o.dtype, device=rays_o.device)
_backend.march_rays_train(rays_o, rays_d, density_bitfield, bound, dt_gamma, max_steps, N, C, H, M, nears, fars, xyzs, dirs, deltas, rays, step_counter, noises) # m is the actually used points number
#print(step_counter, M)
# only used at the first (few) epochs.
if force_all_rays or mean_count <= 0:
m = step_counter[0].item() # D2H copy
if align > 0:
m += align - m % align
xyzs = xyzs[:m]
dirs = dirs[:m]
deltas = deltas[:m]
torch.cuda.empty_cache()
return xyzs, dirs, deltas, rays
march_rays_train = _march_rays_train.apply
class _composite_rays_train(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, sigmas, rgbs, deltas, rays, T_thresh=1e-4):
''' composite rays' rgbs, according to the ray marching formula.
Args:
rgbs: float, [M, 3]
sigmas: float, [M,]
deltas: float, [M, 2]
rays: int32, [N, 3]
Returns:
weights_sum: float, [N,], the alpha channel
depth: float, [N, ], the Depth
image: float, [N, 3], the RGB channel (after multiplying alpha!)
'''
sigmas = sigmas.contiguous()
rgbs = rgbs.contiguous()
M = sigmas.shape[0]
N = rays.shape[0]
weights_sum = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
depth = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
image = torch.empty(N, 3, dtype=sigmas.dtype, device=sigmas.device)
_backend.composite_rays_train_forward(sigmas, rgbs, deltas, rays, M, N, T_thresh, weights_sum, depth, image)
ctx.save_for_backward(sigmas, rgbs, deltas, rays, weights_sum, depth, image)
ctx.dims = [M, N, T_thresh]
return weights_sum, depth, image
@staticmethod
@custom_bwd
def backward(ctx, grad_weights_sum, grad_depth, grad_image):
# NOTE: grad_depth is not used now! It won't be propagated to sigmas.
grad_weights_sum = grad_weights_sum.contiguous()
grad_image = grad_image.contiguous()
sigmas, rgbs, deltas, rays, weights_sum, depth, image = ctx.saved_tensors
M, N, T_thresh = ctx.dims
grad_sigmas = torch.zeros_like(sigmas)
grad_rgbs = torch.zeros_like(rgbs)
_backend.composite_rays_train_backward(grad_weights_sum, grad_image, sigmas, rgbs, deltas, rays, weights_sum, image, M, N, T_thresh, grad_sigmas, grad_rgbs)
return grad_sigmas, grad_rgbs, None, None, None
composite_rays_train = _composite_rays_train.apply
# ----------------------------------------
# infer functions
# ----------------------------------------
class _march_rays(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, density_bitfield, C, H, near, far, align=-1, perturb=False, dt_gamma=0, max_steps=1024):
''' march rays to generate points (forward only, for inference)
Args:
n_alive: int, number of alive rays
n_step: int, how many steps we march
rays_alive: int, [N], the alive rays' IDs in N (N >= n_alive, but we only use first n_alive)
rays_t: float, [N], the alive rays' time, we only use the first n_alive.
rays_o/d: float, [N, 3]
bound: float, scalar
density_bitfield: uint8: [CHHH // 8]
C: int
H: int
nears/fars: float, [N]
align: int, pad output so its size is dividable by align, set to -1 to disable.
perturb: bool/int, int > 0 is used as the random seed.
dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
Returns:
xyzs: float, [n_alive * n_step, 3], all generated points' coords
dirs: float, [n_alive * n_step, 3], all generated points' view dirs.
deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
'''
if not rays_o.is_cuda: rays_o = rays_o.cuda()
if not rays_d.is_cuda: rays_d = rays_d.cuda()
rays_o = rays_o.contiguous().view(-1, 3)
rays_d = rays_d.contiguous().view(-1, 3)
M = n_alive * n_step
if align > 0:
M += align - (M % align)
xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device) # 2 vals, one for rgb, one for depth
if perturb:
# torch.manual_seed(perturb) # test_gui uses spp index as seed
noises = torch.rand(n_alive, dtype=rays_o.dtype, device=rays_o.device)
else:
noises = torch.zeros(n_alive, dtype=rays_o.dtype, device=rays_o.device)
_backend.march_rays(n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, dt_gamma, max_steps, C, H, density_bitfield, near, far, xyzs, dirs, deltas, noises)
return xyzs, dirs, deltas
march_rays = _march_rays.apply
class _composite_rays(Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32) # need to cast sigmas & rgbs to float
def forward(ctx, n_alive, n_step, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image, T_thresh=1e-2):
''' composite rays' rgbs, according to the ray marching formula. (for inference)
Args:
n_alive: int, number of alive rays
n_step: int, how many steps we march
rays_alive: int, [n_alive], the alive rays' IDs in N (N >= n_alive)
rays_t: float, [N], the alive rays' time
sigmas: float, [n_alive * n_step,]
rgbs: float, [n_alive * n_step, 3]
deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
In-place Outputs:
weights_sum: float, [N,], the alpha channel
depth: float, [N,], the depth value
image: float, [N, 3], the RGB channel (after multiplying alpha!)
'''
_backend.composite_rays(n_alive, n_step, T_thresh, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image)
return tuple()
composite_rays = _composite_rays.apply |