Spaces:
Running
on
Zero
Running
on
Zero
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
// | |
// NVIDIA CORPORATION and its licensors retain all intellectual property | |
// and proprietary rights in and to this software, related documentation | |
// and any modifications thereto. Any use, reproduction, disclosure or | |
// distribution of this software and related documentation without an express | |
// license agreement from NVIDIA CORPORATION is strictly prohibited. | |
//------------------------------------------------------------------------ | |
// Forward kernel. | |
template <bool ENABLE_DA> | |
static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p) | |
{ | |
// Calculate pixel position. | |
int px = blockIdx.x * blockDim.x + threadIdx.x; | |
int py = blockIdx.y * blockDim.y + threadIdx.y; | |
int pz = blockIdx.z; | |
if (px >= p.width || py >= p.height || pz >= p.depth) | |
return; | |
// Pixel index. | |
int pidx = px + p.width * (py + p.height * pz); | |
// Output ptrs. | |
float* out = p.out + pidx * p.numAttr; | |
float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0; | |
// Fetch rasterizer output. | |
float4 r = ((float4*)p.rast)[pidx]; | |
int triIdx = float_to_triidx(r.w) - 1; | |
bool triValid = (triIdx >= 0 && triIdx < p.numTriangles); | |
// If no geometry in entire warp, zero the output and exit. | |
// Otherwise force barys to zero and output with live threads. | |
if (__all_sync(0xffffffffu, !triValid)) | |
{ | |
for (int i=0; i < p.numAttr; i++) | |
out[i] = 0.f; | |
if (ENABLE_DA) | |
for (int i=0; i < p.numDiffAttr; i++) | |
outDA[i] = make_float2(0.f, 0.f); | |
return; | |
} | |
// Fetch vertex indices. | |
int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0; | |
int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0; | |
int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0; | |
// Bail out if corrupt indices. | |
if (vi0 < 0 || vi0 >= p.numVertices || | |
vi1 < 0 || vi1 >= p.numVertices || | |
vi2 < 0 || vi2 >= p.numVertices) | |
return; | |
// In instance mode, adjust vertex indices by minibatch index unless broadcasting. | |
if (p.instance_mode && !p.attrBC) | |
{ | |
vi0 += pz * p.numVertices; | |
vi1 += pz * p.numVertices; | |
vi2 += pz * p.numVertices; | |
} | |
// Pointers to attributes. | |
const float* a0 = p.attr + vi0 * p.numAttr; | |
const float* a1 = p.attr + vi1 * p.numAttr; | |
const float* a2 = p.attr + vi2 * p.numAttr; | |
// Barys. If no triangle, force all to zero -> output is zero. | |
float b0 = triValid ? r.x : 0.f; | |
float b1 = triValid ? r.y : 0.f; | |
float b2 = triValid ? (1.f - r.x - r.y) : 0.f; | |
// Interpolate and write attributes. | |
for (int i=0; i < p.numAttr; i++) | |
out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i]; | |
// No diff attrs? Exit. | |
if (!ENABLE_DA) | |
return; | |
// Read bary pixel differentials if we have a triangle. | |
float4 db = make_float4(0.f, 0.f, 0.f, 0.f); | |
if (triValid) | |
db = ((float4*)p.rastDB)[pidx]; | |
// Unpack a bit. | |
float dudx = db.x; | |
float dudy = db.y; | |
float dvdx = db.z; | |
float dvdy = db.w; | |
// Calculate the pixel differentials of chosen attributes. | |
for (int i=0; i < p.numDiffAttr; i++) | |
{ | |
// Input attribute index. | |
int j = p.diff_attrs_all ? i : p.diffAttrs[i]; | |
if (j < 0) | |
j += p.numAttr; // Python-style negative indices. | |
// Zero output if invalid index. | |
float dsdx = 0.f; | |
float dsdy = 0.f; | |
if (j >= 0 && j < p.numAttr) | |
{ | |
float s0 = a0[j]; | |
float s1 = a1[j]; | |
float s2 = a2[j]; | |
float dsdu = s0 - s2; | |
float dsdv = s1 - s2; | |
dsdx = dudx*dsdu + dvdx*dsdv; | |
dsdy = dudy*dsdu + dvdy*dsdv; | |
} | |
// Write. | |
outDA[i] = make_float2(dsdx, dsdy); | |
} | |
} | |
// Template specializations. | |
__global__ void InterpolateFwdKernel (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); } | |
__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); } | |
//------------------------------------------------------------------------ | |
// Gradient kernel. | |
template <bool ENABLE_DA> | |
static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p) | |
{ | |
// Temporary space for coalesced atomics. | |
CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT); | |
// Calculate pixel position. | |
int px = blockIdx.x * blockDim.x + threadIdx.x; | |
int py = blockIdx.y * blockDim.y + threadIdx.y; | |
int pz = blockIdx.z; | |
if (px >= p.width || py >= p.height || pz >= p.depth) | |
return; | |
// Pixel index. | |
int pidx = px + p.width * (py + p.height * pz); | |
// Fetch triangle ID. If none, output zero bary/db gradients and exit. | |
float4 r = ((float4*)p.rast)[pidx]; | |
int triIdx = float_to_triidx(r.w) - 1; | |
if (triIdx < 0 || triIdx >= p.numTriangles) | |
{ | |
((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); | |
if (ENABLE_DA) | |
((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); | |
return; | |
} | |
// Fetch vertex indices. | |
int vi0 = p.tri[triIdx * 3 + 0]; | |
int vi1 = p.tri[triIdx * 3 + 1]; | |
int vi2 = p.tri[triIdx * 3 + 2]; | |
// Bail out if corrupt indices. | |
if (vi0 < 0 || vi0 >= p.numVertices || | |
vi1 < 0 || vi1 >= p.numVertices || | |
vi2 < 0 || vi2 >= p.numVertices) | |
return; | |
// In instance mode, adjust vertex indices by minibatch index unless broadcasting. | |
if (p.instance_mode && !p.attrBC) | |
{ | |
vi0 += pz * p.numVertices; | |
vi1 += pz * p.numVertices; | |
vi2 += pz * p.numVertices; | |
} | |
// Initialize coalesced atomics. | |
CA_SET_GROUP(triIdx); | |
// Pointers to inputs. | |
const float* a0 = p.attr + vi0 * p.numAttr; | |
const float* a1 = p.attr + vi1 * p.numAttr; | |
const float* a2 = p.attr + vi2 * p.numAttr; | |
const float* pdy = p.dy + pidx * p.numAttr; | |
// Pointers to outputs. | |
float* ga0 = p.gradAttr + vi0 * p.numAttr; | |
float* ga1 = p.gradAttr + vi1 * p.numAttr; | |
float* ga2 = p.gradAttr + vi2 * p.numAttr; | |
// Barys and bary gradient accumulators. | |
float b0 = r.x; | |
float b1 = r.y; | |
float b2 = 1.f - r.x - r.y; | |
float gb0 = 0.f; | |
float gb1 = 0.f; | |
// Loop over attributes and accumulate attribute gradients. | |
for (int i=0; i < p.numAttr; i++) | |
{ | |
float y = pdy[i]; | |
float s0 = a0[i]; | |
float s1 = a1[i]; | |
float s2 = a2[i]; | |
gb0 += y * (s0 - s2); | |
gb1 += y * (s1 - s2); | |
caAtomicAdd(ga0 + i, b0 * y); | |
caAtomicAdd(ga1 + i, b1 * y); | |
caAtomicAdd(ga2 + i, b2 * y); | |
} | |
// Write the bary gradients. | |
((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f); | |
// If pixel differentials disabled, we're done. | |
if (!ENABLE_DA) | |
return; | |
// Calculate gradients based on attribute pixel differentials. | |
const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr; | |
float gdudx = 0.f; | |
float gdudy = 0.f; | |
float gdvdx = 0.f; | |
float gdvdy = 0.f; | |
// Read bary pixel differentials. | |
float4 db = ((float4*)p.rastDB)[pidx]; | |
float dudx = db.x; | |
float dudy = db.y; | |
float dvdx = db.z; | |
float dvdy = db.w; | |
for (int i=0; i < p.numDiffAttr; i++) | |
{ | |
// Input attribute index. | |
int j = p.diff_attrs_all ? i : p.diffAttrs[i]; | |
if (j < 0) | |
j += p.numAttr; // Python-style negative indices. | |
// Check that index is valid. | |
if (j >= 0 && j < p.numAttr) | |
{ | |
float2 dsdxy = dda[i]; | |
float dsdx = dsdxy.x; | |
float dsdy = dsdxy.y; | |
float s0 = a0[j]; | |
float s1 = a1[j]; | |
float s2 = a2[j]; | |
// Gradients of db. | |
float dsdu = s0 - s2; | |
float dsdv = s1 - s2; | |
gdudx += dsdu * dsdx; | |
gdudy += dsdu * dsdy; | |
gdvdx += dsdv * dsdx; | |
gdvdy += dsdv * dsdy; | |
// Gradients of attributes. | |
float du = dsdx*dudx + dsdy*dudy; | |
float dv = dsdx*dvdx + dsdy*dvdy; | |
caAtomicAdd(ga0 + j, du); | |
caAtomicAdd(ga1 + j, dv); | |
caAtomicAdd(ga2 + j, -du - dv); | |
} | |
} | |
// Write. | |
((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy); | |
} | |
// Template specializations. | |
__global__ void InterpolateGradKernel (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); } | |
__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); } | |
//------------------------------------------------------------------------ | |