Spaces:

microsoft
/

TRELLIS

Running on Zero

App Files Files Community

TRELLIS / extensions /nvdiffrast /nvdiffrast /common /interpolate.cu

JeffreyXiang

update

15fe7bc 6 months ago

raw

history blame

8.88 kB

	// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	//
	// NVIDIA CORPORATION and its licensors retain all intellectual property
	// and proprietary rights in and to this software, related documentation
	// and any modifications thereto. Any use, reproduction, disclosure or
	// distribution of this software and related documentation without an express
	// license agreement from NVIDIA CORPORATION is strictly prohibited.

	#include "common.h"
	#include "interpolate.h"

	//------------------------------------------------------------------------
	// Forward kernel.

	template <bool ENABLE_DA>
	static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
	{
	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	if (px >= p.width \|\| py >= p.height \|\| pz >= p.depth)
	return;

	// Pixel index.
	int pidx = px + p.width * (py + p.height * pz);

	// Output ptrs.
	float* out = p.out + pidx * p.numAttr;
	float2* outDA = ENABLE_DA ? (((float2)p.outDA) + pidx p.numDiffAttr) : 0;

	// Fetch rasterizer output.
	float4 r = ((float4*)p.rast)[pidx];
	int triIdx = float_to_triidx(r.w) - 1;
	bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);

	// If no geometry in entire warp, zero the output and exit.
	// Otherwise force barys to zero and output with live threads.
	if (__all_sync(0xffffffffu, !triValid))
	{
	for (int i=0; i < p.numAttr; i++)
	out[i] = 0.f;
	if (ENABLE_DA)
	for (int i=0; i < p.numDiffAttr; i++)
	outDA[i] = make_float2(0.f, 0.f);
	return;
	}

	// Fetch vertex indices.
	int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
	int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
	int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;

	// Bail out if corrupt indices.
	if (vi0 < 0 \|\| vi0 >= p.numVertices \|\|
	vi1 < 0 \|\| vi1 >= p.numVertices \|\|
	vi2 < 0 \|\| vi2 >= p.numVertices)
	return;

	// In instance mode, adjust vertex indices by minibatch index unless broadcasting.
	if (p.instance_mode && !p.attrBC)
	{
	vi0 += pz * p.numVertices;
	vi1 += pz * p.numVertices;
	vi2 += pz * p.numVertices;
	}

	// Pointers to attributes.
	const float* a0 = p.attr + vi0 * p.numAttr;
	const float* a1 = p.attr + vi1 * p.numAttr;
	const float* a2 = p.attr + vi2 * p.numAttr;

	// Barys. If no triangle, force all to zero -> output is zero.
	float b0 = triValid ? r.x : 0.f;
	float b1 = triValid ? r.y : 0.f;
	float b2 = triValid ? (1.f - r.x - r.y) : 0.f;

	// Interpolate and write attributes.
	for (int i=0; i < p.numAttr; i++)
	out[i] = b0a0[i] + b1a1[i] + b2*a2[i];

	// No diff attrs? Exit.
	if (!ENABLE_DA)
	return;

	// Read bary pixel differentials if we have a triangle.
	float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
	if (triValid)
	db = ((float4*)p.rastDB)[pidx];

	// Unpack a bit.
	float dudx = db.x;
	float dudy = db.y;
	float dvdx = db.z;
	float dvdy = db.w;

	// Calculate the pixel differentials of chosen attributes.
	for (int i=0; i < p.numDiffAttr; i++)
	{
	// Input attribute index.
	int j = p.diff_attrs_all ? i : p.diffAttrs[i];
	if (j < 0)
	j += p.numAttr; // Python-style negative indices.

	// Zero output if invalid index.
	float dsdx = 0.f;
	float dsdy = 0.f;
	if (j >= 0 && j < p.numAttr)
	{
	float s0 = a0[j];
	float s1 = a1[j];
	float s2 = a2[j];
	float dsdu = s0 - s2;
	float dsdv = s1 - s2;
	dsdx = dudxdsdu + dvdxdsdv;
	dsdy = dudydsdu + dvdydsdv;
	}

	// Write.
	outDA[i] = make_float2(dsdx, dsdy);
	}
	}

	// Template specializations.
	__global__ void InterpolateFwdKernel (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
	__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }

	//------------------------------------------------------------------------
	// Gradient kernel.

	template <bool ENABLE_DA>
	static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
	{
	// Temporary space for coalesced atomics.
	CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);

	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	if (px >= p.width \|\| py >= p.height \|\| pz >= p.depth)
	return;

	// Pixel index.
	int pidx = px + p.width * (py + p.height * pz);

	// Fetch triangle ID. If none, output zero bary/db gradients and exit.
	float4 r = ((float4*)p.rast)[pidx];
	int triIdx = float_to_triidx(r.w) - 1;
	if (triIdx < 0 \|\| triIdx >= p.numTriangles)
	{
	((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
	if (ENABLE_DA)
	((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
	return;
	}

	// Fetch vertex indices.
	int vi0 = p.tri[triIdx * 3 + 0];
	int vi1 = p.tri[triIdx * 3 + 1];
	int vi2 = p.tri[triIdx * 3 + 2];

	// Bail out if corrupt indices.
	if (vi0 < 0 \|\| vi0 >= p.numVertices \|\|
	vi1 < 0 \|\| vi1 >= p.numVertices \|\|
	vi2 < 0 \|\| vi2 >= p.numVertices)
	return;

	// In instance mode, adjust vertex indices by minibatch index unless broadcasting.
	if (p.instance_mode && !p.attrBC)
	{
	vi0 += pz * p.numVertices;
	vi1 += pz * p.numVertices;
	vi2 += pz * p.numVertices;
	}

	// Initialize coalesced atomics.
	CA_SET_GROUP(triIdx);

	// Pointers to inputs.
	const float* a0 = p.attr + vi0 * p.numAttr;
	const float* a1 = p.attr + vi1 * p.numAttr;
	const float* a2 = p.attr + vi2 * p.numAttr;
	const float* pdy = p.dy + pidx * p.numAttr;

	// Pointers to outputs.
	float* ga0 = p.gradAttr + vi0 * p.numAttr;
	float* ga1 = p.gradAttr + vi1 * p.numAttr;
	float* ga2 = p.gradAttr + vi2 * p.numAttr;

	// Barys and bary gradient accumulators.
	float b0 = r.x;
	float b1 = r.y;
	float b2 = 1.f - r.x - r.y;
	float gb0 = 0.f;
	float gb1 = 0.f;

	// Loop over attributes and accumulate attribute gradients.
	for (int i=0; i < p.numAttr; i++)
	{
	float y = pdy[i];
	float s0 = a0[i];
	float s1 = a1[i];
	float s2 = a2[i];
	gb0 += y * (s0 - s2);
	gb1 += y * (s1 - s2);
	caAtomicAdd(ga0 + i, b0 * y);
	caAtomicAdd(ga1 + i, b1 * y);
	caAtomicAdd(ga2 + i, b2 * y);
	}

	// Write the bary gradients.
	((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);

	// If pixel differentials disabled, we're done.
	if (!ENABLE_DA)
	return;

	// Calculate gradients based on attribute pixel differentials.
	const float2* dda = ((float2)p.dda) + pidx p.numDiffAttr;
	float gdudx = 0.f;
	float gdudy = 0.f;
	float gdvdx = 0.f;
	float gdvdy = 0.f;

	// Read bary pixel differentials.
	float4 db = ((float4*)p.rastDB)[pidx];
	float dudx = db.x;
	float dudy = db.y;
	float dvdx = db.z;
	float dvdy = db.w;

	for (int i=0; i < p.numDiffAttr; i++)
	{
	// Input attribute index.
	int j = p.diff_attrs_all ? i : p.diffAttrs[i];
	if (j < 0)
	j += p.numAttr; // Python-style negative indices.

	// Check that index is valid.
	if (j >= 0 && j < p.numAttr)
	{
	float2 dsdxy = dda[i];
	float dsdx = dsdxy.x;
	float dsdy = dsdxy.y;

	float s0 = a0[j];
	float s1 = a1[j];
	float s2 = a2[j];

	// Gradients of db.
	float dsdu = s0 - s2;
	float dsdv = s1 - s2;
	gdudx += dsdu * dsdx;
	gdudy += dsdu * dsdy;
	gdvdx += dsdv * dsdx;
	gdvdy += dsdv * dsdy;

	// Gradients of attributes.
	float du = dsdxdudx + dsdydudy;
	float dv = dsdxdvdx + dsdydvdy;
	caAtomicAdd(ga0 + j, du);
	caAtomicAdd(ga1 + j, dv);
	caAtomicAdd(ga2 + j, -du - dv);
	}
	}

	// Write.
	((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
	}

	// Template specializations.
	__global__ void InterpolateGradKernel (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
	__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }

	//------------------------------------------------------------------------