Spaces:

geyik1
/

dnm3d

Sleeping

App Files Files Community

dnm3d / extensions /nvdiffrast /common /extensions_nvdiffrast_nvdiffrast_common_texture_.cu

geyik1

Upload 174 files

111ba01 verified 7 months ago

raw

history blame

47.8 kB

	// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	//
	// NVIDIA CORPORATION and its licensors retain all intellectual property
	// and proprietary rights in and to this software, related documentation
	// and any modifications thereto. Any use, reproduction, disclosure or
	// distribution of this software and related documentation without an express
	// license agreement from NVIDIA CORPORATION is strictly prohibited.

	#include "common.h"
	#include "texture.h"

	//------------------------------------------------------------------------
	// Memory access and math helpers.

	static __device__ __forceinline__ void accum_from_mem(float* a, int s, float b, float c) { a[0] += b * c; }
	static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
	static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2s] += b.z c; a[3s] += b.w c; }
	static __device__ __forceinline__ void accum_to_mem(float& a, float* b, int s) { a += b[0]; }
	static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
	static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2s]; v.w += b[3s]; a = v; }
	static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); }
	static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); }
	template<class T> static __device__ __forceinline__ T lerp (const T& a, const T& b, float c) { return a + c * (b - a); }
	template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }

	//------------------------------------------------------------------------
	// Cube map wrapping for smooth filtering across edges and corners. At corners,
	// one of the texture coordinates will be negative. For correct interpolation,
	// the missing texel must take the average color of the other three.

	static __constant__ uint32_t c_cubeWrapMask1[48] =
	{
	0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
	0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
	0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
	0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
	0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
	0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
	};

	static __constant__ uint8_t c_cubeWrapMask2[48] =
	{
	0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
	0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
	0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
	};

	static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
	{
	// Calculate case number.
	int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
	int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
	int c = cx + cy;
	if (c >= 5)
	c--;
	c = (face << 3) + c;

	// Compute coordinates and faces.
	unsigned int m = c_cubeWrapMask1[c];
	int x0 = (m >> 0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
	int x1 = (m >> 2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
	int x2 = (m >> 4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
	int x3 = (m >> 6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
	int y0 = (m >> 8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
	int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
	int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
	int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
	int f0 = ((m >> 16) & 15) - 1;
	int f1 = ((m >> 20) & 15) - 1;
	int f2 = ((m >> 24) & 15) - 1;
	int f3 = ((m >> 28) ) - 1;

	// Flips.
	unsigned int f = c_cubeWrapMask2[c];
	int w1 = w - 1;
	if (f & 0x01) x0 = w1 - x0;
	if (f & 0x02) x1 = w1 - x1;
	if (f & 0x04) x2 = w1 - x2;
	if (f & 0x08) x3 = w1 - x3;
	if (f & 0x10) y0 = w1 - y0;
	if (f & 0x20) y1 = w1 - y1;
	if (f & 0x40) y2 = w1 - y2;
	if (f & 0x80) y3 = w1 - y3;

	// Done.
	int4 tcOut;
	tcOut.x = x0 + (y0 + f0 * w) * w;
	tcOut.y = x1 + (y1 + f1 * w) * w;
	tcOut.z = x2 + (y2 + f2 * w) * w;
	tcOut.w = x3 + (y3 + f3 * w) * w;
	return tcOut;
	}

	//------------------------------------------------------------------------
	// Cube map indexing and gradient functions.

	// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
	// two parameters) and face index.
	static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
	{
	float ax = fabsf(x);
	float ay = fabsf(y);
	float az = fabsf(z);
	int idx;
	float c;
	if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
	else if (ay > ax) { idx = 2; c = y; y = z; }
	else { idx = 0; c = x; x = z; }
	if (c < 0.f) idx += 1;
	float m = __frcp_rz(fabsf(c)) * .5;
	float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
	float m1 = (idx != 2) ? -m : m;
	x = x * m0 + .5;
	y = y * m1 + .5;
	if (!isfinite(x) \|\| !isfinite(y))
	return -1; // Invalid uv.
	x = fminf(fmaxf(x, 0.f), 1.f);
	y = fminf(fmaxf(y, 0.f), 1.f);
	return idx;
	}

	// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
	static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
	{
	float ax = fabsf(uv.x);
	float ay = fabsf(uv.y);
	float az = fabsf(uv.z);
	int idx;
	float c;
	float c0 = gu;
	float c1 = gv;
	if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
	else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
	else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
	if (c < 0.f) idx += idx;
	float m = __frcp_rz(fabsf(c));
	c0 = (idx & 0x34) ? -c0 : c0;
	c1 = (idx & 0x2e) ? -c1 : c1;
	float gl = (c0 + c1) * m;
	float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
	float gy = (idx & 0x0c) ? gl : -gv;
	float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
	gz = (idx & 0x09) ? -gz : gz;
	float3 res = make_float3(gx, gy, gz) * (m * .5f);
	if (!isfinite_vec3(res))
	return make_float3(0.f, 0.f, 0.f); // Invalid uv.
	return res;
	}

	// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
	// indexCubeMapGrad() functions rolled together.
	static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
	{
	float ax = fabsf(uv.x);
	float ay = fabsf(uv.y);
	float az = fabsf(uv.z);
	int idx;
	float c, c0, c1;
	if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
	else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
	else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
	if (c < 0.f) idx += idx;
	float m = __frcp_rz(fabsf(c));
	c0 = (idx & 0x34) ? -c0 : c0;
	c1 = (idx & 0x2e) ? -c1 : c1;
	float gl0 = (dw.x * c0 + dw.z * c1) * m;
	float gl1 = (dw.y * c0 + dw.w * c1) * m;
	float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
	float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
	float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
	float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
	float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
	float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
	if (idx & 0x09)
	{
	gz0 = -gz0;
	gz1 = -gz1;
	}
	g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
	g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
	if (!isfinite_vec3(g0) \|\| !isfinite_vec3(g1))
	{
	g0 = make_float3(0.f, 0.f, 0.f); // Invalid uv.
	g1 = make_float3(0.f, 0.f, 0.f);
	}
	}

	// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
	// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
	static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
	{
	float ax = fabsf(uv.x);
	float ay = fabsf(uv.y);
	float az = fabsf(uv.z);
	int idx;
	float c, gu, gv;
	if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
	else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
	else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
	if (c < 0.f) idx += idx;
	if (idx & 0x09)
	{
	dvdX.z = -dvdX.z;
	dvdY.z = -dvdY.z;
	}
	float m = __frcp_rz(fabsf(c));
	float dm = m * .5f;
	float mm = m * dm;
	gu *= (idx & 0x34) ? -mm : mm;
	gv *= (idx & 0x2e) ? -mm : mm;

	float4 res;
	if (idx & 0x03)
	{
	res = make_float4(gu * dvdX.x + dm * dvdX.z,
	gu * dvdY.x + dm * dvdY.z,
	gv * dvdX.x - dm * dvdX.y,
	gv * dvdY.x - dm * dvdY.y);
	}
	else if (idx & 0x0c)
	{
	res = make_float4(gu * dvdX.y + dm * dvdX.x,
	gu * dvdY.y + dm * dvdY.x,
	gv * dvdX.y + dm * dvdX.z,
	gv * dvdY.y + dm * dvdY.z);
	}
	else // (idx & 0x30)
	{
	res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
	gu * dvdY.z + copysignf(dm, c) * dvdY.x,
	gv * dvdX.z - dm * dvdX.y,
	gv * dvdY.z - dm * dvdY.y);
	}

	if (!isfinite_vec4(res))
	return make_float4(0.f, 0.f, 0.f, 0.f);

	return res;
	}

	// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
	// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
	// \| d(ds/dX)/dx d(ds/dY)/dx d(dt/dX)/dx d(dt/dY)/dx \|
	// \| d(ds/dX)/dy d(ds/dY)/dy d(dt/dX)/dy d(dt/dY)/dy \|
	// \| d(ds/dX)/dz d(ds/dY)/dz d(dt/dX)/dz d(dt/dY)/dz \|
	static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
	{
	float ax = fabsf(uv.x);
	float ay = fabsf(uv.y);
	float az = fabsf(uv.z);
	int idx;
	float c, gu, gv;
	if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
	else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
	else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
	if (c < 0.f) idx += idx;

	if (idx & 0x09)
	{
	dvdX.z = -dvdX.z;
	dvdY.z = -dvdY.z;
	}

	float m = __frcp_rz(c);
	float dm = -m * fabsf(m) * .5;
	float mm = m * m * .5;
	float mu = (idx & 0x34) ? -mm : mm;
	float mv = (idx & 0x2e) ? -mm : mm;
	gu = -2.0 m * mu;
	gv = -2.0 m * mv;

	if (idx & 0x03)
	{
	dx.x = gu * dvdX.x + dm * dvdX.z;
	dx.y = gu * dvdY.x + dm * dvdY.z;
	dx.z = gv * dvdX.x - dm * dvdX.y;
	dx.w = gv * dvdY.x - dm * dvdY.y;
	dy.x = 0.f;
	dy.y = 0.f;
	dy.z = mv * dvdX.x;
	dy.w = mv * dvdY.x;
	dz.x = mu * dvdX.x;
	dz.y = mu * dvdY.x;
	dz.z = 0.f;
	dz.w = 0.f;
	}
	else if (idx & 0x0c)
	{
	dx.x = mu * dvdX.y;
	dx.y = mu * dvdY.y;
	dx.z = 0.f;
	dx.w = 0.f;
	dy.x = gu * dvdX.y + dm * dvdX.x;
	dy.y = gu * dvdY.y + dm * dvdY.x;
	dy.z = gv * dvdX.y + dm * dvdX.z;
	dy.w = gv * dvdY.y + dm * dvdY.z;
	dz.x = 0.f;
	dz.y = 0.f;
	dz.z = mv * dvdX.y;
	dz.w = mv * dvdY.y;
	}
	else // (idx & 0x30)
	{
	dx.x = mu * dvdX.z;
	dx.y = mu * dvdY.z;
	dx.z = 0.f;
	dx.w = 0.f;
	dy.x = 0.f;
	dy.y = 0.f;
	dy.z = mv * dvdX.z;
	dy.w = mv * dvdY.z;
	dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
	dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
	dz.z = gv * dvdX.z - dm * dvdX.y;
	dz.w = gv * dvdY.z - dm * dvdY.y;
	}
	}

	//------------------------------------------------------------------------
	// General texture indexing.

	template <bool CUBE_MODE>
	static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
	{
	int w = p.texWidth;
	int h = p.texHeight;
	float u = uv.x;
	float v = uv.y;

	// Cube map indexing.
	if (CUBE_MODE)
	{
	// No wrap. Fold face index into tz right away.
	int idx = indexCubeMap(u, v, uv.z); // Rewrites u, v.
	if (idx < 0)
	return -1; // Invalid uv.
	tz = 6 * tz + idx;
	}
	else
	{
	// Handle boundary.
	if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
	{
	u = u - (float)__float2int_rd(u);
	v = v - (float)__float2int_rd(v);
	}
	}

	u = u * (float)w;
	v = v * (float)h;

	int iu = __float2int_rd(u);
	int iv = __float2int_rd(v);

	// In zero boundary mode, return texture address -1.
	if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
	{
	if (iu < 0 \|\| iu >= w \|\| iv < 0 \|\| iv >= h)
	return -1;
	}

	// Otherwise clamp and calculate the coordinate properly.
	iu = min(max(iu, 0), w-1);
	iv = min(max(iv, 0), h-1);
	return iu + w * (iv + tz * h);
	}

	template <bool CUBE_MODE>
	static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
	{
	// Mip level size.
	int2 sz = mipLevelSize(p, level);
	int w = sz.x;
	int h = sz.y;

	// Compute texture-space u, v.
	float u = uv.x;
	float v = uv.y;
	bool clampU = false;
	bool clampV = false;

	// Cube map indexing.
	int face = 0;
	if (CUBE_MODE)
	{
	// Neither clamp or wrap.
	face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
	if (face < 0)
	{
	tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; // Invalid uv.
	return make_float2(0.f, 0.f);
	}
	u = u * (float)w - 0.5f;
	v = v * (float)h - 0.5f;
	}
	else
	{
	if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
	{
	// Wrap.
	u = u - (float)__float2int_rd(u);
	v = v - (float)__float2int_rd(v);
	}

	// Move to texel space.
	u = u * (float)w - 0.5f;
	v = v * (float)h - 0.5f;

	if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
	{
	// Clamp to center of edge texels.
	u = fminf(fmaxf(u, 0.f), w - 1.f);
	v = fminf(fmaxf(v, 0.f), h - 1.f);
	clampU = (u == 0.f \|\| u == w - 1.f);
	clampV = (v == 0.f \|\| v == h - 1.f);
	}
	}

	// Compute texel coordinates and weights.
	int iu0 = __float2int_rd(u);
	int iv0 = __float2int_rd(v);
	int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
	int iv1 = iv0 + (clampV ? 0 : 1);
	u -= (float)iu0;
	v -= (float)iv0;

	// Cube map wrapping.
	bool cubeWrap = CUBE_MODE && (iu0 < 0 \|\| iv0 < 0 \|\| iu1 >= w \|\| iv1 >= h);
	if (cubeWrap)
	{
	tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
	tcOut += 6 * tz * w * h; // Bring in tz.
	return make_float2(u, v); // Done.
	}

	// Fold cube map face into tz.
	if (CUBE_MODE)
	tz = 6 * tz + face;

	// Wrap overflowing texel indices.
	if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
	{
	if (iu0 < 0) iu0 += w;
	if (iv0 < 0) iv0 += h;
	if (iu1 >= w) iu1 -= w;
	if (iv1 >= h) iv1 -= h;
	}

	// Coordinates with tz folded in.
	int iu0z = iu0 + tz * w * h;
	int iu1z = iu1 + tz * w * h;
	tcOut.x = iu0z + w * iv0;
	tcOut.y = iu1z + w * iv0;
	tcOut.z = iu0z + w * iv1;
	tcOut.w = iu1z + w * iv1;

	// Invalidate texture addresses outside unit square if we are in zero mode.
	if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
	{
	bool iu0_out = (iu0 < 0 \|\| iu0 >= w);
	bool iu1_out = (iu1 < 0 \|\| iu1 >= w);
	bool iv0_out = (iv0 < 0 \|\| iv0 >= h);
	bool iv1_out = (iv1 < 0 \|\| iv1 >= h);
	if (iu0_out \|\| iv0_out) tcOut.x = -1;
	if (iu1_out \|\| iv0_out) tcOut.y = -1;
	if (iu0_out \|\| iv1_out) tcOut.z = -1;
	if (iu1_out \|\| iv1_out) tcOut.w = -1;
	}

	// All done.
	return make_float2(u, v);
	}

	//------------------------------------------------------------------------
	// Mip level calculation.

	template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
	static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
	{
	// Do nothing if mips not in use.
	if (FILTER_MODE == TEX_MODE_NEAREST \|\| FILTER_MODE == TEX_MODE_LINEAR)
	return;

	// Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
	if (!BIAS_ONLY)
	{
	// Get pixel derivatives of texture coordinates.
	float4 uvDA;
	float3 dvdX, dvdY; // Gradients use these later.
	if (CUBE_MODE)
	{
	// Fetch.
	float2 d0 = ((const float2)p.uvDA)[3 pidx + 0];
	float2 d1 = ((const float2)p.uvDA)[3 pidx + 1];
	float2 d2 = ((const float2)p.uvDA)[3 pidx + 2];

	// Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
	dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
	dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
	uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
	}
	else
	{
	// Fetch.
	uvDA = ((const float4*)p.uvDA)[pidx];
	}

	// Scaling factors.
	float uscl = p.texWidth;
	float vscl = p.texHeight;

	// d[s,t]/d[X,Y].
	float dsdx = uvDA.x * uscl;
	float dsdy = uvDA.y * uscl;
	float dtdx = uvDA.z * vscl;
	float dtdy = uvDA.w * vscl;

	// Calculate footprint axis lengths.
	float A = dsdxdsdx + dtdxdtdx;
	float B = dsdydsdy + dtdydtdy;
	float C = dsdxdsdy + dtdxdtdy;
	float l2b = 0.5 * (A + B);
	float l2n = 0.25 * (A-B)(A-B) + CC;
	float l2a = sqrt(l2n);
	float lenMinorSqr = fmaxf(0.0, l2b - l2a);
	float lenMajorSqr = l2b + l2a;

	// Footprint vs. mip level gradient.
	if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
	{
	float dw = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
	float AB = dw * .5f * (A - B);
	float Cw = dw * C;
	float l2aw = dw * l2a;
	float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
	float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
	float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
	float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);

	float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
	if (!CUBE_MODE)
	*pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);

	// In cube maps, there is also a texture coordinate vs. mip level gradient.
	// Only output nonzero vectors if both are free of inf/Nan garbage.
	if (CUBE_MODE)
	{
	float4 dx, dy, dz;
	indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
	float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
	float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
	float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
	float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);

	float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
	d_f_dv += d_dsdX_dv * d_f_ddsdX;
	d_f_dv += d_dsdY_dv * d_f_ddsdY;
	d_f_dv += d_dtdX_dv * d_f_ddtdX;
	d_f_dv += d_dtdY_dv * d_f_ddtdY;

	bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv);
	*pdw = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
	*pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f);
	}
	}

	// Finally, calculate mip level.
	flevel = .5f * __log2f(lenMajorSqr); // May be inf/NaN, but clamp fixes it.
	}

	// Bias the mip level and clamp.
	if (p.mipLevelBias)
	flevel += p.mipLevelBias[pidx];
	flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);

	// Calculate levels depending on filter mode.
	level0 = __float2int_rd(flevel);

	// Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
	if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
	{
	level1 = min(level0 + 1, p.mipLevelMax);
	flevel -= level0; // Fractional part. Zero if clamped on last level.
	}
	}

	//------------------------------------------------------------------------
	// Texel fetch and accumulator helpers that understand cube map corners.

	template<class T>
	static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
	{
	// For invalid cube map uv, tc will be all negative, and all texel values will be zero.
	if (corner)
	{
	T avg = zero_value<T>();
	if (tc.x >= 0) avg += (a00 = ((const T)&pIn[tc.x]));
	if (tc.y >= 0) avg += (a10 = ((const T)&pIn[tc.y]));
	if (tc.z >= 0) avg += (a01 = ((const T)&pIn[tc.z]));
	if (tc.w >= 0) avg += (a11 = ((const T)&pIn[tc.w]));
	avg *= 0.33333333f;
	if (tc.x < 0) a00 = avg;
	if (tc.y < 0) a10 = avg;
	if (tc.z < 0) a01 = avg;
	if (tc.w < 0) a11 = avg;
	}
	else
	{
	a00 = (tc.x >= 0) ? ((const T)&pIn[tc.x]) : zero_value<T>();
	a10 = (tc.y >= 0) ? ((const T)&pIn[tc.y]) : zero_value<T>();
	a01 = (tc.z >= 0) ? ((const T)&pIn[tc.z]) : zero_value<T>();
	a11 = (tc.w >= 0) ? ((const T)&pIn[tc.w]) : zero_value<T>();
	}
	}

	static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
	{
	// For invalid cube map uv, tc will be all negative, and no accumulation will take place.
	if (corner)
	{
	float cb;
	if (tc.x < 0) cb = c.x;
	if (tc.y < 0) cb = c.y;
	if (tc.z < 0) cb = c.z;
	if (tc.w < 0) cb = c.w;
	cb *= 0.33333333f;
	if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
	if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
	if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
	if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
	}
	else
	{
	if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
	if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
	if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
	if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
	}
	}

	//------------------------------------------------------------------------
	// Mip builder kernel.

	template<class T, int C>
	static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
	{
	// Sizes.
	int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
	int2 sz_out = mipLevelSize(p, p.mipLevelOut);

	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	if (px >= sz_out.x \|\| py >= sz_out.y)
	return;

	// Pixel indices.
	int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
	int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
	int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));

	// Input and output pointers.
	const float* pin = p.tex[p.mipLevelOut - 1];
	float* pout = (float*)p.tex[p.mipLevelOut];

	// Special case: Input texture height or width is 1.
	if (sz_in.x == 1 \|\| sz_in.y == 1)
	{
	if (sz_in.y == 1)
	pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.

	for (int i=0; i < p.channels; i += C)
	{
	T v0 = ((const T)&pin[pidx_in0 + i]);
	T v1 = ((const T)&pin[pidx_in1 + i]);
	T avg = .5f * (v0 + v1);
	#if TEX_DEBUG_MIP_RETAIN_VARIANCE
	avg = (avg - .5f) * 1.41421356f + .5f;
	#endif
	((T)&pout[pidx_out + i]) = avg;
	}

	return;
	}

	for (int i=0; i < p.channels; i += C)
	{
	T v0 = ((const T)&pin[pidx_in0 + i]);
	T v1 = ((const T)&pin[pidx_in0 + i + p.channels]);
	T v2 = ((const T)&pin[pidx_in1 + i]);
	T v3 = ((const T)&pin[pidx_in1 + i + p.channels]);
	T avg = .25f * (v0 + v1 + v2 + v3);
	#if TEX_DEBUG_MIP_RETAIN_VARIANCE
	avg = (avg - .5f) * 2.f + .5f;
	#endif
	((T)&pout[pidx_out + i]) = avg;
	}
	}

	// Template specializations.
	__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float, 1>(p); }
	__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
	__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }

	//------------------------------------------------------------------------
	// Forward kernel.

	template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
	static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
	{
	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	int tz = (p.texDepth == 1) ? 0 : pz;
	if (px >= p.imgWidth \|\| py >= p.imgHeight \|\| pz >= p.n)
	return;

	// Pixel index.
	int pidx = px + p.imgWidth * (py + p.imgHeight * pz);

	// Output ptr.
	float* pOut = p.out + pidx * p.channels;

	// Get UV.
	float3 uv;
	if (CUBE_MODE)
	uv = ((const float3*)p.uv)[pidx];
	else
	uv = make_float3(((const float2*)p.uv)[pidx], 0.f);

	// Nearest mode.
	if (FILTER_MODE == TEX_MODE_NEAREST)
	{
	int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
	tc *= p.channels;
	const float* pIn = p.tex[0];

	// Copy if valid tc, otherwise output zero.
	for (int i=0; i < p.channels; i += C)
	((T)&pOut[i]) = (tc >= 0) ? ((const T)&pIn[tc + i]) : zero_value<T>();

	return; // Exit.
	}

	// Calculate mip level. In 'linear' mode these will all stay zero.
	float flevel = 0.f; // Fractional level.
	int level0 = 0; // Discrete level 0.
	int level1 = 0; // Discrete level 1.
	calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);

	// Get texel indices and pointer for level 0.
	int4 tc0 = make_int4(0, 0, 0, 0);
	float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
	const float* pIn0 = p.tex[level0];
	bool corner0 = CUBE_MODE && ((tc0.x \| tc0.y \| tc0.z \| tc0.w) < 0);
	tc0 *= p.channels;

	// Bilinear fetch.
	if (FILTER_MODE == TEX_MODE_LINEAR \|\| FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
	{
	// Interpolate.
	for (int i=0; i < p.channels; i += C, tc0 += C)
	{
	T a00, a10, a01, a11;
	fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
	((T)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
	}
	return; // Exit.
	}

	// Get texel indices and pointer for level 1.
	int4 tc1 = make_int4(0, 0, 0, 0);
	float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
	const float* pIn1 = p.tex[level1];
	bool corner1 = CUBE_MODE && ((tc1.x \| tc1.y \| tc1.z \| tc1.w) < 0);
	tc1 *= p.channels;

	// Trilinear fetch.
	for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
	{
	// First level.
	T a00, a10, a01, a11;
	fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
	T a = bilerp(a00, a10, a01, a11, uv0);

	// Second level unless in magnification mode.
	if (flevel > 0.f)
	{
	T b00, b10, b01, b11;
	fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
	T b = bilerp(b00, b10, b01, b11, uv1);
	a = lerp(a, b, flevel); // Interpolate between levels.
	}

	// Write.
	((T)&pOut[i]) = a;
	}
	}

	// Template specializations.
	__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }

	//------------------------------------------------------------------------
	// Gradient mip puller kernel.

	template<class T, int C>
	static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
	{
	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	if (px >= p.texWidth \|\| py >= p.texHeight)
	return;

	// Number of wide elements.
	int c = p.channels;
	if (C == 2) c >>= 1;
	if (C == 4) c >>= 2;

	// Dynamically allocated shared memory for holding a texel.
	extern __shared__ float s_texelAccum[];
	int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
	int sharedStride = blockDim.x * blockDim.y;
	# define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))

	// Clear the texel.
	for (int i=0; i < p.channels; i++)
	*TEXEL_ACCUM(i) = 0.f;

	// Track texel position and accumulation weight over the mip stack.
	int x = px;
	int y = py;
	float w = 1.f;

	// Pull gradients from all levels.
	int2 sz = mipLevelSize(p, 0); // Previous level size.
	for (int level=1; level <= p.mipLevelMax; level++)
	{
	// Weight decay depends on previous level size.
	if (sz.x > 1) w *= .5f;
	if (sz.y > 1) w *= .5f;

	// Current level size and coordinates.
	sz = mipLevelSize(p, level);
	x >>= 1;
	y >>= 1;

	T* pIn = (T)(p.gradTex[level] + (x + sz.x (y + sz.y * pz)) * p.channels);
	for (int i=0; i < c; i++)
	accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
	}

	// Add to main texture gradients.
	T* pOut = (T)(p.gradTex[0] + (px + p.texWidth (py + p.texHeight * pz)) * p.channels);
	for (int i=0; i < c; i++)
	accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
	}

	// Template specializations.
	__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float, 1>(p); }
	__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
	__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }

	//------------------------------------------------------------------------
	// Gradient kernel.

	template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
	static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
	{
	// Temporary space for coalesced atomics.
	CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);

	// Calculate pixel position.
	int px = blockIdx.x * blockDim.x + threadIdx.x;
	int py = blockIdx.y * blockDim.y + threadIdx.y;
	int pz = blockIdx.z;
	int tz = (p.texDepth == 1) ? 0 : pz;
	if (px >= p.imgWidth \|\| py >= p.imgHeight \|\| pz >= p.n)
	return;

	// Pixel index.
	int pidx = px + p.imgWidth * (py + p.imgHeight * pz);

	// Early exit if output gradients are zero.
	const float* pDy = p.dy + pidx * p.channels;
	unsigned int dmax = 0u;
	if ((p.channels & 3) == 0)
	{
	for (int i=0; i < p.channels; i += 4)
	{
	uint4 dy = ((const uint4)&pDy[i]);
	dmax \|= (dy.x \| dy.y \| dy.z \| dy.w);
	}
	}
	else
	{
	for (int i=0; i < p.channels; i++)
	dmax \|= __float_as_uint(pDy[i]);
	}

	// Store zeros and exit.
	if (__uint_as_float(dmax) == 0.f)
	{
	if (CUBE_MODE)
	{
	if (FILTER_MODE != TEX_MODE_NEAREST)
	((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
	if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
	{
	if (p.gradUVDA)
	{
	((float2)p.gradUVDA)[3 pidx + 0] = make_float2(0.f, 0.f);
	((float2)p.gradUVDA)[3 pidx + 1] = make_float2(0.f, 0.f);
	((float2)p.gradUVDA)[3 pidx + 2] = make_float2(0.f, 0.f);
	}
	if (p.gradMipLevelBias)
	p.gradMipLevelBias[pidx] = 0.f;
	}
	}
	else
	{
	if (FILTER_MODE != TEX_MODE_NEAREST)
	((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
	if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
	{
	if (p.gradUVDA)
	((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
	if (p.gradMipLevelBias)
	p.gradMipLevelBias[pidx] = 0.f;
	}
	}
	return;
	}

	// Get UV.
	float3 uv;
	if (CUBE_MODE)
	uv = ((const float3*)p.uv)[pidx];
	else
	uv = make_float3(((const float2*)p.uv)[pidx], 0.f);

	// Nearest mode - texture gradients only.
	if (FILTER_MODE == TEX_MODE_NEAREST)
	{
	int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
	if (tc < 0)
	return; // Outside texture.

	tc *= p.channels;
	float* pOut = p.gradTex[0];

	// Accumulate texture gradients.
	for (int i=0; i < p.channels; i++)
	caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);

	return; // Exit.
	}

	// Calculate mip level. In 'linear' mode these will all stay zero.
	float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
	float3 dfdv = make_float3(0.f, 0.f, 0.f);
	float flevel = 0.f; // Fractional level.
	int level0 = 0; // Discrete level 0.
	int level1 = 0; // Discrete level 1.
	calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);

	// UV gradient accumulators.
	float gu = 0.f;
	float gv = 0.f;

	// Get texel indices and pointers for level 0.
	int4 tc0 = make_int4(0, 0, 0, 0);
	float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
	const float* pIn0 = p.tex[level0];
	float* pOut0 = p.gradTex[level0];
	bool corner0 = CUBE_MODE && ((tc0.x \| tc0.y \| tc0.z \| tc0.w) < 0);
	tc0 *= p.channels;

	// Texel weights.
	float uv011 = uv0.x * uv0.y;
	float uv010 = uv0.x - uv011;
	float uv001 = uv0.y - uv011;
	float uv000 = 1.f - uv0.x - uv001;
	float4 tw0 = make_float4(uv000, uv010, uv001, uv011);

	// Attribute weights.
	int2 sz0 = mipLevelSize(p, level0);
	float sclu0 = (float)sz0.x;
	float sclv0 = (float)sz0.y;

	// Bilinear mode - texture and uv gradients.
	if (FILTER_MODE == TEX_MODE_LINEAR \|\| FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
	{
	for (int i=0; i < p.channels; i++, tc0 += 1)
	{
	float dy = pDy[i];
	accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);

	float a00, a10, a01, a11;
	fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
	float ad = (a11 + a00 - a10 - a01);
	gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
	gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
	}

	// Store UV gradients and exit.
	if (CUBE_MODE)
	((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
	else
	((float2*)p.gradUV)[pidx] = make_float2(gu, gv);

	return;
	}

	// Accumulate fractional mip level gradient.
	float df = 0; // dL/df.

	// Get texel indices and pointers for level 1.
	int4 tc1 = make_int4(0, 0, 0, 0);
	float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
	const float* pIn1 = p.tex[level1];
	float* pOut1 = p.gradTex[level1];
	bool corner1 = CUBE_MODE && ((tc1.x \| tc1.y \| tc1.z \| tc1.w) < 0);
	tc1 *= p.channels;

	// Texel weights.
	float uv111 = uv1.x * uv1.y;
	float uv110 = uv1.x - uv111;
	float uv101 = uv1.y - uv111;
	float uv100 = 1.f - uv1.x - uv101;
	float4 tw1 = make_float4(uv100, uv110, uv101, uv111);

	// Attribute weights.
	int2 sz1 = mipLevelSize(p, level1);
	float sclu1 = (float)sz1.x;
	float sclv1 = (float)sz1.y;

	// Trilinear mode.
	for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
	{
	float dy = pDy[i];
	float dy0 = (1.f - flevel) * dy;
	accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);

	// UV gradients for first level.
	float a00, a10, a01, a11;
	fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
	float ad = (a11 + a00 - a10 - a01);
	gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
	gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;

	// Second level unless in magnification mode.
	if (flevel > 0.f)
	{
	// Texture gradients for second level.
	float dy1 = flevel * dy;
	accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);

	// UV gradients for second level.
	float b00, b10, b01, b11;
	fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
	float bd = (b11 + b00 - b10 - b01);
	gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
	gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;

	// Mip level gradient.
	float a = bilerp(a00, a10, a01, a11, uv0);
	float b = bilerp(b00, b10, b01, b11, uv1);
	df += (b-a) * dy;
	}
	}

	// Store UV gradients.
	if (CUBE_MODE)
	((float3)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv df);
	else
	((float2*)p.gradUV)[pidx] = make_float2(gu, gv);

	// Store mip level bias gradient.
	if (p.gradMipLevelBias)
	p.gradMipLevelBias[pidx] = df;

	// Store UV pixel differential gradients.
	if (!BIAS_ONLY)
	{
	// Final gradients.
	dw = df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) dL/df.

	// Store them.
	if (CUBE_MODE)
	{
	// Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
	float3 g0, g1;
	indexCubeMapGrad4(uv, dw, g0, g1);
	((float2)p.gradUVDA)[3 pidx + 0] = make_float2(g0.x, g1.x);
	((float2)p.gradUVDA)[3 pidx + 1] = make_float2(g0.y, g1.y);
	((float2)p.gradUVDA)[3 pidx + 2] = make_float2(g0.z, g1.z);
	}
	else
	((float4*)p.gradUVDA)[pidx] = dw;
	}
	}

	// Template specializations.
	__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_NEAREST>(p); }
	__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR>(p); }
	__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
	__global__ void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
	__global__ void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }

	//------------------------------------------------------------------------