|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "common.h" |
|
#include "texture.h" |
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float b, float c) { a[0] += b * c; } |
|
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; } |
|
static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; } |
|
static __device__ __forceinline__ void accum_to_mem(float& a, float* b, int s) { a += b[0]; } |
|
static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; } |
|
static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; } |
|
static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); } |
|
static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); } |
|
template<class T> static __device__ __forceinline__ T lerp (const T& a, const T& b, float c) { return a + c * (b - a); } |
|
template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); } |
|
|
|
|
|
|
|
|
|
|
|
|
|
static __constant__ uint32_t c_cubeWrapMask1[48] = |
|
{ |
|
0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01, |
|
0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01, |
|
0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219, |
|
0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219, |
|
0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11, |
|
0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11, |
|
}; |
|
|
|
static __constant__ uint8_t c_cubeWrapMask2[48] = |
|
{ |
|
0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40, |
|
0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64, |
|
0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44, |
|
}; |
|
|
|
static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w) |
|
{ |
|
|
|
int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1; |
|
int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3; |
|
int c = cx + cy; |
|
if (c >= 5) |
|
c--; |
|
c = (face << 3) + c; |
|
|
|
|
|
unsigned int m = c_cubeWrapMask1[c]; |
|
int x0 = (m >> 0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0; |
|
int x1 = (m >> 2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0; |
|
int x2 = (m >> 4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1; |
|
int x3 = (m >> 6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1; |
|
int y0 = (m >> 8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0; |
|
int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0; |
|
int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1; |
|
int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1; |
|
int f0 = ((m >> 16) & 15) - 1; |
|
int f1 = ((m >> 20) & 15) - 1; |
|
int f2 = ((m >> 24) & 15) - 1; |
|
int f3 = ((m >> 28) ) - 1; |
|
|
|
|
|
unsigned int f = c_cubeWrapMask2[c]; |
|
int w1 = w - 1; |
|
if (f & 0x01) x0 = w1 - x0; |
|
if (f & 0x02) x1 = w1 - x1; |
|
if (f & 0x04) x2 = w1 - x2; |
|
if (f & 0x08) x3 = w1 - x3; |
|
if (f & 0x10) y0 = w1 - y0; |
|
if (f & 0x20) y1 = w1 - y1; |
|
if (f & 0x40) y2 = w1 - y2; |
|
if (f & 0x80) y3 = w1 - y3; |
|
|
|
|
|
int4 tcOut; |
|
tcOut.x = x0 + (y0 + f0 * w) * w; |
|
tcOut.y = x1 + (y1 + f1 * w) * w; |
|
tcOut.z = x2 + (y2 + f2 * w) * w; |
|
tcOut.w = x3 + (y3 + f3 * w) * w; |
|
return tcOut; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z) |
|
{ |
|
float ax = fabsf(x); |
|
float ay = fabsf(y); |
|
float az = fabsf(z); |
|
int idx; |
|
float c; |
|
if (az > fmaxf(ax, ay)) { idx = 4; c = z; } |
|
else if (ay > ax) { idx = 2; c = y; y = z; } |
|
else { idx = 0; c = x; x = z; } |
|
if (c < 0.f) idx += 1; |
|
float m = __frcp_rz(fabsf(c)) * .5; |
|
float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31)); |
|
float m1 = (idx != 2) ? -m : m; |
|
x = x * m0 + .5; |
|
y = y * m1 + .5; |
|
if (!isfinite(x) || !isfinite(y)) |
|
return -1; |
|
x = fminf(fmaxf(x, 0.f), 1.f); |
|
y = fminf(fmaxf(y, 0.f), 1.f); |
|
return idx; |
|
} |
|
|
|
|
|
static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv) |
|
{ |
|
float ax = fabsf(uv.x); |
|
float ay = fabsf(uv.y); |
|
float az = fabsf(uv.z); |
|
int idx; |
|
float c; |
|
float c0 = gu; |
|
float c1 = gv; |
|
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; } |
|
else if (ay > ax) { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; } |
|
else { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; } |
|
if (c < 0.f) idx += idx; |
|
float m = __frcp_rz(fabsf(c)); |
|
c0 = (idx & 0x34) ? -c0 : c0; |
|
c1 = (idx & 0x2e) ? -c1 : c1; |
|
float gl = (c0 + c1) * m; |
|
float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu; |
|
float gy = (idx & 0x0c) ? gl : -gv; |
|
float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv; |
|
gz = (idx & 0x09) ? -gz : gz; |
|
float3 res = make_float3(gx, gy, gz) * (m * .5f); |
|
if (!isfinite_vec3(res)) |
|
return make_float3(0.f, 0.f, 0.f); |
|
return res; |
|
} |
|
|
|
|
|
|
|
static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1) |
|
{ |
|
float ax = fabsf(uv.x); |
|
float ay = fabsf(uv.y); |
|
float az = fabsf(uv.z); |
|
int idx; |
|
float c, c0, c1; |
|
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; } |
|
else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; } |
|
else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; } |
|
if (c < 0.f) idx += idx; |
|
float m = __frcp_rz(fabsf(c)); |
|
c0 = (idx & 0x34) ? -c0 : c0; |
|
c1 = (idx & 0x2e) ? -c1 : c1; |
|
float gl0 = (dw.x * c0 + dw.z * c1) * m; |
|
float gl1 = (dw.y * c0 + dw.w * c1) * m; |
|
float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x; |
|
float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y; |
|
float gy0 = (idx & 0x0c) ? gl0 : -dw.z; |
|
float gy1 = (idx & 0x0c) ? gl1 : -dw.w; |
|
float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z; |
|
float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w; |
|
if (idx & 0x09) |
|
{ |
|
gz0 = -gz0; |
|
gz1 = -gz1; |
|
} |
|
g0 = make_float3(gx0, gy0, gz0) * (m * .5f); |
|
g1 = make_float3(gx1, gy1, gz1) * (m * .5f); |
|
if (!isfinite_vec3(g0) || !isfinite_vec3(g1)) |
|
{ |
|
g0 = make_float3(0.f, 0.f, 0.f); |
|
g1 = make_float3(0.f, 0.f, 0.f); |
|
} |
|
} |
|
|
|
|
|
|
|
static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY) |
|
{ |
|
float ax = fabsf(uv.x); |
|
float ay = fabsf(uv.y); |
|
float az = fabsf(uv.z); |
|
int idx; |
|
float c, gu, gv; |
|
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; } |
|
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; } |
|
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; } |
|
if (c < 0.f) idx += idx; |
|
if (idx & 0x09) |
|
{ |
|
dvdX.z = -dvdX.z; |
|
dvdY.z = -dvdY.z; |
|
} |
|
float m = __frcp_rz(fabsf(c)); |
|
float dm = m * .5f; |
|
float mm = m * dm; |
|
gu *= (idx & 0x34) ? -mm : mm; |
|
gv *= (idx & 0x2e) ? -mm : mm; |
|
|
|
float4 res; |
|
if (idx & 0x03) |
|
{ |
|
res = make_float4(gu * dvdX.x + dm * dvdX.z, |
|
gu * dvdY.x + dm * dvdY.z, |
|
gv * dvdX.x - dm * dvdX.y, |
|
gv * dvdY.x - dm * dvdY.y); |
|
} |
|
else if (idx & 0x0c) |
|
{ |
|
res = make_float4(gu * dvdX.y + dm * dvdX.x, |
|
gu * dvdY.y + dm * dvdY.x, |
|
gv * dvdX.y + dm * dvdX.z, |
|
gv * dvdY.y + dm * dvdY.z); |
|
} |
|
else |
|
{ |
|
res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x, |
|
gu * dvdY.z + copysignf(dm, c) * dvdY.x, |
|
gv * dvdX.z - dm * dvdX.y, |
|
gv * dvdY.z - dm * dvdY.y); |
|
} |
|
|
|
if (!isfinite_vec4(res)) |
|
return make_float4(0.f, 0.f, 0.f, 0.f); |
|
|
|
return res; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz) |
|
{ |
|
float ax = fabsf(uv.x); |
|
float ay = fabsf(uv.y); |
|
float az = fabsf(uv.z); |
|
int idx; |
|
float c, gu, gv; |
|
if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; } |
|
else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; } |
|
else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; } |
|
if (c < 0.f) idx += idx; |
|
|
|
if (idx & 0x09) |
|
{ |
|
dvdX.z = -dvdX.z; |
|
dvdY.z = -dvdY.z; |
|
} |
|
|
|
float m = __frcp_rz(c); |
|
float dm = -m * fabsf(m) * .5; |
|
float mm = m * m * .5; |
|
float mu = (idx & 0x34) ? -mm : mm; |
|
float mv = (idx & 0x2e) ? -mm : mm; |
|
gu *= -2.0 * m * mu; |
|
gv *= -2.0 * m * mv; |
|
|
|
if (idx & 0x03) |
|
{ |
|
dx.x = gu * dvdX.x + dm * dvdX.z; |
|
dx.y = gu * dvdY.x + dm * dvdY.z; |
|
dx.z = gv * dvdX.x - dm * dvdX.y; |
|
dx.w = gv * dvdY.x - dm * dvdY.y; |
|
dy.x = 0.f; |
|
dy.y = 0.f; |
|
dy.z = mv * dvdX.x; |
|
dy.w = mv * dvdY.x; |
|
dz.x = mu * dvdX.x; |
|
dz.y = mu * dvdY.x; |
|
dz.z = 0.f; |
|
dz.w = 0.f; |
|
} |
|
else if (idx & 0x0c) |
|
{ |
|
dx.x = mu * dvdX.y; |
|
dx.y = mu * dvdY.y; |
|
dx.z = 0.f; |
|
dx.w = 0.f; |
|
dy.x = gu * dvdX.y + dm * dvdX.x; |
|
dy.y = gu * dvdY.y + dm * dvdY.x; |
|
dy.z = gv * dvdX.y + dm * dvdX.z; |
|
dy.w = gv * dvdY.y + dm * dvdY.z; |
|
dz.x = 0.f; |
|
dz.y = 0.f; |
|
dz.z = mv * dvdX.y; |
|
dz.w = mv * dvdY.y; |
|
} |
|
else |
|
{ |
|
dx.x = mu * dvdX.z; |
|
dx.y = mu * dvdY.z; |
|
dx.z = 0.f; |
|
dx.w = 0.f; |
|
dy.x = 0.f; |
|
dy.y = 0.f; |
|
dy.z = mv * dvdX.z; |
|
dy.w = mv * dvdY.z; |
|
dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x; |
|
dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x; |
|
dz.z = gv * dvdX.z - dm * dvdX.y; |
|
dz.w = gv * dvdY.z - dm * dvdY.y; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
template <bool CUBE_MODE> |
|
static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz) |
|
{ |
|
int w = p.texWidth; |
|
int h = p.texHeight; |
|
float u = uv.x; |
|
float v = uv.y; |
|
|
|
|
|
if (CUBE_MODE) |
|
{ |
|
|
|
int idx = indexCubeMap(u, v, uv.z); |
|
if (idx < 0) |
|
return -1; |
|
tz = 6 * tz + idx; |
|
} |
|
else |
|
{ |
|
|
|
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP) |
|
{ |
|
u = u - (float)__float2int_rd(u); |
|
v = v - (float)__float2int_rd(v); |
|
} |
|
} |
|
|
|
u = u * (float)w; |
|
v = v * (float)h; |
|
|
|
int iu = __float2int_rd(u); |
|
int iv = __float2int_rd(v); |
|
|
|
|
|
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO) |
|
{ |
|
if (iu < 0 || iu >= w || iv < 0 || iv >= h) |
|
return -1; |
|
} |
|
|
|
|
|
iu = min(max(iu, 0), w-1); |
|
iv = min(max(iv, 0), h-1); |
|
return iu + w * (iv + tz * h); |
|
} |
|
|
|
template <bool CUBE_MODE> |
|
static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level) |
|
{ |
|
|
|
int2 sz = mipLevelSize(p, level); |
|
int w = sz.x; |
|
int h = sz.y; |
|
|
|
|
|
float u = uv.x; |
|
float v = uv.y; |
|
bool clampU = false; |
|
bool clampV = false; |
|
|
|
|
|
int face = 0; |
|
if (CUBE_MODE) |
|
{ |
|
|
|
face = indexCubeMap(u, v, uv.z); |
|
if (face < 0) |
|
{ |
|
tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; |
|
return make_float2(0.f, 0.f); |
|
} |
|
u = u * (float)w - 0.5f; |
|
v = v * (float)h - 0.5f; |
|
} |
|
else |
|
{ |
|
if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP) |
|
{ |
|
|
|
u = u - (float)__float2int_rd(u); |
|
v = v - (float)__float2int_rd(v); |
|
} |
|
|
|
|
|
u = u * (float)w - 0.5f; |
|
v = v * (float)h - 0.5f; |
|
|
|
if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP) |
|
{ |
|
|
|
u = fminf(fmaxf(u, 0.f), w - 1.f); |
|
v = fminf(fmaxf(v, 0.f), h - 1.f); |
|
clampU = (u == 0.f || u == w - 1.f); |
|
clampV = (v == 0.f || v == h - 1.f); |
|
} |
|
} |
|
|
|
|
|
int iu0 = __float2int_rd(u); |
|
int iv0 = __float2int_rd(v); |
|
int iu1 = iu0 + (clampU ? 0 : 1); |
|
int iv1 = iv0 + (clampV ? 0 : 1); |
|
u -= (float)iu0; |
|
v -= (float)iv0; |
|
|
|
|
|
bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h); |
|
if (cubeWrap) |
|
{ |
|
tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w); |
|
tcOut += 6 * tz * w * h; |
|
return make_float2(u, v); |
|
} |
|
|
|
|
|
if (CUBE_MODE) |
|
tz = 6 * tz + face; |
|
|
|
|
|
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP) |
|
{ |
|
if (iu0 < 0) iu0 += w; |
|
if (iv0 < 0) iv0 += h; |
|
if (iu1 >= w) iu1 -= w; |
|
if (iv1 >= h) iv1 -= h; |
|
} |
|
|
|
|
|
int iu0z = iu0 + tz * w * h; |
|
int iu1z = iu1 + tz * w * h; |
|
tcOut.x = iu0z + w * iv0; |
|
tcOut.y = iu1z + w * iv0; |
|
tcOut.z = iu0z + w * iv1; |
|
tcOut.w = iu1z + w * iv1; |
|
|
|
|
|
if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO) |
|
{ |
|
bool iu0_out = (iu0 < 0 || iu0 >= w); |
|
bool iu1_out = (iu1 < 0 || iu1 >= w); |
|
bool iv0_out = (iv0 < 0 || iv0 >= h); |
|
bool iv1_out = (iv1 < 0 || iv1 >= h); |
|
if (iu0_out || iv0_out) tcOut.x = -1; |
|
if (iu1_out || iv0_out) tcOut.y = -1; |
|
if (iu0_out || iv1_out) tcOut.z = -1; |
|
if (iu1_out || iv1_out) tcOut.w = -1; |
|
} |
|
|
|
|
|
return make_float2(u, v); |
|
} |
|
|
|
|
|
|
|
|
|
template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE> |
|
static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv) |
|
{ |
|
|
|
if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR) |
|
return; |
|
|
|
|
|
if (!BIAS_ONLY) |
|
{ |
|
|
|
float4 uvDA; |
|
float3 dvdX, dvdY; |
|
if (CUBE_MODE) |
|
{ |
|
|
|
float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0]; |
|
float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1]; |
|
float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2]; |
|
|
|
|
|
dvdX = make_float3(d0.x, d1.x, d2.x); |
|
dvdY = make_float3(d0.y, d1.y, d2.y); |
|
uvDA = indexCubeMapGradST(uv, dvdX, dvdY); |
|
} |
|
else |
|
{ |
|
|
|
uvDA = ((const float4*)p.uvDA)[pidx]; |
|
} |
|
|
|
|
|
float uscl = p.texWidth; |
|
float vscl = p.texHeight; |
|
|
|
|
|
float dsdx = uvDA.x * uscl; |
|
float dsdy = uvDA.y * uscl; |
|
float dtdx = uvDA.z * vscl; |
|
float dtdy = uvDA.w * vscl; |
|
|
|
|
|
float A = dsdx*dsdx + dtdx*dtdx; |
|
float B = dsdy*dsdy + dtdy*dtdy; |
|
float C = dsdx*dsdy + dtdx*dtdy; |
|
float l2b = 0.5 * (A + B); |
|
float l2n = 0.25 * (A-B)*(A-B) + C*C; |
|
float l2a = sqrt(l2n); |
|
float lenMinorSqr = fmaxf(0.0, l2b - l2a); |
|
float lenMajorSqr = l2b + l2a; |
|
|
|
|
|
if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR) |
|
{ |
|
float dw = 0.72134752f / (l2n + l2a * l2b); |
|
float AB = dw * .5f * (A - B); |
|
float Cw = dw * C; |
|
float l2aw = dw * l2a; |
|
float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw); |
|
float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw); |
|
float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw); |
|
float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw); |
|
|
|
float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY); |
|
if (!CUBE_MODE) |
|
*pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f); |
|
|
|
|
|
|
|
if (CUBE_MODE) |
|
{ |
|
float4 dx, dy, dz; |
|
indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz); |
|
float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x); |
|
float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y); |
|
float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z); |
|
float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w); |
|
|
|
float3 d_f_dv = make_float3(0.f, 0.f, 0.f); |
|
d_f_dv += d_dsdX_dv * d_f_ddsdX; |
|
d_f_dv += d_dsdY_dv * d_f_ddsdY; |
|
d_f_dv += d_dtdX_dv * d_f_ddtdX; |
|
d_f_dv += d_dtdY_dv * d_f_ddtdY; |
|
|
|
bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv); |
|
*pdw = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f); |
|
*pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f); |
|
} |
|
} |
|
|
|
|
|
flevel = .5f * __log2f(lenMajorSqr); |
|
} |
|
|
|
|
|
if (p.mipLevelBias) |
|
flevel += p.mipLevelBias[pidx]; |
|
flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax); |
|
|
|
|
|
level0 = __float2int_rd(flevel); |
|
|
|
|
|
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f) |
|
{ |
|
level1 = min(level0 + 1, p.mipLevelMax); |
|
flevel -= level0; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
template<class T> |
|
static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner) |
|
{ |
|
|
|
if (corner) |
|
{ |
|
T avg = zero_value<T>(); |
|
if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x])); |
|
if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y])); |
|
if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z])); |
|
if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w])); |
|
avg *= 0.33333333f; |
|
if (tc.x < 0) a00 = avg; |
|
if (tc.y < 0) a10 = avg; |
|
if (tc.z < 0) a01 = avg; |
|
if (tc.w < 0) a11 = avg; |
|
} |
|
else |
|
{ |
|
a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>(); |
|
a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>(); |
|
a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>(); |
|
a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>(); |
|
} |
|
} |
|
|
|
static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM) |
|
{ |
|
|
|
if (corner) |
|
{ |
|
float cb; |
|
if (tc.x < 0) cb = c.x; |
|
if (tc.y < 0) cb = c.y; |
|
if (tc.z < 0) cb = c.z; |
|
if (tc.w < 0) cb = c.w; |
|
cb *= 0.33333333f; |
|
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb); |
|
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb); |
|
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb); |
|
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb); |
|
} |
|
else |
|
{ |
|
if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x); |
|
if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y); |
|
if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z); |
|
if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
template<class T, int C> |
|
static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p) |
|
{ |
|
|
|
int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1); |
|
int2 sz_out = mipLevelSize(p, p.mipLevelOut); |
|
|
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
if (px >= sz_out.x || py >= sz_out.y) |
|
return; |
|
|
|
|
|
int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y)); |
|
int pidx_in1 = pidx_in0 + p.channels * sz_in.x; |
|
int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz)); |
|
|
|
|
|
const float* pin = p.tex[p.mipLevelOut - 1]; |
|
float* pout = (float*)p.tex[p.mipLevelOut]; |
|
|
|
|
|
if (sz_in.x == 1 || sz_in.y == 1) |
|
{ |
|
if (sz_in.y == 1) |
|
pidx_in1 = pidx_in0 + p.channels; |
|
|
|
for (int i=0; i < p.channels; i += C) |
|
{ |
|
T v0 = *((const T*)&pin[pidx_in0 + i]); |
|
T v1 = *((const T*)&pin[pidx_in1 + i]); |
|
T avg = .5f * (v0 + v1); |
|
#if TEX_DEBUG_MIP_RETAIN_VARIANCE |
|
avg = (avg - .5f) * 1.41421356f + .5f; |
|
#endif |
|
*((T*)&pout[pidx_out + i]) = avg; |
|
} |
|
|
|
return; |
|
} |
|
|
|
for (int i=0; i < p.channels; i += C) |
|
{ |
|
T v0 = *((const T*)&pin[pidx_in0 + i]); |
|
T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]); |
|
T v2 = *((const T*)&pin[pidx_in1 + i]); |
|
T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]); |
|
T avg = .25f * (v0 + v1 + v2 + v3); |
|
#if TEX_DEBUG_MIP_RETAIN_VARIANCE |
|
avg = (avg - .5f) * 2.f + .5f; |
|
#endif |
|
*((T*)&pout[pidx_out + i]) = avg; |
|
} |
|
} |
|
|
|
|
|
__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float, 1>(p); } |
|
__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); } |
|
__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); } |
|
|
|
|
|
|
|
|
|
template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE> |
|
static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p) |
|
{ |
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
int tz = (p.texDepth == 1) ? 0 : pz; |
|
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n) |
|
return; |
|
|
|
|
|
int pidx = px + p.imgWidth * (py + p.imgHeight * pz); |
|
|
|
|
|
float* pOut = p.out + pidx * p.channels; |
|
|
|
|
|
float3 uv; |
|
if (CUBE_MODE) |
|
uv = ((const float3*)p.uv)[pidx]; |
|
else |
|
uv = make_float3(((const float2*)p.uv)[pidx], 0.f); |
|
|
|
|
|
if (FILTER_MODE == TEX_MODE_NEAREST) |
|
{ |
|
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz); |
|
tc *= p.channels; |
|
const float* pIn = p.tex[0]; |
|
|
|
|
|
for (int i=0; i < p.channels; i += C) |
|
*((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>(); |
|
|
|
return; |
|
} |
|
|
|
|
|
float flevel = 0.f; |
|
int level0 = 0; |
|
int level1 = 0; |
|
calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0); |
|
|
|
|
|
int4 tc0 = make_int4(0, 0, 0, 0); |
|
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0); |
|
const float* pIn0 = p.tex[level0]; |
|
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0); |
|
tc0 *= p.channels; |
|
|
|
|
|
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST) |
|
{ |
|
|
|
for (int i=0; i < p.channels; i += C, tc0 += C) |
|
{ |
|
T a00, a10, a01, a11; |
|
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0); |
|
*((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0); |
|
} |
|
return; |
|
} |
|
|
|
|
|
int4 tc1 = make_int4(0, 0, 0, 0); |
|
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1); |
|
const float* pIn1 = p.tex[level1]; |
|
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0); |
|
tc1 *= p.channels; |
|
|
|
|
|
for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C) |
|
{ |
|
|
|
T a00, a10, a01, a11; |
|
fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0); |
|
T a = bilerp(a00, a10, a01, a11, uv0); |
|
|
|
|
|
if (flevel > 0.f) |
|
{ |
|
T b00, b10, b01, b11; |
|
fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1); |
|
T b = bilerp(b00, b10, b01, b11, uv1); |
|
a = lerp(a, b, flevel); |
|
} |
|
|
|
|
|
*((T*)&pOut[i]) = a; |
|
} |
|
} |
|
|
|
|
|
__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate<float, 1, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
|
|
|
|
|
|
|
|
template<class T, int C> |
|
static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p) |
|
{ |
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
if (px >= p.texWidth || py >= p.texHeight) |
|
return; |
|
|
|
|
|
int c = p.channels; |
|
if (C == 2) c >>= 1; |
|
if (C == 4) c >>= 2; |
|
|
|
|
|
extern __shared__ float s_texelAccum[]; |
|
int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x; |
|
int sharedStride = blockDim.x * blockDim.y; |
|
# define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride)) |
|
|
|
|
|
for (int i=0; i < p.channels; i++) |
|
*TEXEL_ACCUM(i) = 0.f; |
|
|
|
|
|
int x = px; |
|
int y = py; |
|
float w = 1.f; |
|
|
|
|
|
int2 sz = mipLevelSize(p, 0); |
|
for (int level=1; level <= p.mipLevelMax; level++) |
|
{ |
|
|
|
if (sz.x > 1) w *= .5f; |
|
if (sz.y > 1) w *= .5f; |
|
|
|
|
|
sz = mipLevelSize(p, level); |
|
x >>= 1; |
|
y >>= 1; |
|
|
|
T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels); |
|
for (int i=0; i < c; i++) |
|
accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w); |
|
} |
|
|
|
|
|
T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels); |
|
for (int i=0; i < c; i++) |
|
accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride); |
|
} |
|
|
|
|
|
__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float, 1>(p); } |
|
__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); } |
|
__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); } |
|
|
|
|
|
|
|
|
|
template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE> |
|
static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p) |
|
{ |
|
|
|
CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT); |
|
|
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
int tz = (p.texDepth == 1) ? 0 : pz; |
|
if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n) |
|
return; |
|
|
|
|
|
int pidx = px + p.imgWidth * (py + p.imgHeight * pz); |
|
|
|
|
|
const float* pDy = p.dy + pidx * p.channels; |
|
unsigned int dmax = 0u; |
|
if ((p.channels & 3) == 0) |
|
{ |
|
for (int i=0; i < p.channels; i += 4) |
|
{ |
|
uint4 dy = *((const uint4*)&pDy[i]); |
|
dmax |= (dy.x | dy.y | dy.z | dy.w); |
|
} |
|
} |
|
else |
|
{ |
|
for (int i=0; i < p.channels; i++) |
|
dmax |= __float_as_uint(pDy[i]); |
|
} |
|
|
|
|
|
if (__uint_as_float(dmax) == 0.f) |
|
{ |
|
if (CUBE_MODE) |
|
{ |
|
if (FILTER_MODE != TEX_MODE_NEAREST) |
|
((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f); |
|
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR) |
|
{ |
|
if (p.gradUVDA) |
|
{ |
|
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f); |
|
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f); |
|
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f); |
|
} |
|
if (p.gradMipLevelBias) |
|
p.gradMipLevelBias[pidx] = 0.f; |
|
} |
|
} |
|
else |
|
{ |
|
if (FILTER_MODE != TEX_MODE_NEAREST) |
|
((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f); |
|
if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR) |
|
{ |
|
if (p.gradUVDA) |
|
((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); |
|
if (p.gradMipLevelBias) |
|
p.gradMipLevelBias[pidx] = 0.f; |
|
} |
|
} |
|
return; |
|
} |
|
|
|
|
|
float3 uv; |
|
if (CUBE_MODE) |
|
uv = ((const float3*)p.uv)[pidx]; |
|
else |
|
uv = make_float3(((const float2*)p.uv)[pidx], 0.f); |
|
|
|
|
|
if (FILTER_MODE == TEX_MODE_NEAREST) |
|
{ |
|
int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz); |
|
if (tc < 0) |
|
return; |
|
|
|
tc *= p.channels; |
|
float* pOut = p.gradTex[0]; |
|
|
|
|
|
for (int i=0; i < p.channels; i++) |
|
caAtomicAddTexture(pOut, 0, tc + i, pDy[i]); |
|
|
|
return; |
|
} |
|
|
|
|
|
float4 dw = make_float4(0.f, 0.f, 0.f, 0.f); |
|
float3 dfdv = make_float3(0.f, 0.f, 0.f); |
|
float flevel = 0.f; |
|
int level0 = 0; |
|
int level1 = 0; |
|
calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv); |
|
|
|
|
|
float gu = 0.f; |
|
float gv = 0.f; |
|
|
|
|
|
int4 tc0 = make_int4(0, 0, 0, 0); |
|
float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0); |
|
const float* pIn0 = p.tex[level0]; |
|
float* pOut0 = p.gradTex[level0]; |
|
bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0); |
|
tc0 *= p.channels; |
|
|
|
|
|
float uv011 = uv0.x * uv0.y; |
|
float uv010 = uv0.x - uv011; |
|
float uv001 = uv0.y - uv011; |
|
float uv000 = 1.f - uv0.x - uv001; |
|
float4 tw0 = make_float4(uv000, uv010, uv001, uv011); |
|
|
|
|
|
int2 sz0 = mipLevelSize(p, level0); |
|
float sclu0 = (float)sz0.x; |
|
float sclv0 = (float)sz0.y; |
|
|
|
|
|
if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST) |
|
{ |
|
for (int i=0; i < p.channels; i++, tc0 += 1) |
|
{ |
|
float dy = pDy[i]; |
|
accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP); |
|
|
|
float a00, a10, a01, a11; |
|
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0); |
|
float ad = (a11 + a00 - a10 - a01); |
|
gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0; |
|
gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0; |
|
} |
|
|
|
|
|
if (CUBE_MODE) |
|
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv); |
|
else |
|
((float2*)p.gradUV)[pidx] = make_float2(gu, gv); |
|
|
|
return; |
|
} |
|
|
|
|
|
float df = 0; |
|
|
|
|
|
int4 tc1 = make_int4(0, 0, 0, 0); |
|
float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1); |
|
const float* pIn1 = p.tex[level1]; |
|
float* pOut1 = p.gradTex[level1]; |
|
bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0); |
|
tc1 *= p.channels; |
|
|
|
|
|
float uv111 = uv1.x * uv1.y; |
|
float uv110 = uv1.x - uv111; |
|
float uv101 = uv1.y - uv111; |
|
float uv100 = 1.f - uv1.x - uv101; |
|
float4 tw1 = make_float4(uv100, uv110, uv101, uv111); |
|
|
|
|
|
int2 sz1 = mipLevelSize(p, level1); |
|
float sclu1 = (float)sz1.x; |
|
float sclv1 = (float)sz1.y; |
|
|
|
|
|
for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1) |
|
{ |
|
float dy = pDy[i]; |
|
float dy0 = (1.f - flevel) * dy; |
|
accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP); |
|
|
|
|
|
float a00, a10, a01, a11; |
|
fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0); |
|
float ad = (a11 + a00 - a10 - a01); |
|
gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0; |
|
gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0; |
|
|
|
|
|
if (flevel > 0.f) |
|
{ |
|
|
|
float dy1 = flevel * dy; |
|
accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP); |
|
|
|
|
|
float b00, b10, b01, b11; |
|
fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1); |
|
float bd = (b11 + b00 - b10 - b01); |
|
gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1; |
|
gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1; |
|
|
|
|
|
float a = bilerp(a00, a10, a01, a11, uv0); |
|
float b = bilerp(b00, b10, b01, b11, uv1); |
|
df += (b-a) * dy; |
|
} |
|
} |
|
|
|
|
|
if (CUBE_MODE) |
|
((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df); |
|
else |
|
((float2*)p.gradUV)[pidx] = make_float2(gu, gv); |
|
|
|
|
|
if (p.gradMipLevelBias) |
|
p.gradMipLevelBias[pidx] = df; |
|
|
|
|
|
if (!BIAS_ONLY) |
|
{ |
|
|
|
dw *= df; |
|
|
|
|
|
if (CUBE_MODE) |
|
{ |
|
|
|
float3 g0, g1; |
|
indexCubeMapGrad4(uv, dw, g0, g1); |
|
((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x); |
|
((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y); |
|
((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z); |
|
} |
|
else |
|
((float4*)p.gradUVDA)[pidx] = dw; |
|
} |
|
} |
|
|
|
|
|
__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_NEAREST>(p); } |
|
__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR>(p); } |
|
__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate<true, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<false, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
__global__ void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); } |
|
__global__ void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate<true, true, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); } |
|
|
|
|
|
|