|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "common.h" |
|
#include "interpolate.h" |
|
|
|
|
|
|
|
|
|
template <bool ENABLE_DA> |
|
static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p) |
|
{ |
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
if (px >= p.width || py >= p.height || pz >= p.depth) |
|
return; |
|
|
|
|
|
int pidx = px + p.width * (py + p.height * pz); |
|
|
|
|
|
float* out = p.out + pidx * p.numAttr; |
|
float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0; |
|
|
|
|
|
float4 r = ((float4*)p.rast)[pidx]; |
|
int triIdx = float_to_triidx(r.w) - 1; |
|
bool triValid = (triIdx >= 0 && triIdx < p.numTriangles); |
|
|
|
|
|
|
|
if (__all_sync(0xffffffffu, !triValid)) |
|
{ |
|
for (int i=0; i < p.numAttr; i++) |
|
out[i] = 0.f; |
|
if (ENABLE_DA) |
|
for (int i=0; i < p.numDiffAttr; i++) |
|
outDA[i] = make_float2(0.f, 0.f); |
|
return; |
|
} |
|
|
|
|
|
int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0; |
|
int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0; |
|
int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0; |
|
|
|
|
|
if (vi0 < 0 || vi0 >= p.numVertices || |
|
vi1 < 0 || vi1 >= p.numVertices || |
|
vi2 < 0 || vi2 >= p.numVertices) |
|
return; |
|
|
|
|
|
if (p.instance_mode && !p.attrBC) |
|
{ |
|
vi0 += pz * p.numVertices; |
|
vi1 += pz * p.numVertices; |
|
vi2 += pz * p.numVertices; |
|
} |
|
|
|
|
|
const float* a0 = p.attr + vi0 * p.numAttr; |
|
const float* a1 = p.attr + vi1 * p.numAttr; |
|
const float* a2 = p.attr + vi2 * p.numAttr; |
|
|
|
|
|
float b0 = triValid ? r.x : 0.f; |
|
float b1 = triValid ? r.y : 0.f; |
|
float b2 = triValid ? (1.f - r.x - r.y) : 0.f; |
|
|
|
|
|
for (int i=0; i < p.numAttr; i++) |
|
out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i]; |
|
|
|
|
|
if (!ENABLE_DA) |
|
return; |
|
|
|
|
|
float4 db = make_float4(0.f, 0.f, 0.f, 0.f); |
|
if (triValid) |
|
db = ((float4*)p.rastDB)[pidx]; |
|
|
|
|
|
float dudx = db.x; |
|
float dudy = db.y; |
|
float dvdx = db.z; |
|
float dvdy = db.w; |
|
|
|
|
|
for (int i=0; i < p.numDiffAttr; i++) |
|
{ |
|
|
|
int j = p.diff_attrs_all ? i : p.diffAttrs[i]; |
|
if (j < 0) |
|
j += p.numAttr; |
|
|
|
|
|
float dsdx = 0.f; |
|
float dsdy = 0.f; |
|
if (j >= 0 && j < p.numAttr) |
|
{ |
|
float s0 = a0[j]; |
|
float s1 = a1[j]; |
|
float s2 = a2[j]; |
|
float dsdu = s0 - s2; |
|
float dsdv = s1 - s2; |
|
dsdx = dudx*dsdu + dvdx*dsdv; |
|
dsdy = dudy*dsdu + dvdy*dsdv; |
|
} |
|
|
|
|
|
outDA[i] = make_float2(dsdx, dsdy); |
|
} |
|
} |
|
|
|
|
|
__global__ void InterpolateFwdKernel (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); } |
|
__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); } |
|
|
|
|
|
|
|
|
|
template <bool ENABLE_DA> |
|
static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p) |
|
{ |
|
|
|
CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT); |
|
|
|
|
|
int px = blockIdx.x * blockDim.x + threadIdx.x; |
|
int py = blockIdx.y * blockDim.y + threadIdx.y; |
|
int pz = blockIdx.z; |
|
if (px >= p.width || py >= p.height || pz >= p.depth) |
|
return; |
|
|
|
|
|
int pidx = px + p.width * (py + p.height * pz); |
|
|
|
|
|
float4 r = ((float4*)p.rast)[pidx]; |
|
int triIdx = float_to_triidx(r.w) - 1; |
|
if (triIdx < 0 || triIdx >= p.numTriangles) |
|
{ |
|
((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); |
|
if (ENABLE_DA) |
|
((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f); |
|
return; |
|
} |
|
|
|
|
|
int vi0 = p.tri[triIdx * 3 + 0]; |
|
int vi1 = p.tri[triIdx * 3 + 1]; |
|
int vi2 = p.tri[triIdx * 3 + 2]; |
|
|
|
|
|
if (vi0 < 0 || vi0 >= p.numVertices || |
|
vi1 < 0 || vi1 >= p.numVertices || |
|
vi2 < 0 || vi2 >= p.numVertices) |
|
return; |
|
|
|
|
|
if (p.instance_mode && !p.attrBC) |
|
{ |
|
vi0 += pz * p.numVertices; |
|
vi1 += pz * p.numVertices; |
|
vi2 += pz * p.numVertices; |
|
} |
|
|
|
|
|
CA_SET_GROUP(triIdx); |
|
|
|
|
|
const float* a0 = p.attr + vi0 * p.numAttr; |
|
const float* a1 = p.attr + vi1 * p.numAttr; |
|
const float* a2 = p.attr + vi2 * p.numAttr; |
|
const float* pdy = p.dy + pidx * p.numAttr; |
|
|
|
|
|
float* ga0 = p.gradAttr + vi0 * p.numAttr; |
|
float* ga1 = p.gradAttr + vi1 * p.numAttr; |
|
float* ga2 = p.gradAttr + vi2 * p.numAttr; |
|
|
|
|
|
float b0 = r.x; |
|
float b1 = r.y; |
|
float b2 = 1.f - r.x - r.y; |
|
float gb0 = 0.f; |
|
float gb1 = 0.f; |
|
|
|
|
|
for (int i=0; i < p.numAttr; i++) |
|
{ |
|
float y = pdy[i]; |
|
float s0 = a0[i]; |
|
float s1 = a1[i]; |
|
float s2 = a2[i]; |
|
gb0 += y * (s0 - s2); |
|
gb1 += y * (s1 - s2); |
|
caAtomicAdd(ga0 + i, b0 * y); |
|
caAtomicAdd(ga1 + i, b1 * y); |
|
caAtomicAdd(ga2 + i, b2 * y); |
|
} |
|
|
|
|
|
((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f); |
|
|
|
|
|
if (!ENABLE_DA) |
|
return; |
|
|
|
|
|
const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr; |
|
float gdudx = 0.f; |
|
float gdudy = 0.f; |
|
float gdvdx = 0.f; |
|
float gdvdy = 0.f; |
|
|
|
|
|
float4 db = ((float4*)p.rastDB)[pidx]; |
|
float dudx = db.x; |
|
float dudy = db.y; |
|
float dvdx = db.z; |
|
float dvdy = db.w; |
|
|
|
for (int i=0; i < p.numDiffAttr; i++) |
|
{ |
|
|
|
int j = p.diff_attrs_all ? i : p.diffAttrs[i]; |
|
if (j < 0) |
|
j += p.numAttr; |
|
|
|
|
|
if (j >= 0 && j < p.numAttr) |
|
{ |
|
float2 dsdxy = dda[i]; |
|
float dsdx = dsdxy.x; |
|
float dsdy = dsdxy.y; |
|
|
|
float s0 = a0[j]; |
|
float s1 = a1[j]; |
|
float s2 = a2[j]; |
|
|
|
|
|
float dsdu = s0 - s2; |
|
float dsdv = s1 - s2; |
|
gdudx += dsdu * dsdx; |
|
gdudy += dsdu * dsdy; |
|
gdvdx += dsdv * dsdx; |
|
gdvdy += dsdv * dsdy; |
|
|
|
|
|
float du = dsdx*dudx + dsdy*dudy; |
|
float dv = dsdx*dvdx + dsdy*dvdy; |
|
caAtomicAdd(ga0 + j, du); |
|
caAtomicAdd(ga1 + j, dv); |
|
caAtomicAdd(ga2 + j, -du - dv); |
|
} |
|
} |
|
|
|
|
|
((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy); |
|
} |
|
|
|
|
|
__global__ void InterpolateGradKernel (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); } |
|
__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); } |
|
|
|
|
|
|