|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "antialias.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define F32_MAX (3.402823466e+38f) |
|
|
static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; } |
|
|
static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); } |
|
|
static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2) |
|
|
{ |
|
|
bool g10 = rational_gt(n1, n0, d1, d0); |
|
|
bool g20 = rational_gt(n2, n0, d2, d0); |
|
|
bool g21 = rational_gt(n2, n1, d2, d1); |
|
|
if (g20 && g21) return 2; |
|
|
if (g10) return 1; |
|
|
return 0; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct AAWorkItem |
|
|
{ |
|
|
enum |
|
|
{ |
|
|
EDGE_MASK = 3, |
|
|
FLAG_DOWN_BIT = 2, |
|
|
FLAG_TRI1_BIT = 3, |
|
|
}; |
|
|
|
|
|
int px, py; |
|
|
unsigned int pz_flags; |
|
|
float alpha; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define JENKINS_MAGIC (0x9e3779b9u) |
|
|
static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c) |
|
|
{ |
|
|
a -= b; a -= c; a ^= (c>>13); |
|
|
b -= c; b -= a; b ^= (a<<8); |
|
|
c -= a; c -= b; c ^= (b>>13); |
|
|
a -= b; a -= c; a ^= (c>>12); |
|
|
b -= c; b -= a; b ^= (a<<16); |
|
|
c -= a; c -= b; c ^= (b>>5); |
|
|
a -= b; a -= c; a ^= (c>>3); |
|
|
b -= c; b -= a; b ^= (a<<10); |
|
|
c -= a; c -= b; c ^= (b>>15); |
|
|
} |
|
|
|
|
|
|
|
|
class HashIndex |
|
|
{ |
|
|
public: |
|
|
__device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key) |
|
|
{ |
|
|
m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; |
|
|
m_idx = (uint32_t)(key & 0xffffffffu); |
|
|
m_skip = (uint32_t)(key >> 32); |
|
|
uint32_t dummy = JENKINS_MAGIC; |
|
|
jenkins_mix(m_idx, m_skip, dummy); |
|
|
m_idx &= m_mask; |
|
|
m_skip &= m_mask; |
|
|
m_skip |= 1; |
|
|
} |
|
|
__device__ __forceinline__ int get(void) const { return m_idx; } |
|
|
__device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; } |
|
|
private: |
|
|
uint32_t m_idx, m_skip, m_mask; |
|
|
}; |
|
|
|
|
|
static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v) |
|
|
{ |
|
|
HashIndex idx(p, key); |
|
|
while(1) |
|
|
{ |
|
|
uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key); |
|
|
if (prev == 0 || prev == key) |
|
|
break; |
|
|
idx.next(); |
|
|
} |
|
|
int* q = (int*)&p.evHash[idx.get()]; |
|
|
int a = atomicCAS(q+2, 0, v); |
|
|
if (a != 0 && a != v) |
|
|
atomicCAS(q+3, 0, v); |
|
|
} |
|
|
|
|
|
static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key) |
|
|
{ |
|
|
HashIndex idx(p, key); |
|
|
while(1) |
|
|
{ |
|
|
uint4 entry = p.evHash[idx.get()]; |
|
|
uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32); |
|
|
if (k == key || k == 0) |
|
|
return make_int2((int)entry.z, (int)entry.w); |
|
|
idx.next(); |
|
|
} |
|
|
} |
|
|
|
|
|
static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn) |
|
|
{ |
|
|
if (va == vb) |
|
|
return; |
|
|
|
|
|
uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order |
|
|
uint64_t v1 = (uint32_t)max(va, vb) + 1; |
|
|
uint64_t vk = v0 | (v1 << 32); // hash key |
|
|
hash_insert(p, vk, vn + 1); |
|
|
} |
|
|
|
|
|
static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr) |
|
|
{ |
|
|
if (va == vb) |
|
|
return -1; |
|
|
|
|
|
uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order |
|
|
uint64_t v1 = (uint32_t)max(va, vb) + 1; |
|
|
uint64_t vk = v0 | (v1 << 32); // hash key |
|
|
int2 vn = hash_find(p, vk) - 1; |
|
|
if (vn.x == vr) return vn.y; |
|
|
if (vn.y == vr) return vn.x; |
|
|
return -1; |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------------------ |
|
|
// Mesh analysis kernel. |
|
|
|
|
|
__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p) |
|
|
{ |
|
|
int idx = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
if (idx >= p.numTriangles) |
|
|
return; |
|
|
|
|
|
int v0 = p.tri[idx * 3 + 0]; |
|
|
int v1 = p.tri[idx * 3 + 1]; |
|
|
int v2 = p.tri[idx * 3 + 2]; |
|
|
|
|
|
if (v0 < 0 || v0 >= p.numVertices || |
|
|
v1 < 0 || v1 >= p.numVertices || |
|
|
v2 < 0 || v2 >= p.numVertices) |
|
|
return; |
|
|
|
|
|
if (v0 == v1 || v1 == v2 || v2 == v0) |
|
|
return; |
|
|
|
|
|
evhash_insert_vertex(p, v1, v2, v0); |
|
|
evhash_insert_vertex(p, v2, v0, v1); |
|
|
evhash_insert_vertex(p, v0, v1, v2); |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------------------ |
|
|
// Discontinuity finder kernel. |
|
|
|
|
|
__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p) |
|
|
{ |
|
|
// Calculate pixel position. |
|
|
int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x; |
|
|
int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y; |
|
|
int pz = blockIdx.z; |
|
|
if (px >= p.width || py >= p.height || pz >= p.n) |
|
|
return; |
|
|
|
|
|
// Pointer to our TriIdx and fetch. |
|
|
int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3; |
|
|
float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other. |
|
|
|
|
|
// Look right, clamp at edge. |
|
|
int pidx1 = pidx0; |
|
|
if (px < p.width - 1) |
|
|
pidx1 += 4; |
|
|
float tri1 = p.rasterOut[pidx1]; |
|
|
|
|
|
// Look down, clamp at edge. |
|
|
int pidx2 = pidx0; |
|
|
if (py < p.height - 1) |
|
|
pidx2 += p.width << 2; |
|
|
float tri2 = p.rasterOut[pidx2]; |
|
|
|
|
|
// Determine amount of work. |
|
|
int count = 0; |
|
|
if (tri1 != tri0) count = 1; |
|
|
if (tri2 != tri0) count += 1; |
|
|
if (!count) |
|
|
return; // Exit warp. |
|
|
|
|
|
// Coalesce work counter update to once per CTA. |
|
|
__shared__ int s_temp; |
|
|
s_temp = 0; |
|
|
__syncthreads(); |
|
|
int idx = atomicAdd(&s_temp, count); |
|
|
__syncthreads(); |
|
|
if (idx == 0) |
|
|
{ |
|
|
int base = atomicAdd(&p.workBuffer[0].x, s_temp); |
|
|
s_temp = base + 1; // don't clobber the counters in first slot. |
|
|
} |
|
|
__syncthreads(); |
|
|
idx += s_temp; |
|
|
|
|
|
// Write to memory. |
|
|
if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0); |
|
|
if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0); |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------------------ |
|
|
// Forward analysis kernel. |
|
|
|
|
|
__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p) |
|
|
{ |
|
|
__shared__ int s_base; |
|
|
int workCount = p.workBuffer[0].x; |
|
|
for(;;) |
|
|
{ |
|
|
// Persistent threads work fetcher. |
|
|
__syncthreads(); |
|
|
if (threadIdx.x == 0) |
|
|
s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK); |
|
|
__syncthreads(); |
|
|
int thread_idx = s_base + threadIdx.x; |
|
|
if (thread_idx >= workCount) |
|
|
return; |
|
|
|
|
|
int4* pItem = p.workBuffer + thread_idx + 1; |
|
|
int4 item = *pItem; |
|
|
int px = item.x; |
|
|
int py = item.y; |
|
|
int pz = (int)(((unsigned int)item.z) >> 16); |
|
|
int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1; |
|
|
|
|
|
int pixel0 = px + p.width * (py + p.height * pz); |
|
|
int pixel1 = pixel0 + (d ? p.width : 1); |
|
|
float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1]; |
|
|
float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1]; |
|
|
int tri0 = float_to_triidx(zt0.y) - 1; |
|
|
int tri1 = float_to_triidx(zt1.y) - 1; |
|
|
|
|
|
// Select triangle based on background / depth. |
|
|
int tri = (tri0 >= 0) ? tri0 : tri1; |
|
|
if (tri0 >= 0 && tri1 >= 0) |
|
|
tri = (zt0.x < zt1.x) ? tri0 : tri1; |
|
|
if (tri == tri1) |
|
|
{ |
|
|
// Calculate with respect to neighbor pixel if chose that triangle. |
|
|
px += 1 - d; |
|
|
py += d; |
|
|
} |
|
|
|
|
|
// Bail out if triangle index is corrupt. |
|
|
if (tri < 0 || tri >= p.numTriangles) |
|
|
continue; |
|
|
|
|
|
// Fetch vertex indices. |
|
|
int vi0 = p.tri[tri * 3 + 0]; |
|
|
int vi1 = p.tri[tri * 3 + 1]; |
|
|
int vi2 = p.tri[tri * 3 + 2]; |
|
|
|
|
|
// Bail out if vertex indices are corrupt. |
|
|
if (vi0 < 0 || vi0 >= p.numVertices || |
|
|
vi1 < 0 || vi1 >= p.numVertices || |
|
|
vi2 < 0 || vi2 >= p.numVertices) |
|
|
continue; |
|
|
|
|
|
// Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists. |
|
|
int op0 = evhash_find_vertex(p, vi2, vi1, vi0); |
|
|
int op1 = evhash_find_vertex(p, vi0, vi2, vi1); |
|
|
int op2 = evhash_find_vertex(p, vi1, vi0, vi2); |
|
|
|
|
|
// Instance mode: Adjust vertex indices based on minibatch index. |
|
|
if (p.instance_mode) |
|
|
{ |
|
|
int vbase = pz * p.numVertices; |
|
|
vi0 += vbase; |
|
|
vi1 += vbase; |
|
|
vi2 += vbase; |
|
|
if (op0 >= 0) op0 += vbase; |
|
|
if (op1 >= 0) op1 += vbase; |
|
|
if (op2 >= 0) op2 += vbase; |
|
|
} |
|
|
|
|
|
// Fetch vertex positions. |
|
|
float4 p0 = ((float4*)p.pos)[vi0]; |
|
|
float4 p1 = ((float4*)p.pos)[vi1]; |
|
|
float4 p2 = ((float4*)p.pos)[vi2]; |
|
|
float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0]; |
|
|
float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1]; |
|
|
float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2]; |
|
|
|
|
|
// Project vertices to pixel space. |
|
|
float w0 = 1.f / p0.w; |
|
|
float w1 = 1.f / p1.w; |
|
|
float w2 = 1.f / p2.w; |
|
|
float ow0 = 1.f / o0.w; |
|
|
float ow1 = 1.f / o1.w; |
|
|
float ow2 = 1.f / o2.w; |
|
|
float fx = (float)px + .5f - p.xh; |
|
|
float fy = (float)py + .5f - p.yh; |
|
|
float x0 = p0.x * w0 * p.xh - fx; |
|
|
float y0 = p0.y * w0 * p.yh - fy; |
|
|
float x1 = p1.x * w1 * p.xh - fx; |
|
|
float y1 = p1.y * w1 * p.yh - fy; |
|
|
float x2 = p2.x * w2 * p.xh - fx; |
|
|
float y2 = p2.y * w2 * p.yh - fy; |
|
|
float ox0 = o0.x * ow0 * p.xh - fx; |
|
|
float oy0 = o0.y * ow0 * p.yh - fy; |
|
|
float ox1 = o1.x * ow1 * p.xh - fx; |
|
|
float oy1 = o1.y * ow1 * p.yh - fy; |
|
|
float ox2 = o2.x * ow2 * p.xh - fx; |
|
|
float oy2 = o2.y * ow2 * p.yh - fy; |
|
|
|
|
|
// Signs to kill non-silhouette edges. |
|
|
float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself. |
|
|
float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings. |
|
|
float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1); |
|
|
float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2); |
|
|
|
|
|
// If no matching signs anywhere, skip the rest. |
|
|
if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb)) |
|
|
{ |
|
|
// XY flip for horizontal edges. |
|
|
if (d) |
|
|
{ |
|
|
swap(x0, y0); |
|
|
swap(x1, y1); |
|
|
swap(x2, y2); |
|
|
} |
|
|
|
|
|
float dx0 = x2 - x1; |
|
|
float dx1 = x0 - x2; |
|
|
float dx2 = x1 - x0; |
|
|
float dy0 = y2 - y1; |
|
|
float dy1 = y0 - y2; |
|
|
float dy2 = y1 - y0; |
|
|
|
|
|
// Check if an edge crosses between us and the neighbor pixel. |
|
|
float dc = -F32_MAX; |
|
|
float ds = (tri == tri0) ? 1.f : -1.f; |
|
|
float d0 = ds * (x1*dy0 - y1*dx0); |
|
|
float d1 = ds * (x2*dy1 - y2*dx1); |
|
|
float d2 = ds * (x0*dy2 - y0*dx2); |
|
|
|
|
|
if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f; |
|
|
if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f; |
|
|
if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f; |
|
|
|
|
|
int di = max_idx3(d0, d1, d2, dy0, dy1, dy2); |
|
|
if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0; |
|
|
if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1; |
|
|
if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2; |
|
|
float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy. |
|
|
|
|
|
// Adjust output image if a suitable edge was found. |
|
|
if (dc > -eps && dc < 1.f + eps) |
|
|
{ |
|
|
dc = fminf(fmaxf(dc, 0.f), 1.f); |
|
|
float alpha = ds * (.5f - dc); |
|
|
const float* pColor0 = p.color + pixel0 * p.channels; |
|
|
const float* pColor1 = p.color + pixel1 * p.channels; |
|
|
float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels; |
|
|
for (int i=0; i < p.channels; i++) |
|
|
atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i])); |
|
|
|
|
|
// Rewrite the work item's flags and alpha. Keep original px, py. |
|
|
unsigned int flags = pz << 16; |
|
|
flags |= di; |
|
|
flags |= d << AAWorkItem::FLAG_DOWN_BIT; |
|
|
flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT; |
|
|
((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------------------ |
|
|
// Gradient kernel. |
|
|
|
|
|
__global__ void AntialiasGradKernel(const AntialiasKernelParams p) |
|
|
{ |
|
|
// Temporary space for coalesced atomics. |
|
|
CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK); |
|
|
__shared__ int s_base; // Work counter communication across entire CTA. |
|
|
|
|
|
int workCount = p.workBuffer[0].x; |
|
|
|
|
|
for(;;) |
|
|
{ |
|
|
// Persistent threads work fetcher. |
|
|
__syncthreads(); |
|
|
if (threadIdx.x == 0) |
|
|
s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK); |
|
|
__syncthreads(); |
|
|
int thread_idx = s_base + threadIdx.x; |
|
|
if (thread_idx >= workCount) |
|
|
return; |
|
|
|
|
|
// Read work item filled out by forward kernel. |
|
|
int4 item = p.workBuffer[thread_idx + 1]; |
|
|
unsigned int amask = __ballot_sync(0xffffffffu, item.w); |
|
|
if (item.w == 0) |
|
|
continue; // No effect. |
|
|
|
|
|
// Unpack work item and replicate setup from forward analysis kernel. |
|
|
int px = item.x; |
|
|
int py = item.y; |
|
|
int pz = (int)(((unsigned int)item.z) >> 16); |
|
|
int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1; |
|
|
float alpha = __int_as_float(item.w); |
|
|
int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1; |
|
|
int di = item.z & AAWorkItem::EDGE_MASK; |
|
|
float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31)); |
|
|
int pixel0 = px + p.width * (py + p.height * pz); |
|
|
int pixel1 = pixel0 + (d ? p.width : 1); |
|
|
int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1; |
|
|
if (tri1) |
|
|
{ |
|
|
px += 1 - d; |
|
|
py += d; |
|
|
} |
|
|
|
|
|
// Bail out if triangle index is corrupt. |
|
|
bool triFail = (tri < 0 || tri >= p.numTriangles); |
|
|
amask = __ballot_sync(amask, !triFail); |
|
|
if (triFail) |
|
|
continue; |
|
|
|
|
|
// Outgoing color gradients. |
|
|
float* pGrad0 = p.gradColor + pixel0 * p.channels; |
|
|
float* pGrad1 = p.gradColor + pixel1 * p.channels; |
|
|
|
|
|
// Incoming color gradients. |
|
|
const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels; |
|
|
|
|
|
// Position gradient weight based on colors and incoming gradients. |
|
|
float dd = 0.f; |
|
|
const float* pColor0 = p.color + pixel0 * p.channels; |
|
|
const float* pColor1 = p.color + pixel1 * p.channels; |
|
|
|
|
|
// Loop over channels and accumulate. |
|
|
for (int i=0; i < p.channels; i++) |
|
|
{ |
|
|
float dy = pDy[i]; |
|
|
if (dy != 0.f) |
|
|
{ |
|
|
// Update position gradient weight. |
|
|
dd += dy * (pColor1[i] - pColor0[i]); |
|
|
|
|
|
// Update color gradients. No coalescing because all have different targets. |
|
|
float v = alpha * dy; |
|
|
atomicAdd(&pGrad0[i], -v); |
|
|
atomicAdd(&pGrad1[i], v); |
|
|
} |
|
|
} |
|
|
|
|
|
// If position weight is zero, skip the rest. |
|
|
bool noGrad = (dd == 0.f); |
|
|
amask = __ballot_sync(amask, !noGrad); |
|
|
if (noGrad) |
|
|
continue; |
|
|
|
|
|
// Fetch vertex indices of the active edge and their positions. |
|
|
int i1 = (di < 2) ? (di + 1) : 0; |
|
|
int i2 = (i1 < 2) ? (i1 + 1) : 0; |
|
|
int vi1 = p.tri[3 * tri + i1]; |
|
|
int vi2 = p.tri[3 * tri + i2]; |
|
|
|
|
|
// Bail out if vertex indices are corrupt. |
|
|
bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices); |
|
|
amask = __ballot_sync(amask, !vtxFail); |
|
|
if (vtxFail) |
|
|
continue; |
|
|
|
|
|
// Instance mode: Adjust vertex indices based on minibatch index. |
|
|
if (p.instance_mode) |
|
|
{ |
|
|
vi1 += pz * p.numVertices; |
|
|
vi2 += pz * p.numVertices; |
|
|
} |
|
|
|
|
|
// Fetch vertex positions. |
|
|
float4 p1 = ((float4*)p.pos)[vi1]; |
|
|
float4 p2 = ((float4*)p.pos)[vi2]; |
|
|
|
|
|
// Project vertices to pixel space. |
|
|
float pxh = p.xh; |
|
|
float pyh = p.yh; |
|
|
float fx = (float)px + .5f - pxh; |
|
|
float fy = (float)py + .5f - pyh; |
|
|
|
|
|
// XY flip for horizontal edges. |
|
|
if (d) |
|
|
{ |
|
|
swap(p1.x, p1.y); |
|
|
swap(p2.x, p2.y); |
|
|
swap(pxh, pyh); |
|
|
swap(fx, fy); |
|
|
} |
|
|
|
|
|
// Gradient calculation setup. |
|
|
float w1 = 1.f / p1.w; |
|
|
float w2 = 1.f / p2.w; |
|
|
float x1 = p1.x * w1 * pxh - fx; |
|
|
float y1 = p1.y * w1 * pyh - fy; |
|
|
float x2 = p2.x * w2 * pxh - fx; |
|
|
float y2 = p2.y * w2 * pyh - fy; |
|
|
float dx = x2 - x1; |
|
|
float dy = y2 - y1; |
|
|
float db = x1*dy - y1*dx; |
|
|
|
|
|
// Compute inverse delta-y with epsilon. |
|
|
float ep = copysignf(1e-3f, dy); // ~1/1000 pixel. |
|
|
float iy = 1.f / (dy + ep); |
|
|
|
|
|
// Compute position gradients. |
|
|
float dby = db * iy; |
|
|
float iw1 = -w1 * iy * dd; |
|
|
float iw2 = w2 * iy * dd; |
|
|
float gp1x = iw1 * pxh * y2; |
|
|
float gp2x = iw2 * pxh * y1; |
|
|
float gp1y = iw1 * pyh * (dby - x2); |
|
|
float gp2y = iw2 * pyh * (dby - x1); |
|
|
float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1; |
|
|
float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2; |
|
|
|
|
|
// XY flip the gradients. |
|
|
if (d) |
|
|
{ |
|
|
swap(gp1x, gp1y); |
|
|
swap(gp2x, gp2y); |
|
|
} |
|
|
|
|
|
// Kill position gradients if alpha was saturated. |
|
|
if (fabsf(alpha) >= 0.5f) |
|
|
{ |
|
|
gp1x = gp1y = gp1w = 0.f; |
|
|
gp2x = gp2y = gp2w = 0.f; |
|
|
} |
|
|
|
|
|
// Initialize coalesced atomics. Match both triangle ID and edge index. |
|
|
// Also note that some threads may be inactive. |
|
|
CA_SET_GROUP_MASK(tri ^ (di << 30), amask); |
|
|
|
|
|
// Accumulate gradients. |
|
|
caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w); |
|
|
caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w); |
|
|
} |
|
|
} |
|
|
|
|
|
//------------------------------------------------------------------------ |
|
|
|