Spaces:

geyik1
/

dnm3d

Sleeping

File size: 8,883 Bytes

111ba01

// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include "common.h"
#include "interpolate.h"

//------------------------------------------------------------------------
// Forward kernel.

template <bool ENABLE_DA>
static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
{
    // Calculate pixel position.
    int px = blockIdx.x * blockDim.x + threadIdx.x;
    int py = blockIdx.y * blockDim.y + threadIdx.y;
    int pz = blockIdx.z;
    if (px >= p.width || py >= p.height || pz >= p.depth)
        return;

    // Pixel index.
    int pidx = px + p.width * (py + p.height * pz);

    // Output ptrs.
    float* out = p.out + pidx * p.numAttr;
    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;

    // Fetch rasterizer output.
    float4 r = ((float4*)p.rast)[pidx];
    int triIdx = float_to_triidx(r.w) - 1;
    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);

    // If no geometry in entire warp, zero the output and exit.
    // Otherwise force barys to zero and output with live threads.
    if (__all_sync(0xffffffffu, !triValid))
    {
        for (int i=0; i < p.numAttr; i++)
            out[i] = 0.f;
        if (ENABLE_DA)
            for (int i=0; i < p.numDiffAttr; i++)
                outDA[i] = make_float2(0.f, 0.f);
        return;
    }

    // Fetch vertex indices.
    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;

    // Bail out if corrupt indices.
    if (vi0 < 0 || vi0 >= p.numVertices ||
        vi1 < 0 || vi1 >= p.numVertices ||
        vi2 < 0 || vi2 >= p.numVertices)
        return;

    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
    if (p.instance_mode && !p.attrBC)
    {
        vi0 += pz * p.numVertices;
        vi1 += pz * p.numVertices;
        vi2 += pz * p.numVertices;
    }

    // Pointers to attributes.
    const float* a0 = p.attr + vi0 * p.numAttr;
    const float* a1 = p.attr + vi1 * p.numAttr;
    const float* a2 = p.attr + vi2 * p.numAttr;

    // Barys. If no triangle, force all to zero -> output is zero.
    float b0 = triValid ? r.x : 0.f;
    float b1 = triValid ? r.y : 0.f;
    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;

    // Interpolate and write attributes.
    for (int i=0; i < p.numAttr; i++)
        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];

    // No diff attrs? Exit.
    if (!ENABLE_DA)
        return;

    // Read bary pixel differentials if we have a triangle.
    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
    if (triValid)
        db = ((float4*)p.rastDB)[pidx];

    // Unpack a bit.
    float dudx = db.x;
    float dudy = db.y;
    float dvdx = db.z;
    float dvdy = db.w;

    // Calculate the pixel differentials of chosen attributes.    
    for (int i=0; i < p.numDiffAttr; i++)
    {   
        // Input attribute index.
        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
        if (j < 0)
            j += p.numAttr; // Python-style negative indices.

        // Zero output if invalid index.
        float dsdx = 0.f;
        float dsdy = 0.f;
        if (j >= 0 && j < p.numAttr)
        {
            float s0 = a0[j];
            float s1 = a1[j];
            float s2 = a2[j];
            float dsdu = s0 - s2;
            float dsdv = s1 - s2;
            dsdx = dudx*dsdu + dvdx*dsdv;
            dsdy = dudy*dsdu + dvdy*dsdv;
        }

        // Write.
        outDA[i] = make_float2(dsdx, dsdy);
    }
}

// Template specializations.
__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }

//------------------------------------------------------------------------
// Gradient kernel.

template <bool ENABLE_DA>
static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
{
    // Temporary space for coalesced atomics.
    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);

    // Calculate pixel position.
    int px = blockIdx.x * blockDim.x + threadIdx.x;
    int py = blockIdx.y * blockDim.y + threadIdx.y;
    int pz = blockIdx.z;
    if (px >= p.width || py >= p.height || pz >= p.depth)
        return;

    // Pixel index.
    int pidx = px + p.width * (py + p.height * pz);

    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
    float4 r = ((float4*)p.rast)[pidx];
    int triIdx = float_to_triidx(r.w) - 1;
    if (triIdx < 0 || triIdx >= p.numTriangles)
    {
        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
        if (ENABLE_DA)
            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
        return;
    }

    // Fetch vertex indices.
    int vi0 = p.tri[triIdx * 3 + 0];
    int vi1 = p.tri[triIdx * 3 + 1];
    int vi2 = p.tri[triIdx * 3 + 2];

    // Bail out if corrupt indices.
    if (vi0 < 0 || vi0 >= p.numVertices ||
        vi1 < 0 || vi1 >= p.numVertices ||
        vi2 < 0 || vi2 >= p.numVertices)
        return;

    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
    if (p.instance_mode && !p.attrBC)
    {
        vi0 += pz * p.numVertices;
        vi1 += pz * p.numVertices;
        vi2 += pz * p.numVertices;
    }

    // Initialize coalesced atomics.
    CA_SET_GROUP(triIdx);

    // Pointers to inputs.
    const float* a0 = p.attr + vi0 * p.numAttr;
    const float* a1 = p.attr + vi1 * p.numAttr;
    const float* a2 = p.attr + vi2 * p.numAttr;
    const float* pdy = p.dy + pidx * p.numAttr;

    // Pointers to outputs.
    float* ga0 = p.gradAttr + vi0 * p.numAttr;
    float* ga1 = p.gradAttr + vi1 * p.numAttr;
    float* ga2 = p.gradAttr + vi2 * p.numAttr;

    // Barys and bary gradient accumulators.
    float b0 = r.x;
    float b1 = r.y;
    float b2 = 1.f - r.x - r.y;
    float gb0 = 0.f;
    float gb1 = 0.f;

    // Loop over attributes and accumulate attribute gradients.
    for (int i=0; i < p.numAttr; i++)
    {
        float y = pdy[i];
        float s0 = a0[i];
        float s1 = a1[i];
        float s2 = a2[i];
        gb0 += y * (s0 - s2);
        gb1 += y * (s1 - s2);
        caAtomicAdd(ga0 + i, b0 * y);
        caAtomicAdd(ga1 + i, b1 * y);
        caAtomicAdd(ga2 + i, b2 * y);
    }

    // Write the bary gradients.
    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);

    // If pixel differentials disabled, we're done.
    if (!ENABLE_DA)
        return;

    // Calculate gradients based on attribute pixel differentials.
    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
    float gdudx = 0.f;
    float gdudy = 0.f;
    float gdvdx = 0.f;
    float gdvdy = 0.f;

    // Read bary pixel differentials.
    float4 db = ((float4*)p.rastDB)[pidx];
    float dudx = db.x;
    float dudy = db.y;
    float dvdx = db.z;
    float dvdy = db.w;

    for (int i=0; i < p.numDiffAttr; i++)
    {
        // Input attribute index.
        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
        if (j < 0)
            j += p.numAttr; // Python-style negative indices.

        // Check that index is valid.
        if (j >= 0 && j < p.numAttr)
        {
            float2 dsdxy = dda[i];
            float dsdx = dsdxy.x;
            float dsdy = dsdxy.y;

            float s0 = a0[j];
            float s1 = a1[j];
            float s2 = a2[j];

            // Gradients of db.
            float dsdu = s0 - s2;
            float dsdv = s1 - s2;
            gdudx += dsdu * dsdx;
            gdudy += dsdu * dsdy;
            gdvdx += dsdv * dsdx;
            gdvdy += dsdv * dsdy;

            // Gradients of attributes.
            float du = dsdx*dudx + dsdy*dudy;
            float dv = dsdx*dvdx + dsdy*dvdy;
            caAtomicAdd(ga0 + j, du);
            caAtomicAdd(ga1 + j, dv);
            caAtomicAdd(ga2 + j, -du - dv);
        }
    }

    // Write.
    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
}

// Template specializations.
__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }

//------------------------------------------------------------------------