Spaces:
Paused
Paused
| // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
| // | |
| // NVIDIA CORPORATION and its licensors retain all intellectual property | |
| // and proprietary rights in and to this software, related documentation | |
| // and any modifications thereto. Any use, reproduction, disclosure or | |
| // distribution of this software and related documentation without an express | |
| // license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| //------------------------------------------------------------------------ | |
| // Common op attribute parser. | |
| static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA) | |
| { | |
| if (enableDA) | |
| { | |
| OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all)); | |
| if (!p.diff_attrs_all) | |
| { | |
| std::vector<int> diff_attrs_vec; | |
| OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec)); | |
| OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list")); | |
| OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)")); | |
| p.numDiffAttr = diff_attrs_vec.size(); | |
| memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int)); | |
| } | |
| } | |
| } | |
| //------------------------------------------------------------------------ | |
| // Forward TensorFlow op. | |
| template <bool ENABLE_DA> | |
| struct InterpolateFwdOp : public OpKernel | |
| { | |
| InterpolateKernelParams m_attribs; | |
| InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
| { | |
| memset(&m_attribs, 0, sizeof(m_attribs)); | |
| interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA); | |
| } | |
| void Compute(OpKernelContext* ctx) | |
| { | |
| InterpolateKernelParams& p = m_attribs; | |
| cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
| // Get input. | |
| const Tensor& attr = ctx->input(0); | |
| const Tensor& rast = ctx->input(1); | |
| const Tensor& tri = ctx->input(2); | |
| const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2); | |
| // Instance rendering mode? | |
| p.instance_mode = attr.dims() > 2; | |
| // Extract input dimensions. | |
| if (p.instance_mode) | |
| { | |
| p.numVertices = (attr.dims() > 1) ? attr.dim_size(1) : 0; | |
| p.numAttr = (attr.dims() > 2) ? attr.dim_size(2) : 0; | |
| } | |
| else | |
| { | |
| p.numVertices = (attr.dims() > 0) ? attr.dim_size(0) : 0; | |
| p.numAttr = (attr.dims() > 1) ? attr.dim_size(1) : 0; | |
| } | |
| p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0; | |
| p.height = (rast.dims() > 1) ? rast.dim_size(1) : 0; | |
| p.width = (rast.dims() > 2) ? rast.dim_size(2) : 0; | |
| p.depth = (rast.dims() > 0) ? rast.dim_size(0) : 0; | |
| // Sanity checks. | |
| OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]")); | |
| OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); | |
| OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]")); | |
| if (p.instance_mode) | |
| OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr")); | |
| if (ENABLE_DA) | |
| { | |
| OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]")); | |
| OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db")); | |
| OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db")); | |
| } | |
| // All diff attrs mode. | |
| if (p.diff_attrs_all) | |
| p.numDiffAttr = p.numAttr; | |
| // Get input pointers. | |
| p.attr = attr.flat<float>().data(); | |
| p.rast = rast.flat<float>().data(); | |
| p.tri = tri.flat<int>().data(); | |
| p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0; | |
| p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0; | |
| // Allocate main output tensor. | |
| Tensor* out_tensor = NULL; | |
| TensorShape out_shape; | |
| out_shape.AddDim(p.depth); | |
| out_shape.AddDim(p.height); | |
| out_shape.AddDim(p.width); | |
| out_shape.AddDim(p.numAttr); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor)); | |
| p.out = out_tensor->flat<float>().data(); | |
| // Allocate pixel differential output tensor. | |
| Tensor* out_da_tensor = NULL; | |
| out_shape.set_dim(3, p.numDiffAttr * 2); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor)); | |
| p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0; | |
| // Verify that buffers are aligned to allow float2/float4 operations. | |
| OP_REQUIRES(ctx, !((uintptr_t)p.rast & 15), errors::Internal("rast input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4")); | |
| if (ENABLE_DA) | |
| OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2")); | |
| // Choose launch parameters. | |
| dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height); | |
| dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth); | |
| // Launch CUDA kernel. | |
| void* args[] = {&p}; | |
| void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel; | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream)); | |
| } | |
| }; | |
| REGISTER_OP("InterpolateFwd") | |
| .Input ("attr: float") | |
| .Input ("rast: float") | |
| .Input ("tri: int32") | |
| .Output ("out: float") | |
| .Output ("out_da: float"); | |
| REGISTER_OP("InterpolateFwdDa") | |
| .Input ("attr: float") | |
| .Input ("rast: float") | |
| .Input ("tri: int32") | |
| .Input ("rast_db: float") | |
| .Output ("out: float") | |
| .Output ("out_da: float") | |
| .Attr ("diff_attrs_all: int") | |
| .Attr ("diff_attrs: list(int)"); | |
| REGISTER_KERNEL_BUILDER(Name("InterpolateFwd") .Device(DEVICE_GPU), InterpolateFwdOp<false>); | |
| REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>); | |
| //------------------------------------------------------------------------ | |
| // Gradient TensorFlow op. | |
| template <bool ENABLE_DA> | |
| struct InterpolateGradOp : public OpKernel | |
| { | |
| InterpolateKernelParams m_attribs; | |
| InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
| { | |
| memset(&m_attribs, 0, sizeof(m_attribs)); | |
| interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA); | |
| } | |
| void Compute(OpKernelContext* ctx) | |
| { | |
| InterpolateKernelParams& p = m_attribs; | |
| cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
| // Get input. | |
| const Tensor& attr = ctx->input(0); | |
| const Tensor& rast = ctx->input(1); | |
| const Tensor& tri = ctx->input(2); | |
| const Tensor& dy = ctx->input(3); | |
| const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3); | |
| const Tensor& dda = ctx->input(ENABLE_DA ? 5 : 3); | |
| // Instance rendering mode? | |
| p.instance_mode = attr.dims() > 2; | |
| // Extract input dimensions. | |
| if (p.instance_mode) | |
| { | |
| p.numVertices = (attr.dims() > 1) ? attr.dim_size(1) : 0; | |
| p.numAttr = (attr.dims() > 2) ? attr.dim_size(2) : 0; | |
| } | |
| else | |
| { | |
| p.numVertices = (attr.dims() > 0) ? attr.dim_size(0) : 0; | |
| p.numAttr = (attr.dims() > 1) ? attr.dim_size(1) : 0; | |
| } | |
| p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0; | |
| p.depth = (rast.dims() > 0) ? rast.dim_size(0) : 0; | |
| p.height = (rast.dims() > 1) ? rast.dim_size(1) : 0; | |
| p.width = (rast.dims() > 2) ? rast.dim_size(2) : 0; | |
| int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1; | |
| // Sanity checks. | |
| OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]")); | |
| OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); | |
| OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]")); | |
| OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]")); | |
| OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr")); | |
| OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr")); | |
| if (ENABLE_DA) | |
| { | |
| OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]")); | |
| OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda")); | |
| } | |
| // All diff attrs mode. | |
| if (p.diff_attrs_all) | |
| p.numDiffAttr = p.numAttr; | |
| // Get input pointers. | |
| p.attr = attr.flat<float>().data(); | |
| p.rast = rast.flat<float>().data(); | |
| p.tri = tri.flat<int>().data(); | |
| p.dy = dy.flat<float>().data(); | |
| p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0; | |
| p.dda = ENABLE_DA ? dda.flat<float>().data() : 0; | |
| p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0; | |
| // Allocate attribute gradient output tensor. | |
| Tensor* grad_attr_tensor = NULL; | |
| TensorShape grad_attr_shape; | |
| if (p.instance_mode) | |
| grad_attr_shape.AddDim(attr_depth); | |
| grad_attr_shape.AddDim(p.numVertices); | |
| grad_attr_shape.AddDim(p.numAttr); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor)); | |
| p.gradAttr = grad_attr_tensor->flat<float>().data(); | |
| // Allocate bary gradient output tensor. | |
| Tensor* grad_rast_tensor = NULL; | |
| TensorShape grad_rast_shape; | |
| grad_rast_shape.AddDim(p.depth); | |
| grad_rast_shape.AddDim(p.height); | |
| grad_rast_shape.AddDim(p.width); | |
| grad_rast_shape.AddDim(4); | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor)); | |
| p.gradRaster = grad_rast_tensor->flat<float>().data(); | |
| // Allocate bary pixel diff gradient output tensor. | |
| if (ENABLE_DA) | |
| { | |
| Tensor* grad_rast_db_tensor = NULL; | |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor)); | |
| p.gradRasterDB = grad_rast_db_tensor->flat<float>().data(); | |
| } | |
| // Clear attribute gradients. | |
| cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream); | |
| // Verify that buffers are aligned to allow float2/float4 operations. | |
| OP_REQUIRES(ctx, !((uintptr_t)p.rast & 15), errors::Internal("rast input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4")); | |
| if (ENABLE_DA) | |
| { | |
| OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4")); | |
| OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4")); | |
| } | |
| // Choose launch parameters. | |
| dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height); | |
| dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth); | |
| // Launch CUDA kernel. | |
| void* args[] = {&p}; | |
| void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel; | |
| OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream)); | |
| } | |
| }; | |
| REGISTER_OP("InterpolateGrad") | |
| .Input ("attr: float") | |
| .Input ("rast: float") | |
| .Input ("tri: int32") | |
| .Input ("dy: float") | |
| .Output ("grad_attr: float") | |
| .Output ("grad_rast: float") | |
| ; | |
| REGISTER_OP("InterpolateGradDa") | |
| .Input ("attr: float") | |
| .Input ("rast: float") | |
| .Input ("tri: int32") | |
| .Input ("dy: float") | |
| .Input ("rast_db: float") | |
| .Input ("dda: float") | |
| .Output ("grad_attr: float") | |
| .Output ("grad_rast: float") | |
| .Output ("grad_rast_db: float") | |
| .Attr ("diff_attrs_all: int") | |
| .Attr ("diff_attrs: list(int)"); | |
| ; | |
| REGISTER_KERNEL_BUILDER(Name("InterpolateGrad") .Device(DEVICE_GPU), InterpolateGradOp<false>); | |
| REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>); | |
| //------------------------------------------------------------------------ | |