Spaces:
Runtime error
Runtime error
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
// | |
// NVIDIA CORPORATION and its licensors retain all intellectual property | |
// and proprietary rights in and to this software, related documentation | |
// and any modifications thereto. Any use, reproduction, disclosure or | |
// distribution of this software and related documentation without an express | |
// license agreement from NVIDIA CORPORATION is strictly prohibited. | |
//------------------------------------------------------------------------ | |
// Forward TensorFlow op. | |
struct RasterizeFwdOp : public OpKernel | |
{ | |
RasterizeGLState m_glState; // OpenGL-related persistent state. | |
int m_tri_const; // 1 if triangle array is known to be constant. | |
RasterizeFwdOp(OpKernelConstruction* ctx): | |
OpKernel(ctx) | |
{ | |
memset(&m_glState, 0, sizeof(RasterizeGLState)); | |
OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB)); | |
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const)); | |
} | |
void Compute(OpKernelContext* ctx) | |
{ | |
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
// Check that input shapes are correct. | |
const Tensor& pos = ctx->input(0); | |
const Tensor& tri = ctx->input(1); | |
const Tensor& resolution = ctx->input(2); | |
const Tensor& ranges = ctx->input(3); | |
// Determine number of outputs | |
int num_outputs = m_glState.enableDB ? 2 : 1; | |
// Determine instance mode and check input dimensions. | |
bool instance_mode = pos.dims() > 2; | |
if (instance_mode) | |
{ | |
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]")); | |
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); | |
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]")); | |
} | |
else | |
{ | |
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]")); | |
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); | |
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]")); | |
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]")); | |
} | |
// Get output shape. | |
const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory. | |
int height = res_in[0]; | |
int width = res_in[1]; | |
int depth = instance_mode ? pos.dim_size(0) : ranges.dim_size(0); | |
OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]")); | |
// Get position and triangle buffer sizes in int32/float32. | |
int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1); | |
int triCount = 3 * tri.dim_size(0); | |
// Init context and GL? | |
bool initCtx = !m_glState.glFBO; | |
if (initCtx) | |
{ | |
const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info(); | |
int cudaDeviceIdx = g ? g->gpu_id : -1; | |
rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp | |
} | |
else | |
setGLContext(m_glState.glctx); // (Re-)Activate GL context. | |
// Resize all buffers. | |
bool changes = false; | |
rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp | |
if (changes) | |
{ | |
#ifdef _WIN32 | |
// Workaround for occasional blank first frame on Windows. | |
releaseGLContext(); | |
setGLContext(m_glState.glctx); | |
#endif | |
} | |
// Copy input data to GL and render. | |
const float* posPtr = pos.flat<float>().data(); | |
const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory. | |
const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed. | |
int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0; | |
rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1); | |
// Allocate output tensors. | |
TensorShape output_shape; | |
output_shape.AddDim(depth); | |
output_shape.AddDim(height); | |
output_shape.AddDim(width); | |
output_shape.AddDim(4); | |
float* outputPtr[2]; | |
for (int i=0; i < 2; i++) | |
{ | |
if (i >= num_outputs) | |
output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor. | |
Tensor* output_tensor = NULL; | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor)); | |
if (i < num_outputs) | |
outputPtr[i] = output_tensor->flat<float>().data(); | |
} | |
// Copy rasterized results into CUDA buffers. | |
rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth); | |
// Done. Release GL context. | |
releaseGLContext(); | |
} | |
}; | |
REGISTER_OP("RasterizeFwd") | |
.Input ("pos: float") | |
.Input ("tri: int32") | |
.Input ("resolution: int32") | |
.Input ("ranges: int32") | |
.Output ("out: float") | |
.Output ("out_db: float") | |
.Attr ("enable_db: int") | |
.Attr ("tri_const: int"); | |
REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp); | |
//------------------------------------------------------------------------ | |
// Gradient TensorFlow op. | |
template <bool ENABLE_DB> | |
struct RasterizeGradOp : public OpKernel | |
{ | |
RasterizeGradParams m_attribs; | |
RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
{ | |
memset(&m_attribs, 0, sizeof(m_attribs)); | |
} | |
void Compute(OpKernelContext* ctx) | |
{ | |
RasterizeGradParams& p = m_attribs; | |
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
// Input tensors. | |
const Tensor& pos = ctx->input(0); | |
const Tensor& tri = ctx->input(1); | |
const Tensor& out = ctx->input(2); | |
const Tensor& dy = ctx->input(3); | |
const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3); | |
// Determine instance mode. | |
p.instance_mode = (pos.dims() > 2) ? 1 : 0; | |
// Shape is taken from the rasterizer output tensor. | |
OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4")); | |
p.depth = out.dim_size(0); | |
p.height = out.dim_size(1); | |
p.width = out.dim_size(2); | |
OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]")); | |
// Check other shapes. | |
if (p.instance_mode) | |
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]")); | |
else | |
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]")); | |
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]")); | |
OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]")); | |
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.depth && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]")); | |
if (ENABLE_DB) | |
OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]")); | |
// Populate parameters. | |
p.numTriangles = tri.dim_size(0); | |
p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0); | |
p.pos = pos.flat<float>().data(); | |
p.tri = tri.flat<int>().data(); | |
p.out = out.flat<float>().data(); | |
p.dy = dy.flat<float>().data(); | |
p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0; | |
// Set up pixel position to clip space x, y transform. | |
p.xs = 2.f / (float)p.width; | |
p.xo = 1.f / (float)p.width - 1.f; | |
p.ys = 2.f / (float)p.height; | |
p.yo = 1.f / (float)p.height - 1.f; | |
// Allocate output tensor for position gradients. | |
Tensor* grad_tensor = NULL; | |
TensorShape grad_shape; | |
if (p.instance_mode) | |
grad_shape.AddDim(p.depth); | |
grad_shape.AddDim(p.numVertices); | |
grad_shape.AddDim(4); | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor)); | |
p.grad = grad_tensor->flat<float>().data(); | |
// Clear the output buffers. | |
size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float); | |
cudaMemsetAsync(p.grad, 0, gradBytes, stream); | |
// Verify that buffers are aligned to allow float2/float4 operations. | |
OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy input tensor not aligned to float2")); | |
if (ENABLE_DB) | |
OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4")); | |
// Choose launch parameters. | |
dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height); | |
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth); | |
// Launch CUDA kernel. | |
void* args[] = {&p}; | |
void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel; | |
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream)); | |
} | |
}; | |
REGISTER_OP("RasterizeGrad") | |
.Input ("pos: float") | |
.Input ("tri: int32") | |
.Input ("out: float") | |
.Input ("dy: float") | |
.Output ("grad: float"); | |
REGISTER_OP("RasterizeGradDb") | |
.Input ("pos: float") | |
.Input ("tri: int32") | |
.Input ("out: float") | |
.Input ("dy: float") | |
.Input ("ddb: float") | |
.Output ("grad: float"); | |
REGISTER_KERNEL_BUILDER(Name("RasterizeGrad") .Device(DEVICE_GPU), RasterizeGradOp<false>); | |
REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>); | |
//------------------------------------------------------------------------ | |