JeffreyXiang's picture
update
15fe7bc
raw
history blame
11.4 kB
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//------------------------------------------------------------------------
// Forward TensorFlow op.
struct RasterizeFwdOp : public OpKernel
{
RasterizeGLState m_glState; // OpenGL-related persistent state.
int m_tri_const; // 1 if triangle array is known to be constant.
RasterizeFwdOp(OpKernelConstruction* ctx):
OpKernel(ctx)
{
memset(&m_glState, 0, sizeof(RasterizeGLState));
OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
}
void Compute(OpKernelContext* ctx)
{
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Check that input shapes are correct.
const Tensor& pos = ctx->input(0);
const Tensor& tri = ctx->input(1);
const Tensor& resolution = ctx->input(2);
const Tensor& ranges = ctx->input(3);
// Determine number of outputs
int num_outputs = m_glState.enableDB ? 2 : 1;
// Determine instance mode and check input dimensions.
bool instance_mode = pos.dims() > 2;
if (instance_mode)
{
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
}
else
{
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
}
// Get output shape.
const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
int height = res_in[0];
int width = res_in[1];
int depth = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
// Get position and triangle buffer sizes in int32/float32.
int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
int triCount = 3 * tri.dim_size(0);
// Init context and GL?
bool initCtx = !m_glState.glFBO;
if (initCtx)
{
const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
int cudaDeviceIdx = g ? g->gpu_id : -1;
rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
}
else
setGLContext(m_glState.glctx); // (Re-)Activate GL context.
// Resize all buffers.
bool changes = false;
rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
if (changes)
{
#ifdef _WIN32
// Workaround for occasional blank first frame on Windows.
releaseGLContext();
setGLContext(m_glState.glctx);
#endif
}
// Copy input data to GL and render.
const float* posPtr = pos.flat<float>().data();
const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);
// Allocate output tensors.
TensorShape output_shape;
output_shape.AddDim(depth);
output_shape.AddDim(height);
output_shape.AddDim(width);
output_shape.AddDim(4);
float* outputPtr[2];
for (int i=0; i < 2; i++)
{
if (i >= num_outputs)
output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
Tensor* output_tensor = NULL;
OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
if (i < num_outputs)
outputPtr[i] = output_tensor->flat<float>().data();
}
// Copy rasterized results into CUDA buffers.
rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
// Done. Release GL context.
releaseGLContext();
}
};
REGISTER_OP("RasterizeFwd")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("resolution: int32")
.Input ("ranges: int32")
.Output ("out: float")
.Output ("out_db: float")
.Attr ("enable_db: int")
.Attr ("tri_const: int");
REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
//------------------------------------------------------------------------
// Gradient TensorFlow op.
template <bool ENABLE_DB>
struct RasterizeGradOp : public OpKernel
{
RasterizeGradParams m_attribs;
RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
{
memset(&m_attribs, 0, sizeof(m_attribs));
}
void Compute(OpKernelContext* ctx)
{
RasterizeGradParams& p = m_attribs;
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
// Input tensors.
const Tensor& pos = ctx->input(0);
const Tensor& tri = ctx->input(1);
const Tensor& out = ctx->input(2);
const Tensor& dy = ctx->input(3);
const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
// Determine instance mode.
p.instance_mode = (pos.dims() > 2) ? 1 : 0;
// Shape is taken from the rasterizer output tensor.
OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
p.depth = out.dim_size(0);
p.height = out.dim_size(1);
p.width = out.dim_size(2);
OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
// Check other shapes.
if (p.instance_mode)
OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
else
OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.depth && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
if (ENABLE_DB)
OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
// Populate parameters.
p.numTriangles = tri.dim_size(0);
p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
p.pos = pos.flat<float>().data();
p.tri = tri.flat<int>().data();
p.out = out.flat<float>().data();
p.dy = dy.flat<float>().data();
p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
// Set up pixel position to clip space x, y transform.
p.xs = 2.f / (float)p.width;
p.xo = 1.f / (float)p.width - 1.f;
p.ys = 2.f / (float)p.height;
p.yo = 1.f / (float)p.height - 1.f;
// Allocate output tensor for position gradients.
Tensor* grad_tensor = NULL;
TensorShape grad_shape;
if (p.instance_mode)
grad_shape.AddDim(p.depth);
grad_shape.AddDim(p.numVertices);
grad_shape.AddDim(4);
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
p.grad = grad_tensor->flat<float>().data();
// Clear the output buffers.
size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
cudaMemsetAsync(p.grad, 0, gradBytes, stream);
// Verify that buffers are aligned to allow float2/float4 operations.
OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy input tensor not aligned to float2"));
if (ENABLE_DB)
OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
// Choose launch parameters.
dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
// Launch CUDA kernel.
void* args[] = {&p};
void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
}
};
REGISTER_OP("RasterizeGrad")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("out: float")
.Input ("dy: float")
.Output ("grad: float");
REGISTER_OP("RasterizeGradDb")
.Input ("pos: float")
.Input ("tri: int32")
.Input ("out: float")
.Input ("dy: float")
.Input ("ddb: float")
.Output ("grad: float");
REGISTER_KERNEL_BUILDER(Name("RasterizeGrad") .Device(DEVICE_GPU), RasterizeGradOp<false>);
REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
//------------------------------------------------------------------------