Image-to-3D-Object-Generator

Runtime error

App Files Files Community

Image-to-3D-Object-Generator / extensions /nvdiffrast /nvdiffrast /tensorflow /tf_rasterize.cu

JeffreyXiang

update

15fe7bc 7 months ago

raw

history blame

11.4 kB

	// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	//
	// NVIDIA CORPORATION and its licensors retain all intellectual property
	// and proprietary rights in and to this software, related documentation
	// and any modifications thereto. Any use, reproduction, disclosure or
	// distribution of this software and related documentation without an express
	// license agreement from NVIDIA CORPORATION is strictly prohibited.

	//------------------------------------------------------------------------
	// Forward TensorFlow op.

	struct RasterizeFwdOp : public OpKernel
	{
	RasterizeGLState m_glState; // OpenGL-related persistent state.
	int m_tri_const; // 1 if triangle array is known to be constant.

	RasterizeFwdOp(OpKernelConstruction* ctx):
	OpKernel(ctx)
	{
	memset(&m_glState, 0, sizeof(RasterizeGLState));
	OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
	OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
	}

	void Compute(OpKernelContext* ctx)
	{
	cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();

	// Check that input shapes are correct.
	const Tensor& pos = ctx->input(0);
	const Tensor& tri = ctx->input(1);
	const Tensor& resolution = ctx->input(2);
	const Tensor& ranges = ctx->input(3);

	// Determine number of outputs
	int num_outputs = m_glState.enableDB ? 2 : 1;

	// Determine instance mode and check input dimensions.
	bool instance_mode = pos.dims() > 2;
	if (instance_mode)
	{
	OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
	OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
	OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
	}
	else
	{
	OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
	OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
	OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
	OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
	}

	// Get output shape.
	const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
	int height = res_in[0];
	int width = res_in[1];
	int depth = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
	OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));

	// Get position and triangle buffer sizes in int32/float32.
	int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
	int triCount = 3 * tri.dim_size(0);

	// Init context and GL?
	bool initCtx = !m_glState.glFBO;
	if (initCtx)
	{
	const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
	int cudaDeviceIdx = g ? g->gpu_id : -1;
	rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
	}
	else
	setGLContext(m_glState.glctx); // (Re-)Activate GL context.

	// Resize all buffers.
	bool changes = false;
	rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
	if (changes)
	{
	#ifdef _WIN32
	// Workaround for occasional blank first frame on Windows.
	releaseGLContext();
	setGLContext(m_glState.glctx);
	#endif
	}

	// Copy input data to GL and render.
	const float* posPtr = pos.flat<float>().data();
	const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
	const int32_t* triPtr = (initCtx \|\| !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
	int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
	rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);

	// Allocate output tensors.
	TensorShape output_shape;
	output_shape.AddDim(depth);
	output_shape.AddDim(height);
	output_shape.AddDim(width);
	output_shape.AddDim(4);
	float* outputPtr[2];
	for (int i=0; i < 2; i++)
	{
	if (i >= num_outputs)
	output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
	Tensor* output_tensor = NULL;
	OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
	if (i < num_outputs)
	outputPtr[i] = output_tensor->flat<float>().data();
	}

	// Copy rasterized results into CUDA buffers.
	rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);

	// Done. Release GL context.
	releaseGLContext();
	}
	};

	REGISTER_OP("RasterizeFwd")
	.Input ("pos: float")
	.Input ("tri: int32")
	.Input ("resolution: int32")
	.Input ("ranges: int32")
	.Output ("out: float")
	.Output ("out_db: float")
	.Attr ("enable_db: int")
	.Attr ("tri_const: int");

	REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);

	//------------------------------------------------------------------------
	// Gradient TensorFlow op.

	template <bool ENABLE_DB>
	struct RasterizeGradOp : public OpKernel
	{
	RasterizeGradParams m_attribs;

	RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
	{
	memset(&m_attribs, 0, sizeof(m_attribs));
	}

	void Compute(OpKernelContext* ctx)
	{
	RasterizeGradParams& p = m_attribs;
	cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();

	// Input tensors.
	const Tensor& pos = ctx->input(0);
	const Tensor& tri = ctx->input(1);
	const Tensor& out = ctx->input(2);
	const Tensor& dy = ctx->input(3);
	const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);

	// Determine instance mode.
	p.instance_mode = (pos.dims() > 2) ? 1 : 0;

	// Shape is taken from the rasterizer output tensor.
	OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
	p.depth = out.dim_size(0);
	p.height = out.dim_size(1);
	p.width = out.dim_size(2);
	OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));

	// Check other shapes.
	if (p.instance_mode)
	OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
	else
	OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
	OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
	OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
	OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.depth && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
	if (ENABLE_DB)
	OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));

	// Populate parameters.
	p.numTriangles = tri.dim_size(0);
	p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
	p.pos = pos.flat<float>().data();
	p.tri = tri.flat<int>().data();
	p.out = out.flat<float>().data();
	p.dy = dy.flat<float>().data();
	p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;

	// Set up pixel position to clip space x, y transform.
	p.xs = 2.f / (float)p.width;
	p.xo = 1.f / (float)p.width - 1.f;
	p.ys = 2.f / (float)p.height;
	p.yo = 1.f / (float)p.height - 1.f;

	// Allocate output tensor for position gradients.
	Tensor* grad_tensor = NULL;
	TensorShape grad_shape;
	if (p.instance_mode)
	grad_shape.AddDim(p.depth);
	grad_shape.AddDim(p.numVertices);
	grad_shape.AddDim(4);
	OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
	p.grad = grad_tensor->flat<float>().data();

	// Clear the output buffers.
	size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
	cudaMemsetAsync(p.grad, 0, gradBytes, stream);

	// Verify that buffers are aligned to allow float2/float4 operations.
	OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
	OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy input tensor not aligned to float2"));
	if (ENABLE_DB)
	OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));

	// Choose launch parameters.
	dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
	dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.depth);

	// Launch CUDA kernel.
	void* args[] = {&p};
	void* func = ENABLE_DB ? (void)RasterizeGradKernelDb : (void)RasterizeGradKernel;
	OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
	}
	};

	REGISTER_OP("RasterizeGrad")
	.Input ("pos: float")
	.Input ("tri: int32")
	.Input ("out: float")
	.Input ("dy: float")
	.Output ("grad: float");

	REGISTER_OP("RasterizeGradDb")
	.Input ("pos: float")
	.Input ("tri: int32")
	.Input ("out: float")
	.Input ("dy: float")
	.Input ("ddb: float")
	.Output ("grad: float");

	REGISTER_KERNEL_BUILDER(Name("RasterizeGrad") .Device(DEVICE_GPU), RasterizeGradOp<false>);
	REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);

	//------------------------------------------------------------------------