// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. // // NVIDIA CORPORATION and its licensors retain all intellectual property // and proprietary rights in and to this software, related documentation // and any modifications thereto. Any use, reproduction, disclosure or // distribution of this software and related documentation without an express // license agreement from NVIDIA CORPORATION is strictly prohibited. #include "rasterize_gl.h" #include "glutil.h" #include #define STRINGIFY_SHADER_SOURCE(x) #x //------------------------------------------------------------------------ // Helpers. #define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) static int ROUND_UP_BITS(uint32_t x, uint32_t y) { // Round x up so that it has at most y bits of mantissa. if (x < (1u << y)) return x; uint32_t m = 0; while (x & ~m) m = (m << 1) | 1u; m >>= y; if (!(x & m)) return x; return (x | m) + 1u; } //------------------------------------------------------------------------ // Draw command struct used by rasterizer. struct GLDrawCmd { uint32_t count; uint32_t instanceCount; uint32_t firstIndex; uint32_t baseVertex; uint32_t baseInstance; }; //------------------------------------------------------------------------ // GL helpers. static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf) { std::string src(src_buf); // Set preprocessor directives. int n = src.find('\n') + 1; // After first line containing #version directive. if (s.enableZModify) src.insert(n, "#define IF_ZMODIFY(x) x\n"); else src.insert(n, "#define IF_ZMODIFY(x)\n"); const char *cstr = src.c_str(); *pShader = 0; NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType)); NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0)); NVDR_CHECK_GL_ERROR(glCompileShader(*pShader)); } static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader) { *pProgram = 0; GLuint glProgram = 0; NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram()); NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader)); NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader)); NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader)); NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram)); GLint linkStatus = 0; NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus)); if (!linkStatus) { GLint infoLen = 0; NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen)); if (infoLen) { const char* hdr = "glLinkProgram() failed:\n"; std::vector info(strlen(hdr) + infoLen); strcpy(&info[0], hdr); NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)])); NVDR_CHECK(0, &info[0]); } NVDR_CHECK(0, "glLinkProgram() failed"); } *pProgram = glProgram; } //------------------------------------------------------------------------ // Shared C++ functions. void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx) { // Create GL context and set it current. s.glctx = createGLContext(cudaDeviceIdx); setGLContext(s.glctx); // Version check. GLint vMajor = 0; GLint vMinor = 0; glGetIntegerv(GL_MAJOR_VERSION, &vMajor); glGetIntegerv(GL_MINOR_VERSION, &vMinor); glGetError(); // Clear possible GL_INVALID_ENUM error in version query. LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor; NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required"); // Enable depth modification workaround on A100 and later. int capMajor = 0; NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx)); s.enableZModify = (capMajor >= 8); // Number of output buffers. int num_outputs = s.enableDB ? 2 : 1; // Set up vertex shader. compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER, "#version 330\n" "#extension GL_ARB_shader_draw_parameters : enable\n" STRINGIFY_SHADER_SOURCE( layout(location = 0) in vec4 in_pos; out int v_layer; out int v_offset; void main() { int layer = gl_DrawIDARB; gl_Position = in_pos; v_layer = layer; v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here. } ) ); // Geometry and fragment shaders depend on if bary differential output is enabled or not. if (s.enableDB) { // Set up geometry shader. Calculation of per-pixel bary differentials is based on: // u = (u/w) / (1/w) // --> du/dX = d((u/w) / (1/w))/dX // --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle. compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER, "#version 430\n" STRINGIFY_SHADER_SOURCE( layout(triangles) in; layout(triangle_strip, max_vertices=3) out; layout(location = 0) uniform vec2 vp_scale; in int v_layer[]; in int v_offset[]; out vec4 var_uvzw; out vec4 var_db; void main() { // Plane equations for bary differentials. float w0 = gl_in[0].gl_Position.w; float w1 = gl_in[1].gl_Position.w; float w2 = gl_in[2].gl_Position.w; vec2 p0 = gl_in[0].gl_Position.xy; vec2 p1 = gl_in[1].gl_Position.xy; vec2 p2 = gl_in[2].gl_Position.xy; vec2 e0 = p0*w2 - p2*w0; vec2 e1 = p1*w2 - p2*w1; float a = e0.x*e1.y - e0.y*e1.x; // Clamp area to an epsilon to avoid arbitrarily high bary differentials. float eps = 1e-6f; // ~1 pixel in 1k x 1k image. float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign. float ia = 1.f / ca; // Inverse area. vec2 ascl = ia * vp_scale; float dudx = e1.y * ascl.x; float dudy = -e1.x * ascl.y; float dvdx = -e0.y * ascl.x; float dvdy = e0.x * ascl.y; float duwdx = w2 * dudx; float dvwdx = w2 * dvdx; float duvdx = w0 * dudx + w1 * dvdx; float duwdy = w2 * dudy; float dvwdy = w2 * dvdy; float duvdy = w0 * dudy + w1 * dvdy; vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy); vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy); vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy); int layer_id = v_layer[0]; int prim_id = gl_PrimitiveIDIn + v_offset[0]; gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex(); } ) ); // Set up fragment shader. compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER, "#version 430\n" STRINGIFY_SHADER_SOURCE( in vec4 var_uvzw; in vec4 var_db; layout(location = 0) out vec4 out_raster; layout(location = 1) out vec4 out_db; IF_ZMODIFY( layout(location = 1) uniform float in_dummy; ) void main() { int id_int = gl_PrimitiveID + 1; float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); out_db = var_db * var_uvzw.w; IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) } ) ); // Set up fragment shader for depth peeling. compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER, "#version 430\n" STRINGIFY_SHADER_SOURCE( in vec4 var_uvzw; in vec4 var_db; layout(binding = 0) uniform sampler2DArray out_prev; layout(location = 0) out vec4 out_raster; layout(location = 1) out vec4 out_db; IF_ZMODIFY( layout(location = 1) uniform float in_dummy; ) void main() { int id_int = gl_PrimitiveID + 1; float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0); float depth_new = var_uvzw.z / var_uvzw.w; if (prev.w == 0 || depth_new <= prev.z) discard; out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float); out_db = var_db * var_uvzw.w; IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) } ) ); } else { // Geometry shader without bary differential output. compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER, "#version 330\n" STRINGIFY_SHADER_SOURCE( layout(triangles) in; layout(triangle_strip, max_vertices=3) out; in int v_layer[]; in int v_offset[]; out vec4 var_uvzw; void main() { int layer_id = v_layer[0]; int prim_id = gl_PrimitiveIDIn + v_offset[0]; gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex(); gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex(); } ) ); // Fragment shader without bary differential output. compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER, "#version 430\n" STRINGIFY_SHADER_SOURCE( in vec4 var_uvzw; layout(location = 0) out vec4 out_raster; IF_ZMODIFY( layout(location = 1) uniform float in_dummy; ) void main() { int id_int = gl_PrimitiveID + 1; float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) } ) ); // Depth peeling variant of fragment shader. compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER, "#version 430\n" STRINGIFY_SHADER_SOURCE( in vec4 var_uvzw; layout(binding = 0) uniform sampler2DArray out_prev; layout(location = 0) out vec4 out_raster; IF_ZMODIFY( layout(location = 1) uniform float in_dummy; ) void main() { int id_int = gl_PrimitiveID + 1; float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0); float depth_new = var_uvzw.z / var_uvzw.w; if (prev.w == 0 || depth_new <= prev.z) discard; out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) } ) ); } // Finalize programs. constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader); constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP); // Construct main fbo and bind permanently. NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO)); NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO)); // Enable two color attachments. GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 }; NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers)); // Construct vertex array object. NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO)); NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO)); // Construct position buffer, bind permanently, enable, set ptr. NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer)); NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer)); NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0)); NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0)); // Construct index buffer and bind permanently. NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer)); NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer)); // Set up depth test. NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST)); NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS)); NVDR_CHECK_GL_ERROR(glClearDepth(1.0)); // Create and bind output buffers. Storage is allocated later. NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer)); for (int i=0; i < num_outputs; i++) { NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i])); NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0)); } // Create and bind depth/stencil buffer. Storage is allocated later. NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer)); NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer)); NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0)); // Create texture name for previous output buffer (depth peeling). NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer)); } void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth) { changes = false; // Resize vertex buffer? if (posCount > s.posCount) { if (s.cudaPosBuffer) NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer)); s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64; LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32"; NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard)); changes = true; } // Resize triangle buffer? if (triCount > s.triCount) { if (s.cudaTriBuffer) NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer)); s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64; LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32"; NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard)); changes = true; } // Resize framebuffer? if (width > s.width || height > s.height || depth > s.depth) { int num_outputs = s.enableDB ? 2 : 1; if (s.cudaColorBuffer[0]) for (int i=0; i < num_outputs; i++) NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i])); if (s.cudaPrevOutBuffer) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer)); s.cudaPrevOutBuffer = 0; } // New framebuffer size. s.width = (width > s.width) ? width : s.width; s.height = (height > s.height) ? height : s.height; s.depth = (depth > s.depth) ? depth : s.depth; s.width = ROUND_UP(s.width, 32); s.height = ROUND_UP(s.height, 32); LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")"; // Allocate color buffers. for (int i=0; i < num_outputs; i++) { NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i])); NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE)); } // Allocate depth/stencil buffer. NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer)); NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0)); // (Re-)register all GL buffers into Cuda. for (int i=0; i < num_outputs; i++) NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly)); changes = true; } } void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx) { // Only copy inputs if we are on first iteration of depth peeling or not doing it at all. if (peeling_idx < 1) { if (triPtr) { // Copy both position and triangle buffers. void* glPosPtr = NULL; void* glTriPtr = NULL; size_t posBytes = 0; size_t triBytes = 0; NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer)); NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch"); NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch"); NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream)); NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream)); } else { // Copy position buffer only. Triangles are already copied and known to be constant. void* glPosPtr = NULL; size_t posBytes = 0; NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer)); NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch"); NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream)); } } // Select program based on whether we have a depth peeling input or not. if (peeling_idx < 1) { // Normal case: No peeling, or peeling disabled. NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram)); } else { // If we don't have a third buffer yet, create one. if (!s.cudaPrevOutBuffer) { NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer)); NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE)); NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE)); NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly)); } // Swap the GL buffers. GLuint glTempBuffer = s.glPrevOutBuffer; s.glPrevOutBuffer = s.glColorBuffer[0]; s.glColorBuffer[0] = glTempBuffer; // Swap the Cuda buffers. cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer; s.cudaPrevOutBuffer = s.cudaColorBuffer[0]; s.cudaColorBuffer[0] = cudaTempBuffer; // Bind the new output buffer. NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0])); NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0)); // Bind old buffer as the input texture. NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer)); // Activate the correct program. NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP)); } // Set viewport, clear color buffer(s) and depth/stencil buffer. NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height)); NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT)); // If outputting bary differentials, set resolution uniform if (s.enableDB) NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height)); // Set the dummy uniform if depth modification workaround is active. if (s.enableZModify) NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f)); // Render the meshes. if (depth == 1 && !rangesPtr) { // Trivial case. NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0)); } else { // Populate a buffer for draw commands and execute it. std::vector drawCmdBuffer(depth); if (!rangesPtr) { // Fill in range array to instantiate the same triangles for each output layer. // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to // the first dimension in addressing the triangle array. for (int i=0; i < depth; i++) { GLDrawCmd& cmd = drawCmdBuffer[i]; cmd.firstIndex = 0; cmd.count = triCount; cmd.baseVertex = vtxPerInstance * i; cmd.baseInstance = 0; cmd.instanceCount = 1; } } else { // Fill in the range array according to user-given ranges. Triangle IDs point // to the input triangle array, NOT index within range, so they correspond to // the first dimension in addressing the triangle array. for (int i=0, j=0; i < depth; i++) { GLDrawCmd& cmd = drawCmdBuffer[i]; int first = rangesPtr[j++]; int count = rangesPtr[j++]; NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values"); NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer"); cmd.firstIndex = first * 3; cmd.count = count * 3; cmd.baseVertex = 0; cmd.baseInstance = first; cmd.instanceCount = 1; } } // Draw! NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd))); } } void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth) { // Copy color buffers to output tensors. cudaArray_t array = 0; cudaChannelFormatDesc arrayDesc = {}; // For error checking. cudaExtent arrayExt = {}; // For error checking. int num_outputs = s.enableDB ? 2 : 1; NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream)); for (int i=0; i < num_outputs; i++) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0)); NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array)); NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch"); NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch"); NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch"); cudaMemcpy3DParms p = {0}; p.srcArray = array; p.dstPtr.ptr = outputPtr[i]; p.dstPtr.pitch = width * 4 * sizeof(float); p.dstPtr.xsize = width; p.dstPtr.ysize = height; p.extent.width = width; p.extent.height = height; p.extent.depth = depth; p.kind = cudaMemcpyDeviceToDevice; NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream)); } NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream)); } void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s) { int num_outputs = s.enableDB ? 2 : 1; if (s.cudaPosBuffer) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer)); s.cudaPosBuffer = 0; } if (s.cudaTriBuffer) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer)); s.cudaTriBuffer = 0; } for (int i=0; i < num_outputs; i++) { if (s.cudaColorBuffer[i]) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i])); s.cudaColorBuffer[i] = 0; } } if (s.cudaPrevOutBuffer) { NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer)); s.cudaPrevOutBuffer = 0; } } //------------------------------------------------------------------------