Spaces:
Running
Running
namespace c10::cuda { | |
// Copy string from `src` to `dst` | |
static __device__ void dstrcpy(char* dst, const char* src) { | |
int i = 0; | |
// Copy string from source to destination, ensuring that it | |
// isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1` | |
while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) { | |
*dst++ = *src++; | |
} | |
*dst = '\0'; | |
} | |
static __device__ void dsa_add_new_assertion_failure( | |
DeviceAssertionsData* assertions_data, | |
const char* assertion_msg, | |
const char* filename, | |
const char* function_name, | |
const int line_number, | |
const uint32_t caller, | |
const dim3 block_id, | |
const dim3 thread_id) { | |
// `assertions_data` may be nullptr if device-side assertion checking | |
// is disabled at run-time. If it is disabled at compile time this | |
// function will never be called | |
if (!assertions_data) { | |
return; | |
} | |
// Atomically increment so other threads can fail at the same time | |
// Note that incrementing this means that the CPU can observe that | |
// a failure has happened and can begin to respond before we've | |
// written information about that failure out to the buffer. | |
const auto nid = atomicAdd(&(assertions_data->assertion_count), 1); | |
if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) { | |
// At this point we're ran out of assertion buffer space. | |
// We could print a message about this, but that'd get | |
// spammy if a lot of threads did it, so we just silently | |
// ignore any other assertion failures. In most cases the | |
// failures will all probably be analogous anyway. | |
return; | |
} | |
// Write information about the assertion failure to memory. | |
// Note that this occurs only after the `assertion_count` | |
// increment broadcasts that there's been a problem. | |
auto& self = assertions_data->assertions[nid]; | |
dstrcpy(self.assertion_msg, assertion_msg); | |
dstrcpy(self.filename, filename); | |
dstrcpy(self.function_name, function_name); | |
self.line_number = line_number; | |
self.caller = caller; | |
self.block_id[0] = block_id.x; | |
self.block_id[1] = block_id.y; | |
self.block_id[2] = block_id.z; | |
self.thread_id[0] = thread_id.x; | |
self.thread_id[1] = thread_id.y; | |
self.thread_id[2] = thread_id.z; | |
} | |
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, | |
// so you should assume everything the kernel produces is garbage if there's an | |
// assertion failure. | |
// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are | |
// arguments of the kernel and therefore accessible. | |
do { \ | |
if (C10_UNLIKELY(!(condition))) { \ | |
/* Has an atomic element so threads can fail at the same time */ \ | |
c10::cuda::dsa_add_new_assertion_failure( \ | |
assertions_data, \ | |
C10_STRINGIZE(condition), \ | |
__FILE__, \ | |
__FUNCTION__, \ | |
__LINE__, \ | |
assertion_caller_id, \ | |
blockIdx, \ | |
threadIdx); \ | |
/* Now that the kernel has failed we early exit the kernel, but */ \ | |
/* otherwise keep going and rely on the host to check UVM and */ \ | |
/* determine we've had a problem */ \ | |
return; \ | |
} \ | |
} while (false) | |
} // namespace c10::cuda | |