Spaces:
Running
Running
/// Number of assertion failure messages we can store. If this is too small | |
/// threads will fail silently. | |
constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10; | |
constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512; | |
namespace c10::cuda { | |
/// Holds information about any device-side assertions that fail. | |
/// Held in managed memory and access by both the CPU and the GPU. | |
struct DeviceAssertionData { | |
/// Stringification of the assertion | |
// NOLINTNEXTLINE(*-c-arrays) | |
char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{}; | |
/// File the assertion was in | |
// NOLINTNEXTLINE(*-c-arrays) | |
char filename[C10_CUDA_DSA_MAX_STR_LEN]{}; | |
/// Name of the function the assertion was in | |
// NOLINTNEXTLINE(*-c-arrays) | |
char function_name[C10_CUDA_DSA_MAX_STR_LEN]{}; | |
/// Line number the assertion was at | |
int line_number{}; | |
/// Number uniquely identifying the kernel launch that triggered the assertion | |
uint32_t caller{}; | |
/// block_id of the thread that failed the assertion | |
// NOLINTNEXTLINE(*-c-arrays) | |
int32_t block_id[3]{}; | |
/// third_id of the thread that failed the assertion | |
// NOLINTNEXTLINE(*-c-arrays) | |
int32_t thread_id[3]{}; | |
}; | |
/// Used to hold assertions generated by the device | |
/// Held in managed memory and access by both the CPU and the GPU. | |
struct DeviceAssertionsData { | |
/// Total number of assertions found; a subset of thse will be recorded | |
/// in `assertions` | |
int32_t assertion_count{}; | |
/// An array of assertions that will be written to in a race-free manner | |
// NOLINTNEXTLINE(*-c-arrays) | |
DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{}; | |
}; | |
/// Use to hold info about kernel launches so that we can run kernels | |
/// asynchronously and still associate launches with device-side | |
/// assertion failures | |
struct CUDAKernelLaunchInfo { | |
/// Filename of the code where the kernel was launched from | |
const char* launch_filename; | |
/// Function from which the kernel was launched | |
const char* launch_function; | |
/// Line number of where the code was launched from | |
uint32_t launch_linenum; | |
/// Backtrace of where the kernel was launched from, only populated if | |
/// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True | |
std::string launch_stacktrace; | |
/// Kernel that was launched | |
const char* kernel_name; | |
/// Device the kernel was launched on | |
int device; | |
/// Stream the kernel was launched on | |
int32_t stream; | |
/// A number that uniquely identifies the kernel launch | |
uint64_t generation_number; | |
}; | |
/// Circular buffer used to hold information about kernel launches | |
/// this is later used to reconstruct how a device-side kernel assertion failure | |
/// occurred CUDAKernelLaunchRegistry is used as a singleton | |
class C10_CUDA_API CUDAKernelLaunchRegistry { | |
private: | |
/// Assume that this is the max number of kernel launches that might ever be | |
/// enqueued across all streams on a single device | |
static constexpr int max_kernel_launches = 1024; | |
/// How many kernel launch infos we've inserted. Used to ensure that circular | |
/// queue doesn't provide false information by always increasing, but also to | |
/// mark where we are inserting into the queue | |
uint64_t generation_number = 0; | |
/// Shared mutex between writer and accessor to ensure multi-threaded safety. | |
mutable std::mutex read_write_mutex; | |
/// Used to ensure prevent race conditions in GPU memory allocation | |
mutable std::mutex gpu_alloc_mutex; | |
/// Pointer to managed memory keeping track of device-side assertions. There | |
/// is one entry for each possible device the process might work with. Unused | |
/// entries are nullptrs. We could also use an unordered_set here, but this | |
/// vector design will be faster and the wasted memory is small since we | |
/// expect the number of GPUs per node will always be small | |
std::vector< | |
std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>> | |
uvm_assertions; | |
/// A single circular buffer holds information about every kernel launch the | |
/// process makes across all devices. | |
std::vector<CUDAKernelLaunchInfo> kernel_launches; | |
bool check_env_for_enable_launch_stacktracing() const; | |
bool check_env_for_dsa_enabled() const; | |
public: | |
CUDAKernelLaunchRegistry(); | |
/// Register a new kernel launch and obtain a generation number back to be | |
/// passed to the kernel | |
uint32_t insert( | |
const char* launch_filename, | |
const char* launch_function, | |
const uint32_t launch_linenum, | |
const char* kernel_name, | |
const int32_t stream_id); | |
/// Get copies of the kernel launch registry and each device's assertion | |
/// failure buffer so they can be inspected without raising race conditions | |
std:: | |
pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>> | |
snapshot() const; | |
/// Get a pointer to the current device's assertion failure buffer. If no such | |
/// buffer exists then one is created. This means that the first kernel launch | |
/// made on each device will be slightly slower because memory allocations are | |
/// required | |
DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device(); | |
/// Gets the global singleton of the registry | |
static CUDAKernelLaunchRegistry& get_singleton_ref(); | |
/// If not all devices support DSA, we disable it | |
const bool do_all_devices_support_managed_memory = false; | |
/// Whether or not to gather stack traces when launching kernels | |
bool gather_launch_stacktrace = false; | |
/// Whether or not host-side DSA is enabled or disabled at run-time | |
/// Note: Device-side code cannot be enabled/disabled at run-time | |
bool enabled_at_runtime = false; | |
/// Whether or not a device has indicated a failure | |
bool has_failed() const; | |
const bool enabled_at_compile_time = true; | |
const bool enabled_at_compile_time = false; | |
}; | |
std::string c10_retrieve_device_side_assertion_info(); | |
} // namespace c10::cuda | |
// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH | |
// requires the same input arguments. We introduce the following macro to | |
// standardize these. | |
[[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \ | |
[[maybe_unused]] uint32_t assertion_caller_id | |
// This macro can be used to pass the DSA arguments onward to another | |
// function | |