|
#pragma once |
|
|
|
#include <stdio.h> |
|
|
|
#if defined(__HIPCC__) |
|
#define HOST_DEVICE_INLINE __host__ __device__ |
|
#define DEVICE_INLINE __device__ |
|
#define HOST_INLINE __host__ |
|
#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) |
|
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ |
|
#define DEVICE_INLINE __device__ __forceinline__ |
|
#define HOST_INLINE __host__ __forceinline__ |
|
#else |
|
#define HOST_DEVICE_INLINE inline |
|
#define DEVICE_INLINE inline |
|
#define HOST_INLINE inline |
|
#endif |
|
|
|
#define CUDA_CHECK(cmd) \ |
|
do { \ |
|
cudaError_t e = cmd; \ |
|
if (e != cudaSuccess) { \ |
|
printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ |
|
cudaGetErrorString(e)); \ |
|
exit(EXIT_FAILURE); \ |
|
} \ |
|
} while (0) |
|
|
|
int64_t get_device_attribute(int64_t attribute, int64_t device_id); |
|
|
|
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); |
|
|
|
namespace cuda_utils { |
|
|
|
template <typename T> |
|
HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T> |
|
ceil_div(T a, T b) { |
|
return (a + b - 1) / b; |
|
} |
|
|
|
}; |