Spaces:
Running
Running
File size: 1,356 Bytes
c61ccee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>
namespace at::cuda {
/**
Computes ceil(a / b)
*/
template <typename T>
__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
return (a + b - 1) / b;
}
namespace {
// Threads per block for our apply kernel
// FIXME: use occupancy calculator instead
constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
template <int step = 1>
inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
if (curDevice == -1) return false;
uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
if (numBlocks > maxGridX)
numBlocks = maxGridX;
grid = dim3(numBlocks);
return true;
}
constexpr int getApplyBlocksPerSM() {
return AT_APPLY_BLOCKS_PER_SM;
}
constexpr int getApplyBlockSize() {
return AT_APPLY_THREADS_PER_BLOCK;
}
inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
return dim3(max_threads_per_block);
}
} // anonymous namespace
} // namespace at::cuda
|