Spaces:
Running
Running
// Collection of direct PTX functions | |
namespace at::cuda { | |
template <typename T> | |
struct Bitfield {}; | |
template <> | |
struct Bitfield<unsigned int> { | |
static __device__ __host__ __forceinline__ | |
unsigned int getBitfield(unsigned int val, int pos, int len) { | |
pos &= 0xff; | |
len &= 0xff; | |
unsigned int m = (1u << len) - 1u; | |
return (val >> pos) & m; | |
unsigned int ret; | |
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len)); | |
return ret; | |
} | |
static __device__ __host__ __forceinline__ | |
unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) { | |
pos &= 0xff; | |
len &= 0xff; | |
unsigned int m = (1u << len) - 1u; | |
toInsert &= m; | |
toInsert <<= pos; | |
m <<= pos; | |
return (val & ~m) | toInsert; | |
unsigned int ret; | |
asm("bfi.b32 %0, %1, %2, %3, %4;" : | |
"=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len)); | |
return ret; | |
} | |
}; | |
template <> | |
struct Bitfield<uint64_t> { | |
static __device__ __host__ __forceinline__ | |
uint64_t getBitfield(uint64_t val, int pos, int len) { | |
pos &= 0xff; | |
len &= 0xff; | |
uint64_t m = (1u << len) - 1u; | |
return (val >> pos) & m; | |
uint64_t ret; | |
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); | |
return ret; | |
} | |
static __device__ __host__ __forceinline__ | |
uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) { | |
pos &= 0xff; | |
len &= 0xff; | |
uint64_t m = (1u << len) - 1u; | |
toInsert &= m; | |
toInsert <<= pos; | |
m <<= pos; | |
return (val & ~m) | toInsert; | |
uint64_t ret; | |
asm("bfi.b64 %0, %1, %2, %3, %4;" : | |
"=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len)); | |
return ret; | |
} | |
}; | |
__device__ __forceinline__ int getLaneId() { | |
return __lane_id(); | |
int laneId; | |
asm("mov.s32 %0, %%laneid;" : "=r"(laneId) ); | |
return laneId; | |
} | |
__device__ __forceinline__ unsigned long long int getLaneMaskLt() { | |
const std::uint64_t m = (1ull << getLaneId()) - 1ull; | |
return m; | |
} | |
__device__ __forceinline__ unsigned getLaneMaskLt() { | |
unsigned mask; | |
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); | |
return mask; | |
} | |
__device__ __forceinline__ unsigned long long int getLaneMaskLe() { | |
std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1)); | |
return m; | |
} | |
__device__ __forceinline__ unsigned getLaneMaskLe() { | |
unsigned mask; | |
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); | |
return mask; | |
} | |
__device__ __forceinline__ unsigned long long int getLaneMaskGt() { | |
const std::uint64_t m = getLaneMaskLe(); | |
return m ? ~m : m; | |
} | |
__device__ __forceinline__ unsigned getLaneMaskGt() { | |
unsigned mask; | |
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask)); | |
return mask; | |
} | |
__device__ __forceinline__ unsigned long long int getLaneMaskGe() { | |
const std::uint64_t m = getLaneMaskLt(); | |
return ~m; | |
} | |
__device__ __forceinline__ unsigned getLaneMaskGe() { | |
unsigned mask; | |
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); | |
return mask; | |
} | |
} // namespace at::cuda | |