Spaces:
Running
Running
/* Main entry for c10/macros. | |
* | |
* In your code, include c10/macros/Macros.h directly, instead of individual | |
* files in this folder. | |
*/ | |
// For build systems that do not directly depend on CMake and directly build | |
// from the source directory (such as Buck), one may not have a cmake_macros.h | |
// file at all. In this case, the build system is responsible for providing | |
// correct macro definitions corresponding to the cmake_macros.h.in file. | |
// | |
// In such scenarios, one should define the macro | |
// C10_USING_CUSTOM_GENERATED_MACROS | |
// to inform this header that it does not need to include the cmake_macros.h | |
// file. | |
__attribute__((no_sanitize("float-divide-by-zero"))) | |
__attribute__((no_sanitize("signed-integer-overflow"))) | |
__attribute__((no_sanitize("pointer-overflow"))) | |
// Detect address sanitizer as some stuff doesn't work with it | |
// for clang | |
// for gcc | |
// Disable the copy and assignment operator for a class. Note that this will | |
// disable the usage of the class in std containers. | |
classname(const classname&) = delete; \ | |
classname& operator=(const classname&) = delete | |
/** | |
* C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with | |
* str and ending with a number that varies with the line. | |
*/ | |
/// C10_NODISCARD - Warn if a type or return value is discarded. | |
// Technically, we should check if __cplusplus > 201402L here, because | |
// [[nodiscard]] is only defined in C++17. However, some compilers | |
// we care about don't advertise being C++17 (e.g., clang), but | |
// support the attribute anyway. In fact, this is not just a good idea, | |
// it's the law: clang::warn_unused_result doesn't work on nvcc + clang | |
// and the best workaround for this case is to use [[nodiscard]] | |
// instead; see https://github.com/pytorch/pytorch/issues/13118 | |
// | |
// Note to future editors: if you have noticed that a compiler is | |
// misbehaving (e.g., it advertises support, but the support doesn't | |
// actually work, or it is emitting warnings). Some compilers which | |
// are strict about the matter include MSVC, which will complain: | |
// | |
// error C2429: attribute 'nodiscard' requires compiler flag '/std:c++latest' | |
// | |
// Exhibits: | |
// - MSVC 19.14: https://godbolt.org/z/Dzd7gn (requires /std:c++latest) | |
// - Clang 8.0.0: https://godbolt.org/z/3PYL4Z (always advertises support) | |
// - gcc 8.3: https://godbolt.org/z/4tLMQS (always advertises support) | |
// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious | |
// error when __has_cpp_attribute is given a scoped attribute in C mode. | |
// TODO: It's possible this is still triggering | |
// https://github.com/pytorch/pytorch/issues/13118 on Windows; if it is, better | |
// fix it. | |
// suppress an unused variable. | |
// Direct port of LLVM_ATTRIBUTE_USED. | |
// Simply define the namespace, in case a dependent library want to refer to | |
// the c10 namespace but not any nontrivial files. | |
namespace c10 {} | |
namespace c10::cuda {} | |
namespace c10::hip {} | |
namespace c10::xpu {} | |
// Since C10 is the core library for caffe2 (and aten), we will simply reroute | |
// all abstractions defined in c10 to be available in caffe2 as well. | |
// This is only for backwards compatibility. Please use the symbols from the | |
// c10 namespace where possible. | |
namespace caffe2 { | |
using namespace c10; | |
} | |
namespace at { | |
using namespace c10; | |
} | |
namespace at::cuda { | |
using namespace c10::cuda; | |
} // namespace at::cuda | |
// WARNING!!! THIS IS A GIANT HACK!!! | |
// This line means you cannot simultaneously include c10/hip | |
// and c10/cuda and then use them from the at::cuda namespace. | |
// This is true in practice, because HIPIFY works inplace on | |
// files in ATen/cuda, so it assumes that c10::hip is available | |
// from at::cuda. This namespace makes that happen. When | |
// HIPIFY is no longer out-of-place, we can switch the cuda | |
// here to hip and everyone is happy. | |
namespace at::cuda { | |
using namespace c10::hip; | |
} // namespace at::cuda | |
namespace at::xpu { | |
using namespace c10::xpu; | |
} // namespace at::xpu | |
// C10_LIKELY/C10_UNLIKELY | |
// | |
// These macros provide parentheses, so you can use these macros as: | |
// | |
// if C10_LIKELY(some_expr) { | |
// ... | |
// } | |
// | |
// NB: static_cast to boolean is mandatory in C++, because __builtin_expect | |
// takes a long argument, which means you may trigger the wrong conversion | |
// without it. | |
// | |
/// C10_NOINLINE - Functions whose declaration is annotated with this will not | |
/// be inlined. | |
// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. | |
// We do this #include here so that C10_HOST_DEVICE and friends will Just Work. | |
// See https://github.com/ROCm-Developer-Tools/HIP/issues/441 | |
// Designates functions callable from the host (CPU) and the device (GPU) | |
// constants from | |
// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) | |
// The maximum number of threads per multiprocessor is 1024 for Turing | |
// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and | |
// 2048 for all other architectures. You'll get warnings if you exceed these | |
// constants. Hence, the following macros adjust the input values from the user | |
// to resolve potential warnings. | |
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; | |
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; | |
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; | |
// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently | |
constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; | |
// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block | |
// size. 256 is a good number for this fallback and should give good occupancy | |
// and versatility across all architectures. | |
constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; | |
// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it | |
// turns out that although __launch_bounds__ can take constexpr, it | |
// can't take a constexpr that has anything to do with templates. | |
// Currently we use launch_bounds that depend on template arguments in | |
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK | |
// and C10_MIN_BLOCKS_PER_SM are kept as macros. | |
// Suppose you were planning to write __launch_bounds__(a, b), based on your | |
// performance tuning on a modern GPU. Instead, you should write | |
// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), | |
// which will also properly respect limits on old architectures. | |
(((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ | |
: CUDA_THREADS_PER_BLOCK_FALLBACK) | |
((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ | |
? (blocks_per_sm) \ | |
: ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) / \ | |
(threads_per_block)))) | |
// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ | |
__launch_bounds__( \ | |
256, 4) // default launch bounds that should give good occupancy and | |
// versatility across all architectures. | |
__launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) | |
__launch_bounds__( \ | |
(C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ | |
(C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm)))) | |
// CUDA_KERNEL_ASSERT checks the assertion | |
// even when NDEBUG is defined. This is useful for important assertions in CUDA | |
// code that would otherwise be suppressed when building Release. | |
(defined(USE_ROCM) && ROCM_VERSION < 40100) | |
// Those platforms do not support assert() | |
extern "C" { | |
C10_IMPORT | |
extern SYCL_EXTERNAL void _wassert( | |
const wchar_t* wexpr, | |
const wchar_t* wfile, | |
unsigned line); | |
__host__ __device__ | |
void | |
_wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line); | |
} | |
if (C10_UNLIKELY(!(cond))) { \ | |
(void)(_wassert( \ | |
_CRT_WIDE( | |
_CRT_WIDE(__FILE__), \ | |
static_cast<unsigned>(__LINE__)), \ | |
0); \ | |
} | |
if (C10_UNLIKELY(!(cond))) { \ | |
(void)(_wassert( \ | |
_CRT_WIDE( | |
_CRT_WIDE(__FILE__), \ | |
static_cast<unsigned>(__LINE__)), \ | |
0); \ | |
} | |
extern "C" { | |
extern SYCL_EXTERNAL void __assert_fail( | |
const char* expr, | |
const char* file, | |
unsigned int line, | |
const char* func); | |
// CUDA supports __assert_fail function which are common for both device | |
// and host side code. | |
__host__ __device__ | |
// This forward declaration matching the declaration of __assert_fail | |
// exactly how it is in glibc in case parts of the program are compiled with | |
// different NDEBUG settings. Otherwise we might get 'ambiguous declaration' | |
// error. Note: On ROCm - this declaration serves for host side compilation. | |
void | |
__assert_fail( | |
const char* assertion, | |
const char* file, | |
unsigned int line, | |
const char* function) noexcept __attribute__((__noreturn__)); | |
} | |
// ROCm disable kernel assert by default | |
if (C10_UNLIKELY(!(cond))) { \ | |
__assert_fail( \ | |
} | |
if (C10_UNLIKELY(!(cond))) { \ | |
__assert_fail( \ | |
} | |
defined(__APPLE__) && \ | |
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) | |
// Note [static constexpr char* members for windows NVCC] | |
// The Windows NVCC compiler doesn't handle static constexpr class members, | |
// although it's fixed in a later version. | |
// (see | |
// https://developercommunity.visualstudio.com/t/intellisense-error-c11-static-constexpr-member-ini/245425) | |
// | |
// If we want to ensure that our field is static under all builds, then we need | |
// to work around it specifically for windows NVCC by making it (a) const, (b) | |
// defined outside of the class definition We need to define it outside of the | |
// class definition because of the C++ standard; char* is not an integral type | |
// (see | |
// https://stackoverflow.com/questions/24278473/intellisense-a-member-of-type-const-char-const-cannot-have-an-in-class-in) | |
// | |
// So instead of this: | |
// struct Foo { | |
// static constexpr const char* name = "foo"; | |
// } | |
// In Windows NVCC, we end up with this: | |
// struct Foo { | |
// static const char* name; | |
// } | |
// const char* Foo::name = "foo"; | |
// | |
// This gives us a small perf hit for any code that wants to access these field | |
// members, but right now it isn't used in any perf-critical code paths. | |
static const char* field; | |
const char* cls::field = val; | |
static constexpr const char* field = val; | |
static const char* field; | |
const char* cls::field = val; | |
static constexpr const char* field = val; | |
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) | |
_C10_PRAGMA_(clang diagnostic ignored flag) | |
_C10_PRAGMA_(clang diagnostic push) \ | |
_C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \ | |
_C10_PRAGMA_(clang diagnostic ignored warning) | |
_C10_PRAGMA_(GCC diagnostic push) \ | |
_C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas") \ | |
_C10_PRAGMA_(GCC diagnostic ignored warning) | |