Spaces:
Running
Running
import os # noqa: C101 | |
import sys | |
from typing import Any, Callable, Dict, Optional, TYPE_CHECKING | |
import torch | |
def is_fbcode(): | |
return not hasattr(torch.version, "git_version") | |
# add some debug printouts | |
debug = False | |
# add inf and NaN checkers | |
debug_check_inf_and_nan = False | |
# Whether to disable a progress bar for autotuning | |
disable_progress = True | |
# Whether to enable printing the source code for each future | |
verbose_progress = False | |
# use fx aot graph codegen cache | |
fx_graph_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE") == "1" | |
# use cpp wrapper instead of python wrapper | |
cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1" | |
# codegen cpp wrapper code in an ABI compatible mode | |
abi_compatible = ( | |
os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1" | |
) | |
c_shim_version = os.environ.get( | |
"TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2" | |
) | |
# dead code elimination | |
dce = False | |
# assume weight tensors are fixed size | |
static_weight_shapes = True | |
# put correctness assertions in generated code | |
size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1" | |
nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1" | |
# enable loop reordering based on input orders | |
pick_loop_orders = True | |
# reuse a kernel input as the output | |
inplace_buffers = True | |
# reuse a buffer for an unrelated purpose | |
allow_buffer_reuse = True | |
# Enable pooled allocations for non-output tensors | |
memory_planning = os.environ.get("TORCHINDUCTOR_MEMORY_PLANNING", "0") == "1" | |
# How to organize memory under memory_planning=True: | |
# - "none": do not try to pool storage, just reuse | |
# - "intermediates": all non-outputs share storage, outputs each get unique storage | |
# - "outputs": two pools, one for intermediates (freed on return) and one for outputs | |
# - "combined": a single pool for both intermediates and outputs | |
memory_pool = os.environ.get("TORCHINDUCTOR_MEMORY_POOL", "intermediates") | |
# codegen benchmark harness | |
benchmark_harness = True | |
# fuse pointwise into templates | |
epilogue_fusion = True | |
# do epilogue fusions before other fusions | |
epilogue_fusion_first = False | |
# enable pattern match+replace optimizations | |
pattern_matcher = True | |
# register custom graph optimization pass hook. so far, pre/post passes are | |
# only applied before/after pattern_matcher in post_grad_passes. | |
# | |
# def my_custom_pre_pass(graph: torch.fx.graph.Graph): | |
# # my custom graph optimization pass | |
# ... | |
# | |
# def my_custom_post_pass(graph: torch.fx.graph.Graph): | |
# # my custom graph optimization pass | |
# ... | |
# | |
# torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass | |
# torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass | |
post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None | |
post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None | |
# Registers a custom pregrad pass. Note that the pre-grad IR is 1. | |
# non-functional, 2. non-normalized, and 3. prone to change. Ideally we should | |
# use post-grad passes. | |
pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None | |
# Optimize away split cat patterns (Experimental) | |
split_cat_fx_passes = True | |
# Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability. | |
efficient_conv_bn_eval_fx_passes = False | |
# Enable predispatch aten IR for export | |
is_predispatch = False | |
# Deprecated | |
group_fusion = False | |
# Deprecated | |
batch_fusion = True | |
# Pre grad group/batch fusion and options in order, set to empty dict to disable fusion. | |
# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions. | |
pre_grad_fusion_options: Dict[str, Dict[str, Any]] = { | |
"batch_linear": {}, | |
"batch_linear_lhs": {}, | |
"batch_layernorm": {}, | |
"batch_tanh": {}, | |
"batch_relu": {}, | |
"batch_sigmoid": {}, | |
} | |
# Post grad group/batch fusion and options, set to empty dict to disable fusion. | |
# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions. | |
post_grad_fusion_options: Dict[str, Dict[str, Any]] = {} | |
# enable reordering pass for improving memory locality | |
reorder_for_locality = True | |
# Scale down RBLOCK for better occupancy | |
dynamic_scale_rblock = os.environ.get("TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCK", "1") == "1" | |
# this forces fusion for int_mm with mul. Needed when you want to avoid realizing the int32 | |
# but the mul gets fused with other pointwise ops instead. | |
force_fuse_int_mm_with_mul = False | |
# for pattern torch.mm(a, b.to(dtype)) with cuda tensors, | |
# enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel. | |
# Autotune will compare perf with normal cast->then->mm option | |
use_mixed_mm = False | |
# enable runtime numeric check for pre/post grad fx passes | |
# floating point provides limited accuracy (about 7 decimal digits for single precision | |
# floating point numbers,about 16 decimal digits for double precision floating point numbers) | |
# according to PyTorch documentation. | |
# https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations | |
fx_passes_numeric_check: Dict[str, Any] = { | |
"pre_grad": False, | |
"precision": 1e-4, | |
"num_iterations": 1, | |
"requires_optimizer": True, | |
} | |
# for pattern torch.mm(a, b.to(dtype)) with cuda tensors, always use | |
# torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel. | |
# Autotune will not compare with normal cast->then->mm option. | |
# (if force_mixed_mm is true, the use_mixed_mm flag will be ignored) | |
force_mixed_mm = False | |
# enable reordering pass for increasing overlap between compute and communication | |
reorder_for_compute_comm_overlap = False | |
# passes (in execution order) for increasing overlap between compute and communication | |
# for built-in passes, use string name; for user-defined passes, pass in the function handle | |
reorder_for_compute_comm_overlap_passes = [ | |
"reorder_compute_for_overlap", | |
"sink_waits", | |
"raise_comms", | |
] | |
# runtime estimation function for ops | |
# for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle | |
estimate_op_runtime = "default" | |
# unit: GB/s, uni-directional P2P bandwidth per card | |
# default value is NVLink | |
intra_node_bw = 300 | |
# unit: GB/s, uni-directional P2P bandwidth per node | |
# default value is InfiniBand | |
inter_node_bw = 25 | |
# enable slow autotuning passes to select algorithms | |
max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1" | |
# enable slow autotuning passes to select pointwise/reductions algorithms | |
max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1" | |
# enable slow autotuning passes to select gemm algorithms | |
max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1" | |
# enable autotune local cache | |
use_autotune_local_cache = True | |
# enable autotune remote cache | |
use_autotune_remote_cache = ( | |
os.environ.get("TORCH_INDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1" | |
) | |
# force cublas and triton to use the same precision; cublas supports TF32 for matmul operations | |
# when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations | |
# for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure | |
# that triton does not use TF32 wherever cublas would not use TF32 | |
force_same_precision = ( | |
True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1" | |
) | |
# Specify candidate backends for gemm autotune. | |
# Possible choices are combinations of: ATen, Triton, CUTLASS. | |
# ATen: default Pytorch ATen kernels. | |
# Triton: Triton templates defined in torch inductor. | |
# CUTLASS: Cutlass templates and kernels. | |
max_autotune_gemm_backends = os.environ.get( | |
"TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON" | |
).upper() | |
# the value used as a fallback for the unbacked SymInts | |
# that can appear in the input shapes (e.g., in autotuning) | |
unbacked_symint_fallback = 8192 | |
# enable searching global and local cache regardless of `max_autotune` | |
search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1" | |
save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1" | |
# We will disable creating subprocess for autotuning if this is False | |
autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1" | |
# If autotuning in subprocess, whether to use multiple devices | |
autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1" | |
coordinate_descent_tuning = ( | |
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1" | |
) | |
coordinate_descent_check_all_directions = ( | |
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1" | |
) | |
coordinate_descent_search_radius = int( | |
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1") | |
) | |
# Disabled by default on ROCm, opt-in if model utilises NHWC convolutions | |
layout_opt_default = "1" if not torch.version.hip else "0" | |
layout_optimization = ( | |
os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1" | |
) | |
force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1" | |
# Whether to keep the output strides the same as eager after layout optimization. | |
keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1" | |
# Enabling this will let compiler print warning messages if a generated triton | |
# kernel has inputs with mixed layouts. This is helpful for perf debugging | |
# since kernel with mixed layout inputs may run much slower then one whose inputs | |
# have uniform layouts. | |
warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1" | |
# control store vs recompute heuristic | |
# For fanouts, rematerialization can lead to exponential blowup. So, have | |
# smaller threshold | |
realize_reads_threshold = 4 | |
realize_opcount_threshold = 30 | |
# Threshold to prevent excessive accumulation of ops in one buffer during lowering | |
realize_acc_reads_threshold = 8 | |
# fallback to eager for random/dropout, this is slow but useful for debugging | |
fallback_random = False | |
# automatically create fallbacks when encountering an unhandled op | |
implicit_fallbacks = True | |
# fuse even in cases without common reads | |
aggressive_fusion = False | |
# For each fused kernel in the wrapper, comment with the nodes that get fused. | |
# Useful for debugging fusion. | |
debug_fusion = os.environ.get("TORCHINDUCTOR_DEBUG_FUSION") == "1" | |
benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1" | |
enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "") | |
# how many nodes to allow into a single fusion | |
max_fusion_size = 64 | |
# max number of inputs to generate cat as a pointwise op with masked laods | |
max_pointwise_cat_inputs = 8 | |
# replace small reductions with pointwise, disable with `= 1` | |
unroll_reductions_threshold = 8 | |
# Add extra comments to output code (causes compile cache misses) | |
comment_origin = False | |
# Convert 1x1 convs into matmuls | |
conv_1x1_as_mm = False | |
# Enable split reductions for better utilization when the dimension | |
# being reduced over is large (by splitting it) | |
split_reductions = True | |
benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1" | |
# Enable constant and index_expr folding | |
constant_and_index_propagation = True | |
# we always add constants into graph.constants without | |
# performing any constant-inlining optimization | |
always_keep_tensor_constants = False | |
# assert that indirect indexing does not read / write out of bounds | |
assert_indirect_indexing = True | |
# constant folding on the joint graph | |
joint_graph_constant_folding = True | |
# Enable indirect_indexing asserts for decompositions and lowerings | |
debug_index_asserts = False | |
# warnings intended for PyTorch developers, disable for point releases | |
is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__ | |
developer_warnings = is_fbcode() or is_nightly_or_source | |
# The multiprocessing start method to use for inductor workers in the codecache. | |
# TODO: fork is not safe in a multithreaded environment, we should evaluate changing | |
# the default to spawn. | |
worker_start_method = "fork" | |
def decide_compile_threads(): | |
""" | |
Here are the precedence to decide compile_threads | |
1. User can override it by TORCHINDUCTOR_COMPILE_THREADS. One may want to disable async compiling by | |
setting this to 1 to make pdb happy. | |
2. Set to 1 if it's win32 platform or it's a fbcode build | |
3. decide by the number of CPU cores | |
""" | |
if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ: | |
return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"]) | |
elif sys.platform == "win32" or is_fbcode(): | |
return 1 | |
else: | |
cpu_count = ( | |
len(os.sched_getaffinity(0)) | |
if hasattr(os, "sched_getaffinity") | |
else os.cpu_count() | |
) | |
assert cpu_count | |
return min(32, cpu_count) | |
compile_threads = decide_compile_threads() | |
# gemm autotuning global cache dir | |
if is_fbcode(): | |
from libfb.py import parutil | |
try: | |
if __package__: | |
global_cache_dir = parutil.get_dir_path( | |
os.path.join(__package__.replace(".", os.sep), "fb/cache") | |
) | |
else: | |
global_cache_dir = parutil.get_dir_path("fb/cache") | |
except ValueError: | |
global_cache_dir = None | |
else: | |
global_cache_dir = None | |
# If kernel is fused, the name is generated from the origin node op names | |
# for larger kernels limit this | |
kernel_name_max_ops = 10 | |
# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs | |
shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1" | |
# Fx-based linear/matmul/bmm + permute/transpose vertical fusion | |
permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1" | |
# Mark the wrapper call in PyTorch profiler | |
profiler_mark_wrapper_call = False | |
# Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for | |
# every intermediate for which we can correlate it with an intermediate | |
# from the original FX graph | |
generate_intermediate_hooks = False | |
# Populate traceback field on IRNode; good for debugging why origin_node is | |
# not populated, or finding out where an IRNode was constructed | |
debug_ir_traceback = False | |
# used for debugging to make sure config is properly set | |
_raise_error_for_testing = False | |
_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "") | |
profile_bandwidth = _profile_var != "" | |
profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var | |
# Specify a file where we print out the profiling results. | |
# None means we do not dump results to a file. | |
profile_bandwidth_output = os.environ.get("TORCHINDUCTOR_PROFILE_OUTPUT", None) | |
# TODO: remove later | |
disable_cpp_codegen = False | |
# Freezing will attempt to inline weights as constants in optimization | |
# and run constant folding and other optimizations on them. After freezing, weights | |
# can no longer be updated. | |
freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1" | |
# Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead | |
# of potentially keeping multiple copies of weights. | |
freezing_discard_parameters: bool = False | |
# Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests | |
# should be run with this flag both on and off to make sure we have coverage. | |
allow_stack_allocation: bool = ( | |
os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1") == "1" | |
) | |
# Enables an alternate DSO interface (the "minimal ArrayRef interface") intended | |
# to maximize performance for use cases that it can accommodate at the expense of | |
# generality. In brief: | |
# - inputs and outputs are ArrayRefTensor<T> (note that strides are required, but the | |
# tensor must be contiguous) | |
# - constant handling is unchanged because it is not a per-inference-iteration bottleneck | |
# | |
# When the DSO is generated in this mode, the usual interface will also be supported, | |
# but performance for that interface may be degraded. | |
use_minimal_arrayref_interface: bool = False | |
# decompose some memory bound matmul/bmm to mul | |
decompose_mem_bound_mm: bool = False | |
# config specific to codegen/cpp.py | |
class cpp: | |
# set to torch.get_num_threads() | |
threads = -1 | |
# Do not generate loops when the condition doesn't hold, like: | |
# for(long i0=4096; i0<4096; i0+=1) | |
no_redundant_loops = True | |
# Assume number of threads is dynamic, don't specialize thread number. | |
# Kernels don't recompile on thread number changes with this flag on. | |
# For single-threaded workload, turning it on would incur a slight | |
# performance degradation. | |
dynamic_threads = False | |
simdlen: Optional[int] = None | |
min_chunk_size = 4096 | |
cxx = ( | |
None, # download gcc12 from conda-forge if conda is installed | |
# "g++-12", | |
# "g++-11", | |
# "g++-10", | |
# "clang++", | |
os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"), | |
# "g++.par", | |
) | |
# Allow kernel performance profiling via PyTorch profiler | |
enable_kernel_profile = False | |
# enable weight prepacking to get a better performance; may lead to large memory footprint | |
weight_prepack = True | |
# Inject a bug into our relu implementation; useful for testing our repro | |
# extraction and minification functionality. | |
# Valid values: "compile_error", "runtime_error", "accuracy" | |
inject_relu_bug_TESTING_ONLY: Optional[str] = None | |
inject_log1p_bug_TESTING_ONLY: Optional[str] = None | |
# If None, autodetect whether or not AVX512/AVX2 can be used. Otherwise, | |
# force usage as specified, without testing. | |
vec_isa_ok: Optional[bool] = None | |
# similar to config.triton.descriptive_names | |
descriptive_names = "original_aten" | |
# how many nodes to allow into a single horizontal fusion | |
max_horizontal_fusion_size = 16 | |
# Make scatter_reduce fallback when reduce is sum to avoid performance regression | |
# using atomic_add. | |
fallback_scatter_reduce_sum = True | |
# Use funsafe-math-optimizations when compiling | |
enable_unsafe_math_opt_flag = False | |
# Use ffp-contract when compiling | |
enable_floating_point_contract_flag = False | |
# config specific to codegen/triton.py | |
class triton: | |
# Use cudagraphs on output code | |
cudagraphs = False | |
# Use cudagraph trees for memory pooling if `cudagraphs` is True | |
cudagraph_trees = True | |
# assertions not on the fast path, steady state | |
slow_path_cudagraph_asserts = True | |
# TODO - need to debug why this prevents cleanup | |
cudagraph_trees_history_recording = False | |
# assertions on the fast path | |
fast_path_cudagraph_asserts = False | |
# skip warmup for cudagraph trees | |
skip_cudagraph_warmup = False | |
# Synchronize before and after every compiled graph. | |
debug_sync_graph = False | |
# Synchronize after every kernel launch, to help pinpoint bugs | |
debug_sync_kernel = False | |
# Always load full blocks (rather than broadcasting inside the block) | |
dense_indexing = False | |
# limit tiling dimensions | |
max_tiles = 2 | |
# use triton.autotune for pointwise ops with complex layouts | |
# this should only be disabled for debugging/testing | |
autotune_pointwise = True | |
# max autotune gemm with cublasLt | |
autotune_cublasLt = True | |
# should we stop a fusion to allow better tiling? | |
tiling_prevents_pointwise_fusion = True | |
tiling_prevents_reduction_fusion = True | |
# should we give different names to kernels | |
# Note: This is orthogonal to descriptive_names - this is deciding whether | |
# our triton kernel names should all be `triton_` (to maximize caching) or | |
# whether they should be unique. | |
unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1" | |
# should we put op names in kernel names | |
# False: No special names (just triton__1, triton__2, etc.) | |
# "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.) | |
# "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions) | |
# "inductor_node": Maps to the node name in the FX graph passed to Inductor | |
descriptive_names = "original_aten" | |
# use alternate codegen for smaller reductions | |
persistent_reductions = ( | |
os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1" | |
) | |
# 0/False: disable | |
# 1/True: enable, use tuning to pick between different subkernels | |
# 2: enable, force using persistent reduction (for debugging) | |
# 3: enable, force using non-persistent reduction (for debugging) | |
multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0")) | |
# hint to Triton when arguments are divisible by 16 | |
divisible_by_16 = True | |
# theses are not enforced, but they are used by asserts in triton_heuristics.py | |
# NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048 | |
# Max RBLOCK will be large for multi-kernel since we do more aggressive | |
# persistent reduction. | |
max_block = { | |
"X": 2048, | |
"Y": 1024, | |
"Z": 1024, | |
"R": 4096 * (16 if multi_kernel else 1), | |
} | |
# Minimum RBLOCK to be used for a TritonSplitScanKernel | |
# NOTE: This also indirectly controls the size of workspace buffer required | |
min_split_scan_rblock = 256 | |
# Store the generated cubin files for cpp wrapper code to load | |
store_cubin = False | |
# the max number of spills we allow for the configs we benchmark. | |
# Setting this to 0 means we skip a config if it spills even a single | |
# register. | |
# Setting it to a larger value allows a config spilling a small amount | |
# of registers being benchmarked. | |
# | |
# NOTE: triton will always report >0 register spills for kernels using sin/cos. | |
# (check this issue https://github.com/openai/triton/issues/1756 ) | |
# So far we see a fixed 8 spilled registers for kernels using sin/cos. | |
# Raise the threshold to 16 to be safe. | |
# We should revisit this once we understand more of the source of register spills. | |
spill_threshold: int = 16 | |
# Generate code containing the newer tl.make_block_ptr() API for loads/store | |
use_block_ptr = False | |
# Inject a bug into our relu implementation; useful for testing our repro | |
# extraction and minification functionality. | |
# Valid values: "compile_error", "runtime_error", "accuracy" | |
inject_relu_bug_TESTING_ONLY: Optional[str] = None | |
class aot_inductor: | |
# AOTInductor output path | |
# If an absolute path is specified, the generated lib files will be stored under the directory; | |
# If a relative path is specified, it will be used as a subdirectory under the default caching path; | |
# If not specified, a temp directory will be created under the default caching path. | |
# If the specified path contains something like "model.so", the sub-string will be used | |
# to name the generated library. | |
output_path = "" | |
debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1" | |
# Serialized tree spec for flattening inputs | |
serialized_in_spec = "" | |
# Serialized tree spec for flattening outputs | |
serialized_out_spec = "" | |
# flag to decide whether to create a submodule for constant graph. | |
use_runtime_constant_folding: bool = False | |
class cuda: | |
# CUDA arch to use for CUDA template kernel compilation. | |
# e.g. "70", "75", "80", "90", etc. | |
# When arch is None, Inductor uses torch.cuda.get_device_capability(0). | |
arch: Optional[str] = None | |
# CUDA version to use for CUDA template kernel compilation. | |
# e.g. "11.4", "12.1", etc. | |
# When version is None, Inductor uses torch.version.cuda. | |
version: Optional[str] = None | |
# Optimization level for the host compiler. | |
compile_opt_level = "-O1" | |
# Whether to enable device LTO (link-time-optimization). | |
enable_cuda_lto = False | |
# Whether to keep intermediate files dring compilation. | |
enable_ptxas_info = False | |
# Whether to enable debug info, e.g. line number, cutlass debug info. | |
enable_debug_info = False | |
# Whether to use fast math. | |
use_fast_math = False | |
# Path to the CUTLASS repo root directory. | |
# The default path only works under PyTorch local development environment. | |
cutlass_dir = os.environ.get( | |
"TORCHINDUCTOR_CUTLASS_DIR", | |
os.path.abspath( | |
os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/") | |
), | |
) | |
# Configures the maximum number of CUTLASS configs to profile in max_autotune. | |
# By default it's None, so that all CUTLASS configs are tuned. | |
# This is mainly used to reduce test time in CI. | |
cutlass_max_profiling_configs: Optional[int] = None | |
# Path to CUDA NVCC. | |
# NVCC search order: | |
# 1) cuda_cxx set in this config | |
# 2)CUDACXX environment variable | |
# 3)CUDA_HOME environment variable | |
# 4) default system search PATH. | |
cuda_cxx: Optional[str] = None | |
# If set to True, it will ensure that only GEMM ops capable of | |
# epilogue fusion via CUTLASS Epilogue Visitor Trees ( EVT ) | |
# are enabled for the CUTLASS backend. | |
cutlass_only_evt_capable_ops: bool = False | |
# create a directory containing lots of debug information | |
class trace: | |
# master switch for all debugging flags below | |
enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1" | |
# Save debug information to a temporary directory | |
# If not specified, a temp directory will be created by system | |
debug_dir: Optional[str] = None | |
# Save python logger call >=logging.DEBUG | |
debug_log = False | |
# Save python logger call >=logging.INFO | |
info_log = False | |
# Save input FX graph (post decomps, pre optimization) | |
fx_graph = True | |
# Save FX graph after transformations | |
fx_graph_transformed = True | |
# Save TorchInductor IR before fusion pass | |
ir_pre_fusion = True | |
# Save TorchInductor IR after fusion pass | |
ir_post_fusion = True | |
# Copy generated code to trace dir | |
output_code = True | |
# SVG figure showing post-fusion graph | |
graph_diagram = os.environ.get("INDUCTOR_POST_FUSION_SVG", "0") == "1" | |
# SVG figure showing fx with fusion | |
draw_orig_fx_graph = os.environ.get("INDUCTOR_ORIG_FX_SVG", "0") == "1" | |
# We draw our fx graphs with the "record" shape attribute by default. | |
# Sometimes, when the graph is very complex, we may hit dot errors like below: | |
# "flat edge between adjacent nodes one of which has a record shape - | |
# replace records with HTML-like labels" | |
# and thus fail to generate a graph. So, let's give the user an option | |
# to specify the shape attribute for the dot graph. For example, passing | |
# INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables | |
# to workaround the above failure. | |
dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None) | |
# Store cProfile (see snakeviz to view) | |
compile_profile = False | |
# Upload the .tar.gz file | |
# Needs to be overriden based on specific environment needs | |
upload_tar: Optional[Callable[[str], None]] = None | |
log_autotuning_results: bool = False | |
_save_config_ignore = { | |
# workaround: "Can't pickle <function ...>" | |
"trace.upload_tar", | |
} | |
if TYPE_CHECKING: | |
from torch.utils._config_typing import * # noqa: F401, F403 | |
from torch.utils._config_module import install_config_module | |
# adds patch, save_config, etc | |
install_config_module(sys.modules[__name__]) | |