Spaces:
Running
Running
File size: 6,555 Bytes
c61ccee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# mypy: ignore-errors
import functools
import importlib
import logging
import os
import tempfile
import torch
from .common import device_from_inputs, fake_tensor_unsupported
from .registry import register_backend
log = logging.getLogger(__name__)
@register_backend
@fake_tensor_unsupported
def tvm(gm, example_inputs, *, scheduler=None, trials=20000):
import tvm # type: ignore[import]
from tvm import relay # type: ignore[import]
from tvm.contrib import graph_executor # type: ignore[import]
jit_mod = torch.jit.trace(gm, example_inputs)
device = device_from_inputs(example_inputs)
shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
example_outputs = gm(*example_inputs)
if len(example_outputs) == 0:
log.warning("Explicitly fall back to eager due to zero output")
return gm.forward
mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
if device.type == "cuda":
dev = tvm.cuda(device.index)
target = tvm.target.cuda()
else:
dev = tvm.cpu(0)
target = tvm.target.Target(llvm_target())
if scheduler is None:
scheduler = os.environ.get("TVM_SCHEDULER", None)
if scheduler == "auto_scheduler":
from tvm import auto_scheduler
log_file = tempfile.NamedTemporaryFile()
if not os.path.exists(log_file):
tasks, task_weights = auto_scheduler.extract_tasks(
mod["main"], params, target
)
for task in tasks:
print(task.compute_dag)
else:
print("No tasks")
if len(tasks) != 0:
tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
if not os.path.exists(log_file):
assert trials > 0
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=trials,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
early_stopping=2000,
)
try:
tuner.tune(tune_option)
except Exception:
if os.path.exists(log_file):
os.unlink(log_file)
raise
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(
opt_level=3, config={"relay.backend.use_auto_scheduler": True}
):
lib = relay.build(mod, target=target, params=params)
elif scheduler == "meta_schedule":
from tvm import meta_schedule as ms
with tempfile.TemporaryDirectory() as work_dir:
if device.type != "cuda":
# meta_schedule needs num-cores to be specified
# here we use the maximum core count
target = tvm.target.Target(
f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}"
)
# TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
# once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
database = ms.relay_integration.tune_relay(
mod=mod,
target=target,
work_dir=work_dir,
max_trials_global=20000,
num_trials_per_iter=64,
params=params,
strategy="evolutionary",
)
lib = ms.relay_integration.compile_relay(
database=database,
mod=mod,
target=target,
params=params,
)
elif scheduler == "default" or not scheduler:
# no autotuning
with tvm.transform.PassContext(opt_level=10):
lib = relay.build(mod, target=target, params=params)
else:
raise NotImplementedError(
"This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
"There are three available options: default, auto_scheduler and meta_schedule."
)
m = graph_executor.GraphModule(lib["default"](dev))
def to_torch_tensor(nd_tensor):
"""A helper function to transfer a NDArray to torch.tensor."""
if nd_tensor.dtype == "bool":
# DLPack does not support boolean so it can't be handled by
# torch.utils.dlpack.from_pack. Workaround by going through
# numpy, although this brings additional data copy overhead.
return torch.from_numpy(nd_tensor.numpy())
return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
def to_tvm_tensor(torch_tensor):
"""A helper function to transfer a torch.tensor to NDArray."""
if torch_tensor.dtype == torch.bool:
# same reason as above, fallback to numpy conversion which
# could introduce data copy overhead
return tvm.nd.array(torch_tensor.cpu().numpy())
return tvm.nd.from_dlpack(torch_tensor)
def exec_tvm(*i_args):
args = [a.contiguous() for a in i_args]
shape_info, _ = m.get_input_info()
active_inputs = {name for name, _ in shape_info.items()}
for idx, arg in enumerate(args, 0):
if arg.dim() != 0:
if arg.requires_grad:
arg = arg.detach()
inp_name = f"inp_{idx}"
if inp_name not in active_inputs:
log.warning(
"input %s skipped as not found in tvm's runtime library",
inp_name,
)
continue
m.set_input(
inp_name,
to_tvm_tensor(arg),
)
m.run()
return [to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())]
return exec_tvm
tvm_meta_schedule = functools.partial(tvm, scheduler="meta_schedule")
tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler")
def has_tvm():
try:
importlib.import_module("tvm")
return True
except ImportError:
return False
@functools.lru_cache(None)
def llvm_target():
if "avx512" in open("/proc/cpuinfo").read():
return "llvm -mcpu=skylake-avx512"
return "llvm -mcpu=core-avx2"
|