Spaces:
Running
Running
# mypy: ignore-errors | |
import functools | |
import importlib | |
import logging | |
import os | |
import tempfile | |
import torch | |
from .common import device_from_inputs, fake_tensor_unsupported | |
from .registry import register_backend | |
log = logging.getLogger(__name__) | |
def tvm(gm, example_inputs, *, scheduler=None, trials=20000): | |
import tvm # type: ignore[import] | |
from tvm import relay # type: ignore[import] | |
from tvm.contrib import graph_executor # type: ignore[import] | |
jit_mod = torch.jit.trace(gm, example_inputs) | |
device = device_from_inputs(example_inputs) | |
shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)] | |
example_outputs = gm(*example_inputs) | |
if len(example_outputs) == 0: | |
log.warning("Explicitly fall back to eager due to zero output") | |
return gm.forward | |
mod, params = relay.frontend.from_pytorch(jit_mod, shape_list) | |
if device.type == "cuda": | |
dev = tvm.cuda(device.index) | |
target = tvm.target.cuda() | |
else: | |
dev = tvm.cpu(0) | |
target = tvm.target.Target(llvm_target()) | |
if scheduler is None: | |
scheduler = os.environ.get("TVM_SCHEDULER", None) | |
if scheduler == "auto_scheduler": | |
from tvm import auto_scheduler | |
log_file = tempfile.NamedTemporaryFile() | |
if not os.path.exists(log_file): | |
tasks, task_weights = auto_scheduler.extract_tasks( | |
mod["main"], params, target | |
) | |
for task in tasks: | |
print(task.compute_dag) | |
else: | |
print("No tasks") | |
if len(tasks) != 0: | |
tuner = auto_scheduler.TaskScheduler(tasks, task_weights) | |
if not os.path.exists(log_file): | |
assert trials > 0 | |
tune_option = auto_scheduler.TuningOptions( | |
num_measure_trials=trials, | |
measure_callbacks=[auto_scheduler.RecordToFile(log_file)], | |
early_stopping=2000, | |
) | |
try: | |
tuner.tune(tune_option) | |
except Exception: | |
if os.path.exists(log_file): | |
os.unlink(log_file) | |
raise | |
with auto_scheduler.ApplyHistoryBest(log_file): | |
with tvm.transform.PassContext( | |
opt_level=3, config={"relay.backend.use_auto_scheduler": True} | |
): | |
lib = relay.build(mod, target=target, params=params) | |
elif scheduler == "meta_schedule": | |
from tvm import meta_schedule as ms | |
with tempfile.TemporaryDirectory() as work_dir: | |
if device.type != "cuda": | |
# meta_schedule needs num-cores to be specified | |
# here we use the maximum core count | |
target = tvm.target.Target( | |
f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}" | |
) | |
# TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch | |
# once USE_PT_TVMDSOOP is updated and turned on by default in TVM. | |
database = ms.relay_integration.tune_relay( | |
mod=mod, | |
target=target, | |
work_dir=work_dir, | |
max_trials_global=20000, | |
num_trials_per_iter=64, | |
params=params, | |
strategy="evolutionary", | |
) | |
lib = ms.relay_integration.compile_relay( | |
database=database, | |
mod=mod, | |
target=target, | |
params=params, | |
) | |
elif scheduler == "default" or not scheduler: | |
# no autotuning | |
with tvm.transform.PassContext(opt_level=10): | |
lib = relay.build(mod, target=target, params=params) | |
else: | |
raise NotImplementedError( | |
"This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. " | |
"There are three available options: default, auto_scheduler and meta_schedule." | |
) | |
m = graph_executor.GraphModule(lib["default"](dev)) | |
def to_torch_tensor(nd_tensor): | |
"""A helper function to transfer a NDArray to torch.tensor.""" | |
if nd_tensor.dtype == "bool": | |
# DLPack does not support boolean so it can't be handled by | |
# torch.utils.dlpack.from_pack. Workaround by going through | |
# numpy, although this brings additional data copy overhead. | |
return torch.from_numpy(nd_tensor.numpy()) | |
return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack()) | |
def to_tvm_tensor(torch_tensor): | |
"""A helper function to transfer a torch.tensor to NDArray.""" | |
if torch_tensor.dtype == torch.bool: | |
# same reason as above, fallback to numpy conversion which | |
# could introduce data copy overhead | |
return tvm.nd.array(torch_tensor.cpu().numpy()) | |
return tvm.nd.from_dlpack(torch_tensor) | |
def exec_tvm(*i_args): | |
args = [a.contiguous() for a in i_args] | |
shape_info, _ = m.get_input_info() | |
active_inputs = {name for name, _ in shape_info.items()} | |
for idx, arg in enumerate(args, 0): | |
if arg.dim() != 0: | |
if arg.requires_grad: | |
arg = arg.detach() | |
inp_name = f"inp_{idx}" | |
if inp_name not in active_inputs: | |
log.warning( | |
"input %s skipped as not found in tvm's runtime library", | |
inp_name, | |
) | |
continue | |
m.set_input( | |
inp_name, | |
to_tvm_tensor(arg), | |
) | |
m.run() | |
return [to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())] | |
return exec_tvm | |
tvm_meta_schedule = functools.partial(tvm, scheduler="meta_schedule") | |
tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler") | |
def has_tvm(): | |
try: | |
importlib.import_module("tvm") | |
return True | |
except ImportError: | |
return False | |
def llvm_target(): | |
if "avx512" in open("/proc/cpuinfo").read(): | |
return "llvm -mcpu=skylake-avx512" | |
return "llvm -mcpu=core-avx2" | |