Spaces:
Sleeping
Sleeping
import bisect | |
import itertools | |
import math | |
from collections import defaultdict, namedtuple | |
from operator import attrgetter | |
from typing import Any, Dict, List, Optional, Tuple | |
import torch | |
from torch.autograd import DeviceType | |
__all__ = [ | |
"EventList", | |
"FormattedTimesMixin", | |
"Interval", | |
"Kernel", | |
"FunctionEvent", | |
"FunctionEventAvg", | |
"StringTable", | |
"MemRecordsAcc", | |
] | |
class EventList(list): | |
"""A list of Events (for pretty printing).""" | |
def __init__(self, *args, **kwargs): | |
use_cuda = kwargs.pop("use_cuda", True) | |
use_device = kwargs.pop("use_device", None) | |
profile_memory = kwargs.pop("profile_memory", False) | |
with_flops = kwargs.pop("with_flops", False) | |
super().__init__(*args, **kwargs) | |
self._use_cuda = use_cuda | |
self._use_device = use_device | |
self._profile_memory = profile_memory | |
self._tree_built = False | |
self._with_flops = with_flops | |
def _build_tree(self): | |
self._populate_cpu_children() | |
self._remove_dup_nodes() | |
self._set_backward_stacktraces() | |
self._tree_built = True | |
def __str__(self): | |
return self.table() | |
def _remove_dup_nodes(self): | |
while True: | |
to_delete = set() | |
for idx in range(len(self)): | |
if ( | |
self[idx].cpu_parent is not None | |
and self[idx].cpu_parent.name == self[idx].name | |
and len(self[idx].cpu_parent.cpu_children) == 1 | |
): | |
self[idx].cpu_parent.cpu_children = self[idx].cpu_children | |
self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up | |
for ch in self[idx].cpu_children: | |
ch.cpu_parent = self[idx].cpu_parent | |
to_delete.add(idx) | |
if len(to_delete) == 0: | |
break | |
new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete] | |
self.clear() | |
self.extend(new_evts) | |
def _populate_cpu_children(self): | |
"""Populate child events into each underlying FunctionEvent object. | |
One event is a child of another if [s1, e1) is inside [s2, e2). Where | |
s1 and e1 would be start and end of the child event's interval. And | |
s2 and e2 start and end of the parent event's interval | |
Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10] | |
be a parent of two other intervals. | |
If for any reason two intervals intersect only partially, this function | |
will not record a parent child relationship between then. | |
""" | |
# Some events can be async (i.e. start and end on different threads), | |
# since it's generally undefined how to attribute children ranges to | |
# async ranges, we do not use them when calculating nested ranges and stats | |
sync_events = [ | |
evt | |
for evt in self | |
if not evt.is_async and evt.device_type == DeviceType.CPU | |
] | |
events = sorted( | |
sync_events, | |
key=attrgetter("thread"), | |
) | |
# Group by both thread and node_id, so that events that happen to have | |
# the same thread_id but are from different nodes aren't incorrectly | |
# grouped together. | |
threads = itertools.groupby( | |
events, key=lambda event: (event.thread, event.node_id) | |
) | |
# For each thread we keep a stack of current nested parents. | |
# We maintain the invariant that each interval is a subset of all other | |
# intervals lower in the stack. | |
# | |
# First we sort the intervals by their start time. Then we iterate over them. | |
# Every time we see a new interval we remove several parents from | |
# the top until we restore the invariant. Then parent child relationship | |
# if recorded if the stack is not empty. | |
# Finally we add new interval to the list | |
# | |
# Algorithm has O(N * log(N)) complexity where N is number of | |
# intervals | |
for thread_id, thread_events in threads: | |
thread_events_ = sorted( | |
thread_events, | |
key=lambda event: [event.time_range.start, -event.time_range.end], | |
) | |
current_events: List[FunctionEvent] = [] | |
cur_end = 0 | |
for event in thread_events_: | |
while len(current_events) > 0: | |
parent = current_events[-1] | |
if ( | |
event.time_range.start >= parent.time_range.end | |
or event.time_range.end > parent.time_range.end | |
): | |
# this can't be a parent | |
current_events.pop() | |
else: | |
parent.append_cpu_child(event) | |
assert ( | |
event.cpu_parent is None | |
), f"There is already a CPU parent event for {event.key}" | |
event.set_cpu_parent(parent) | |
break | |
current_events.append(event) | |
def _set_backward_stacktraces(self): | |
def bw_parent(evt): | |
if evt is None: | |
return None | |
elif evt.scope == 1: # BACKWARD_FUNCTION | |
return evt | |
else: | |
return bw_parent(evt.cpu_parent) | |
fwd_stacks = {} | |
for evt in self: | |
if bw_parent(evt) is None and evt.stack is not None: | |
t = (evt.sequence_nr, evt.thread) | |
if t not in fwd_stacks: | |
fwd_stacks[t] = evt.stack | |
for evt in self: | |
p = bw_parent(evt) | |
if p is not None: | |
assert p.fwd_thread is not None | |
t = (p.sequence_nr, p.fwd_thread) | |
if t in fwd_stacks: | |
evt.stack = fwd_stacks[t] | |
else: | |
evt.stack = [] | |
def self_cpu_time_total(self): | |
return sum([event.self_cpu_time_total for event in self]) | |
def table( | |
self, | |
sort_by=None, | |
row_limit=100, | |
max_src_column_width=75, | |
max_name_column_width=55, | |
max_shapes_column_width=80, | |
header=None, | |
top_level_events_only=False, | |
): | |
"""Print an EventList as a nicely formatted table. | |
Args: | |
sort_by (str, optional): Attribute used to sort entries. By default | |
they are printed in the same order as they were registered. | |
Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``, | |
``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``, | |
``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``. | |
top_level_events_only(bool, optional): Boolean flag to determine the | |
selection of events to display. If true, the profiler will only | |
display events at top level like top-level invocation of python | |
`lstm`, python `add` or other functions, nested events like low-level | |
cpu/cuda ops events are omitted for profiler result readability. | |
Returns: | |
A string containing the table. | |
""" | |
return _build_table( | |
self, | |
sort_by=sort_by, | |
row_limit=row_limit, | |
max_src_column_width=max_src_column_width, | |
max_name_column_width=max_name_column_width, | |
max_shapes_column_width=max_shapes_column_width, | |
header=header, | |
profile_memory=self._profile_memory, | |
with_flops=self._with_flops, | |
top_level_events_only=top_level_events_only, | |
) | |
def export_chrome_trace(self, path): | |
"""Export an EventList as a Chrome tracing tools file. | |
The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL. | |
Args: | |
path (str): Path where the trace will be written. | |
""" | |
import os | |
device_name = "cuda" if not self._use_device else self._use_device | |
with open(path, "w") as f: | |
chrome_events = [] | |
next_id = 0 | |
# Use file IO over using json.dump since JSON dumping is very slow and | |
# this technique is proven to give a 4x speedup. | |
f.write("[") | |
for evt in self: | |
if evt.trace_name is None: | |
continue | |
f.write( | |
'{{"name": "{}", ' | |
'"ph": "X", ' | |
'"ts": {}, ' | |
'"dur": {}, ' | |
'"tid": {}, ' | |
'"pid": "CPU functions", ' | |
'"args": {{}}}}, '.format( | |
evt.trace_name, | |
evt.time_range.start, | |
evt.time_range.elapsed_us(), | |
evt.thread | |
if not evt.is_remote | |
else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "', | |
) | |
) | |
for k in evt.kernels: | |
# 's' and 'f' draw Flow arrows from | |
# the CPU launch to the GPU kernel | |
f.write( | |
f'{{"name": "{evt.trace_name}", ' | |
'"ph": "s", ' | |
f'"ts": {evt.time_range.start}, ' | |
f'"tid": {evt.thread}, ' | |
'"pid": "CPU functions", ' | |
f'"id": {next_id}, ' | |
f'"cat": "cpu_to_{device_name}", ' | |
'"args": {}}, ' | |
) | |
# Note: use torch.profiler to get device kernel trace | |
next_id += 1 | |
if len(self) > 0: | |
# remove trailing whitespace and comma | |
f.seek(f.tell() - 2, os.SEEK_SET) | |
f.truncate() | |
f.write("]") | |
def supported_export_stacks_metrics(self): | |
return [ | |
"self_cpu_time_total", | |
"self_cuda_time_total", | |
"self_privateuse1_time_total", | |
] | |
def export_stacks(self, path: str, metric: str): | |
if metric not in self.supported_export_stacks_metrics(): | |
raise ValueError( | |
"metric should be one of: " | |
+ str(self.supported_export_stacks_metrics()) | |
) | |
translate_table = str.maketrans(" ;\t\n", "____") | |
with open(path, "w") as f: | |
for evt in self: | |
if evt.stack and len(evt.stack) > 0: | |
metric_value = getattr(evt, metric) | |
if int(metric_value) > 0: | |
stack_str = "" | |
for entry in reversed(evt.stack): | |
stack_str += entry.translate(translate_table) | |
stack_str += ";" | |
stack_str = stack_str[:-1] + " " + str(int(metric_value)) | |
f.write(stack_str + "\n") | |
def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0): | |
"""Averages all function events over their keys. | |
Args: | |
group_by_input_shapes: group entries by | |
(event name, input shapes) rather than just event name. | |
This is useful to see which input shapes contribute to the runtime | |
the most and may help with size-specific optimizations or | |
choosing the best candidates for quantization (aka fitting a roof line) | |
group_by_stack_n: group by top n stack trace entries | |
Returns: | |
An EventList containing FunctionEventAvg objects. | |
""" | |
assert self._tree_built | |
stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg) | |
def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]: | |
key = [ | |
str(event.key), | |
str(event.node_id), | |
str(event.device_type), | |
str(event.is_legacy), | |
] | |
if group_by_input_shapes: | |
key.append(str(event.input_shapes)) | |
if group_by_stack_n > 0: | |
key += event.stack[:group_by_stack_n] | |
return tuple(key) | |
for evt in self: | |
stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt) | |
avg_list = EventList( | |
stats.values(), | |
use_cuda=self._use_cuda, | |
use_device=self._use_device, | |
profile_memory=self._profile_memory, | |
with_flops=self._with_flops, | |
) | |
for evt in avg_list: | |
evt.stack = evt.stack[:group_by_stack_n] | |
if not group_by_input_shapes: | |
evt.input_shapes = "" | |
return avg_list | |
def total_average(self): | |
"""Averages all events. | |
Returns: | |
A FunctionEventAvg object. | |
""" | |
total_stat = FunctionEventAvg() | |
for evt in self: | |
total_stat += evt | |
total_stat.key = None | |
total_stat.key = "Total" | |
return total_stat | |
def _format_time(time_us): | |
"""Define how to format time in FunctionEvent.""" | |
US_IN_SECOND = 1000.0 * 1000.0 | |
US_IN_MS = 1000.0 | |
if time_us >= US_IN_SECOND: | |
return f"{time_us / US_IN_SECOND:.3f}s" | |
if time_us >= US_IN_MS: | |
return f"{time_us / US_IN_MS:.3f}ms" | |
return f"{time_us:.3f}us" | |
def _format_time_share(time_us, total_time_us): | |
"""Define how to format time in FunctionEvent.""" | |
if total_time_us == 0: | |
assert time_us == 0, f"Expected time_us == 0 but got {time_us}" | |
return "NaN" | |
return f"{time_us * 100.0 / total_time_us:.2f}%" | |
def _format_memory(nbytes): | |
"""Return a formatted memory size string.""" | |
KB = 1024 | |
MB = 1024 * KB | |
GB = 1024 * MB | |
if abs(nbytes) >= GB: | |
return f"{nbytes * 1.0 / GB:.2f} Gb" | |
elif abs(nbytes) >= MB: | |
return f"{nbytes * 1.0 / MB:.2f} Mb" | |
elif abs(nbytes) >= KB: | |
return f"{nbytes * 1.0 / KB:.2f} Kb" | |
else: | |
return str(nbytes) + " b" | |
def _attr_formatter(name): | |
return property(lambda self: _format_time(getattr(self, name))) | |
class FormattedTimesMixin: | |
"""Helpers for FunctionEvent and FunctionEventAvg. | |
The subclass should define `*_time_total` and `count` attributes. | |
""" | |
cpu_time_str = _attr_formatter("cpu_time") | |
cuda_time_str = _attr_formatter("cuda_time") | |
privateuse1_time_str = _attr_formatter("privateuse1_time") | |
cpu_time_total_str = _attr_formatter("cpu_time_total") | |
cuda_time_total_str = _attr_formatter("cuda_time_total") | |
privateuse1_time_total_str = _attr_formatter("privateuse1_time_total") | |
self_cpu_time_total_str = _attr_formatter("self_cpu_time_total") | |
self_cuda_time_total_str = _attr_formatter("self_cuda_time_total") | |
self_privateuse1_time_total_str = _attr_formatter("self_privateuse1_time_total") | |
def cpu_time(self): | |
return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined] | |
def cuda_time(self): | |
return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count # type: ignore[attr-defined] | |
def privateuse1_time(self): | |
return 0.0 if self.count == 0 else 1.0 * self.privateuse1_time_total / self.count # type: ignore[attr-defined] | |
class Interval: | |
def __init__(self, start, end): | |
self.start = start | |
self.end = end | |
def elapsed_us(self): | |
r""" | |
Returns the length of the interval | |
""" | |
return self.end - self.start | |
Kernel = namedtuple("Kernel", ["name", "device", "duration"]) | |
class FunctionEvent(FormattedTimesMixin): | |
"""Profiling information about a single function.""" | |
def __init__( | |
self, | |
id, | |
name, | |
thread, | |
start_us, | |
end_us, | |
fwd_thread=None, | |
input_shapes=None, | |
stack=None, | |
scope=0, | |
use_device=None, | |
cpu_memory_usage=0, | |
cuda_memory_usage=0, | |
privateuse1_memory_usage=0, | |
is_async=False, | |
is_remote=False, | |
sequence_nr=-1, | |
node_id=-1, | |
device_type=DeviceType.CPU, | |
device_index=0, | |
is_legacy=False, | |
flops=None, | |
trace_name=None, | |
concrete_inputs=None, | |
): | |
self.id: int = id | |
self.node_id: int = node_id | |
self.name: str = name | |
self.trace_name: str = trace_name | |
self.time_range: Interval = Interval(start_us, end_us) | |
self.thread: int = thread | |
self.fwd_thread: Optional[int] = fwd_thread | |
self.kernels: List[Kernel] = [] | |
self.count: int = 1 | |
self.cpu_children: List[FunctionEvent] = [] | |
self.cpu_parent: Optional[FunctionEvent] = None | |
self.input_shapes: Tuple[int, ...] = input_shapes | |
self.concrete_inputs: List[Any] = concrete_inputs | |
self.stack: List = stack | |
self.scope: int = scope | |
self.use_device: Optional[str] = use_device | |
self.cpu_memory_usage: int = cpu_memory_usage | |
self.cuda_memory_usage: int = cuda_memory_usage | |
self.privateuse1_memory_usage: int = privateuse1_memory_usage | |
self.is_async: bool = is_async | |
self.is_remote: bool = is_remote | |
self.sequence_nr: int = sequence_nr | |
self.device_type: DeviceType = device_type | |
self.device_index: int = device_index | |
self.is_legacy: bool = is_legacy | |
self.flops: Optional[int] = flops | |
def append_kernel(self, name, device, duration): | |
assert self.device_type == DeviceType.CPU | |
self.kernels.append(Kernel(name, device, duration)) | |
def append_cpu_child(self, child): | |
"""Append a CPU child of type FunctionEvent. | |
One is supposed to append only direct children to the event to have | |
correct self cpu time being reported. | |
""" | |
assert self.device_type == DeviceType.CPU | |
assert isinstance(child, FunctionEvent) | |
assert child.device_type == DeviceType.CPU | |
self.cpu_children.append(child) | |
def set_cpu_parent(self, parent): | |
"""Set the immediate CPU parent of type FunctionEvent. | |
One profiling FunctionEvent should have only one CPU parent such that | |
the child's range interval is completely inside the parent's. We use | |
this connection to determine the event is from top-level op or not. | |
""" | |
assert self.device_type == DeviceType.CPU | |
assert isinstance(parent, FunctionEvent) | |
assert parent.device_type == DeviceType.CPU | |
self.cpu_parent = parent | |
# Note: async events don't have children, are not used when computing 'self' | |
# metrics of other events, have only total cpu time | |
def self_cpu_memory_usage(self): | |
if self.is_async or self.device_type != DeviceType.CPU: | |
return 0 | |
return self.cpu_memory_usage - sum( | |
[child.cpu_memory_usage for child in self.cpu_children] | |
) | |
def self_cuda_memory_usage(self): | |
if self.is_async or self.device_type != DeviceType.CPU: | |
return 0 | |
return self.cuda_memory_usage - sum( | |
[child.cuda_memory_usage for child in self.cpu_children] | |
) | |
def self_privateuse1_memory_usage(self): | |
if self.is_async or self.device_type != DeviceType.CPU: | |
return 0 | |
return self.privateuse1_memory_usage - sum( | |
[child.privateuse1_memory_usage for child in self.cpu_children] | |
) | |
def self_cpu_time_total(self): | |
if self.is_async or self.device_type != DeviceType.CPU: | |
return 0 | |
return self.cpu_time_total - sum( | |
[child.cpu_time_total for child in self.cpu_children] | |
) | |
def cuda_time_total(self): | |
if self.is_async or self.use_device: | |
return 0 | |
if self.device_type == DeviceType.CPU: | |
if not self.is_legacy: | |
# account for the kernels in the children ops | |
return sum(kinfo.duration for kinfo in self.kernels) + sum( | |
ch.cuda_time_total for ch in self.cpu_children | |
) | |
else: | |
# each legacy cpu events has a single (fake) kernel | |
return sum(kinfo.duration for kinfo in self.kernels) | |
else: | |
assert self.device_type == DeviceType.CUDA | |
return self.time_range.elapsed_us() | |
def self_cuda_time_total(self): | |
if self.is_async or self.use_device: | |
return 0 | |
if self.device_type == DeviceType.CPU: | |
return self.cuda_time_total - sum( | |
[child.cuda_time_total for child in self.cpu_children] | |
) | |
else: | |
assert self.device_type == DeviceType.CUDA | |
return self.cuda_time_total | |
def cpu_time_total(self): | |
if self.device_type == DeviceType.CPU: | |
return self.time_range.elapsed_us() | |
else: | |
return 0 | |
def self_privateuse1_time_total(self): | |
if self.is_async or not self.use_device: | |
return 0 | |
if self.device_type == DeviceType.CPU: | |
return self.privateuse1_time_total - sum( | |
[child.privateuse1_time_total for child in self.cpu_children] | |
) | |
else: | |
assert self.device_type == DeviceType.CUDA | |
return self.privateuse1_time_total | |
def privateuse1_time_total(self): | |
if self.is_async or not self.use_device: | |
return 0 | |
if self.device_type == DeviceType.CPU: | |
if not self.is_legacy: | |
# account for the kernels in the children ops | |
return sum(kinfo.duration for kinfo in self.kernels) + sum( | |
ch.privateuse1_time_total for ch in self.cpu_children | |
) | |
else: | |
# each legacy cpu events has a single (fake) kernel | |
return sum(kinfo.duration for kinfo in self.kernels) | |
else: | |
assert self.device_type == DeviceType.PrivateUse1 | |
return self.time_range.elapsed_us() | |
def key(self): | |
return self.name | |
def __repr__(self): | |
device_name = "cuda" if not self.use_device else self.use_device | |
device_time = ( | |
self.cuda_time_str if not self.use_device else self.privateuse1_time_str | |
) | |
device_memory_usage = ( | |
self.cuda_memory_usage | |
if not self.use_device | |
else self.privateuse1_memory_usage | |
) | |
return ( | |
"<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} " | |
"cpu_children={} {}_time={} name={} thread={} input_shapes={} " | |
"cpu_memory_usage={} {}_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>".format( | |
self.id, | |
self.name, | |
self.device_type, | |
self.node_id, | |
self.cpu_time_str, | |
self.time_range.start, | |
self.time_range.end, | |
str([child.id for child in self.cpu_children]), | |
device_name, | |
device_time, | |
self.name, | |
self.thread, | |
str(self.input_shapes), | |
self.cpu_memory_usage, | |
device_name, | |
device_memory_usage, | |
self.is_async, | |
self.is_remote, | |
self.sequence_nr, | |
self.is_legacy, | |
) | |
) | |
class FunctionEventAvg(FormattedTimesMixin): | |
"""Used to average stats over multiple FunctionEvent objects.""" | |
def __init__(self): | |
self.key: Optional[str] = None | |
self.count: int = 0 | |
self.node_id: int = 0 | |
self.is_async: bool = False | |
self.is_remote: bool = False | |
self.use_device: Optional[str] = None | |
self.cpu_time_total: int = 0 | |
self.cuda_time_total: int = 0 | |
self.privateuse1_time_total: int = 0 | |
self.self_cpu_time_total: int = 0 | |
self.self_cuda_time_total: int = 0 | |
self.self_privateuse1_time_total: int = 0 | |
self.input_shapes: Optional[List[List[int]]] = None | |
self.stack: Optional[List] = None | |
self.scope: Optional[int] = None | |
self.cpu_memory_usage: int = 0 | |
self.cuda_memory_usage: int = 0 | |
self.privateuse1_memory_usage: int = 0 | |
self.self_cpu_memory_usage: int = 0 | |
self.self_cuda_memory_usage: int = 0 | |
self.self_privateuse1_memory_usage: int = 0 | |
self.cpu_children: Optional[List[FunctionEvent]] = None | |
self.cpu_parent: Optional[FunctionEvent] = None | |
self.device_type: DeviceType = DeviceType.CPU | |
self.is_legacy: bool = False | |
self.flops: int = 0 | |
def add(self, other): | |
if self.key is None: | |
# First function being recorded as part of FunctionEventAvg, propagate | |
# fields. | |
self.key = other.key | |
self.node_id = other.node_id | |
self.is_async = other.is_async | |
self.is_remote = other.is_remote | |
self.cpu_parent = other.cpu_parent | |
self.cpu_children = other.cpu_children | |
self.input_shapes = other.input_shapes | |
self.stack = other.stack | |
self.scope = other.scope | |
self.device_type = other.device_type | |
self.is_legacy = other.is_legacy | |
self.use_device = other.use_device | |
assert isinstance(other, (FunctionEvent, FunctionEventAvg)) | |
assert other.key == self.key | |
self.cpu_time_total += other.cpu_time_total | |
self.cuda_time_total += other.cuda_time_total | |
self.privateuse1_time_total += other.privateuse1_time_total | |
self.self_cpu_time_total += other.self_cpu_time_total | |
self.self_cuda_time_total += other.self_cuda_time_total | |
self.self_privateuse1_time_total += other.self_privateuse1_time_total | |
self.cpu_memory_usage += other.cpu_memory_usage | |
self.cuda_memory_usage += other.cuda_memory_usage | |
self.privateuse1_memory_usage += other.privateuse1_memory_usage | |
self.self_cpu_memory_usage += other.self_cpu_memory_usage | |
self.self_cuda_memory_usage += other.self_cuda_memory_usage | |
self.self_privateuse1_memory_usage += other.self_privateuse1_memory_usage | |
self.count += other.count | |
if self.flops is None: | |
self.flops = other.flops | |
elif other.flops is not None: | |
self.flops += other.flops | |
return self | |
def __iadd__(self, other): | |
return self.add(other) | |
def __repr__(self): | |
device_name = "cuda" if not self.use_device else self.use_device | |
self_device_time = ( | |
self.self_cuda_time_total_str | |
if not self.use_device | |
else self.self_privateuse1_time_total_str | |
) | |
device_time = ( | |
self.cuda_time_str if not self.use_device else self.privateuse1_time_str | |
) | |
device_memory = ( | |
self.cuda_memory_usage | |
if not self.use_device | |
else self.privateuse1_memory_usage | |
) | |
return ( | |
"<FunctionEventAvg key={} self_cpu_time={} cpu_time={} " | |
" self_{}_time={} {}_time={} input_shapes={} " | |
"cpu_memory_usage={} {}_memory_usage={}>".format( | |
self.key, | |
self.self_cpu_time_total_str, | |
self.cpu_time_str, | |
device_name, | |
self_device_time, | |
device_name, | |
device_time, | |
str(self.input_shapes), | |
self.cpu_memory_usage, | |
device_name, | |
device_memory, | |
) | |
) | |
class StringTable(defaultdict): | |
def __missing__(self, key): | |
# manage cases like 't' (demangled to 'unsigned short') separately, | |
# for now simply check the length to avoid unexpected results for | |
# the short sequences | |
self[key] = torch._C._demangle(key) if len(key) > 1 else key | |
return self[key] | |
class MemRecordsAcc: | |
"""Acceleration structure for accessing mem_records in interval.""" | |
def __init__(self, mem_records): | |
self._mem_records = mem_records | |
self._start_uses: List[int] = [] | |
self._indices: List[int] = [] | |
if len(mem_records) > 0: | |
tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)]) | |
self._start_uses, self._indices = zip(*tmp) # type: ignore[assignment] | |
def in_interval(self, start_us, end_us): | |
r""" | |
Return all records in the given interval | |
""" | |
start_idx = bisect.bisect_left(self._start_uses, start_us) | |
end_idx = bisect.bisect_right(self._start_uses, end_us) | |
for i in range(start_idx, end_idx): | |
yield self._mem_records[self._indices[i]] | |
def _filter_stack_entry(entry): | |
filtered_entries = [ | |
("autograd/__init__", "_make_grads"), | |
("autograd/__init__", "backward"), | |
("torch/tensor", "backward"), | |
("_internal/common_utils", "prof_callable"), | |
("_internal/common_utils", "prof_func_call"), | |
("_internal/common_utils", "prof_meth_call"), | |
] | |
return all(not (f[0] in entry and f[1] in entry) for f in filtered_entries) | |
MEMORY_EVENT_NAME = "[memory]" | |
OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]" | |
def _filter_name(name): | |
# ignoring the following utility ops | |
filtered_out_names = [ | |
MEMORY_EVENT_NAME, # used only for the top-level memory events | |
OUT_OF_MEMORY_EVENT_NAME, | |
"profiler::_record_function_enter", | |
"profiler::_record_function_enter_new", | |
"profiler::_record_function_exit", | |
"aten::is_leaf", | |
"aten::output_nr", | |
"aten::_version", | |
] | |
return name in filtered_out_names | |
# Demangles and optionally rewrites the provided event name, | |
# with_wildcard - whether to replace certain numbered event names | |
# with a wildcard name to aggregate them together in the profiler table | |
# output | |
def _rewrite_name(name, with_wildcard=False): | |
string_table = StringTable() | |
name = string_table[name] | |
if with_wildcard: | |
if name.startswith("ProfilerStep#"): | |
name = "ProfilerStep*" | |
return name | |
def _build_table( | |
events, | |
sort_by=None, | |
header=None, | |
row_limit=100, | |
max_src_column_width=75, | |
max_name_column_width=55, | |
max_shapes_column_width=80, | |
with_flops=False, | |
profile_memory=False, | |
top_level_events_only=False, | |
): | |
"""Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg).""" | |
if len(events) == 0: | |
return "" | |
has_cuda_time = any(event.self_cuda_time_total > 0 for event in events) | |
has_cuda_mem = any(event.self_cuda_memory_usage > 0 for event in events) | |
has_privateuse1_time = any( | |
event.self_privateuse1_time_total > 0 for event in events | |
) | |
has_privateuse1_mem = any( | |
event.self_privateuse1_memory_usage > 0 for event in events | |
) | |
use_device = events[0].use_device | |
if not use_device and (has_privateuse1_mem or has_privateuse1_time): | |
raise RuntimeError( | |
"use_device is None, but there is private device performance data." | |
) | |
has_input_shapes = any( | |
(event.input_shapes is not None and len(event.input_shapes) > 0) | |
for event in events | |
) | |
if sort_by is not None: | |
events = EventList( | |
sorted(events, key=lambda evt: getattr(evt, sort_by), reverse=True), | |
use_cuda=has_cuda_time, | |
use_device=use_device, | |
profile_memory=profile_memory, | |
with_flops=with_flops, | |
) | |
name_column_width = max([len(evt.key) for evt in events]) + 4 | |
if max_name_column_width is not None: | |
name_column_width = min(name_column_width, max_name_column_width) | |
shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4 | |
if max_shapes_column_width is not None: | |
shapes_column_width = min(shapes_column_width, max_shapes_column_width) | |
DEFAULT_COLUMN_WIDTH = 12 | |
flops_column_width = DEFAULT_COLUMN_WIDTH | |
src_column_width = None | |
stacks = [] | |
for evt in events: | |
if evt.stack is not None and len(evt.stack) > 0: | |
stacks.append(evt.stack) | |
has_stack = len(stacks) > 0 | |
if has_stack: | |
src_column_width = ( | |
max([max([len(entry) for entry in stack]) for stack in stacks]) + 4 | |
) | |
if max_src_column_width is not None: | |
src_column_width = min(src_column_width, max_src_column_width) | |
headers = [ | |
"Name", | |
"Self CPU %", | |
"Self CPU", | |
"CPU total %", | |
"CPU total", | |
"CPU time avg", | |
] | |
if has_cuda_time: | |
headers.extend( | |
[ | |
"Self CUDA", | |
"Self CUDA %", | |
"CUDA total", | |
"CUDA time avg", | |
] | |
) | |
if has_privateuse1_time: | |
privateuse1 = use_device.upper() | |
headers.extend( | |
[ | |
f"Self {privateuse1}", | |
f"Self {privateuse1} %", | |
f"{privateuse1} total", | |
f"{privateuse1} time avg", | |
] | |
) | |
if profile_memory: | |
headers.extend( | |
[ | |
"CPU Mem", | |
"Self CPU Mem", | |
] | |
) | |
if has_cuda_mem: | |
headers.extend( | |
[ | |
"CUDA Mem", | |
"Self CUDA Mem", | |
] | |
) | |
if has_privateuse1_mem: | |
privateuse1 = use_device.upper() | |
headers.extend( | |
[ | |
f"{privateuse1} Mem", | |
f"Self {privateuse1} Mem", | |
] | |
) | |
headers.append("# of Calls") | |
# Only append Node ID if any event has a valid (>= 0) Node ID | |
append_node_id = any(evt.node_id != -1 for evt in events) | |
if append_node_id: | |
headers.append("Node ID") | |
# Have to use a list because nonlocal is Py3 only... | |
SPACING_SIZE = 2 | |
row_format_lst = [""] | |
header_sep_lst = [""] | |
line_length_lst = [-SPACING_SIZE] | |
MAX_STACK_ENTRY = 5 | |
def add_column(padding, text_dir=">"): | |
row_format_lst[0] += ( | |
"{: " + text_dir + str(padding) + "}" + (" " * SPACING_SIZE) | |
) | |
header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE) | |
line_length_lst[0] += padding + SPACING_SIZE | |
def auto_scale_flops(flops): | |
flop_headers = [ | |
"FLOPs", | |
"KFLOPs", | |
"MFLOPs", | |
"GFLOPs", | |
"TFLOPs", | |
"PFLOPs", | |
] | |
assert flops > 0 | |
log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1))) | |
assert log_flops >= 0 and log_flops < len(flop_headers) | |
return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)]) | |
add_column(name_column_width) | |
for _ in headers[1:]: | |
add_column(DEFAULT_COLUMN_WIDTH) | |
if has_input_shapes: | |
headers.append("Input Shapes") | |
add_column(shapes_column_width) | |
if has_stack: | |
headers.append("Source Location") | |
add_column(src_column_width, text_dir="<") | |
if with_flops: | |
# Auto-scaling of flops header | |
raw_flops = [] | |
for evt in events: | |
if evt.flops > 0: | |
raw_flops.append(evt.flops) | |
if len(raw_flops) != 0: | |
(flops_scale, flops_header) = auto_scale_flops(min(raw_flops)) | |
headers.append(f"Total {flops_header}") | |
add_column(flops_column_width) | |
else: | |
with_flops = False # can't find any valid flops | |
row_format = row_format_lst[0] | |
header_sep = header_sep_lst[0] | |
line_length = line_length_lst[0] | |
add_column = None # type: ignore[assignment] | |
# Have to use a list because nonlocal is Py3 only... | |
result = [] | |
def append(s): | |
result.append(s) | |
result.append("\n") # Yes, newline after the end as well | |
sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) | |
sum_self_cuda_time_total = 0 | |
sum_self_privateuse1_time_total = 0 | |
for evt in events: | |
if evt.device_type == DeviceType.CPU: | |
# in legacy profiler, kernel info is stored in cpu events | |
if evt.is_legacy: | |
if not use_device: | |
sum_self_cuda_time_total += evt.self_cuda_time_total | |
else: | |
sum_self_privateuse1_time_total += evt.self_privateuse1_time_total | |
elif evt.device_type == DeviceType.CUDA: | |
# in kineto profiler, there're events with the correct device type (e.g. CUDA) | |
sum_self_cuda_time_total += evt.self_cuda_time_total | |
elif evt.device_type == DeviceType.PrivateUse1: | |
sum_self_privateuse1_time_total += evt.self_privateuse1_time_total | |
# Actual printing | |
if header is not None: | |
append("=" * line_length) | |
append(header) | |
if top_level_events_only: | |
append("=" * line_length) | |
append("This report only display top-level ops statistics") | |
append(header_sep) | |
append(row_format.format(*headers)) | |
append(header_sep) | |
def trim_path(path, src_column_width): | |
if len(path) > src_column_width: | |
offset = len(path) - src_column_width | |
path = path[offset:] | |
if len(path) > 3: | |
path = "..." + path[3:] | |
return path | |
event_limit = 0 | |
for evt in events: | |
if event_limit == row_limit: | |
break | |
if top_level_events_only and evt.cpu_parent is not None: | |
continue | |
else: | |
event_limit += 1 | |
name = evt.key | |
if max_name_column_width is not None and len(name) >= max_name_column_width - 3: | |
name = name[: (max_name_column_width - 3)] + "..." | |
row_values = [ | |
name, | |
# Self CPU total %, 0 for async events. | |
_format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total), | |
evt.self_cpu_time_total_str, # Self CPU total | |
# CPU total %, 0 for async events. | |
_format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) | |
if not evt.is_async | |
else 0, | |
evt.cpu_time_total_str, # CPU total | |
evt.cpu_time_str, # CPU time avg | |
] | |
if has_cuda_time: | |
row_values.extend( | |
[ | |
evt.self_cuda_time_total_str, | |
# CUDA time total % | |
_format_time_share( | |
evt.self_cuda_time_total, sum_self_cuda_time_total | |
), | |
evt.cuda_time_total_str, | |
evt.cuda_time_str, # Cuda time avg | |
] | |
) | |
if has_privateuse1_time: | |
row_values.extend( | |
[ | |
evt.self_privateuse1_time_total_str, | |
# PrivateUse1 time total % | |
_format_time_share( | |
evt.self_privateuse1_time_total, sum_self_privateuse1_time_total | |
), | |
evt.privateuse1_time_total_str, | |
evt.privateuse1_time_str, # PrivateUse1 time avg | |
] | |
) | |
if profile_memory: | |
row_values.extend( | |
[ | |
# CPU Mem Total | |
_format_memory(evt.cpu_memory_usage), | |
# Self CPU Mem Total | |
_format_memory(evt.self_cpu_memory_usage), | |
] | |
) | |
if has_cuda_mem: | |
row_values.extend( | |
[ | |
# CUDA Mem Total | |
_format_memory(evt.cuda_memory_usage), | |
# Self CUDA Mem Total | |
_format_memory(evt.self_cuda_memory_usage), | |
] | |
) | |
if has_privateuse1_mem: | |
row_values.extend( | |
[ | |
# PrivateUse1 Mem Total | |
_format_memory(evt.privateuse1_memory_usage), | |
# Self PrivateUse1 Mem Total | |
_format_memory(evt.self_privateuse1_memory_usage), | |
] | |
) | |
row_values.append( | |
evt.count, # Number of calls | |
) | |
if append_node_id: | |
row_values.append(evt.node_id) | |
if has_input_shapes: | |
row_values.append(str(evt.input_shapes)[:shapes_column_width]) | |
if with_flops: | |
if evt.flops <= 0: | |
row_values.append("--") | |
else: | |
row_values.append(f"{evt.flops * flops_scale:8.3f}") # type: ignore[possibly-undefined] | |
if has_stack: | |
src_field = "" | |
if len(evt.stack) > 0: | |
src_field = trim_path(evt.stack[0], src_column_width) | |
row_values.append(src_field) | |
append(row_format.format(*row_values)) | |
if has_stack: | |
empty_headers = [""] * (len(headers) - 1) | |
for entry in evt.stack[1:MAX_STACK_ENTRY]: | |
append( | |
row_format.format( | |
*(empty_headers + [trim_path(entry, src_column_width)]) | |
) | |
) | |
empty_headers.append("") | |
append(row_format.format(*empty_headers)) | |
append(header_sep) | |
append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}") | |
if has_cuda_time: | |
append(f"Self CUDA time total: {_format_time(sum_self_cuda_time_total)}") | |
if has_privateuse1_time: | |
append( | |
f"Self {use_device.upper()} time total: {_format_time(sum_self_privateuse1_time_total)}" | |
) | |
return "".join(result) | |