File size: 17,082 Bytes
c61ccee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
//  Copyright © 2022 Apple Inc.

#pragma once

#include <ATen/Tensor.h>
#include <ATen/mps/MPSStream.h>
#include <ATen/mps/MPSAllocatorInterface.h>

#include <os/signpost.h>
#include <os/log.h>

#include <sstream>
#include <string>
#include <atomic>
#include <unordered_map>
#include <utility>
#include <ctime>

namespace at::mps {

namespace Profiler {

struct BaseInfo {
  // profiling info types
  enum class Type {
    GRAPH,
    KERNEL,
    COPY,
    CPU_FALLBACK,
  };

  BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
      type(infoType), profileId(Id), handle(Handle) { }
  virtual ~BaseInfo() = default;

  // type of profiling info
  Type type;
  // unique profile ID for execution instances of operations or copies
  uint64_t profileId;
  // ID generated by os_signpost
  // since it's possible to use event and interval-based signposts at the
  // same time, we need separate IDs for each.
  os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
  // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
  std::atomic<double> totalGpuTime{0.0};
  // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
  std::atomic<double> totalSchedulingTime{0.0};
  // indicates if the operation or copy execution has completed
  std::atomic_bool completed{false};
  // handle used to identify the profile info's instance (usually the pointer)
  const uintptr_t handle;

  virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
  // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
  static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
    if (tensor.defined()) {
      std::stringstream tensorStr;
      auto deviceType = tensor.device().type();
      tensorStr << c10::DeviceTypeName(deviceType);
      // see comments for INCLUDE_BUFFER_ID
      if (includeBufferId && deviceType == at::kMPS) {
        id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
        tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
                  << ":" << buffer.retainCount << ")";
      }
      tensorStr << ":"
                << tensor.scalar_type() << tensor.sizes();
      return tensorStr.str();
    } else {
      return "undefined";
    }
  }
  static uint64_t getTime() {
    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
  }
};

struct OperationInfo : BaseInfo {
  OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
      BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }

  uint64_t runCount = 0;
  std::string strKey;

  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

  // builds a string for a kernel
  static std::string buildKernelString(const std::string& kernelName,

                                       const TensorList& tensors,

                                       bool includeBufferId = false) {
    std::stringstream kernelStr;
    kernelStr << kernelName;
    for (const Tensor& tensor: tensors) {
      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
    }
    return kernelStr.str();
  }
};

struct CpuFbInfo : BaseInfo {
  CpuFbInfo(uint64_t Id, const std::string& OpName) :
      BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }

  uint64_t runCount = 0;
  // the current and total overhead of copies in bytes required to convert the Op's
  // input tensors from MPS to CPU and then output from CPU back to MPS
  size_t currentCopyOverhead = 0;
  size_t totalCopyOverhead = 0;
  std::string opName;
  std::string strKey;
  uint64_t startTime = 0;

  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

  void updateCopyOverhead(const TensorList& tensors) {
    currentCopyOverhead = 0;
    for (const Tensor& tensor: tensors) {
      if (tensor.defined()) {
        currentCopyOverhead += tensor.nbytes();
      }
    }
    totalCopyOverhead += currentCopyOverhead;
  }
};

struct CopyInfo : BaseInfo {
  enum class Kind {
    MPS_TO_MPS,
    MPS_TO_CPU,
    CPU_TO_MPS,
  };

  CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
           BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
           length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }

  Kind kind;
  size_t length;
  bool isNonBlocking;
  bool usesBlitter;
  std::string srcStrKey;
  std::string dstStrKey;
  // for copies that don't use blitters, we measure CPU time
  uint64_t startTime = 0;

  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

  static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);

  static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
    if (tensor.has_value()) {
      return tensor->device().type() == at::kMPS;
    }
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
    // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
    return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
  }

  static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,

                          const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
    const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
    const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
    if (isSrcOnMPS && !isDstOnMPS) {
      return Kind::MPS_TO_CPU;
    } else if (!isSrcOnMPS && isDstOnMPS) {
      return Kind::CPU_TO_MPS;
    }
    return Kind::MPS_TO_MPS;
  }
};

struct CopyStat : CopyInfo {
  explicit CopyStat(std::string CopyKindStr) :
          CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
  // total number of copies
  size_t totalCount = 0;
  // number of Scalar copies (i.e., less than sizeof(int64))
  size_t scalarsCount = 0;
  // number of blocking copies (i.e., require syncing to GPU)
  size_t blockingCount = 0;
  // number of copies that used memcpy(), instead of Metal Blit Encoder
  size_t memcpyCount = 0;
  // accumulated GPU time in ms for the scalar copies
  std::atomic<double> scalarsGpuTime{0.0};
  // copy kind in string type
  std::string kindStr;
};

class MPSProfiler {
public:
  // lower 16 bits used for profiler options
  enum ProfileOptions : uint32_t {
    OPTIONS_NONE = 0,
    // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
    // (used for convenience to not compute bit flags by OR-ing manually)
    // trace all signpost types using events
    ALL_SIGNPOST_EVENTS    = (1 << 0),
    // trace all signpost types using intervals
    ALL_SIGNPOST_INTERVALS = (1 << 1),
    // always wait for command buffer to finish executing after each commit
    WAIT_UNTIL_COMPLETED   = (1 << 2),
    // for interval-based signposts, include the scheduling portion of
    // Graph/Kernel/Copy executions as well.
    // if flag is disable, only "GPU run time" is included in interval,
    // and not schedule time.
    INCLUDE_SCHEDULE_INTERVAL = (1 << 3),

    // use these if you need to trace signposts types individually (rarely required)
    // trace signpost using intervals
    USE_INTERVALS = (1 << 4),
    // trace signpost by emitting events
    USE_EVENTS    = (1 << 5),
    // used for sanity check (Change this when new option added)
    OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
  };

  // when adding new types, #define the type string in MPSProfiler.mm as well.
  // upper 16 bits used for event types
  enum SignpostTypes : uint32_t {
    SIGNPOST_NONE = 0,
    // trace signposts for PyTorch operation executions
    RUN_OPERATION = (1 << 16),
    // trace signposts for blitter copies
    BLIT_COPY     = (1 << 17),
    // trace signposts for ops that fall back on CPU
    CPU_FALLBACK  = (1 << 18),
    // used for sanity check (Change this when new type added)
    SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
  };

  enum LogOptions : uint32_t {
    LOG_NONE = 0,

    // Info logging options during execution
    // -------------------------------------
    // prints operation info (id/key/run_count) during execution
    OPERATION_INFO      = (1 << 0),
    // prints copy info (src/dst tensors/buffers, size, etc.) during execution
    COPY_INFO           = (1 << 1),
    // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
    CPU_FALLBACK_INFO   = (1 << 2),

    // Profiling Statistics logging options when process terminates
    // ------------------------------------------------------------
    // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
    // this is convenient to not combine following stats bit flags manually
    ALL_STATS           = (1 << 3),
    // prints operation stats (GPU times, run count, etc.) before process terminates
    OPERATION_STATS     = (1 << 4),
    // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
    COPY_STATS          = (1 << 5),
    // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
    // for tensors, etc.) before process terminates
    CPU_FALLBACK_STATS  = (1 << 6),

    // Metadata format options when logging the info
    // ---------------------------------------------
    // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
    // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
    INCLUDE_GPU_TIME    = (1 << 7),
    // if enabled, includes GPU scheduling time in metadata separately
    // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
    // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
    INCLUDE_KERNEL_TIME = (1 << 8),
    // if enabled, includes the unique buffer ID in metadata for the storage
    // of a tensor that was allocated on MPSAllocator. This is useful (along with
    // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
    // with various operations.
    INCLUDE_BUFFER_ID   = (1 << 9),

    // used for sanity check (Change this when new option added)
    LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
  };

  explicit MPSProfiler();
  ~MPSProfiler();

  // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
  // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
  uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
  uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
  uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,

                            const OptionalTensorRef srcTensor,

                            const OptionalTensorRef dstTensor,

                            size_t length, bool isNonBlocking, bool usesBlitter = true);
  uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
  void beginProfileGPUInterval(const void* handle);

  void endProfileCopy(uint64_t profileId, SyncType syncType);
  void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
  void endProfileCPUFallback(const std::string& opName);

  // these are used to hook into Python bindings for torch.mps.profiler module.
  // this enables generating OS Signpost traces from MPSProfiler on-demand
  // during runtime (instead of environment variables).
  // The "mode" could be either "interval", "event", or both "interval,event"
  // for interval-based and/or event-based signpost tracing.
  void StartTrace(const string& mode, bool waitUntilCompleted);
  void StopTrace();

  // convenience functions to indicate whether signpost tracing or
  // logging are enabled for the SignpostTypes
  bool isOperationProfilingEnabled() const {
    return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
           (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
  }
  bool isCopyProfilingEnabled() const {
    return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
           (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
  }
  bool isCPUFallbackProfilingEnabled() const {
    return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
           (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
  }
  bool isSignpostTracingEnabled() const {
    return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
  }

 private:
  // indicates what type of signpost types are enabled and traced by MPS profiler.
  uint32_t m_signpost_types = 0;
  uint32_t m_profile_options = 0;
  uint32_t m_log_options = 0;
  uint64_t m_kernel_counter = 0;
  uint64_t m_graph_counter = 0;
  uint64_t m_cpu_fb_counter = 0;
  uint64_t m_copy_counter = 0;
  // technically, it's possible to trace both events and intervals at the same time
  // so we use separate os_log categories for them
  os_log_t m_os_log_events;
  os_log_t m_os_log_intervals;
  // stats logging could run either from destructor or signal handler
  // so this is used to check if logging has already started.
  std::atomic_bool hasLoggedStats{false};
  // indicates there are pending completionHandler callbacks that haven't been called yet.
  std::atomic_bool hasPendingCompletionHandlers{false};
  // used to capture sigint signal to log profiling stats
  static struct sigaction currentSigint, previousSigint;

  // We use the following lists for two reasons:
  // 1- for interval-based signposts the "begin" point won't be in same function
  // as the "end" point where we need to be able to retrieve signpost's info
  // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.

  // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
  std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
  // the string key for this map is the op name that we fall back to execute on CPU
  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
  std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
  // this list contains the info for copies, and its key is the unique profileId
  // which is generated from m_copy_counter
  // The copyInfo list is not retained.
  std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
  // a short list that contains copy stats
  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};

  void initialize();
  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,

                           os_signpost_id_t interval_signpost_id,

                           double gpuTime, double schedulingTime);
  void addProfilerScheduledHandler(BaseInfo& info);
  void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
  void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,

                         const std::string& msg) const;
  void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,

                             const std::string& msg) const;
  void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;

  void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
  // returns true if logging the profiling info "during the execution" is enabled
  bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
  // logs all the profiling stats that are enabled
  void logProfilingStats();
  // logs kernel profiling stats when the process ends.
  void logOperationsProfilingStats(std::FILE* f) const;
  // logs CPU Fallback profiling stats when the process ends.
  void logCPUFallbackProfilingStats(std::FILE* f) const;
  // logs copy profiling stats when the process ends.
  void logCopyProfilingStats(std::FILE* f) const;

  os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
  static SignpostTypes getSignpostType(BaseInfo::Type infoType);
  static void handleIntSignal(int signal);
};

} // namespace Profiler

Profiler::MPSProfiler& getMPSProfiler();

} // namespace at::mps