GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /torch /include /ATen /mps /MPSProfiler.h

Kano001

Upload 5252 files

c61ccee verified 10 months ago

raw

history blame

17.1 kB

	// Copyright © 2022 Apple Inc.

	#pragma once

	#include <ATen/Tensor.h>
	#include <ATen/mps/MPSStream.h>
	#include <ATen/mps/MPSAllocatorInterface.h>

	#include <os/signpost.h>
	#include <os/log.h>

	#include <sstream>
	#include <string>
	#include <atomic>
	#include <unordered_map>
	#include <utility>
	#include <ctime>

	namespace at::mps {

	namespace Profiler {

	struct BaseInfo {
	// profiling info types
	enum class Type {
	GRAPH,
	KERNEL,
	COPY,
	CPU_FALLBACK,
	};

	BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
	type(infoType), profileId(Id), handle(Handle) { }
	virtual ~BaseInfo() = default;

	// type of profiling info
	Type type;
	// unique profile ID for execution instances of operations or copies
	uint64_t profileId;
	// ID generated by os_signpost
	// since it's possible to use event and interval-based signposts at the
	// same time, we need separate IDs for each.
	os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
	// accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
	std::atomic<double> totalGpuTime{0.0};
	// accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
	std::atomic<double> totalSchedulingTime{0.0};
	// indicates if the operation or copy execution has completed
	std::atomic_bool completed{false};
	// handle used to identify the profile info's instance (usually the pointer)
	const uintptr_t handle;

	virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
	// builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
	static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
	if (tensor.defined()) {
	std::stringstream tensorStr;
	auto deviceType = tensor.device().type();
	tensorStr << c10::DeviceTypeName(deviceType);
	// see comments for INCLUDE_BUFFER_ID
	if (includeBufferId && deviceType == at::kMPS) {
	id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
	tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
	<< ":" << buffer.retainCount << ")";
	}
	tensorStr << ":"
	<< tensor.scalar_type() << tensor.sizes();
	return tensorStr.str();
	} else {
	return "undefined";
	}
	}
	static uint64_t getTime() {
	return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
	}
	};

	struct OperationInfo : BaseInfo {
	OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
	BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }

	uint64_t runCount = 0;
	std::string strKey;

	const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

	// builds a string for a kernel
	static std::string buildKernelString(const std::string& kernelName,
	const TensorList& tensors,
	bool includeBufferId = false) {
	std::stringstream kernelStr;
	kernelStr << kernelName;
	for (const Tensor& tensor: tensors) {
	kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
	}
	return kernelStr.str();
	}
	};

	struct CpuFbInfo : BaseInfo {
	CpuFbInfo(uint64_t Id, const std::string& OpName) :
	BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }

	uint64_t runCount = 0;
	// the current and total overhead of copies in bytes required to convert the Op's
	// input tensors from MPS to CPU and then output from CPU back to MPS
	size_t currentCopyOverhead = 0;
	size_t totalCopyOverhead = 0;
	std::string opName;
	std::string strKey;
	uint64_t startTime = 0;

	const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

	void updateCopyOverhead(const TensorList& tensors) {
	currentCopyOverhead = 0;
	for (const Tensor& tensor: tensors) {
	if (tensor.defined()) {
	currentCopyOverhead += tensor.nbytes();
	}
	}
	totalCopyOverhead += currentCopyOverhead;
	}
	};

	struct CopyInfo : BaseInfo {
	enum class Kind {
	MPS_TO_MPS,
	MPS_TO_CPU,
	CPU_TO_MPS,
	};

	CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
	BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
	length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }

	Kind kind;
	size_t length;
	bool isNonBlocking;
	bool usesBlitter;
	std::string srcStrKey;
	std::string dstStrKey;
	// for copies that don't use blitters, we measure CPU time
	uint64_t startTime = 0;

	const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;

	static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);

	static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
	if (tensor.has_value()) {
	return tensor->device().type() == at::kMPS;
	}
	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
	// getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
	return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
	}

	static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
	const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
	const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
	const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
	TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS \|\| isDstOnMPS);
	if (isSrcOnMPS && !isDstOnMPS) {
	return Kind::MPS_TO_CPU;
	} else if (!isSrcOnMPS && isDstOnMPS) {
	return Kind::CPU_TO_MPS;
	}
	return Kind::MPS_TO_MPS;
	}
	};

	struct CopyStat : CopyInfo {
	explicit CopyStat(std::string CopyKindStr) :
	CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
	// total number of copies
	size_t totalCount = 0;
	// number of Scalar copies (i.e., less than sizeof(int64))
	size_t scalarsCount = 0;
	// number of blocking copies (i.e., require syncing to GPU)
	size_t blockingCount = 0;
	// number of copies that used memcpy(), instead of Metal Blit Encoder
	size_t memcpyCount = 0;
	// accumulated GPU time in ms for the scalar copies
	std::atomic<double> scalarsGpuTime{0.0};
	// copy kind in string type
	std::string kindStr;
	};

	class MPSProfiler {
	public:
	// lower 16 bits used for profiler options
	enum ProfileOptions : uint32_t {
	OPTIONS_NONE = 0,
	// ALL_* means, all signpost types (RUN_OPERATION\|BLIT_COPY\|CPU_FALLBACK, etc.)
	// (used for convenience to not compute bit flags by OR-ing manually)
	// trace all signpost types using events
	ALL_SIGNPOST_EVENTS = (1 << 0),
	// trace all signpost types using intervals
	ALL_SIGNPOST_INTERVALS = (1 << 1),
	// always wait for command buffer to finish executing after each commit
	WAIT_UNTIL_COMPLETED = (1 << 2),
	// for interval-based signposts, include the scheduling portion of
	// Graph/Kernel/Copy executions as well.
	// if flag is disable, only "GPU run time" is included in interval,
	// and not schedule time.
	INCLUDE_SCHEDULE_INTERVAL = (1 << 3),

	// use these if you need to trace signposts types individually (rarely required)
	// trace signpost using intervals
	USE_INTERVALS = (1 << 4),
	// trace signpost by emitting events
	USE_EVENTS = (1 << 5),
	// used for sanity check (Change this when new option added)
	OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
	};

	// when adding new types, #define the type string in MPSProfiler.mm as well.
	// upper 16 bits used for event types
	enum SignpostTypes : uint32_t {
	SIGNPOST_NONE = 0,
	// trace signposts for PyTorch operation executions
	RUN_OPERATION = (1 << 16),
	// trace signposts for blitter copies
	BLIT_COPY = (1 << 17),
	// trace signposts for ops that fall back on CPU
	CPU_FALLBACK = (1 << 18),
	// used for sanity check (Change this when new type added)
	SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
	};

	enum LogOptions : uint32_t {
	LOG_NONE = 0,

	// Info logging options during execution
	// -------------------------------------
	// prints operation info (id/key/run_count) during execution
	OPERATION_INFO = (1 << 0),
	// prints copy info (src/dst tensors/buffers, size, etc.) during execution
	COPY_INFO = (1 << 1),
	// prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
	CPU_FALLBACK_INFO = (1 << 2),

	// Profiling Statistics logging options when process terminates
	// ------------------------------------------------------------
	// prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
	// this is convenient to not combine following stats bit flags manually
	ALL_STATS = (1 << 3),
	// prints operation stats (GPU times, run count, etc.) before process terminates
	OPERATION_STATS = (1 << 4),
	// prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
	COPY_STATS = (1 << 5),
	// prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
	// for tensors, etc.) before process terminates
	CPU_FALLBACK_STATS = (1 << 6),

	// Metadata format options when logging the info
	// ---------------------------------------------
	// if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
	// from Metal Command Buffers) (e.g., [GPU=0.324 ms])
	INCLUDE_GPU_TIME = (1 << 7),
	// if enabled, includes GPU scheduling time in metadata separately
	// (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
	// e.g., [GPU=0.324 ms, KRNL=0.036 ms]
	INCLUDE_KERNEL_TIME = (1 << 8),
	// if enabled, includes the unique buffer ID in metadata for the storage
	// of a tensor that was allocated on MPSAllocator. This is useful (along with
	// the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
	// with various operations.
	INCLUDE_BUFFER_ID = (1 << 9),

	// used for sanity check (Change this when new option added)
	LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
	};

	explicit MPSProfiler();
	~MPSProfiler();

	// the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
	// the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
	uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
	uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
	uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
	const OptionalTensorRef srcTensor,
	const OptionalTensorRef dstTensor,
	size_t length, bool isNonBlocking, bool usesBlitter = true);
	uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
	void beginProfileGPUInterval(const void* handle);

	void endProfileCopy(uint64_t profileId, SyncType syncType);
	void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
	void endProfileCPUFallback(const std::string& opName);

	// these are used to hook into Python bindings for torch.mps.profiler module.
	// this enables generating OS Signpost traces from MPSProfiler on-demand
	// during runtime (instead of environment variables).
	// The "mode" could be either "interval", "event", or both "interval,event"
	// for interval-based and/or event-based signpost tracing.
	void StartTrace(const string& mode, bool waitUntilCompleted);
	void StopTrace();

	// convenience functions to indicate whether signpost tracing or
	// logging are enabled for the SignpostTypes
	bool isOperationProfilingEnabled() const {
	return (m_signpost_types & SignpostTypes::RUN_OPERATION) \|\|
	(m_log_options & (LogOptions::OPERATION_INFO \| LogOptions::OPERATION_STATS));
	}
	bool isCopyProfilingEnabled() const {
	return (m_signpost_types & SignpostTypes::BLIT_COPY) \|\|
	(m_log_options & (LogOptions::COPY_INFO \| LogOptions::COPY_STATS));
	}
	bool isCPUFallbackProfilingEnabled() const {
	return (m_signpost_types & SignpostTypes::CPU_FALLBACK) \|\|
	(m_log_options & (LogOptions::CPU_FALLBACK_INFO \| LogOptions::CPU_FALLBACK_STATS));
	}
	bool isSignpostTracingEnabled() const {
	return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
	}

	private:
	// indicates what type of signpost types are enabled and traced by MPS profiler.
	uint32_t m_signpost_types = 0;
	uint32_t m_profile_options = 0;
	uint32_t m_log_options = 0;
	uint64_t m_kernel_counter = 0;
	uint64_t m_graph_counter = 0;
	uint64_t m_cpu_fb_counter = 0;
	uint64_t m_copy_counter = 0;
	// technically, it's possible to trace both events and intervals at the same time
	// so we use separate os_log categories for them
	os_log_t m_os_log_events;
	os_log_t m_os_log_intervals;
	// stats logging could run either from destructor or signal handler
	// so this is used to check if logging has already started.
	std::atomic_bool hasLoggedStats{false};
	// indicates there are pending completionHandler callbacks that haven't been called yet.
	std::atomic_bool hasPendingCompletionHandlers{false};
	// used to capture sigint signal to log profiling stats
	static struct sigaction currentSigint, previousSigint;

	// We use the following lists for two reasons:
	// 1- for interval-based signposts the "begin" point won't be in same function
	// as the "end" point where we need to be able to retrieve signpost's info
	// 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.

	// the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
	// this list is retained and could be logged along with aggregate profiling numbers when the process ends.
	std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
	// the string key for this map is the op name that we fall back to execute on CPU
	// this list is retained and could be logged along with aggregate profiling numbers when the process ends.
	std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
	// this list contains the info for copies, and its key is the unique profileId
	// which is generated from m_copy_counter
	// The copyInfo list is not retained.
	std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
	// a short list that contains copy stats
	std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};

	void initialize();
	void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
	void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
	os_signpost_id_t interval_signpost_id,
	double gpuTime, double schedulingTime);
	void addProfilerScheduledHandler(BaseInfo& info);
	void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
	void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
	const std::string& msg) const;
	void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
	const std::string& msg) const;
	void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;

	void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
	// returns true if logging the profiling info "during the execution" is enabled
	bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
	// logs all the profiling stats that are enabled
	void logProfilingStats();
	// logs kernel profiling stats when the process ends.
	void logOperationsProfilingStats(std::FILE* f) const;
	// logs CPU Fallback profiling stats when the process ends.
	void logCPUFallbackProfilingStats(std::FILE* f) const;
	// logs copy profiling stats when the process ends.
	void logCopyProfilingStats(std::FILE* f) const;

	os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
	static SignpostTypes getSignpostType(BaseInfo::Type infoType);
	static void handleIntSignal(int signal);
	};

	} // namespace Profiler

	Profiler::MPSProfiler& getMPSProfiler();

	} // namespace at::mps