GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /torch /include /c10 /cuda /CUDAFunctions.h

Kano001

Upload 5252 files

c61ccee verified 10 months ago

raw

history blame

4 kB

	#pragma once

	// This header provides C++ wrappers around commonly used CUDA API functions.
	// The benefit of using C++ here is that we can raise an exception in the
	// event of an error, rather than explicitly pass around error codes. This
	// leads to more natural APIs.
	//
	// The naming convention used here matches the naming convention of torch.cuda

	#include <c10/core/Device.h>
	#include <c10/core/impl/GPUTrace.h>
	#include <c10/cuda/CUDAException.h>
	#include <c10/cuda/CUDAMacros.h>
	#include <cuda_runtime_api.h>
	namespace c10::cuda {

	// NB: In the past, we were inconsistent about whether or not this reported
	// an error if there were driver problems are not. Based on experience
	// interacting with users, it seems that people basically ~never want this
	// function to fail; it should just return zero if things are not working.
	// Oblige them.
	// It still might log a warning for user first time it's invoked
	C10_CUDA_API DeviceIndex device_count() noexcept;

	// Version of device_count that throws is no devices are detected
	C10_CUDA_API DeviceIndex device_count_ensure_non_zero();

	C10_CUDA_API DeviceIndex current_device();

	C10_CUDA_API void set_device(DeviceIndex device);

	C10_CUDA_API void device_synchronize();

	C10_CUDA_API void warn_or_error_on_sync();

	// Raw CUDA device management functions
	C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);

	C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);

	C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);

	C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);

	C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);

	C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);

	C10_CUDA_API void SetTargetDevice();

	enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };

	// this is a holder for c10 global state (similar to at GlobalContext)
	// currently it's used to store cuda synchronization warning state,
	// but can be expanded to hold other related global state, e.g. to
	// record stream usage
	class WarningState {
	public:
	void set_sync_debug_mode(SyncDebugMode l) {
	sync_debug_mode = l;
	}

	SyncDebugMode get_sync_debug_mode() {
	return sync_debug_mode;
	}

	private:
	SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
	};

	C10_CUDA_API __inline__ WarningState& warning_state() {
	static WarningState warning_state_;
	return warning_state_;
	}
	// the subsequent functions are defined in the header because for performance
	// reasons we want them to be inline
	C10_CUDA_API void __inline__ memcpy_and_sync(
	void* dst,
	const void* src,
	int64_t nbytes,
	cudaMemcpyKind kind,
	cudaStream_t stream) {
	if (C10_UNLIKELY(
	warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
	warn_or_error_on_sync();
	}
	const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
	if (C10_UNLIKELY(interp)) {
	(*interp)->trace_gpu_stream_synchronization(
	reinterpret_cast<uintptr_t>(stream));
	}
	#if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
	C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
	#else
	C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
	C10_CUDA_CHECK(cudaStreamSynchronize(stream));
	#endif
	}

	C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
	if (C10_UNLIKELY(
	warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
	warn_or_error_on_sync();
	}
	const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
	if (C10_UNLIKELY(interp)) {
	(*interp)->trace_gpu_stream_synchronization(
	reinterpret_cast<uintptr_t>(stream));
	}
	C10_CUDA_CHECK(cudaStreamSynchronize(stream));
	}

	C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
	C10_CUDA_API c10::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();

	} // namespace c10::cuda