Spaces:
Running
Running
File size: 9,938 Bytes
c61ccee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
#pragma once
#include <cstdint>
#include <utility>
#include <cuda_runtime_api.h>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>
/*
* Stream pool note.
*
* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
* are backed by cuStreams, but they use several pools to minimize the costs
* associated with creating, retaining, and destroying cuStreams.
*
* There are three pools per device, and a device's pools are lazily created.
*
* The first pool contains only the default stream. When the default stream
* is requested it's returned.
*
* The second pool is the "low priority" or "default priority" streams. In
* HIP builds there is no distinction between streams in this pool and streams
* in the third pool (below). There are 32 of these streams per device, and
* when a stream is requested one of these streams is returned round-robin.
* That is, the first stream requested is at index 0, the second at index 1...
* to index 31, then index 0 again.
*
* This means that if 33 low priority streams are requested, the first and
* last streams requested are actually the same stream (under the covers)
* and kernels enqueued on them cannot run concurrently.
*
* The third pool is the "high priority" streams. The third pool acts like
* the second pool except the streams are created with a higher priority.
*
* These pools suggest that stream users should prefer many short-lived streams,
* as the cost of acquiring and releasing streams is effectively zero. If
* many longer-lived streams are required in performance critical scenarios
* then the functionality here may need to be extended to allow, for example,
* "reserving" a subset of the pool so that other streams do not accidentally
* overlap the performance critical streams.
*
* Note: although the notion of "current stream for device" is thread local
* (every OS thread has a separate current stream, as one might expect),
* the stream pool is global across all threads; stream 0 is always stream 0
* no matter which thread you use it on. Multiple threads can synchronize
* on the same stream. Although the CUDA documentation is not very clear
* on the matter, streams are thread safe; e.g., it is safe to enqueue
* a kernel on the same stream from two different threads.
*/
namespace c10::cuda {
static constexpr int max_compile_time_stream_priorities = 4;
// Value object representing a CUDA stream. This is just a wrapper
// around c10::Stream, but it comes with a little extra CUDA-specific
// functionality (conversion to cudaStream_t), and a guarantee that
// the wrapped c10::Stream really is a CUDA stream.
class C10_CUDA_API CUDAStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a CUDAStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a CUDA stream.
explicit CUDAStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
}
/// Construct a CUDAStream from a Stream with no error checking.
/// This constructor uses the "named" constructor idiom, and can
/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const CUDAStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const CUDAStream& other) const noexcept {
return unwrap() != other.unwrap();
}
/// Implicit conversion to cudaStream_t.
operator cudaStream_t() const {
return stream();
}
/// Implicit conversion to Stream (a.k.a., forget that the stream is a
/// CUDA stream).
operator Stream() const {
return unwrap();
}
/// Used to avoid baking in device type explicitly to Python-side API.
DeviceType device_type() const {
return DeviceType::CUDA;
}
/// Get the CUDA device index that this stream is associated with.
DeviceIndex device_index() const {
return stream_.device_index();
}
/// Get the full Device that this stream is associated with. The Device
/// is guaranteed to be a CUDA device.
Device device() const {
return Device(DeviceType::CUDA, device_index());
}
/// Return the stream ID corresponding to this particular stream.
StreamId id() const {
return stream_.id();
}
bool query() const {
DeviceGuard guard{stream_.device()};
cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
if (err == cudaSuccess) {
return true;
} else if (err != cudaErrorNotReady) {
C10_CUDA_CHECK(err);
} else {
// ignore and clear the error if not ready
(void)cudaGetLastError();
}
return false;
}
void synchronize() const {
DeviceGuard guard{stream_.device()};
c10::cuda::stream_synchronize(stream());
}
int priority() const {
DeviceGuard guard{stream_.device()};
int priority = 0;
C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
return priority;
}
/// Explicit conversion to cudaStream_t.
cudaStream_t stream() const;
/// Explicit conversion to Stream.
Stream unwrap() const {
return stream_;
}
/// Reversibly pack a CUDAStream into a struct representation.
/// Previously the stream's data was packed into a single int64_t,
/// as it was assumed the fields would not require more than
/// 64 bits of storage in total.
/// See https://github.com/pytorch/pytorch/issues/75854
/// for more information regarding newer platforms that may violate
/// this assumption.
///
/// The CUDAStream can be unpacked using unpack().
struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
// Unpack a CUDAStream from the 3 fields generated by pack().
static CUDAStream unpack3(
StreamId stream_id,
DeviceIndex device_index,
DeviceType device_type) {
return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
}
static std::tuple<int, int> priority_range() {
// Note: this returns the range of priority **supported by PyTorch**, not
// the range of priority **supported by CUDA**. The former is a subset of
// the latter.
int least_priority = 0, greatest_priority = 0;
C10_CUDA_CHECK(
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
#ifdef USE_ROCM
// See Note [HIP stream priorities]
TORCH_INTERNAL_ASSERT(
least_priority == 1, "Unexpected HIP stream priority range");
least_priority = 0;
#else
TORCH_INTERNAL_ASSERT(
least_priority == 0, "Unexpected CUDA stream priority range");
#endif
TORCH_INTERNAL_ASSERT(
greatest_priority <= -1, "Unexpected CUDA stream priority range");
greatest_priority = std::max(
-c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
return std::make_tuple(least_priority, greatest_priority);
}
// Deleted for now; use CUDAEvent::block instead
// void synchronize_with(const CUDAEvent& event) const;
private:
Stream stream_;
};
/**
* Get a new stream from the CUDA stream pool. You can think of this
* as "creating" a new stream, but no such creation actually happens;
* instead, streams are preallocated from the pool and returned in a
* round-robin fashion.
*
* You can request a stream from the high priority pool by setting
* isHighPriority to true, or a stream for a specific device by setting device
* (defaulting to the current CUDA stream.)
*/
C10_API CUDAStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
// no default priority to disambiguate overloads
C10_API CUDAStream
getStreamFromPool(const int priority, DeviceIndex device = -1);
/**
* Get a CUDAStream from a externally allocated one.
*
* This is mainly for interoperability with different libraries where we
* want to operate on a non-torch allocated stream for data exchange or similar
* purposes
*/
C10_API CUDAStream
getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
/**
* Get the default CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The default stream is
* where most computation occurs when you aren't explicitly using
* streams.
*/
C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
/**
* Get the current CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The current CUDA stream
* will usually be the default CUDA stream for the device, but it may
* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
* or 'CUDAStreamGuard'.
*/
C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
/**
* Set the current stream on the device of the passed in stream to be
* the passed in stream. Yes, you read that right: this function
* has *nothing* to do with the current device: it toggles the current
* stream of the device of the passed stream.
*
* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
* (which will switch both your current device and current stream in the way you
* expect, and reset it back to its original state afterwards).
*/
C10_API void setCurrentCUDAStream(CUDAStream stream);
C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
} // namespace c10::cuda
namespace std {
template <>
struct hash<c10::cuda::CUDAStream> {
size_t operator()(c10::cuda::CUDAStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std
|