Spaces:
Runtime error
Runtime error
Upload llama.cpp/ggml/src/ggml-cuda/common.cuh with huggingface_hub
Browse files
llama.cpp/ggml/src/ggml-cuda/common.cuh
ADDED
@@ -0,0 +1,688 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "ggml.h"
|
4 |
+
#include "ggml-cuda.h"
|
5 |
+
|
6 |
+
#include <cstdint>
|
7 |
+
#include <memory>
|
8 |
+
|
9 |
+
#if defined(GGML_USE_HIPBLAS)
|
10 |
+
#define GGML_COMMON_DECL_HIP
|
11 |
+
#define GGML_COMMON_IMPL_HIP
|
12 |
+
#else
|
13 |
+
#define GGML_COMMON_DECL_CUDA
|
14 |
+
#define GGML_COMMON_IMPL_CUDA
|
15 |
+
#if defined(GGML_USE_MUSA)
|
16 |
+
#define GGML_COMMON_DECL_MUSA
|
17 |
+
#define GGML_COMMON_IMPL_MUSA
|
18 |
+
#endif
|
19 |
+
#endif
|
20 |
+
#include "ggml-common.h"
|
21 |
+
|
22 |
+
#include <cstdio>
|
23 |
+
#include <array>
|
24 |
+
#include <cassert>
|
25 |
+
#include <cfloat>
|
26 |
+
#include <string>
|
27 |
+
#include <vector>
|
28 |
+
|
29 |
+
#if defined(GGML_USE_HIPBLAS)
|
30 |
+
#include "vendors/hip.h"
|
31 |
+
#elif defined(GGML_USE_MUSA)
|
32 |
+
#include "vendors/musa.h"
|
33 |
+
#else
|
34 |
+
#include "vendors/cuda.h"
|
35 |
+
#endif // defined(GGML_USE_HIPBLAS)
|
36 |
+
|
37 |
+
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
38 |
+
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
39 |
+
|
40 |
+
#define WARP_SIZE 32
|
41 |
+
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
42 |
+
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
43 |
+
|
44 |
+
#define CC_PASCAL 600
|
45 |
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
46 |
+
#define CC_VOLTA 700
|
47 |
+
#define CC_TURING 750
|
48 |
+
#define CC_AMPERE 800
|
49 |
+
#define CC_OFFSET_AMD 1000000
|
50 |
+
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
51 |
+
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
52 |
+
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
53 |
+
#define CC_QY1 210
|
54 |
+
#define CC_QY2 220
|
55 |
+
|
56 |
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
57 |
+
|
58 |
+
#if defined(_MSC_VER)
|
59 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
60 |
+
#endif
|
61 |
+
|
62 |
+
#define GGML_CUDA_MAX_STREAMS 8
|
63 |
+
|
64 |
+
[[noreturn]]
|
65 |
+
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
|
66 |
+
|
67 |
+
#define CUDA_CHECK_GEN(err, success, error_fn) \
|
68 |
+
do { \
|
69 |
+
auto err_ = (err); \
|
70 |
+
if (err_ != (success)) { \
|
71 |
+
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
|
72 |
+
} \
|
73 |
+
} while (0)
|
74 |
+
|
75 |
+
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
76 |
+
|
77 |
+
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
78 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
79 |
+
return cublasGetStatusString(err);
|
80 |
+
}
|
81 |
+
#else
|
82 |
+
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
83 |
+
switch (err) {
|
84 |
+
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
85 |
+
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
86 |
+
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
87 |
+
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
88 |
+
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
89 |
+
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
90 |
+
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
91 |
+
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
92 |
+
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
93 |
+
default: return "unknown error";
|
94 |
+
}
|
95 |
+
}
|
96 |
+
#endif // CUDART_VERSION >= 12000
|
97 |
+
|
98 |
+
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
99 |
+
|
100 |
+
#if !defined(GGML_USE_HIPBLAS)
|
101 |
+
static const char * cu_get_error_str(CUresult err) {
|
102 |
+
const char * err_str;
|
103 |
+
cuGetErrorString(err, &err_str);
|
104 |
+
return err_str;
|
105 |
+
}
|
106 |
+
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
107 |
+
#endif
|
108 |
+
|
109 |
+
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
110 |
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
111 |
+
#else
|
112 |
+
#define GGML_CUDA_ASSUME(x)
|
113 |
+
#endif // CUDART_VERSION >= 11100
|
114 |
+
|
115 |
+
#ifdef GGML_CUDA_F16
|
116 |
+
typedef half dfloat; // dequantize float
|
117 |
+
typedef half2 dfloat2;
|
118 |
+
#else
|
119 |
+
typedef float dfloat; // dequantize float
|
120 |
+
typedef float2 dfloat2;
|
121 |
+
#endif // GGML_CUDA_F16
|
122 |
+
|
123 |
+
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
124 |
+
#define FP16_AVAILABLE
|
125 |
+
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
126 |
+
|
127 |
+
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
128 |
+
#define FAST_FP16_AVAILABLE
|
129 |
+
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
130 |
+
|
131 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
132 |
+
#define FP16_MMA_AVAILABLE
|
133 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
134 |
+
|
135 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
136 |
+
#define INT8_MMA_AVAILABLE
|
137 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
138 |
+
|
139 |
+
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
140 |
+
#define FLASH_ATTN_AVAILABLE
|
141 |
+
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
142 |
+
|
143 |
+
static constexpr bool fast_fp16_available(const int cc) {
|
144 |
+
return cc >= CC_PASCAL && cc != 610;
|
145 |
+
}
|
146 |
+
|
147 |
+
static constexpr bool fp16_mma_available(const int cc) {
|
148 |
+
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
149 |
+
}
|
150 |
+
|
151 |
+
static constexpr bool int8_mma_available(const int cc) {
|
152 |
+
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
153 |
+
}
|
154 |
+
|
155 |
+
[[noreturn]]
|
156 |
+
static __device__ void no_device_code(
|
157 |
+
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
158 |
+
|
159 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
160 |
+
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
161 |
+
file_name, line, function_name, arch);
|
162 |
+
GGML_UNUSED(arch_list);
|
163 |
+
#else
|
164 |
+
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
165 |
+
file_name, line, function_name, arch, arch_list);
|
166 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
167 |
+
__trap();
|
168 |
+
|
169 |
+
GGML_UNUSED(no_device_code); // suppress unused function warning
|
170 |
+
}
|
171 |
+
|
172 |
+
#ifdef __CUDA_ARCH__
|
173 |
+
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
174 |
+
#else
|
175 |
+
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
176 |
+
#endif // __CUDA_ARCH__
|
177 |
+
|
178 |
+
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
179 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
180 |
+
return __reduce_add_sync(0xffffffff, x);
|
181 |
+
#else
|
182 |
+
#pragma unroll
|
183 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
184 |
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
185 |
+
}
|
186 |
+
return x;
|
187 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
188 |
+
}
|
189 |
+
|
190 |
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
191 |
+
#pragma unroll
|
192 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
193 |
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
194 |
+
}
|
195 |
+
return x;
|
196 |
+
}
|
197 |
+
|
198 |
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
199 |
+
#pragma unroll
|
200 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
201 |
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
202 |
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
203 |
+
}
|
204 |
+
return a;
|
205 |
+
}
|
206 |
+
|
207 |
+
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
208 |
+
#ifdef FP16_AVAILABLE
|
209 |
+
|
210 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
211 |
+
#pragma unroll
|
212 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
213 |
+
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
214 |
+
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
215 |
+
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
216 |
+
}
|
217 |
+
return a;
|
218 |
+
#else
|
219 |
+
#pragma unroll
|
220 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
221 |
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
222 |
+
}
|
223 |
+
return a;
|
224 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
225 |
+
|
226 |
+
#else
|
227 |
+
NO_DEVICE_CODE;
|
228 |
+
return a;
|
229 |
+
#endif // FP16_AVAILABLE
|
230 |
+
}
|
231 |
+
|
232 |
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
233 |
+
#pragma unroll
|
234 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
235 |
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
236 |
+
}
|
237 |
+
return x;
|
238 |
+
}
|
239 |
+
|
240 |
+
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
241 |
+
#ifdef FP16_AVAILABLE
|
242 |
+
|
243 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
244 |
+
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
245 |
+
#else
|
246 |
+
return __hmax(a, b);
|
247 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
248 |
+
|
249 |
+
#else
|
250 |
+
NO_DEVICE_CODE;
|
251 |
+
GGML_UNUSED(b);
|
252 |
+
return a;
|
253 |
+
#endif // FP16_AVAILABLE
|
254 |
+
}
|
255 |
+
|
256 |
+
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
257 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
258 |
+
|
259 |
+
#if CUDART_VERSION >= CUDART_HMAX
|
260 |
+
return __hmax2(a, b);
|
261 |
+
#else
|
262 |
+
half2 ret;
|
263 |
+
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
264 |
+
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
265 |
+
return ret;
|
266 |
+
#endif // CUDART_VERSION >= CUDART_HMAX
|
267 |
+
|
268 |
+
#else
|
269 |
+
GGML_UNUSED(a);
|
270 |
+
GGML_UNUSED(b);
|
271 |
+
NO_DEVICE_CODE;
|
272 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
273 |
+
}
|
274 |
+
|
275 |
+
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
276 |
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
277 |
+
#pragma unroll
|
278 |
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
279 |
+
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
280 |
+
}
|
281 |
+
return x;
|
282 |
+
#else
|
283 |
+
GGML_UNUSED(x);
|
284 |
+
NO_DEVICE_CODE;
|
285 |
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
286 |
+
}
|
287 |
+
|
288 |
+
#if CUDART_VERSION < CUDART_HMASK
|
289 |
+
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
290 |
+
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
291 |
+
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
292 |
+
return mask_low | mask_high;
|
293 |
+
}
|
294 |
+
#endif // CUDART_VERSION < CUDART_HMASK
|
295 |
+
|
296 |
+
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
297 |
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
298 |
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
299 |
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
300 |
+
#elif defined(RDNA3)
|
301 |
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
302 |
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
303 |
+
int tmp1;
|
304 |
+
int tmp2;
|
305 |
+
asm("\n \
|
306 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
307 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
308 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
309 |
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
310 |
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
311 |
+
v_add3_u32 %0, %1, %2, %0 \n \
|
312 |
+
"
|
313 |
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
314 |
+
: "v"(a), "v"(b)
|
315 |
+
);
|
316 |
+
#else
|
317 |
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
318 |
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
319 |
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
320 |
+
#endif
|
321 |
+
return c;
|
322 |
+
|
323 |
+
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
324 |
+
|
325 |
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
326 |
+
return __dp4a(a, b, c);
|
327 |
+
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
328 |
+
const int8_t * a8 = (const int8_t *) &a;
|
329 |
+
const int8_t * b8 = (const int8_t *) &b;
|
330 |
+
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
331 |
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
332 |
+
|
333 |
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
334 |
+
}
|
335 |
+
|
336 |
+
// TODO: move to ggml-common.h
|
337 |
+
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
338 |
+
|
339 |
+
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
340 |
+
|
341 |
+
static __device__ __forceinline__ float get_alibi_slope(
|
342 |
+
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
|
343 |
+
) {
|
344 |
+
if (max_bias <= 0.0f) {
|
345 |
+
return 1.0f;
|
346 |
+
}
|
347 |
+
const float base = h < n_head_log2 ? m0 : m1;
|
348 |
+
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
349 |
+
|
350 |
+
return powf(base, exph);
|
351 |
+
}
|
352 |
+
|
353 |
+
template <ggml_type type>
|
354 |
+
struct ggml_cuda_type_traits;
|
355 |
+
|
356 |
+
template<>
|
357 |
+
struct ggml_cuda_type_traits<GGML_TYPE_F16> {
|
358 |
+
static constexpr int qk = 1;
|
359 |
+
static constexpr int qr = 1;
|
360 |
+
};
|
361 |
+
|
362 |
+
template<>
|
363 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
|
364 |
+
static constexpr int qk = QK4_0;
|
365 |
+
static constexpr int qr = QR4_0;
|
366 |
+
static constexpr int qi = QI4_0;
|
367 |
+
};
|
368 |
+
|
369 |
+
template<>
|
370 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
|
371 |
+
static constexpr int qk = QK4_1;
|
372 |
+
static constexpr int qr = QR4_1;
|
373 |
+
static constexpr int qi = QI4_1;
|
374 |
+
};
|
375 |
+
|
376 |
+
template<>
|
377 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
|
378 |
+
static constexpr int qk = QK5_0;
|
379 |
+
static constexpr int qr = QR5_0;
|
380 |
+
static constexpr int qi = QI5_0;
|
381 |
+
};
|
382 |
+
|
383 |
+
template<>
|
384 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
|
385 |
+
static constexpr int qk = QK5_1;
|
386 |
+
static constexpr int qr = QR5_1;
|
387 |
+
static constexpr int qi = QI5_1;
|
388 |
+
};
|
389 |
+
|
390 |
+
template<>
|
391 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
|
392 |
+
static constexpr int qk = QK8_0;
|
393 |
+
static constexpr int qr = QR8_0;
|
394 |
+
static constexpr int qi = QI8_0;
|
395 |
+
};
|
396 |
+
|
397 |
+
template<>
|
398 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
|
399 |
+
static constexpr int qk = QK_K;
|
400 |
+
static constexpr int qr = QR2_K;
|
401 |
+
static constexpr int qi = QI2_K;
|
402 |
+
};
|
403 |
+
|
404 |
+
template<>
|
405 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
|
406 |
+
static constexpr int qk = QK_K;
|
407 |
+
static constexpr int qr = QR3_K;
|
408 |
+
static constexpr int qi = QI3_K;
|
409 |
+
};
|
410 |
+
|
411 |
+
template<>
|
412 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
|
413 |
+
static constexpr int qk = QK_K;
|
414 |
+
static constexpr int qr = QR4_K;
|
415 |
+
static constexpr int qi = QI4_K;
|
416 |
+
};
|
417 |
+
|
418 |
+
template<>
|
419 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
|
420 |
+
static constexpr int qk = QK_K;
|
421 |
+
static constexpr int qr = QR5_K;
|
422 |
+
static constexpr int qi = QI5_K;
|
423 |
+
};
|
424 |
+
|
425 |
+
template<>
|
426 |
+
struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
|
427 |
+
static constexpr int qk = QK_K;
|
428 |
+
static constexpr int qr = QR6_K;
|
429 |
+
static constexpr int qi = QI6_K;
|
430 |
+
};
|
431 |
+
|
432 |
+
template<>
|
433 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
|
434 |
+
static constexpr int qk = QK_K;
|
435 |
+
static constexpr int qr = QR2_XXS;
|
436 |
+
static constexpr int qi = QI2_XXS;
|
437 |
+
};
|
438 |
+
|
439 |
+
template<>
|
440 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
|
441 |
+
static constexpr int qk = QK_K;
|
442 |
+
static constexpr int qr = QR2_XS;
|
443 |
+
static constexpr int qi = QI2_XS;
|
444 |
+
};
|
445 |
+
|
446 |
+
template<>
|
447 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
|
448 |
+
static constexpr int qk = QK_K;
|
449 |
+
static constexpr int qr = QR2_S;
|
450 |
+
static constexpr int qi = QI2_S;
|
451 |
+
};
|
452 |
+
|
453 |
+
template<>
|
454 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
|
455 |
+
static constexpr int qk = QK_K;
|
456 |
+
static constexpr int qr = QR3_XXS;
|
457 |
+
static constexpr int qi = QI3_XXS;
|
458 |
+
};
|
459 |
+
|
460 |
+
template<>
|
461 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
|
462 |
+
static constexpr int qk = QK_K;
|
463 |
+
static constexpr int qr = QR1_S;
|
464 |
+
static constexpr int qi = QI1_S;
|
465 |
+
};
|
466 |
+
|
467 |
+
template<>
|
468 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
|
469 |
+
static constexpr int qk = QK_K;
|
470 |
+
static constexpr int qr = QR1_M;
|
471 |
+
static constexpr int qi = QI1_M;
|
472 |
+
};
|
473 |
+
|
474 |
+
template<>
|
475 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
|
476 |
+
static constexpr int qk = QK4_NL;
|
477 |
+
static constexpr int qr = QR4_NL;
|
478 |
+
static constexpr int qi = QI4_NL;
|
479 |
+
};
|
480 |
+
|
481 |
+
template<>
|
482 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
|
483 |
+
static constexpr int qk = QK_K;
|
484 |
+
static constexpr int qr = QR4_XS;
|
485 |
+
static constexpr int qi = QI4_XS;
|
486 |
+
};
|
487 |
+
|
488 |
+
template<>
|
489 |
+
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
490 |
+
static constexpr int qk = QK_K;
|
491 |
+
static constexpr int qr = QR3_S;
|
492 |
+
static constexpr int qi = QI3_S;
|
493 |
+
};
|
494 |
+
|
495 |
+
//////////////////////
|
496 |
+
|
497 |
+
struct ggml_cuda_device_info {
|
498 |
+
int device_count;
|
499 |
+
|
500 |
+
struct cuda_device_info {
|
501 |
+
int cc; // compute capability
|
502 |
+
int nsm; // number of streaming multiprocessors
|
503 |
+
size_t smpb; // max. shared memory per block
|
504 |
+
size_t smpbo; // max. shared memory per block (with opt-in)
|
505 |
+
bool vmm; // virtual memory support
|
506 |
+
size_t vmm_granularity; // granularity of virtual memory
|
507 |
+
size_t total_vram;
|
508 |
+
};
|
509 |
+
|
510 |
+
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
511 |
+
|
512 |
+
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
513 |
+
};
|
514 |
+
|
515 |
+
const ggml_cuda_device_info & ggml_cuda_info();
|
516 |
+
|
517 |
+
void ggml_cuda_set_device(int device);
|
518 |
+
int ggml_cuda_get_device();
|
519 |
+
|
520 |
+
struct ggml_cuda_pool {
|
521 |
+
virtual ~ggml_cuda_pool() = default;
|
522 |
+
|
523 |
+
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
524 |
+
virtual void free(void * ptr, size_t size) = 0;
|
525 |
+
};
|
526 |
+
|
527 |
+
template<typename T>
|
528 |
+
struct ggml_cuda_pool_alloc {
|
529 |
+
ggml_cuda_pool * pool = nullptr;
|
530 |
+
T * ptr = nullptr;
|
531 |
+
size_t actual_size = 0;
|
532 |
+
|
533 |
+
ggml_cuda_pool_alloc() = default;
|
534 |
+
|
535 |
+
explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
|
536 |
+
}
|
537 |
+
|
538 |
+
ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
|
539 |
+
alloc(size);
|
540 |
+
}
|
541 |
+
|
542 |
+
~ggml_cuda_pool_alloc() {
|
543 |
+
if (ptr != nullptr) {
|
544 |
+
pool->free(ptr, actual_size);
|
545 |
+
}
|
546 |
+
}
|
547 |
+
|
548 |
+
// size is in number of elements
|
549 |
+
T * alloc(size_t size) {
|
550 |
+
GGML_ASSERT(pool != nullptr);
|
551 |
+
GGML_ASSERT(ptr == nullptr);
|
552 |
+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
|
553 |
+
return ptr;
|
554 |
+
}
|
555 |
+
|
556 |
+
T * alloc(ggml_cuda_pool & pool, size_t size) {
|
557 |
+
this->pool = &pool;
|
558 |
+
return alloc(size);
|
559 |
+
}
|
560 |
+
|
561 |
+
T * get() {
|
562 |
+
return ptr;
|
563 |
+
}
|
564 |
+
|
565 |
+
ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
|
566 |
+
ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
|
567 |
+
ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
|
568 |
+
ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
|
569 |
+
};
|
570 |
+
|
571 |
+
|
572 |
+
// backend interface
|
573 |
+
|
574 |
+
struct ggml_tensor_extra_gpu {
|
575 |
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
576 |
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
577 |
+
};
|
578 |
+
|
579 |
+
|
580 |
+
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
|
581 |
+
#define USE_CUDA_GRAPH
|
582 |
+
#endif
|
583 |
+
|
584 |
+
struct ggml_graph_node_properties {
|
585 |
+
void * node_address;
|
586 |
+
ggml_op node_op;
|
587 |
+
int64_t ne[GGML_MAX_DIMS];
|
588 |
+
size_t nb[GGML_MAX_DIMS];
|
589 |
+
void * src_address[GGML_MAX_SRC];
|
590 |
+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
591 |
+
};
|
592 |
+
|
593 |
+
struct ggml_cuda_graph {
|
594 |
+
#ifdef USE_CUDA_GRAPH
|
595 |
+
~ggml_cuda_graph() {
|
596 |
+
if (instance != nullptr) {
|
597 |
+
CUDA_CHECK(cudaGraphExecDestroy(instance));
|
598 |
+
}
|
599 |
+
if (graph != nullptr) {
|
600 |
+
CUDA_CHECK(cudaGraphDestroy(graph));
|
601 |
+
}
|
602 |
+
}
|
603 |
+
cudaGraph_t graph = nullptr;
|
604 |
+
cudaGraphExec_t instance = nullptr;
|
605 |
+
size_t num_nodes = 0;
|
606 |
+
std::vector<cudaGraphNode_t> nodes;
|
607 |
+
std::vector<cudaKernelNodeParams> params;
|
608 |
+
bool disable_due_to_gpu_arch = false;
|
609 |
+
bool disable_due_to_too_many_updates = false;
|
610 |
+
bool disable_due_to_failed_graph_capture = false;
|
611 |
+
int number_consecutive_updates = 0;
|
612 |
+
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
613 |
+
std::vector<char **> updated_kernel_arg;
|
614 |
+
#endif
|
615 |
+
};
|
616 |
+
|
617 |
+
struct ggml_backend_cuda_context {
|
618 |
+
int device;
|
619 |
+
std::string name;
|
620 |
+
cudaEvent_t copy_event = nullptr;
|
621 |
+
|
622 |
+
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
|
623 |
+
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
624 |
+
|
625 |
+
std::unique_ptr<ggml_cuda_graph> cuda_graph;
|
626 |
+
|
627 |
+
explicit ggml_backend_cuda_context(int device) :
|
628 |
+
device(device),
|
629 |
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
630 |
+
}
|
631 |
+
|
632 |
+
~ggml_backend_cuda_context() {
|
633 |
+
if (copy_event != nullptr) {
|
634 |
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
635 |
+
}
|
636 |
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
637 |
+
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
638 |
+
if (streams[i][j] != nullptr) {
|
639 |
+
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
640 |
+
}
|
641 |
+
}
|
642 |
+
if (cublas_handles[i] != nullptr) {
|
643 |
+
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
644 |
+
}
|
645 |
+
}
|
646 |
+
}
|
647 |
+
|
648 |
+
cudaStream_t stream(int device, int stream) {
|
649 |
+
if (streams[device][stream] == nullptr) {
|
650 |
+
ggml_cuda_set_device(device);
|
651 |
+
CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
|
652 |
+
}
|
653 |
+
return streams[device][stream];
|
654 |
+
}
|
655 |
+
|
656 |
+
cudaStream_t stream() {
|
657 |
+
return stream(device, 0);
|
658 |
+
}
|
659 |
+
|
660 |
+
cublasHandle_t cublas_handle(int device) {
|
661 |
+
if (cublas_handles[device] == nullptr) {
|
662 |
+
ggml_cuda_set_device(device);
|
663 |
+
CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
|
664 |
+
CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
|
665 |
+
}
|
666 |
+
return cublas_handles[device];
|
667 |
+
}
|
668 |
+
|
669 |
+
cublasHandle_t cublas_handle() {
|
670 |
+
return cublas_handle(device);
|
671 |
+
}
|
672 |
+
|
673 |
+
// pool
|
674 |
+
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
675 |
+
|
676 |
+
static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
|
677 |
+
|
678 |
+
ggml_cuda_pool & pool(int device) {
|
679 |
+
if (pools[device] == nullptr) {
|
680 |
+
pools[device] = new_pool_for_device(device);
|
681 |
+
}
|
682 |
+
return *pools[device];
|
683 |
+
}
|
684 |
+
|
685 |
+
ggml_cuda_pool & pool() {
|
686 |
+
return pool(device);
|
687 |
+
}
|
688 |
+
};
|