Arrcttacsrks commited on
Commit
c64a3bf
·
verified ·
1 Parent(s): d3f96d8

Upload llama.cpp/ggml/src/ggml-cuda/common.cuh with huggingface_hub

Browse files
llama.cpp/ggml/src/ggml-cuda/common.cuh ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-cuda.h"
5
+
6
+ #include <cstdint>
7
+ #include <memory>
8
+
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #define GGML_COMMON_DECL_HIP
11
+ #define GGML_COMMON_IMPL_HIP
12
+ #else
13
+ #define GGML_COMMON_DECL_CUDA
14
+ #define GGML_COMMON_IMPL_CUDA
15
+ #if defined(GGML_USE_MUSA)
16
+ #define GGML_COMMON_DECL_MUSA
17
+ #define GGML_COMMON_IMPL_MUSA
18
+ #endif
19
+ #endif
20
+ #include "ggml-common.h"
21
+
22
+ #include <cstdio>
23
+ #include <array>
24
+ #include <cassert>
25
+ #include <cfloat>
26
+ #include <string>
27
+ #include <vector>
28
+
29
+ #if defined(GGML_USE_HIPBLAS)
30
+ #include "vendors/hip.h"
31
+ #elif defined(GGML_USE_MUSA)
32
+ #include "vendors/musa.h"
33
+ #else
34
+ #include "vendors/cuda.h"
35
+ #endif // defined(GGML_USE_HIPBLAS)
36
+
37
+ #define STRINGIZE_IMPL(...) #__VA_ARGS__
38
+ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
39
+
40
+ #define WARP_SIZE 32
41
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
42
+ #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
43
+
44
+ #define CC_PASCAL 600
45
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
+ #define CC_VOLTA 700
47
+ #define CC_TURING 750
48
+ #define CC_AMPERE 800
49
+ #define CC_OFFSET_AMD 1000000
50
+ #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
51
+ #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
52
+ #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
53
+ #define CC_QY1 210
54
+ #define CC_QY2 220
55
+
56
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
57
+
58
+ #if defined(_MSC_VER)
59
+ #pragma warning(disable: 4244 4267) // possible loss of data
60
+ #endif
61
+
62
+ #define GGML_CUDA_MAX_STREAMS 8
63
+
64
+ [[noreturn]]
65
+ void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
66
+
67
+ #define CUDA_CHECK_GEN(err, success, error_fn) \
68
+ do { \
69
+ auto err_ = (err); \
70
+ if (err_ != (success)) { \
71
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
72
+ } \
73
+ } while (0)
74
+
75
+ #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
76
+
77
+ #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
78
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
79
+ return cublasGetStatusString(err);
80
+ }
81
+ #else
82
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
83
+ switch (err) {
84
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
85
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
86
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
87
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
88
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
89
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
90
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
91
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
92
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
93
+ default: return "unknown error";
94
+ }
95
+ }
96
+ #endif // CUDART_VERSION >= 12000
97
+
98
+ #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
99
+
100
+ #if !defined(GGML_USE_HIPBLAS)
101
+ static const char * cu_get_error_str(CUresult err) {
102
+ const char * err_str;
103
+ cuGetErrorString(err, &err_str);
104
+ return err_str;
105
+ }
106
+ #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
107
+ #endif
108
+
109
+ #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
110
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
111
+ #else
112
+ #define GGML_CUDA_ASSUME(x)
113
+ #endif // CUDART_VERSION >= 11100
114
+
115
+ #ifdef GGML_CUDA_F16
116
+ typedef half dfloat; // dequantize float
117
+ typedef half2 dfloat2;
118
+ #else
119
+ typedef float dfloat; // dequantize float
120
+ typedef float2 dfloat2;
121
+ #endif // GGML_CUDA_F16
122
+
123
+ #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
124
+ #define FP16_AVAILABLE
125
+ #endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
126
+
127
+ #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
128
+ #define FAST_FP16_AVAILABLE
129
+ #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
130
+
131
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
132
+ #define FP16_MMA_AVAILABLE
133
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
134
+
135
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
136
+ #define INT8_MMA_AVAILABLE
137
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
138
+
139
+ #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
140
+ #define FLASH_ATTN_AVAILABLE
141
+ #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
142
+
143
+ static constexpr bool fast_fp16_available(const int cc) {
144
+ return cc >= CC_PASCAL && cc != 610;
145
+ }
146
+
147
+ static constexpr bool fp16_mma_available(const int cc) {
148
+ return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
149
+ }
150
+
151
+ static constexpr bool int8_mma_available(const int cc) {
152
+ return cc < CC_OFFSET_AMD && cc >= CC_TURING;
153
+ }
154
+
155
+ [[noreturn]]
156
+ static __device__ void no_device_code(
157
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
158
+
159
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
160
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
161
+ file_name, line, function_name, arch);
162
+ GGML_UNUSED(arch_list);
163
+ #else
164
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
165
+ file_name, line, function_name, arch, arch_list);
166
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
167
+ __trap();
168
+
169
+ GGML_UNUSED(no_device_code); // suppress unused function warning
170
+ }
171
+
172
+ #ifdef __CUDA_ARCH__
173
+ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
174
+ #else
175
+ #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
176
+ #endif // __CUDA_ARCH__
177
+
178
+ static __device__ __forceinline__ int warp_reduce_sum(int x) {
179
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
180
+ return __reduce_add_sync(0xffffffff, x);
181
+ #else
182
+ #pragma unroll
183
+ for (int mask = 16; mask > 0; mask >>= 1) {
184
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
185
+ }
186
+ return x;
187
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
188
+ }
189
+
190
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
191
+ #pragma unroll
192
+ for (int mask = 16; mask > 0; mask >>= 1) {
193
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
194
+ }
195
+ return x;
196
+ }
197
+
198
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
199
+ #pragma unroll
200
+ for (int mask = 16; mask > 0; mask >>= 1) {
201
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
202
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
203
+ }
204
+ return a;
205
+ }
206
+
207
+ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
208
+ #ifdef FP16_AVAILABLE
209
+
210
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
211
+ #pragma unroll
212
+ for (int mask = 16; mask > 0; mask >>= 1) {
213
+ const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
214
+ reinterpret_cast<half&>(a.x) += __low2half(a_other);
215
+ reinterpret_cast<half&>(a.y) += __high2half(a_other);
216
+ }
217
+ return a;
218
+ #else
219
+ #pragma unroll
220
+ for (int mask = 16; mask > 0; mask >>= 1) {
221
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
222
+ }
223
+ return a;
224
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
225
+
226
+ #else
227
+ NO_DEVICE_CODE;
228
+ return a;
229
+ #endif // FP16_AVAILABLE
230
+ }
231
+
232
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
233
+ #pragma unroll
234
+ for (int mask = 16; mask > 0; mask >>= 1) {
235
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
236
+ }
237
+ return x;
238
+ }
239
+
240
+ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
241
+ #ifdef FP16_AVAILABLE
242
+
243
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
244
+ return __float2half(fmaxf(__half2float(a), __half2float(b)));
245
+ #else
246
+ return __hmax(a, b);
247
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
248
+
249
+ #else
250
+ NO_DEVICE_CODE;
251
+ GGML_UNUSED(b);
252
+ return a;
253
+ #endif // FP16_AVAILABLE
254
+ }
255
+
256
+ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
257
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
258
+
259
+ #if CUDART_VERSION >= CUDART_HMAX
260
+ return __hmax2(a, b);
261
+ #else
262
+ half2 ret;
263
+ reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
264
+ reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
265
+ return ret;
266
+ #endif // CUDART_VERSION >= CUDART_HMAX
267
+
268
+ #else
269
+ GGML_UNUSED(a);
270
+ GGML_UNUSED(b);
271
+ NO_DEVICE_CODE;
272
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
273
+ }
274
+
275
+ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
276
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
277
+ #pragma unroll
278
+ for (int mask = 16; mask > 0; mask >>= 1) {
279
+ x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
280
+ }
281
+ return x;
282
+ #else
283
+ GGML_UNUSED(x);
284
+ NO_DEVICE_CODE;
285
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
286
+ }
287
+
288
+ #if CUDART_VERSION < CUDART_HMASK
289
+ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
290
+ const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
291
+ const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
292
+ return mask_low | mask_high;
293
+ }
294
+ #endif // CUDART_VERSION < CUDART_HMASK
295
+
296
+ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
297
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
298
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
299
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
300
+ #elif defined(RDNA3)
301
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
302
+ #elif defined(__gfx1010__) || defined(__gfx900__)
303
+ int tmp1;
304
+ int tmp2;
305
+ asm("\n \
306
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
307
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
308
+ v_add3_u32 %0, %1, %2, %0 \n \
309
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
310
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
311
+ v_add3_u32 %0, %1, %2, %0 \n \
312
+ "
313
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
314
+ : "v"(a), "v"(b)
315
+ );
316
+ #else
317
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
318
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
319
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
320
+ #endif
321
+ return c;
322
+
323
+ #else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
324
+
325
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
326
+ return __dp4a(a, b, c);
327
+ #else // __CUDA_ARCH__ >= MIN_CC_DP4A
328
+ const int8_t * a8 = (const int8_t *) &a;
329
+ const int8_t * b8 = (const int8_t *) &b;
330
+ return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
331
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
332
+
333
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
334
+ }
335
+
336
+ // TODO: move to ggml-common.h
337
+ static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
338
+
339
+ typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
340
+
341
+ static __device__ __forceinline__ float get_alibi_slope(
342
+ const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
343
+ ) {
344
+ if (max_bias <= 0.0f) {
345
+ return 1.0f;
346
+ }
347
+ const float base = h < n_head_log2 ? m0 : m1;
348
+ const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
349
+
350
+ return powf(base, exph);
351
+ }
352
+
353
+ template <ggml_type type>
354
+ struct ggml_cuda_type_traits;
355
+
356
+ template<>
357
+ struct ggml_cuda_type_traits<GGML_TYPE_F16> {
358
+ static constexpr int qk = 1;
359
+ static constexpr int qr = 1;
360
+ };
361
+
362
+ template<>
363
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
364
+ static constexpr int qk = QK4_0;
365
+ static constexpr int qr = QR4_0;
366
+ static constexpr int qi = QI4_0;
367
+ };
368
+
369
+ template<>
370
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
371
+ static constexpr int qk = QK4_1;
372
+ static constexpr int qr = QR4_1;
373
+ static constexpr int qi = QI4_1;
374
+ };
375
+
376
+ template<>
377
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
378
+ static constexpr int qk = QK5_0;
379
+ static constexpr int qr = QR5_0;
380
+ static constexpr int qi = QI5_0;
381
+ };
382
+
383
+ template<>
384
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
385
+ static constexpr int qk = QK5_1;
386
+ static constexpr int qr = QR5_1;
387
+ static constexpr int qi = QI5_1;
388
+ };
389
+
390
+ template<>
391
+ struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
392
+ static constexpr int qk = QK8_0;
393
+ static constexpr int qr = QR8_0;
394
+ static constexpr int qi = QI8_0;
395
+ };
396
+
397
+ template<>
398
+ struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
399
+ static constexpr int qk = QK_K;
400
+ static constexpr int qr = QR2_K;
401
+ static constexpr int qi = QI2_K;
402
+ };
403
+
404
+ template<>
405
+ struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
406
+ static constexpr int qk = QK_K;
407
+ static constexpr int qr = QR3_K;
408
+ static constexpr int qi = QI3_K;
409
+ };
410
+
411
+ template<>
412
+ struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
413
+ static constexpr int qk = QK_K;
414
+ static constexpr int qr = QR4_K;
415
+ static constexpr int qi = QI4_K;
416
+ };
417
+
418
+ template<>
419
+ struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
420
+ static constexpr int qk = QK_K;
421
+ static constexpr int qr = QR5_K;
422
+ static constexpr int qi = QI5_K;
423
+ };
424
+
425
+ template<>
426
+ struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
427
+ static constexpr int qk = QK_K;
428
+ static constexpr int qr = QR6_K;
429
+ static constexpr int qi = QI6_K;
430
+ };
431
+
432
+ template<>
433
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
434
+ static constexpr int qk = QK_K;
435
+ static constexpr int qr = QR2_XXS;
436
+ static constexpr int qi = QI2_XXS;
437
+ };
438
+
439
+ template<>
440
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
441
+ static constexpr int qk = QK_K;
442
+ static constexpr int qr = QR2_XS;
443
+ static constexpr int qi = QI2_XS;
444
+ };
445
+
446
+ template<>
447
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
448
+ static constexpr int qk = QK_K;
449
+ static constexpr int qr = QR2_S;
450
+ static constexpr int qi = QI2_S;
451
+ };
452
+
453
+ template<>
454
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
455
+ static constexpr int qk = QK_K;
456
+ static constexpr int qr = QR3_XXS;
457
+ static constexpr int qi = QI3_XXS;
458
+ };
459
+
460
+ template<>
461
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
462
+ static constexpr int qk = QK_K;
463
+ static constexpr int qr = QR1_S;
464
+ static constexpr int qi = QI1_S;
465
+ };
466
+
467
+ template<>
468
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
469
+ static constexpr int qk = QK_K;
470
+ static constexpr int qr = QR1_M;
471
+ static constexpr int qi = QI1_M;
472
+ };
473
+
474
+ template<>
475
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
476
+ static constexpr int qk = QK4_NL;
477
+ static constexpr int qr = QR4_NL;
478
+ static constexpr int qi = QI4_NL;
479
+ };
480
+
481
+ template<>
482
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
483
+ static constexpr int qk = QK_K;
484
+ static constexpr int qr = QR4_XS;
485
+ static constexpr int qi = QI4_XS;
486
+ };
487
+
488
+ template<>
489
+ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
490
+ static constexpr int qk = QK_K;
491
+ static constexpr int qr = QR3_S;
492
+ static constexpr int qi = QI3_S;
493
+ };
494
+
495
+ //////////////////////
496
+
497
+ struct ggml_cuda_device_info {
498
+ int device_count;
499
+
500
+ struct cuda_device_info {
501
+ int cc; // compute capability
502
+ int nsm; // number of streaming multiprocessors
503
+ size_t smpb; // max. shared memory per block
504
+ size_t smpbo; // max. shared memory per block (with opt-in)
505
+ bool vmm; // virtual memory support
506
+ size_t vmm_granularity; // granularity of virtual memory
507
+ size_t total_vram;
508
+ };
509
+
510
+ cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
511
+
512
+ std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
513
+ };
514
+
515
+ const ggml_cuda_device_info & ggml_cuda_info();
516
+
517
+ void ggml_cuda_set_device(int device);
518
+ int ggml_cuda_get_device();
519
+
520
+ struct ggml_cuda_pool {
521
+ virtual ~ggml_cuda_pool() = default;
522
+
523
+ virtual void * alloc(size_t size, size_t * actual_size) = 0;
524
+ virtual void free(void * ptr, size_t size) = 0;
525
+ };
526
+
527
+ template<typename T>
528
+ struct ggml_cuda_pool_alloc {
529
+ ggml_cuda_pool * pool = nullptr;
530
+ T * ptr = nullptr;
531
+ size_t actual_size = 0;
532
+
533
+ ggml_cuda_pool_alloc() = default;
534
+
535
+ explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
536
+ }
537
+
538
+ ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
539
+ alloc(size);
540
+ }
541
+
542
+ ~ggml_cuda_pool_alloc() {
543
+ if (ptr != nullptr) {
544
+ pool->free(ptr, actual_size);
545
+ }
546
+ }
547
+
548
+ // size is in number of elements
549
+ T * alloc(size_t size) {
550
+ GGML_ASSERT(pool != nullptr);
551
+ GGML_ASSERT(ptr == nullptr);
552
+ ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
553
+ return ptr;
554
+ }
555
+
556
+ T * alloc(ggml_cuda_pool & pool, size_t size) {
557
+ this->pool = &pool;
558
+ return alloc(size);
559
+ }
560
+
561
+ T * get() {
562
+ return ptr;
563
+ }
564
+
565
+ ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
566
+ ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
567
+ ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
568
+ ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
569
+ };
570
+
571
+
572
+ // backend interface
573
+
574
+ struct ggml_tensor_extra_gpu {
575
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
576
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
577
+ };
578
+
579
+
580
+ #if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
581
+ #define USE_CUDA_GRAPH
582
+ #endif
583
+
584
+ struct ggml_graph_node_properties {
585
+ void * node_address;
586
+ ggml_op node_op;
587
+ int64_t ne[GGML_MAX_DIMS];
588
+ size_t nb[GGML_MAX_DIMS];
589
+ void * src_address[GGML_MAX_SRC];
590
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
591
+ };
592
+
593
+ struct ggml_cuda_graph {
594
+ #ifdef USE_CUDA_GRAPH
595
+ ~ggml_cuda_graph() {
596
+ if (instance != nullptr) {
597
+ CUDA_CHECK(cudaGraphExecDestroy(instance));
598
+ }
599
+ if (graph != nullptr) {
600
+ CUDA_CHECK(cudaGraphDestroy(graph));
601
+ }
602
+ }
603
+ cudaGraph_t graph = nullptr;
604
+ cudaGraphExec_t instance = nullptr;
605
+ size_t num_nodes = 0;
606
+ std::vector<cudaGraphNode_t> nodes;
607
+ std::vector<cudaKernelNodeParams> params;
608
+ bool disable_due_to_gpu_arch = false;
609
+ bool disable_due_to_too_many_updates = false;
610
+ bool disable_due_to_failed_graph_capture = false;
611
+ int number_consecutive_updates = 0;
612
+ std::vector<ggml_graph_node_properties> ggml_graph_properties;
613
+ std::vector<char **> updated_kernel_arg;
614
+ #endif
615
+ };
616
+
617
+ struct ggml_backend_cuda_context {
618
+ int device;
619
+ std::string name;
620
+ cudaEvent_t copy_event = nullptr;
621
+
622
+ cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
623
+ cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
624
+
625
+ std::unique_ptr<ggml_cuda_graph> cuda_graph;
626
+
627
+ explicit ggml_backend_cuda_context(int device) :
628
+ device(device),
629
+ name(GGML_CUDA_NAME + std::to_string(device)) {
630
+ }
631
+
632
+ ~ggml_backend_cuda_context() {
633
+ if (copy_event != nullptr) {
634
+ CUDA_CHECK(cudaEventDestroy(copy_event));
635
+ }
636
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
637
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
638
+ if (streams[i][j] != nullptr) {
639
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
640
+ }
641
+ }
642
+ if (cublas_handles[i] != nullptr) {
643
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
644
+ }
645
+ }
646
+ }
647
+
648
+ cudaStream_t stream(int device, int stream) {
649
+ if (streams[device][stream] == nullptr) {
650
+ ggml_cuda_set_device(device);
651
+ CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
652
+ }
653
+ return streams[device][stream];
654
+ }
655
+
656
+ cudaStream_t stream() {
657
+ return stream(device, 0);
658
+ }
659
+
660
+ cublasHandle_t cublas_handle(int device) {
661
+ if (cublas_handles[device] == nullptr) {
662
+ ggml_cuda_set_device(device);
663
+ CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
664
+ CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
665
+ }
666
+ return cublas_handles[device];
667
+ }
668
+
669
+ cublasHandle_t cublas_handle() {
670
+ return cublas_handle(device);
671
+ }
672
+
673
+ // pool
674
+ std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
675
+
676
+ static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
677
+
678
+ ggml_cuda_pool & pool(int device) {
679
+ if (pools[device] == nullptr) {
680
+ pools[device] = new_pool_for_device(device);
681
+ }
682
+ return *pools[device];
683
+ }
684
+
685
+ ggml_cuda_pool & pool() {
686
+ return pool(device);
687
+ }
688
+ };