HGB commited on
Commit
e9fd7b3
·
1 Parent(s): 6f97087

Initial commit for testing triton flash attention

Browse files
Files changed (2) hide show
  1. triton.py +26 -0
  2. triton_flash_atn.py +527 -0
triton.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from triton_flash_atn import _attention
3
+
4
+ # Define dimensions
5
+ batch_size = 2
6
+ num_heads = 4
7
+ seq_len = 128
8
+ head_dim = 64
9
+
10
+ # Create random input tensors for Q, K, V
11
+ q = torch.randn(batch_size, num_heads, seq_len, head_dim,
12
+ dtype=torch.float16, device='cuda')
13
+ k = torch.randn(batch_size, num_heads, seq_len, head_dim,
14
+ dtype=torch.float16, device='cuda')
15
+ v = torch.randn(batch_size, num_heads, seq_len, head_dim,
16
+ dtype=torch.float16, device='cuda')
17
+
18
+ # Define whether the attention is causal and the scaling factor
19
+ causal = False
20
+ sm_scale = 1.0 / (head_dim ** 0.5)
21
+
22
+ # Apply flash attention
23
+ attention = _attention.apply
24
+ output = attention(q, k, v, causal, sm_scale)
25
+
26
+ print(output)
triton_flash_atn.py ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fused Attention
3
+ ===============
4
+
5
+ This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf)
6
+ Credits: OpenAI kernel team
7
+
8
+ Extra Credits:
9
+ - Original flash attention paper (https://arxiv.org/abs/2205.14135)
10
+ - Rabe and Staats (https://arxiv.org/pdf/2112.05682v2.pdf)
11
+
12
+ """
13
+
14
+ import torch
15
+
16
+ import triton
17
+ import triton.language as tl
18
+
19
+
20
+ def is_hip():
21
+ return triton.runtime.driver.HIP
22
+
23
+
24
+ @triton.jit
25
+ def _attn_fwd_inner(acc, l_i, m_i, q, #
26
+ K_block_ptr, V_block_ptr, #
27
+ start_m, qk_scale, #
28
+ BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr, BLOCK_N: tl.constexpr, #
29
+ STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr, #
30
+ N_CTX: tl.constexpr, fp8_v: tl.constexpr):
31
+ # range of values handled by this stage
32
+ if STAGE == 1:
33
+ lo, hi = 0, start_m * BLOCK_M
34
+ elif STAGE == 2:
35
+ lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M
36
+ lo = tl.multiple_of(lo, BLOCK_M)
37
+ # causal = False
38
+ else:
39
+ lo, hi = 0, N_CTX
40
+ K_block_ptr = tl.advance(K_block_ptr, (0, lo))
41
+ V_block_ptr = tl.advance(V_block_ptr, (lo, 0))
42
+ # loop over k, v and update accumulator
43
+ for start_n in range(lo, hi, BLOCK_N):
44
+ start_n = tl.multiple_of(start_n, BLOCK_N)
45
+ # -- compute qk ----
46
+ k = tl.load(K_block_ptr)
47
+ qk = tl.dot(q, k)
48
+ if STAGE == 2:
49
+ mask = offs_m[:, None] >= (start_n + offs_n[None, :])
50
+ qk = qk * qk_scale + tl.where(mask, 0, -1.0e6)
51
+ m_ij = tl.maximum(m_i, tl.max(qk, 1))
52
+ qk -= m_ij[:, None]
53
+ else:
54
+ m_ij = tl.maximum(m_i, tl.max(qk, 1) * qk_scale)
55
+ qk = qk * qk_scale - m_ij[:, None]
56
+ p = tl.math.exp2(qk)
57
+ l_ij = tl.sum(p, 1)
58
+ # -- update m_i and l_i
59
+ alpha = tl.math.exp2(m_i - m_ij)
60
+ l_i = l_i * alpha + l_ij
61
+ # -- update output accumulator --
62
+ acc = acc * alpha[:, None]
63
+ # update acc
64
+ v = tl.load(V_block_ptr)
65
+ if fp8_v:
66
+ p = p.to(tl.float8e5)
67
+ else:
68
+ p = p.to(tl.float16)
69
+ acc = tl.dot(p, v, acc)
70
+ # update m_i and l_i
71
+ m_i = m_ij
72
+ V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
73
+ K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
74
+ return acc, l_i, m_i
75
+
76
+
77
+ # We don't run auto-tuning every time to keep the tutorial fast. Keeping
78
+ # the code below and commenting out the equivalent parameters is convenient for
79
+ # re-tuning.
80
+ configs = [
81
+ triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w)
82
+ for BM in [64, 128]
83
+ for BN in [32, 64]
84
+ for s in ([1] if is_hip() else [3, 4, 7])
85
+ for w in [4, 8]
86
+ ]
87
+
88
+
89
+ def keep(conf):
90
+ BLOCK_M = conf.kwargs["BLOCK_M"]
91
+ BLOCK_N = conf.kwargs["BLOCK_N"]
92
+ if BLOCK_M * BLOCK_N < 128 * 128 and conf.num_warps == 8:
93
+ return False
94
+ return True
95
+
96
+
97
+ @triton.autotune(list(filter(keep, configs)), key=["N_CTX"])
98
+ @triton.jit
99
+ def _attn_fwd(Q, K, V, sm_scale, M, Out, #
100
+ stride_qz, stride_qh, stride_qm, stride_qk, #
101
+ stride_kz, stride_kh, stride_kn, stride_kk, #
102
+ stride_vz, stride_vh, stride_vk, stride_vn, #
103
+ stride_oz, stride_oh, stride_om, stride_on, #
104
+ Z, H, N_CTX, #
105
+ BLOCK_M: tl.constexpr, #
106
+ BLOCK_N: tl.constexpr, #
107
+ HEAD_DIM: tl.constexpr, #
108
+ STAGE: tl.constexpr #
109
+ ):
110
+ tl.static_assert(BLOCK_N <= HEAD_DIM)
111
+ start_m = tl.program_id(0)
112
+ off_hz = tl.program_id(1)
113
+ off_z = off_hz // H
114
+ off_h = off_hz % H
115
+ qvk_offset = off_z.to(tl.int64) * stride_qz + \
116
+ off_h.to(tl.int64) * stride_qh
117
+
118
+ # block pointers
119
+ Q_block_ptr = tl.make_block_ptr(
120
+ base=Q + qvk_offset,
121
+ shape=(N_CTX, HEAD_DIM),
122
+ strides=(stride_qm, stride_qk),
123
+ offsets=(start_m * BLOCK_M, 0),
124
+ block_shape=(BLOCK_M, HEAD_DIM),
125
+ order=(1, 0),
126
+ )
127
+ v_order: tl.constexpr = (
128
+ 0, 1) if V.dtype.element_ty == tl.float8e5 else (1, 0)
129
+ V_block_ptr = tl.make_block_ptr(
130
+ base=V + qvk_offset,
131
+ shape=(N_CTX, HEAD_DIM),
132
+ strides=(stride_vk, stride_vn),
133
+ offsets=(0, 0),
134
+ block_shape=(BLOCK_N, HEAD_DIM),
135
+ order=v_order,
136
+ )
137
+ K_block_ptr = tl.make_block_ptr(
138
+ base=K + qvk_offset,
139
+ shape=(HEAD_DIM, N_CTX),
140
+ strides=(stride_kk, stride_kn),
141
+ offsets=(0, 0),
142
+ block_shape=(HEAD_DIM, BLOCK_N),
143
+ order=(0, 1),
144
+ )
145
+ O_block_ptr = tl.make_block_ptr(
146
+ base=Out + qvk_offset,
147
+ shape=(N_CTX, HEAD_DIM),
148
+ strides=(stride_om, stride_on),
149
+ offsets=(start_m * BLOCK_M, 0),
150
+ block_shape=(BLOCK_M, HEAD_DIM),
151
+ order=(1, 0),
152
+ )
153
+ # initialize offsets
154
+ offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
155
+ offs_n = tl.arange(0, BLOCK_N)
156
+ # initialize pointer to m and l
157
+ m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
158
+ l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
159
+ acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
160
+ # load scales
161
+ qk_scale = sm_scale
162
+ qk_scale *= 1.44269504 # 1/log(2)
163
+ # load q: it will stay in SRAM throughout
164
+ q = tl.load(Q_block_ptr)
165
+ # stage 1: off-band
166
+ # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE
167
+ # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE
168
+ if STAGE & 1:
169
+ acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_block_ptr, V_block_ptr, #
170
+ start_m, qk_scale, #
171
+ BLOCK_M, HEAD_DIM, BLOCK_N, #
172
+ 4 - STAGE, offs_m, offs_n, N_CTX, V.dtype.element_ty == tl.float8e5 #
173
+ )
174
+ # stage 2: on-band
175
+ if STAGE & 2:
176
+ # barrier makes it easier for compielr to schedule the
177
+ # two loops independently
178
+ acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_block_ptr, V_block_ptr, #
179
+ start_m, qk_scale, #
180
+ BLOCK_M, HEAD_DIM, BLOCK_N, #
181
+ 2, offs_m, offs_n, N_CTX, V.dtype.element_ty == tl.float8e5 #
182
+ )
183
+ # epilogue
184
+ m_i += tl.math.log2(l_i)
185
+ acc = acc / l_i[:, None]
186
+ m_ptrs = M + off_hz * N_CTX + offs_m
187
+ tl.store(m_ptrs, m_i)
188
+ tl.store(O_block_ptr, acc.to(Out.type.element_ty))
189
+
190
+
191
+ @triton.jit
192
+ def _attn_bwd_preprocess(O, DO, #
193
+ Delta, #
194
+ Z, H, N_CTX, #
195
+ BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr #
196
+ ):
197
+ off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
198
+ off_hz = tl.program_id(1)
199
+ off_n = tl.arange(0, HEAD_DIM)
200
+ # load
201
+ o = tl.load(O + off_hz * HEAD_DIM * N_CTX +
202
+ off_m[:, None] * HEAD_DIM + off_n[None, :])
203
+ do = tl.load(DO + off_hz * HEAD_DIM * N_CTX +
204
+ off_m[:, None] * HEAD_DIM + off_n[None, :]).to(tl.float32)
205
+ delta = tl.sum(o * do, axis=1)
206
+ # write-back
207
+ tl.store(Delta + off_hz * N_CTX + off_m, delta)
208
+
209
+
210
+ # The main inner-loop logic for computing dK and dV.
211
+ @triton.jit
212
+ def _attn_bwd_dkdv(dk, dv, #
213
+ Q, k, v, sm_scale, #
214
+ DO, #
215
+ M, D, #
216
+ # shared by Q/K/V/DO.
217
+ stride_tok, stride_d, #
218
+ H, N_CTX, BLOCK_M1: tl.constexpr, #
219
+ BLOCK_N1: tl.constexpr, #
220
+ HEAD_DIM: tl.constexpr, #
221
+ # Filled in by the wrapper.
222
+ start_n, start_m, num_steps, #
223
+ MASK: tl.constexpr):
224
+ offs_m = start_m + tl.arange(0, BLOCK_M1)
225
+ offs_n = start_n + tl.arange(0, BLOCK_N1)
226
+ offs_k = tl.arange(0, HEAD_DIM)
227
+ qT_ptrs = Q + offs_m[None, :] * stride_tok + offs_k[:, None] * stride_d
228
+ do_ptrs = DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
229
+ # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
230
+ tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
231
+ curr_m = start_m
232
+ step_m = BLOCK_M1
233
+ for blk_idx in range(num_steps):
234
+ qT = tl.load(qT_ptrs)
235
+ # Load m before computing qk to reduce pipeline stall.
236
+ offs_m = curr_m + tl.arange(0, BLOCK_M1)
237
+ m = tl.load(M + offs_m)
238
+ qkT = tl.dot(k, qT)
239
+ pT = tl.math.exp2(qkT - m[None, :])
240
+ # Autoregressive masking.
241
+ if MASK:
242
+ mask = (offs_m[None, :] >= offs_n[:, None])
243
+ pT = tl.where(mask, pT, 0.0)
244
+ do = tl.load(do_ptrs)
245
+ # Compute dV.
246
+ ppT = pT
247
+ ppT = ppT.to(tl.float16)
248
+ dv += tl.dot(ppT, do)
249
+ # D (= delta) is pre-divided by ds_scale.
250
+ Di = tl.load(D + offs_m)
251
+ # Compute dP and dS.
252
+ dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
253
+ dsT = pT * (dpT - Di[None, :])
254
+ dsT = dsT.to(tl.float16)
255
+ dk += tl.dot(dsT, tl.trans(qT))
256
+ # Increment pointers.
257
+ curr_m += step_m
258
+ qT_ptrs += step_m * stride_tok
259
+ do_ptrs += step_m * stride_tok
260
+ return dk, dv
261
+
262
+
263
+ # the main inner-loop logic for computing dQ
264
+ @triton.jit
265
+ def _attn_bwd_dq(dq, q, K, V, #
266
+ do, m, D,
267
+ # shared by Q/K/V/DO.
268
+ stride_tok, stride_d, #
269
+ H, N_CTX, #
270
+ BLOCK_M2: tl.constexpr, #
271
+ BLOCK_N2: tl.constexpr, #
272
+ HEAD_DIM: tl.constexpr,
273
+ # Filled in by the wrapper.
274
+ start_m, start_n, num_steps, #
275
+ MASK: tl.constexpr):
276
+ offs_m = start_m + tl.arange(0, BLOCK_M2)
277
+ offs_n = start_n + tl.arange(0, BLOCK_N2)
278
+ offs_k = tl.arange(0, HEAD_DIM)
279
+ kT_ptrs = K + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
280
+ vT_ptrs = V + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
281
+ # D (= delta) is pre-divided by ds_scale.
282
+ Di = tl.load(D + offs_m)
283
+ # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
284
+ tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
285
+ curr_n = start_n
286
+ step_n = BLOCK_N2
287
+ for blk_idx in range(num_steps):
288
+ kT = tl.load(kT_ptrs)
289
+ vT = tl.load(vT_ptrs)
290
+ qk = tl.dot(q, kT)
291
+ p = tl.math.exp2(qk - m)
292
+ # Autoregressive masking.
293
+ if MASK:
294
+ offs_n = curr_n + tl.arange(0, BLOCK_N2)
295
+ mask = (offs_m[:, None] >= offs_n[None, :])
296
+ p = tl.where(mask, p, 0.0)
297
+ # Compute dP and dS.
298
+ dp = tl.dot(do, vT).to(tl.float32)
299
+ ds = p * (dp - Di[:, None])
300
+ ds = ds.to(tl.float16)
301
+ # Compute dQ.
302
+ # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
303
+ dq += tl.dot(ds, tl.trans(kT))
304
+ # Increment pointers.
305
+ curr_n += step_n
306
+ kT_ptrs += step_n * stride_tok
307
+ vT_ptrs += step_n * stride_tok
308
+ return dq
309
+
310
+
311
+ @triton.jit
312
+ def _attn_bwd(Q, K, V, sm_scale, #
313
+ DO, #
314
+ DQ, DK, DV, #
315
+ M, D,
316
+ # shared by Q/K/V/DO.
317
+ stride_z, stride_h, stride_tok, stride_d, #
318
+ H, N_CTX, #
319
+ BLOCK_M1: tl.constexpr, #
320
+ BLOCK_N1: tl.constexpr, #
321
+ BLOCK_M2: tl.constexpr, #
322
+ BLOCK_N2: tl.constexpr, #
323
+ BLK_SLICE_FACTOR: tl.constexpr, #
324
+ HEAD_DIM: tl.constexpr):
325
+ LN2: tl.constexpr = 0.6931471824645996 # = ln(2)
326
+
327
+ bhid = tl.program_id(2)
328
+ off_chz = (bhid * N_CTX).to(tl.int64)
329
+ adj = (stride_h * (bhid % H) + stride_z * (bhid // H)).to(tl.int64)
330
+ pid = tl.program_id(0)
331
+
332
+ # offset pointers for batch/head
333
+ Q += adj
334
+ K += adj
335
+ V += adj
336
+ DO += adj
337
+ DQ += adj
338
+ DK += adj
339
+ DV += adj
340
+ M += off_chz
341
+ D += off_chz
342
+
343
+ # load scales
344
+ offs_k = tl.arange(0, HEAD_DIM)
345
+
346
+ start_n = pid * BLOCK_N1
347
+ start_m = start_n
348
+
349
+ MASK_BLOCK_M1: tl.constexpr = BLOCK_M1 // BLK_SLICE_FACTOR
350
+ offs_n = start_n + tl.arange(0, BLOCK_N1)
351
+
352
+ dv = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
353
+ dk = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
354
+
355
+ # load K and V: they stay in SRAM throughout the inner loop.
356
+ k = tl.load(K + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
357
+ v = tl.load(V + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
358
+
359
+ num_steps = BLOCK_N1 // MASK_BLOCK_M1
360
+
361
+ dk, dv = _attn_bwd_dkdv(dk, dv, #
362
+ Q, k, v, sm_scale, #
363
+ DO, #
364
+ M, D, #
365
+ stride_tok, stride_d, #
366
+ H, N_CTX, #
367
+ MASK_BLOCK_M1, BLOCK_N1, HEAD_DIM, #
368
+ start_n, start_m, num_steps, #
369
+ MASK=True #
370
+ )
371
+
372
+ start_m += num_steps * MASK_BLOCK_M1
373
+ num_steps = (N_CTX - start_m) // BLOCK_M1
374
+
375
+ # Compute dK and dV for non-masked blocks.
376
+ dk, dv = _attn_bwd_dkdv( #
377
+ dk, dv, #
378
+ Q, k, v, sm_scale, #
379
+ DO, #
380
+ M, D, #
381
+ stride_tok, stride_d, #
382
+ H, N_CTX, #
383
+ BLOCK_M1, BLOCK_N1, HEAD_DIM, #
384
+ start_n, start_m, num_steps, #
385
+ MASK=False #
386
+ )
387
+
388
+ dv_ptrs = DV + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
389
+ tl.store(dv_ptrs, dv)
390
+
391
+ # Write back dK.
392
+ dk *= sm_scale
393
+ dk_ptrs = DK + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
394
+ tl.store(dk_ptrs, dk)
395
+
396
+ # THIS BLOCK DOES DQ:
397
+ start_m = pid * BLOCK_M2
398
+ end_n = start_m + BLOCK_M2
399
+
400
+ MASK_BLOCK_N2: tl.constexpr = BLOCK_N2 // BLK_SLICE_FACTOR
401
+ offs_m = start_m + tl.arange(0, BLOCK_M2)
402
+
403
+ q = tl.load(Q + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
404
+ dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
405
+ do = tl.load(DO + offs_m[:, None] * stride_tok +
406
+ offs_k[None, :] * stride_d)
407
+
408
+ m = tl.load(M + offs_m)
409
+ m = m[:, None]
410
+
411
+ # Compute dQ for masked (diagonal) blocks.
412
+ # NOTE: This code scans each row of QK^T backward (from right to left,
413
+ # but inside each call to _attn_bwd_dq, from left to right), but that's
414
+ # not due to anything important. I just wanted to reuse the loop
415
+ # structure for dK & dV above as much as possible.
416
+ num_steps = BLOCK_M2 // MASK_BLOCK_N2
417
+ dq = _attn_bwd_dq(dq, q, K, V, #
418
+ do, m, D, #
419
+ stride_tok, stride_d, #
420
+ H, N_CTX, #
421
+ BLOCK_M2, MASK_BLOCK_N2, HEAD_DIM, #
422
+ start_m, end_n - num_steps * MASK_BLOCK_N2, num_steps, #
423
+ MASK=True #
424
+ )
425
+ end_n -= num_steps * MASK_BLOCK_N2
426
+ # stage 2
427
+ num_steps = end_n // BLOCK_N2
428
+ dq = _attn_bwd_dq(dq, q, K, V, #
429
+ do, m, D, #
430
+ stride_tok, stride_d, #
431
+ H, N_CTX, #
432
+ BLOCK_M2, BLOCK_N2, HEAD_DIM, #
433
+ start_m, end_n - num_steps * BLOCK_N2, num_steps, #
434
+ MASK=False #
435
+ )
436
+ # Write back dQ.
437
+ dq_ptrs = DQ + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
438
+ dq *= LN2
439
+ tl.store(dq_ptrs, dq)
440
+
441
+
442
+ class _attention(torch.autograd.Function):
443
+
444
+ @staticmethod
445
+ def forward(ctx, q, k, v, causal, sm_scale):
446
+ # shape constraints
447
+ HEAD_DIM_Q, HEAD_DIM_K = q.shape[-1], k.shape[-1]
448
+ # when v is in float8_e5m2 it is transposed.
449
+ HEAD_DIM_V = v.shape[-2] if v.dtype == torch.float8_e5m2 else v.shape[-1]
450
+ assert HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V
451
+ assert HEAD_DIM_K in {16, 32, 64, 128, 256}
452
+ o = torch.empty_like(q)
453
+ stage = 3 if causal else 1
454
+ extra_kern_args = {}
455
+ # Tuning for AMD target
456
+ if is_hip():
457
+ waves_per_eu = 3 if HEAD_DIM_K <= 64 else 2
458
+ extra_kern_args = {"waves_per_eu": waves_per_eu,
459
+ "allow_flush_denorm": True}
460
+
461
+ def grid(args): return (triton.cdiv(
462
+ q.shape[2], args["BLOCK_M"]), q.shape[0] * q.shape[1], 1)
463
+ M = torch.empty((q.shape[0], q.shape[1], q.shape[2]),
464
+ device=q.device, dtype=torch.float32)
465
+ _attn_fwd[grid](
466
+ q, k, v, sm_scale, M, o, #
467
+ q.stride(0), q.stride(1), q.stride(2), q.stride(3), #
468
+ k.stride(0), k.stride(1), k.stride(2), k.stride(3), #
469
+ v.stride(0), v.stride(1), v.stride(2), v.stride(3), #
470
+ o.stride(0), o.stride(1), o.stride(2), o.stride(3), #
471
+ q.shape[0], q.shape[1], #
472
+ N_CTX=q.shape[2], #
473
+ HEAD_DIM=HEAD_DIM_K, #
474
+ STAGE=stage, #
475
+ **extra_kern_args)
476
+
477
+ ctx.save_for_backward(q, k, v, o, M)
478
+ ctx.grid = grid
479
+ ctx.sm_scale = sm_scale
480
+ ctx.HEAD_DIM = HEAD_DIM_K
481
+ ctx.causal = causal
482
+ return o
483
+
484
+ @staticmethod
485
+ def backward(ctx, do):
486
+ q, k, v, o, M = ctx.saved_tensors
487
+ assert do.is_contiguous()
488
+ assert q.stride() == k.stride() == v.stride() == o.stride() == do.stride()
489
+ dq = torch.empty_like(q)
490
+ dk = torch.empty_like(k)
491
+ dv = torch.empty_like(v)
492
+ BATCH, N_HEAD, N_CTX = q.shape[:3]
493
+ PRE_BLOCK = 128
494
+ NUM_WARPS, NUM_STAGES = 4, 5
495
+ BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 128, 128, 32
496
+ BLK_SLICE_FACTOR = 2
497
+ RCP_LN2 = 1.4426950408889634 # = 1.0 / ln(2)
498
+ arg_k = k
499
+ arg_k = arg_k * (ctx.sm_scale * RCP_LN2)
500
+ PRE_BLOCK = 128
501
+ assert N_CTX % PRE_BLOCK == 0
502
+ pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD)
503
+ delta = torch.empty_like(M)
504
+ _attn_bwd_preprocess[pre_grid](
505
+ o, do, #
506
+ delta, #
507
+ BATCH, N_HEAD, N_CTX, #
508
+ BLOCK_M=PRE_BLOCK, HEAD_DIM=ctx.HEAD_DIM #
509
+ )
510
+ grid = (N_CTX // BLOCK_N1, 1, BATCH * N_HEAD)
511
+ _attn_bwd[grid](
512
+ q, arg_k, v, ctx.sm_scale, do, dq, dk, dv, #
513
+ M, delta, #
514
+ q.stride(0), q.stride(1), q.stride(2), q.stride(3), #
515
+ N_HEAD, N_CTX, #
516
+ BLOCK_M1=BLOCK_M1, BLOCK_N1=BLOCK_N1, #
517
+ BLOCK_M2=BLOCK_M2, BLOCK_N2=BLOCK_N2, #
518
+ BLK_SLICE_FACTOR=BLK_SLICE_FACTOR, #
519
+ HEAD_DIM=ctx.HEAD_DIM, #
520
+ num_warps=NUM_WARPS, #
521
+ num_stages=NUM_STAGES #
522
+ )
523
+
524
+ return dq, dk, dv, None, None
525
+
526
+
527
+ attention = _attention.apply