kernels-community
/

flash-mla

kernel

flash-mla

deepseek

kernel-builder

Model card Files Files and versions Community

drbh commited on Feb 28

Commit

5cb0596

1 Parent(s): 8acf152

fix: readability refactors

Browse files

Files changed (1) hide show

flash_mla/flash_mla_api.cu +6 -43

flash_mla/flash_mla_api.cu CHANGED Viewed

@@ -1,8 +1,6 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include <cutlass/fast_math.h>
 #include "flash_mla.h"
@@ -12,42 +10,6 @@
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-//
-// #include <cmath>
-// #include "cute/tensor.hpp"
-#include <cute/tensor.hpp>
-// __global__ void relu_kernel(float *__restrict__ out,
-//                             float const *__restrict__ input,
-//                             const int d) {
-//   const int64_t token_idx = blockIdx.x;
-//   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-//     auto x = input[token_idx * d + idx];
-//     out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
-//   }
-// }
-// void relu(torch::Tensor &out,
-//           torch::Tensor const &input)
-// {
-//   TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
-//                   input.scalar_type() == at::ScalarType::Float,
-//               "relu_kernel only supports float32");
-//   int d = input.size(-1);
-//   int64_t num_tokens = input.numel() / d;
-//   dim3 grid(num_tokens);
-//   dim3 block(std::min(d, 1024));
-//   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-//   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-//   relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
-//                                           input.data_ptr<float>(), d);
-// }
 std::vector<at::Tensor>
 get_mla_metadata(
     at::Tensor &seqlens_k,
@@ -98,16 +60,17 @@ mha_fwd_kvcache_mla(
     // TODO: fix for optional
     // std::optional<const at::Tensor> &vcache_,    // num_blocks x page_block_size x num_heads_k x head_size_v
-    const at::Tensor &vcache_,    // num_blocks x page_block_size x num_heads_k x head_size_v
     const int64_t head_size_v,
-    const at::Tensor &seqlens_k,                 // batch_size
-    const at::Tensor &block_table,               // batch_size x max_num_blocks_per_seq
     // TODO: should be float
     const double softmax_scale,
     const bool is_causal_,
-    const at::Tensor &tile_scheduler_metadata,   // num_sm_parts x TileSchedulerMetaDataSize
-    const at::Tensor &num_splits,                 // batch_size + 1
     // TODO: remove this once determined why build is adding this parameter
     const int64_t unknown_param

 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include <cutlass/fast_math.h>
 #include "flash_mla.h"
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 std::vector<at::Tensor>
 get_mla_metadata(
     at::Tensor &seqlens_k,
     // TODO: fix for optional
     // std::optional<const at::Tensor> &vcache_,    // num_blocks x page_block_size x num_heads_k x head_size_v
+    const at::Tensor &vcache_,                      // num_blocks x page_block_size x num_heads_k x head_size_v
     const int64_t head_size_v,
+    const at::Tensor &seqlens_k,                    // batch_size
+    const at::Tensor &block_table,                  // batch_size x max_num_blocks_per_seq
     // TODO: should be float
     const double softmax_scale,
     const bool is_causal_,
+    const at::Tensor &tile_scheduler_metadata,      // num_sm_parts x TileSchedulerMetaDataSize
+    const at::Tensor &num_splits,                   // batch_size + 1
     // TODO: remove this once determined why build is adding this parameter
     const int64_t unknown_param