danieldk HF Staff commited on 2 days ago

Commit

8aa00a3

1 Parent(s): d26f884

Sync to vLLM 20250627

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

attention/attention_dtypes.h +7 -0
attention/attention_generic.cuh +65 -0
attention/dtype_bfloat16.cuh +463 -0
attention/dtype_float16.cuh +504 -0
attention/dtype_float32.cuh +251 -0
attention/dtype_fp8.cuh +41 -0
build.toml +236 -87
compressed_tensors/int8_quant_kernels.cu +154 -104
core/math.hpp +23 -2
core/registration.h +0 -27
core/scalar_type.hpp +4 -1
cutlass_extensions/common.hpp +38 -11
cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +14 -12
cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +166 -33
cutlass_w8a8/Epilogues.md +32 -12
cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu +23 -0
cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +279 -0
cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu +1 -2
cutlass_w8a8/c3x/scaled_mm_helper.hpp +75 -0
cutlass_w8a8/c3x/scaled_mm_kernels.hpp +5 -0
cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +72 -3
cutlass_w8a8/common.hpp +0 -27
cutlass_w8a8/scaled_mm_c2x.cuh +8 -3
cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh +1 -1
cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh +1 -1
cutlass_w8a8/scaled_mm_c3x.cu +0 -87
cutlass_w8a8/scaled_mm_c3x.cuh +0 -160
cutlass_w8a8/scaled_mm_c3x_sm100.cu +5 -21
cutlass_w8a8/scaled_mm_c3x_sm90.cu +5 -50
cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh +0 -96
cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh +0 -140
cutlass_w8a8/scaled_mm_entry.cu +55 -13
dispatch_utils.h +48 -0
flake.lock +78 -27
fp8/amd/hip_float8.h +0 -137
fp8/amd/hip_float8_impl.h +0 -316
fp8/amd/quant_utils.cuh +262 -168
fp8/common.cu +58 -40
fp8/common.cuh +60 -55
fp8/nvidia/quant_utils.cuh +1 -1
gptq_marlin/awq_marlin_repack.cu +5 -5
gptq_marlin/dequant.h +507 -0
gptq_marlin/generate_kernels.py +126 -0
gptq_marlin/gptq_marlin.cu +0 -0
gptq_marlin/gptq_marlin_repack.cu +7 -8
gptq_marlin/kernel.h +38 -0
gptq_marlin/kernel_bf16_kfe2m1f.cu +39 -0
gptq_marlin/kernel_bf16_kfe4m3fn.cu +69 -0
gptq_marlin/kernel_bf16_ku4.cu +129 -0
gptq_marlin/kernel_bf16_ku4b8.cu +159 -0

attention/attention_dtypes.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#pragma once
+#include "attention_generic.cuh"
+#include "dtype_float16.cuh"
+#include "dtype_float32.cuh"
+#include "dtype_bfloat16.cuh"
+#include "dtype_fp8.cuh"

attention/attention_generic.cuh ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <stdint.h>
+namespace vllm {
+// A vector type to store Q, K, V elements.
+template <typename T, int VEC_SIZE>
+struct Vec {};
+// A vector type to store FP32 accumulators.
+template <typename T>
+struct FloatVec {};
+// Template vector operations.
+template <typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b);
+template <typename T>
+inline __device__ float sum(T v);
+template <typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+template <typename A, typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+template <typename T>
+inline __device__ void zero(T& dst) {
+  constexpr int WORDS = sizeof(T) / 4;
+  union {
+    T raw;
+    uint32_t words[WORDS];
+  } tmp;
+#pragma unroll
+  for (int ii = 0; ii < WORDS; ++ii) {
+    tmp.words[ii] = 0u;
+  }
+  dst = tmp.raw;
+}
+}  // namespace vllm

attention/dtype_bfloat16.cuh ADDED Viewed

	@@ -0,0 +1,463 @@

+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "attention_generic.cuh"
+#include "dtype_float32.cuh"
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+#include <stdint.h>
+namespace vllm {
+// Define custom BF16 vector data types.
+struct bf16_4_t {
+  __nv_bfloat162 x;
+  __nv_bfloat162 y;
+};
+struct bf16_8_t {
+  __nv_bfloat162 x;
+  __nv_bfloat162 y;
+  __nv_bfloat162 z;
+  __nv_bfloat162 w;
+};
+// BF16 vector types for Q, K, V.
+template <>
+struct Vec<__nv_bfloat16, 1> {
+  using Type = __nv_bfloat16;
+};
+template <>
+struct Vec<__nv_bfloat16, 2> {
+  using Type = __nv_bfloat162;
+};
+template <>
+struct Vec<__nv_bfloat16, 4> {
+  using Type = bf16_4_t;
+};
+template <>
+struct Vec<__nv_bfloat16, 8> {
+  using Type = bf16_8_t;
+};
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<__nv_bfloat16> {
+  using Type = float;
+};
+template <>
+struct FloatVec<__nv_bfloat162> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<bf16_4_t> {
+  using Type = Float4_;
+};
+template <>
+struct FloatVec<bf16_8_t> {
+  using Type = Float8_;
+};
+// Utility functions for type conversions.
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat1622float2(val);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat162bfloat162(val);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+// Vector addition.
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  #ifndef USE_ROCM
+  return a + b;
+  #else
+  return __hadd(a, b);
+  #endif
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hadd2(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) {
+  bf16_4_t c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b) {
+  bf16_8_t c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+inline __device__ float2 add(__nv_bfloat162 a, float2 fb) {
+  float2 fa = bf1622float2(a);
+  return add(fa, fb);
+}
+inline __device__ Float4_ add(bf16_4_t a, Float4_ fb) {
+  Float4_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  return fc;
+}
+inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+// Vector multiplication.
+template <>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hmul(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+template <>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hmul2(a, b);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+template <>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+  return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+template <>
+inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
+  bf16_4_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  return c;
+}
+template <>
+inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_4_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  return c;
+}
+template <>
+inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
+  bf16_8_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+  c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+  return c;
+}
+template <>
+inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_8_t c;
+  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+  c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+  return c;
+}
+template <>
+inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
+  float fa = __bfloat162float(a);
+  float fb = __bfloat162float(b);
+  return fa * fb;
+}
+template <>
+inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
+  float2 fa = bf1622float2(a);
+  float2 fb = bf1622float2(b);
+  return mul<float2, float2, float2>(fa, fb);
+}
+template <>
+inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
+  return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+template <>
+inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
+  Float4_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  return fc;
+}
+template <>
+inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float4_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  return fc;
+}
+template <>
+inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
+  Float8_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+  fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+  fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+  return fc;
+}
+template <>
+inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float8_ fc;
+  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+  fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+  fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+  fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+  return fc;
+}
+// Vector fused multiply-add.
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hfma2(a, b, c);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __hfma2(bf162bf162(a), b, c);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) {
+  bf16_4_t d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_4_t d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c) {
+  bf16_8_t d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c) {
+  __nv_bfloat162 s = bf162bf162(a);
+  bf16_8_t d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc) {
+  return __bfloat162float(a) * __bfloat162float(b) + fc;
+}
+inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc) {
+  float2 fa = bf1622float2(a);
+  float2 fb = bf1622float2(b);
+  return fma(fa, fb, fc);
+}
+inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc) {
+  return fma(bf162bf162(a), b, fc);
+}
+inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc) {
+  Float4_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  return fd;
+}
+inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float4_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  return fd;
+}
+inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc) {
+  Float8_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  fd.z = fma(a.z, b.z, fc.z);
+  fd.w = fma(a.w, b.w, fc.w);
+  return fd;
+}
+inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
+  __nv_bfloat162 s = bf162bf162(a);
+  Float8_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  fd.z = fma(s, b.z, fc.z);
+  fd.w = fma(s, b.w, fc.w);
+  return fd;
+}
+// Vector sum.
+template <>
+inline __device__ float sum(__nv_bfloat16 v) {
+  return __bfloat162float(v);
+}
+template <>
+inline __device__ float sum(__nv_bfloat162 v) {
+  float2 vf = bf1622float2(v);
+  return vf.x + vf.y;
+}
+template <>
+inline __device__ float sum(bf16_4_t v) {
+  return sum(v.x) + sum(v.y);
+}
+template <>
+inline __device__ float sum(bf16_8_t v) {
+  return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
+}
+// From float32 to bfloat16.
+inline __device__ void from_float(__nv_bfloat16& dst, float src) {
+  dst = __float2bfloat16(src);
+}
+inline __device__ void from_float(__nv_bfloat162& dst, float2 src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst = __float22bfloat162_rn(src);
+#endif
+}
+inline __device__ void from_float(bf16_4_t& dst, Float4_ src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst.x = __float22bfloat162_rn(src.x);
+  dst.y = __float22bfloat162_rn(src.y);
+#endif
+}
+inline __device__ void from_float(bf16_8_t& dst, Float8_ src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  dst.x = __float22bfloat162_rn(src.x);
+  dst.y = __float22bfloat162_rn(src.y);
+  dst.z = __float22bfloat162_rn(src.z);
+  dst.w = __float22bfloat162_rn(src.w);
+#endif
+}
+// From bfloat16 to float32.
+inline __device__ float to_float(__nv_bfloat16 u) {
+  return __bfloat162float(u);
+}
+// Zero-out a variable.
+inline __device__ void zero(__nv_bfloat16& dst) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  // Same as CUDART_ZERO_BF16 introduced in CUDA 12.2.
+  dst = __ushort_as_bfloat16((unsigned short)0x0000U);
+#endif
+}
+}  // namespace vllm

attention/dtype_float16.cuh ADDED Viewed

	@@ -0,0 +1,504 @@

+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "attention_generic.cuh"
+#include "dtype_float32.cuh"
+#ifdef USE_ROCM
+  #include <hip/hip_fp16.h>
+#endif
+#include <stdint.h>
+namespace vllm {
+// FP16 vector types for Q, K, V.
+template <>
+struct Vec<uint16_t, 1> {
+  using Type = uint16_t;
+};
+template <>
+struct Vec<uint16_t, 2> {
+  using Type = uint32_t;
+};
+template <>
+struct Vec<uint16_t, 4> {
+  using Type = uint2;
+};
+template <>
+struct Vec<uint16_t, 8> {
+  using Type = uint4;
+};
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<uint16_t> {
+  using Type = float;
+};
+template <>
+struct FloatVec<uint32_t> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<uint2> {
+  using Type = Float4_;
+};
+template <>
+struct FloatVec<uint4> {
+  using Type = Float8_;
+};
+// Utility functions for type conversions.
+inline __device__ uint32_t h0_h0(uint16_t a) {
+#ifndef USE_ROCM
+  uint32_t b;
+  asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+  return b;
+#else
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+  tmp.u16[0] = a;
+  tmp.u16[1] = a;
+  return tmp.u32;
+#endif
+}
+inline __device__ float half_to_float(uint16_t h) {
+  float f;
+#ifndef USE_ROCM
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+#else
+  asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h));
+#endif
+  return f;
+}
+inline __device__ float2 half2_to_float2(uint32_t v) {
+#ifndef USE_ROCM
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+#else
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+  tmp.u32 = v;
+  float2 ret;
+  ret.x = half_to_float(tmp.u16[0]);
+  ret.y = half_to_float(tmp.u16[1]);
+  return ret;
+#endif
+}
+inline __device__ uint16_t float_to_half(float f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#ifndef USE_ROCM
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+#else
+  asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f));
+#endif
+  return tmp.u16[0];
+}
+inline __device__ uint32_t float2_to_half2(float2 f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#ifndef USE_ROCM
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+               : "=r"(tmp.u32)
+               : "f"(f.y), "f"(f.x));
+  #else
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+  #endif
+#else
+  tmp.u16[0] = float_to_half(f.x);
+  tmp.u16[1] = float_to_half(f.y);
+#endif
+  return tmp.u32;
+}
+// Vector addition.
+inline __device__ uint16_t add(uint16_t a, uint16_t b) {
+  uint16_t c;
+#ifndef USE_ROCM
+  asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+#else
+  asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+inline __device__ uint32_t add(uint32_t a, uint32_t b) {
+  uint32_t c;
+#ifndef USE_ROCM
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+inline __device__ uint2 add(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+inline __device__ uint4 add(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+inline __device__ float2 add(uint32_t a, float2 fb) {
+  float2 fa = half2_to_float2(a);
+  return add(fa, fb);
+}
+inline __device__ Float4_ add(uint2 a, Float4_ fb) {
+  Float4_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  return fc;
+}
+inline __device__ Float8_ add(uint4 a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+// Vector multiplication.
+template <>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+  uint16_t c;
+#ifndef USE_ROCM
+  asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+#else
+  asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+template <>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+  uint32_t c;
+#ifndef USE_ROCM
+  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+#endif
+  return c;
+}
+template <>
+inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
+  return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
+}
+template <>
+inline __device__ uint2 mul(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  return c;
+}
+template <>
+inline __device__ uint2 mul(uint16_t a, uint2 b) {
+  uint32_t s = h0_h0(a);
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+  return c;
+}
+template <>
+inline __device__ uint4 mul(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+  return c;
+}
+template <>
+inline __device__ uint4 mul(uint16_t a, uint4 b) {
+  uint32_t s = h0_h0(a);
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(s, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(s, b.w);
+  return c;
+}
+template <>
+inline __device__ float mul(uint16_t a, uint16_t b) {
+  float fa = half_to_float(a);
+  float fb = half_to_float(b);
+  return fa * fb;
+}
+template <>
+inline __device__ float2 mul(uint32_t a, uint32_t b) {
+  float2 fa = half2_to_float2(a);
+  float2 fb = half2_to_float2(b);
+  return mul<float2, float2, float2>(fa, fb);
+}
+template <>
+inline __device__ float2 mul(uint16_t a, uint32_t b) {
+  return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
+}
+template <>
+inline __device__ Float4_ mul(uint2 a, uint2 b) {
+  Float4_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+  return fc;
+}
+template <>
+inline __device__ Float4_ mul(uint16_t a, uint2 b) {
+  uint32_t s = h0_h0(a);
+  Float4_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+  return fc;
+}
+template <>
+inline __device__ Float8_ mul(uint4 a, uint4 b) {
+  Float8_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+  fc.z = mul<float2, uint32_t, uint32_t>(a.z, b.z);
+  fc.w = mul<float2, uint32_t, uint32_t>(a.w, b.w);
+  return fc;
+}
+template <>
+inline __device__ Float8_ mul(uint16_t a, uint4 b) {
+  uint32_t s = h0_h0(a);
+  Float8_ fc;
+  fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+  fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+  fc.z = mul<float2, uint32_t, uint32_t>(s, b.z);
+  fc.w = mul<float2, uint32_t, uint32_t>(s, b.w);
+  return fc;
+}
+// Vector fused multiply-add.
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+#ifndef USE_ROCM
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+#else
+  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+               : "=v"(d)
+               : "v"(a), "v"(b), "v"(c));
+#endif
+  return d;
+}
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) {
+  return fma(h0_h0(a), b, c);
+}
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) {
+  uint2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) {
+  uint32_t s = h0_h0(a);
+  uint2 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) {
+  uint4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) {
+  uint32_t s = h0_h0(a);
+  uint4 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+inline __device__ float fma(uint16_t a, uint16_t b, float fc) {
+  float fa = half_to_float(a);
+  float fb = half_to_float(b);
+  return fa * fb + fc;
+}
+inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc) {
+  float2 fa = half2_to_float2(a);
+  float2 fb = half2_to_float2(b);
+  return fma(fa, fb, fc);
+}
+inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc) {
+  return fma(h0_h0(a), b, fc);
+}
+inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc) {
+  Float4_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  return fd;
+}
+inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc) {
+  uint32_t s = h0_h0(a);
+  Float4_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  return fd;
+}
+inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc) {
+  Float8_ fd;
+  fd.x = fma(a.x, b.x, fc.x);
+  fd.y = fma(a.y, b.y, fc.y);
+  fd.z = fma(a.z, b.z, fc.z);
+  fd.w = fma(a.w, b.w, fc.w);
+  return fd;
+}
+inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
+  uint32_t s = h0_h0(a);
+  Float8_ fd;
+  fd.x = fma(s, b.x, fc.x);
+  fd.y = fma(s, b.y, fc.y);
+  fd.z = fma(s, b.z, fc.z);
+  fd.w = fma(s, b.w, fc.w);
+  return fd;
+}
+// Vector sum.
+template <>
+inline __device__ float sum(uint16_t v) {
+  return half_to_float(v);
+}
+template <>
+inline __device__ float sum(uint32_t v) {
+  float2 tmp = half2_to_float2(v);
+  return tmp.x + tmp.y;
+}
+template <>
+inline __device__ float sum(uint2 v) {
+  uint32_t c = add(v.x, v.y);
+  return sum(c);
+}
+template <>
+inline __device__ float sum(uint4 v) {
+  uint32_t c = add(v.x, v.y);
+  c = add(c, v.z);
+  c = add(c, v.w);
+  return sum(c);
+}
+// From float32 to float16.
+inline __device__ void from_float(uint16_t& dst, float src) {
+  dst = float_to_half(src);
+}
+inline __device__ void from_float(uint32_t& dst, float2 src) {
+  dst = float2_to_half2(src);
+}
+inline __device__ void from_float(uint2& dst, Float4_ src) {
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+}
+inline __device__ void from_float(uint4& dst, Float8_ src) {
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+  dst.z = float2_to_half2(src.z);
+  dst.w = float2_to_half2(src.w);
+}
+// From float16 to float32.
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
+inline __device__ Float4_ to_float(uint2 u) {
+  Float4_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  return tmp;
+}
+inline __device__ Float8_ to_float(uint4 u) {
+  Float8_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  tmp.z = half2_to_float2(u.z);
+  tmp.w = half2_to_float2(u.w);
+  return tmp;
+}
+// Zero-out a variable.
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
+}  // namespace vllm

attention/dtype_float32.cuh ADDED Viewed

	@@ -0,0 +1,251 @@

+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "attention_generic.cuh"
+#include <stdint.h>
+namespace vllm {
+// Define custom FP32 vector data types.
+struct Float4_ {
+  float2 x;
+  float2 y;
+};
+struct Float8_ {
+  float2 x;
+  float2 y;
+  float2 z;
+  float2 w;
+};
+// FP32 vector types for Q, K, V.
+template <>
+struct Vec<float, 1> {
+  using Type = float;
+};
+template <>
+struct Vec<float, 2> {
+  using Type = float2;
+};
+template <>
+struct Vec<float, 4> {
+  using Type = float4;
+};
+// FP32 accumulator vector types corresponding to Vec.
+template <>
+struct FloatVec<float> {
+  using Type = float;
+};
+template <>
+struct FloatVec<float2> {
+  using Type = float2;
+};
+template <>
+struct FloatVec<float4> {
+  using Type = float4;
+};
+// Vector addition.
+inline __device__ float add(float a, float b) { return a + b; }
+inline __device__ float2 add(float2 a, float2 b) {
+  float2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+inline __device__ float4 add(float4 a, float4 b) {
+  float4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+// Vector multiplication.
+template <>
+inline __device__ float mul<float, float>(float a, float b) {
+  return a * b;
+}
+template <>
+inline __device__ float2 mul(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+template <>
+inline __device__ float2 mul(float a, float2 b) {
+  float2 c;
+  c.x = a * b.x;
+  c.y = a * b.y;
+  return c;
+}
+template <>
+inline __device__ float4 mul(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  c.z = a.z * b.z;
+  c.w = a.w * b.w;
+  return c;
+}
+template <>
+inline __device__ float4 mul(float a, float4 b) {
+  float4 c;
+  c.x = a * b.x;
+  c.y = a * b.y;
+  c.z = a * b.z;
+  c.w = a * b.w;
+  return c;
+}
+// Vector fused multiply-add.
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+inline __device__ float2 fma(float2 a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+inline __device__ float2 fma(float a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+inline __device__ float4 fma(float4 a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+inline __device__ float4 fma(float a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
+  Float4_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
+  Float8_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+// Vector sum.
+template <>
+inline __device__ float sum(float v) {
+  return v;
+}
+template <>
+inline __device__ float sum(float2 v) {
+  return v.x + v.y;
+}
+template <>
+inline __device__ float sum(float4 v) {
+  return v.x + v.y + v.z + v.w;
+}
+template <>
+inline __device__ float sum(Float4_ v) {
+  return v.x.x + v.x.y + v.y.x + v.y.y;
+}
+template <>
+inline __device__ float sum(Float8_ v) {
+  return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
+}
+// Vector dot product.
+inline __device__ float dot(float a, float b) { return a * b; }
+inline __device__ float dot(float2 a, float2 b) {
+  float2 c = mul<float2, float2, float2>(a, b);
+  return c.x + c.y;
+}
+inline __device__ float dot(Float4_ a, Float4_ b) {
+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
+  acc = fma(a.y, b.y, acc);
+  return acc.x + acc.y;
+}
+inline __device__ float dot(Float8_ a, Float8_ b) {
+  float2 acc = mul<float2, float2, float2>(a.x, b.x);
+  acc = fma(a.y, b.y, acc);
+  acc = fma(a.z, b.z, acc);
+  acc = fma(a.w, b.w, acc);
+  return acc.x + acc.y;
+}
+// From float to float.
+inline __device__ void from_float(float& dst, float src) { dst = src; }
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
+// From float to float.
+inline __device__ float to_float(float u) { return u; }
+inline __device__ float2 to_float(float2 u) { return u; }
+inline __device__ float4 to_float(float4 u) { return u; }
+inline __device__ Float4_ to_float(Float4_ u) { return u; }
+inline __device__ Float8_ to_float(Float8_ u) { return u; }
+// Zero-out a variable.
+inline __device__ void zero(float& dst) { dst = 0.f; }
+}  // namespace vllm

attention/dtype_fp8.cuh ADDED Viewed

	@@ -0,0 +1,41 @@

+#pragma once
+#include "attention_generic.cuh"
+#include <stdint.h>
+#ifdef ENABLE_FP8
+  #ifndef USE_ROCM
+    #include <cuda_fp8.h>
+  #endif  // USE_ROCM
+#endif    // ENABLE_FP8
+namespace vllm {
+enum class Fp8KVCacheDataType {
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
+};
+// fp8 vector types for quantization of kv cache
+template <>
+struct Vec<uint8_t, 1> {
+  using Type = uint8_t;
+};
+template <>
+struct Vec<uint8_t, 2> {
+  using Type = uint16_t;
+};
+template <>
+struct Vec<uint8_t, 4> {
+  using Type = uint32_t;
+};
+template <>
+struct Vec<uint8_t, 8> {
+  using Type = uint2;
+};
+}  // namespace vllm

build.toml CHANGED Viewed

@@ -1,112 +1,261 @@
 [general]
 name = "quantization"
 [torch]
 src = [
-  "core/registration.h",
-  "core/scalar_type.hpp",
-  "torch-ext/torch_binding.cpp",
-  "torch-ext/torch_binding.h"
 ]
-include = [ "." ]
-[kernel.cutlass_w8a8]
-cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
 src = [
-  "core/math.hpp",
-  "cutlass_w8a8/common.hpp",
-  "cutlass_w8a8/scaled_mm_c2x.cu",
-  "cutlass_w8a8/scaled_mm_c2x.cuh",
-  "cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh",
-  "cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh",
-  "cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh",
-  "cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh",
-  "cutlass_w8a8/scaled_mm_entry.cu",
-  "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp",
-  "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp",
-]
-include = [ "." ]
-depends = [ "cutlass_3_6", "torch" ]
-[kernel.cutlass_w8a8_hopper]
-cuda-capabilities = [ "9.0", "9.0a" ]
 src = [
-  "core/math.hpp",
-  "cutlass_w8a8/common.hpp",
-  "cutlass_w8a8/scaled_mm_c3x.cu",
-  "cutlass_w8a8/scaled_mm_c3x.cuh",
-  "cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh",
-  "cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh",
-  "cutlass_extensions/common.cpp",
-  "cutlass_extensions/common.hpp",
-  "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp",
-  "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp",
-]
-include = [ "." ]
-depends = [ "cutlass_3_6", "torch" ]
 [kernel.fp8_common]
-language = "cuda-hipify"
-cuda-capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
-rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
 src = [
-  "fp8/amd/hip_float8.h",
-  "fp8/amd/hip_float8_impl.h",
-  "fp8/common.cu",
-  "fp8/common.cuh",
-  "dispatch_utils.h",
-  "vectorization.cuh"
-]
-include = [ "." ]
-depends = [ "torch" ]
-[kernel.fp8_marlin]
-cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
 src = [
-  "fp8/fp8_marlin.cu",
-  "gptq_marlin/marlin.cuh",
-  "gptq_marlin/marlin_dtypes.cuh",
 ]
-depends = [ "torch" ]
-[kernel.int8_common]
-language = "cuda-hipify"
-cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
-rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101" ]
 src = [
-  "compressed_tensors/int8_quant_kernels.cu",
-  "dispatch_utils.h"
 ]
-include = [ "." ]
-depends = [ "torch" ]
-[kernel.gptq_marlin]
-cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
 src = [
-  "core/scalar_type.hpp",
-  "gptq_marlin/awq_marlin_repack.cu",
-  "gptq_marlin/gptq_marlin.cu",
-  "gptq_marlin/gptq_marlin_repack.cu",
-  "gptq_marlin/marlin.cuh",
-  "gptq_marlin/marlin_dtypes.cuh"
-]
-include = [ "." ]
-depends = [ "torch" ]
 [kernel.marlin]
-cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0" ]
 src = [
-  "core/scalar_type.hpp",
-  "marlin/dense/common/base.h",
-  "marlin/dense/common/mem.h",
-  "marlin/dense/marlin_cuda_kernel.cu",
-  "marlin/qqq/marlin_qqq_gemm_kernel.cu",
-  "marlin/sparse/common/base.h",
-  "marlin/sparse/common/mem.h",
-  "marlin/sparse/common/mma.h",
-  "marlin/sparse/marlin_24_cuda_kernel.cu"
-]
-include = [ "." ]
-depends = [ "torch" ]

 [general]
 name = "quantization"
+universal = false
 [torch]
+include = ["."]
 src = [
+    "core/scalar_type.hpp",
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
 ]
+[kernel.gptq_marlin]
+backend = "cuda"
+cuda-capabilities = [
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = ["torch"]
+include = ["."]
 src = [
+    "core/scalar_type.hpp",
+    "gptq_marlin/awq_marlin_repack.cu",
+    "gptq_marlin/dequant.h",
+    "gptq_marlin/gptq_marlin.cu",
+    "gptq_marlin/gptq_marlin_repack.cu",
+    "gptq_marlin/kernel.h",
+    "gptq_marlin/kernel_bf16_kfe2m1f.cu",
+    "gptq_marlin/kernel_bf16_kfe4m3fn.cu",
+    "gptq_marlin/kernel_bf16_ku4.cu",
+    "gptq_marlin/kernel_bf16_ku4b8.cu",
+    "gptq_marlin/kernel_bf16_ku8b128.cu",
+    "gptq_marlin/kernel_fp16_kfe2m1f.cu",
+    "gptq_marlin/kernel_fp16_kfe4m3fn.cu",
+    "gptq_marlin/kernel_fp16_ku4.cu",
+    "gptq_marlin/kernel_fp16_ku4b8.cu",
+    "gptq_marlin/kernel_fp16_ku8b128.cu",
+    "gptq_marlin/marlin.cuh",
+    "gptq_marlin/marlin_dtypes.cuh",
+    "gptq_marlin/marlin_template.h",
+]
+[kernel.fp8_common_rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [
+    "gfx906",
+    "gfx908",
+    "gfx90a",
+    "gfx940",
+    "gfx941",
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+    "gfx1101",
+]
+include = ["."]
+src = [
+    "attention/attention_dtypes.h",
+    "attention/attention_generic.cuh",
+    "attention/dtype_bfloat16.cuh",
+    "attention/dtype_float16.cuh",
+    "attention/dtype_float32.cuh",
+    "attention/dtype_fp8.cuh",
+    "fp8/amd/quant_utils.cuh",
+    "fp8/common.cu",
+    "fp8/common.cuh",
+    "dispatch_utils.h",
+    "utils.cuh",
+    "vectorization.cuh",
+]
+[kernel.int8_common]
+backend = "cuda"
+cuda-capabilities = [
+    "7.0",
+    "7.2",
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = ["torch"]
+include = ["."]
 src = [
+    "compressed_tensors/int8_quant_kernels.cu",
+    "dispatch_utils.h",
+    "vectorization_utils.cuh",
+]
 [kernel.fp8_common]
+backend = "cuda"
+cuda-capabilities = [
+    "7.0",
+    "7.2",
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = ["torch"]
+include = ["."]
 src = [
+    "fp8/common.cu",
+    "fp8/common.cuh",
+    "dispatch_utils.h",
+    "utils.cuh",
+    "vectorization.cuh",
+]
+[kernel.cutlass_w8a8_hopper]
+backend = "cuda"
+cuda-capabilities = ["9.0a"]
+depends = [
+    "cutlass_3_9",
+    "torch",
+]
+include = ["."]
 src = [
+    "cuda_utils.h",
+    "core/math.hpp",
+    "cutlass_w8a8/c3x/cutlass_gemm_caller.cuh",
+    "cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh",
+    "cutlass_w8a8/c3x/scaled_mm.cuh",
+    "cutlass_w8a8/c3x/scaled_mm_kernels.hpp",
+    "cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh",
+    "cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh",
+    "cutlass_w8a8/c3x/scaled_mm_helper.hpp",
+    "cutlass_w8a8/scaled_mm_c3x_sm90.cu",
+    "cutlass_extensions/common.cpp",
+    "cutlass_extensions/common.hpp",
+    "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp",
+    "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp",
+    "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp",
+    "cutlass_extensions/gemm/dispatch_policy.hpp",
+    "cutlass_extensions/gemm/collective/collective_builder.hpp",
+    "cutlass_extensions/gemm/collective/fp8_accumulation.hpp",
+    "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp",
 ]
+[kernel.cutlass_w8a8_blackwell]
+backend = "cuda"
+cuda-capabilities = [
+    "10.0a",
+    "10.1a",
+    "12.0a",
+]
+depends = [
+    "cutlass_3_9",
+    "torch",
+]
+include = ["."]
 src = [
+    "cuda_utils.h",
+    "cutlass_w8a8/scaled_mm_c3x_sm100.cu",
+    "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh",
+    "cutlass_w8a8/c3x/scaled_mm_helper.hpp",
+    "cutlass_w8a8/c3x/scaled_mm_kernels.hpp",
+    "cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu",
+    "cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh",
 ]
+[kernel.cutlass_w8a8]
+backend = "cuda"
+cuda-capabilities = [
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = [
+    "cutlass_3_9",
+    "torch",
+]
+include = ["."]
 src = [
+    "core/math.hpp",
+    "cutlass_w8a8/scaled_mm_c2x.cu",
+    "cutlass_w8a8/scaled_mm_c2x.cuh",
+    "cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh",
+    "cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh",
+    "cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh",
+    "cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh",
+    "cutlass_w8a8/scaled_mm_entry.cu",
+    "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp",
+    "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp",
+]
 [kernel.marlin]
+backend = "cuda"
+cuda-capabilities = [
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = ["torch"]
+include = ["."]
 src = [
+    "core/scalar_type.hpp",
+    "marlin/dense/common/base.h",
+    "marlin/dense/common/mem.h",
+    "marlin/dense/marlin_cuda_kernel.cu",
+    "marlin/qqq/marlin_qqq_gemm_kernel.cu",
+    "marlin/sparse/common/base.h",
+    "marlin/sparse/common/mem.h",
+    "marlin/sparse/common/mma.h",
+    "marlin/sparse/marlin_24_cuda_kernel.cu",
+]
+[kernel.int8_common_rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [
+    "gfx906",
+    "gfx908",
+    "gfx90a",
+    "gfx940",
+    "gfx941",
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+    "gfx1101",
+]
+include = ["."]
+src = [
+    "compressed_tensors/int8_quant_kernels.cu",
+    "dispatch_utils.h",
+]

compressed_tensors/int8_quant_kernels.cu CHANGED Viewed

@@ -1,15 +1,17 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include <cmath>
-#include "dispatch_utils.h"
 #ifndef USE_ROCM
-  #include <cub/util_type.cuh>
   #include <cub/cub.cuh>
 #else
-  #include <hipcub/util_type.hpp>
   #include <hipcub/hipcub.hpp>
 #endif
 static inline __device__ int8_t float_to_int8_rn(float x) {
@@ -26,7 +28,13 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
   float dst = std::nearbyint(x);
   // saturate
-  dst = std::clamp(dst, i8_min, i8_max);
   return static_cast<int8_t>(dst);
 #else
   // CUDA path
@@ -79,7 +87,13 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
       static_cast<int32_t>(std::numeric_limits<int8_t>::max());
   // saturate
-  int32_t dst = std::clamp(x, i8_min, i8_max);
   return static_cast<int8_t>(dst);
 #else
   // CUDA path
@@ -91,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
 namespace vllm {
-template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
   // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
-  }
 }
-template <typename scalar_t, typename scale_type, typename azp_type>
 __global__ void static_scaled_int8_azp_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, azp_type const* azp_ptr,
-    const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
-  azp_type const azp = *azp_ptr;
   // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    auto const val = static_cast<float>(input[i]);
-    auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
-    out[i] = quant_val;
-  }
 }
-template <typename scalar_t, typename scale_type>
 __global__ void dynamic_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int64_t const token_idx = blockIdx.x;
-  float absmax_val = 0.0f;
-  float const zero = 0.0f;
   // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    float val = static_cast<float>(input[i]);
-    val = val > zero ? val : -val;
-    absmax_val = val > absmax_val ? val : absmax_val;
   }
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
-  __shared__ float block_absmax_val;
   if (tid == 0) {
-    block_absmax_val = block_absmax_val_maybe;
-    scale[token_idx] = block_absmax_val / 127.0f;
   }
   __syncthreads();
-  float const tmp_scale = 127.0f / block_absmax_val;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
   }
 }
-template <typename scalar_t, typename scale_type, typename azp_type>
 __global__ void dynamic_scaled_int8_azp_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, azp_type* azp, const int hidden_size) {
-  int64_t const token_idx = blockIdx.x;
   // Must be performed using 64-bit math to avoid integer overflow.
-  out += token_idx * hidden_size;
-  input += token_idx * hidden_size;
-  // Scan for the min and max value for this token
-  float max_val = std::numeric_limits<float>::min();
-  float min_val = std::numeric_limits<float>::max();
-  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    auto val = static_cast<float>(input[i]);
-    max_val = std::max(max_val, val);
-    min_val = std::min(min_val, val);
-  }
-  // Reduce the max and min values across the block
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
-  __syncthreads();  // Make sure min doesn't mess with max shared memory
-  min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
-  __shared__ scale_type scale_sh;
-  __shared__ azp_type azp_sh;
-  // Compute the scale and zero point and store them, only on the first thread
-  if (threadIdx.x == 0) {
-    float const scale_val = (max_val - min_val) / 255.0f;
-    // Use rounding to even (same as torch.round)
-    auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
-    auto const azp_val = static_cast<azp_type>(azp_float);
-    // Store the scale and azp into shared and global
-    scale[token_idx] = scale_sh = scale_val;
-    azp[token_idx] = azp_sh = azp_val;
   }
-  // Wait for the scale and azp to be computed
-  __syncthreads();
-  float const scale_val = scale_sh;
-  azp_type const azp_val = azp_sh;
-  // Quantize the values
-  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    auto const val = static_cast<float>(input[i]);
-    auto const quant_val =
-        int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
-    out[i] = quant_val;
   }
 }
 }  // namespace vllm
@@ -235,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@@ -266,7 +316,7 @@ void dynamic_scaled_int8_quant(
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {

 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include <cmath>
+#include "../dispatch_utils.h"
+#include "../vectorization_utils.cuh"
 #ifndef USE_ROCM
   #include <cub/cub.cuh>
+  #include <cub/util_type.cuh>
 #else
   #include <hipcub/hipcub.hpp>
+  #include <hipcub/util_type.hpp>
 #endif
 static inline __device__ int8_t float_to_int8_rn(float x) {
   float dst = std::nearbyint(x);
   // saturate
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
   return static_cast<int8_t>(dst);
 #else
   // CUDA path
       static_cast<int32_t>(std::numeric_limits<int8_t>::max());
   // saturate
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // int32_t dst = std::clamp(x, i8_min, i8_max);
+  int32_t dst = (x < i8_min) ? i8_min : (x > i8_max) ? i8_max : x;
   return static_cast<int8_t>(dst);
 #else
   // CUDA path
 namespace vllm {
+template <typename scalar_t, typename scale_t>
 __global__ void static_scaled_int8_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;
   // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) / scale);
+      });
 }
+template <typename scalar_t, typename scale_t, typename azp_t>
 __global__ void static_scaled_int8_azp_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
+  const float scale = *scale_ptr;
+  const azp_t azp = *azp_ptr;
+  const float inv_s = 1.0f / scale;
   // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
 }
+template <typename scalar_t, typename scale_t>
 __global__ void dynamic_scaled_int8_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
   // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+  // calculate for absmax
+  float thread_max = 0.f;
+  for (int i = tid; i < hidden_size; i += stride) {
+    const auto v = fabsf(static_cast<float>(row_in[i]));
+    thread_max = fmaxf(thread_max, v);
   }
+  using BlockReduce = cub::BlockReduce<float, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;
+  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  __shared__ float absmax;
   if (tid == 0) {
+    absmax = block_max;
+    scale_out[blockIdx.x] = absmax / 127.f;
   }
   __syncthreads();
+  float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
+      });
+}
+// MinMax structure to hold min and max values in one go
+struct MinMax {
+  float min, max;
+  __host__ __device__ MinMax()
+      : min(std::numeric_limits<float>::max()),
+        max(std::numeric_limits<float>::lowest()) {}
+  __host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
+  // add a value to the MinMax
+  __host__ __device__ MinMax& operator+=(float v) {
+    min = fminf(min, v);
+    max = fmaxf(max, v);
+    return *this;
+  }
+  // merge two MinMax objects
+  __host__ __device__ MinMax& operator&=(const MinMax& other) {
+    min = fminf(min, other.min);
+    max = fmaxf(max, other.max);
+    return *this;
   }
+};
+__host__ __device__ inline MinMax operator+(MinMax a, float v) {
+  return a += v;
+}
+__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
+  return a &= b;
 }
+template <typename scalar_t, typename scale_t, typename azp_t>
 __global__ void dynamic_scaled_int8_azp_quant_kernel(
+    const scalar_t* __restrict__ input, int8_t* __restrict__ output,
+    scale_t* scale_out, azp_t* azp_out, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int stride = blockDim.x;
+  const int64_t token_idx = blockIdx.x;
   // Must be performed using 64-bit math to avoid integer overflow.
+  const scalar_t* row_in = input + token_idx * hidden_size;
+  int8_t* row_out = output + token_idx * hidden_size;
+  // 1. calculate min & max
+  MinMax thread_mm;
+  for (int i = tid; i < hidden_size; i += stride) {
+    thread_mm += static_cast<float>(row_in[i]);
   }
+  using BlockReduce = cub::BlockReduce<MinMax, 256>;
+  __shared__ typename BlockReduce::TempStorage tmp;
+  MinMax mm = BlockReduce(tmp).Reduce(
+      thread_mm,
+      [] __device__(MinMax a, const MinMax& b) {
+        a &= b;
+        return a;
+      },
+      blockDim.x);
+  __shared__ float scale_sh;
+  __shared__ azp_t azp_sh;
+  if (tid == 0) {
+    float s = (mm.max - mm.min) / 255.f;
+    float zp = nearbyintf(-128.f - mm.min / s);  // round-to-even
+    scale_sh = s;
+    azp_sh = azp_t(zp);
+    scale_out[blockIdx.x] = s;
+    azp_out[blockIdx.x] = azp_sh;
   }
+  __syncthreads();
+  const float inv_s = 1.f / scale_sh;
+  const azp_t azp = azp_sh;
+  // 2. quantize
+  vectorize_with_alignment<16>(
+      row_in, row_out, hidden_size, tid, stride,
+      [=] __device__(int8_t& dst, const scalar_t& src) {
+        const auto v = static_cast<float>(src) * inv_s;
+        dst = int32_to_int8(float_to_int32_rn(v) + azp);
+      });
 }
 }  // namespace vllm
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 256));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 256));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {

core/math.hpp CHANGED Viewed

@@ -1,7 +1,28 @@
 #include <climits>
 #include <iostream>
-inline uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}

+#pragma once
 #include <climits>
 #include <iostream>
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+template <typename A, typename B>
+static inline constexpr auto div_ceil(A a, B b) {
+  return (a + b - 1) / b;
+}
+// Round a down to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_previous_multiple_of(T a, T b) {
+  return a % b == 0 ? a : (a / b) * b;
+}
+// Round a up to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_next_multiple_of(T a, T b) {
+  return a % b == 0 ? a : ((a / b) + 1) * b;
+}

core/registration.h DELETED Viewed

@@ -1,27 +0,0 @@
-#pragma once
-#include <Python.h>
-#define _CONCAT(A, B) A##B
-#define CONCAT(A, B) _CONCAT(A, B)
-#define _STRINGIFY(A) #A
-#define STRINGIFY(A) _STRINGIFY(A)
-// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
-// could be a macro instead of a literal token.
-#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
-// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
-// could be a macro instead of a literal token.
-#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
-  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
-// REGISTER_EXTENSION allows the shared library to be loaded and initialized
-// via python's import statement.
-#define REGISTER_EXTENSION(NAME)                                               \
-  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
-    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
-                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
-    return PyModule_Create(&module);                                           \
-  }

core/scalar_type.hpp CHANGED Viewed

@@ -32,7 +32,7 @@ class ScalarType {
         signed_(signed_),
         bias(bias),
         finite_values_only(finite_values_only),
-        nan_repr(nan_repr){};
   static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
     return ScalarType(0, size_bits - 1, true, bias);
@@ -315,6 +315,8 @@ static inline constexpr auto kS8 = ScalarType::int_(8);
 static inline constexpr auto kU8 = ScalarType::uint(8);
 static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
 static inline constexpr auto kFE3M2f =
     ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
@@ -332,6 +334,7 @@ static inline constexpr auto kInt8 = kS8;
 static inline constexpr auto kUint8 = kU8;
 static inline constexpr auto kUint8b128 = kU8B128;
 static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
 static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
 static inline constexpr auto kFloat8_e5m2 = kFE5M2;

         signed_(signed_),
         bias(bias),
         finite_values_only(finite_values_only),
+        nan_repr(nan_repr) {};
   static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
     return ScalarType(0, size_bits - 1, true, bias);
 static inline constexpr auto kU8 = ScalarType::uint(8);
 static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+static inline constexpr auto kFE2M1f =
+    ScalarType::float_(2, 1, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE3M2f =
     ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
 static inline constexpr auto kUint8 = kU8;
 static inline constexpr auto kUint8b128 = kU8B128;
+static inline constexpr auto kFloat4_e2m1f = kFE2M1f;
 static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
 static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
 static inline constexpr auto kFloat8_e5m2 = kFE5M2;

cutlass_extensions/common.hpp CHANGED Viewed

@@ -15,21 +15,48 @@
                 cutlassGetStatusString(error));     \
   }
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                        \
-  {                                                               \
-    cudaError_t error = status;                                   \
-    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
-  }
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
   return max_shared_mem_per_block_opt_in;
 }
 int32_t get_sm_version_num();

                 cutlassGetStatusString(error));     \
   }
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
   return max_shared_mem_per_block_opt_in;
 }
 int32_t get_sm_version_num();
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+template <typename Kernel>
+struct enable_sm100_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};

cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp CHANGED Viewed

@@ -122,8 +122,8 @@ struct ScaledEpilogue
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
   }
 };
@@ -167,8 +167,8 @@ struct ScaledEpilogueBias
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
   }
 };
@@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
   }
 };
@@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
   }
 };
-};  // namespace vllm::c2x

     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
   }
 };
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
   }
 };
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
+};  // namespace vllm::c2x

cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp CHANGED Viewed

@@ -1,6 +1,7 @@
 #pragma once
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
@@ -16,36 +17,68 @@ namespace vllm::c3x {
 using namespace cute;
 /*
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
 struct ScaledEpilogueBase {
  protected:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
   template <typename T>
   using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
   template <typename T>
   using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
@@ -74,6 +107,14 @@ struct ScaledEpilogueBase {
                   std::is_same_v<Descriptor, RowLoad<T, true>>);
     return Arguments{data_ptr};
   }
 };
 /*
@@ -92,11 +133,11 @@ struct ScaledEpilogueBase {
    the A and B operands respectively. These scales may be either per-tensor or
    per row or column.
 */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
 struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -122,8 +163,8 @@ struct ScaledEpilogue
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
   }
 };
@@ -136,11 +177,11 @@ struct ScaledEpilogue
  * The bias tensor must be per-output channel.
  * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
 struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -169,8 +210,51 @@ struct ScaledEpilogueBias
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
   }
 };
@@ -182,11 +266,11 @@ struct ScaledEpilogueBias
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
 struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -230,9 +314,10 @@ struct ScaledEpilogueBiasAzp
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
   }
 };
@@ -246,11 +331,11 @@ struct ScaledEpilogueBiasAzp
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
 struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -307,11 +392,59 @@ struct ScaledEpilogueBiasAzpToken
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
   }
 };
-};  // namespace vllm::c3x

 #pragma once
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
 using namespace cute;
+template <typename T>
+struct identity {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const { return lhs; }
+};
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct TrivialEpilogue {
+ private:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+  using Compute = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::epilogue::thread::Identity, ElementD, ElementAcc,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+ public:
+  using EVTCompute = cutlass::epilogue::fusion::Sm90EVT<Compute, Accum>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  template <typename... Args>
+  static ArgumentType prepare_args(Args... args) {
+    return {};
+  }
+};
 /*
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
  */
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBase {
  protected:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
   template <typename T>
   using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
   template <typename T>
   using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, TileShape, T, T, Stride<Int<1>, Int<0>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
+  template <typename T>
+  using ColOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+  template <typename T>
+  using RowOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
                   std::is_same_v<Descriptor, RowLoad<T, true>>);
     return Arguments{data_ptr};
   }
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
+    using Arguments = typename Descriptor::Arguments;
+    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
+    return Arguments{data_ptr, do_broadcast};
+  }
 };
 /*
    the A and B operands respectively. These scales may be either per-tensor or
    per row or column.
 */
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
   }
 };
  * The bias tensor must be per-output channel.
  * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
  */
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
+  }
+};
+/*
+ * This epilogue performs the same operation as ScaledEpilogueBias, but the
+ * bias is a column vector instead of a row vector. Useful e.g. if we are
+ * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
+ */
+template <typename ElementAcc, typename ElementD, typename TileShape>
+struct ScaledEpilogueColumnBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template ColLoad<ElementD>;
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
   }
 };
  *
  * This epilogue also supports bias, which remains per-channel.
  */
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzp
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
  *
  * This epilogue also supports bias, which remains per-channel.
  */
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzpToken
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
+  }
+};
+/*
+    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
+    to arrays containing different scales used in group gemm. The number of
+   pointers in ScaleA and the number of pointers in ScaleB are equal to the
+   group size.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueArray
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
+  static ArgumentType prepare_args(float const* const* a_scales_ptr,
+                                   float const* const* b_scales_ptr,
+                                   bool a_col_broadcast, bool b_row_broadcast) {
+    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
+        a_scales_ptr, a_col_broadcast);
+    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
+        b_scales_ptr, b_row_broadcast);
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
   }
 };
+};  // namespace vllm::c3x

cutlass_w8a8/Epilogues.md CHANGED Viewed

@@ -1,17 +1,19 @@
 # CUTLASS Epilogues
 ## Introduction
-This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs.
 Currently, we only support symmetric quantization for weights,
 and symmetric and asymmetric quantization for activations.
 Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
 There are 4 epilogues:
-1. ScaledEpilogue: symmetric quantization for activations, no bias.
-1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
-1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
-1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
 We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
 Instead, if no bias is passed, the epilogue will use 0 as the bias.
@@ -26,12 +28,15 @@ If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
 ```math
 A = s_a (\widehat A - J_a z_a)
 ```
 ```math
 B = s_b \widehat B
 ```
 ```math
 D = A B + C
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
@@ -48,9 +53,11 @@ Expanding further, we can calculate $` \widehat D `$ as follows:
 ```math
 A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
 ```
 ```math
 A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
 ```
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
@@ -61,16 +68,19 @@ Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of
 ## Epilogues
-### ScaledEpilogue
 This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B
 ```
 ```math
 D = s_a s_b \widehat D
 ```
 ```math
 D = s_a s_b \widehat A \widehat B
 ```
@@ -79,44 +89,51 @@ Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
-### ScaledEpilogueBias
 This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
 ```math
 D = s_a s_b \widehat A \widehat B + C
 ```
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `bias` is the bias, is always per-channel (row-vector).
-### ScaledEpilogueAzp
 This epilogue computes the asymmetric per-tensor quantization for activations with bias.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
 ```math
 D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
 ```
-Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$.
 That is precomputed and stored in `azp_with_adj` as a row-vector.
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-tensor as the zero-points are per-tensor.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
@@ -125,13 +142,15 @@ Epilogue parameters:
 To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
-### ScaledEpilogueAzpPerToken
 This epilogue computes the asymmetric per-token quantization for activations with bias.
 The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
 That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-token as the zero-points are per-token.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
@@ -142,6 +161,7 @@ Epilogue parameters:
 To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
 The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
-```
 out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
 ```

 # CUTLASS Epilogues
 ## Introduction
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs.
 Currently, we only support symmetric quantization for weights,
 and symmetric and asymmetric quantization for activations.
 Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
 There are 4 epilogues:
+1. `ScaledEpilogue`: symmetric quantization for activations, no bias.
+1. `ScaledEpilogueBias`: symmetric quantization for activations, supports bias.
+1. `ScaledEpilogueAzp`: asymmetric per-tensor quantization for activations, supports bias.
+1. `ScaledEpilogueAzpPerToken`: asymmetric per-token quantization for activations, supports bias.
 We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
 Instead, if no bias is passed, the epilogue will use 0 as the bias.
 ```math
 A = s_a (\widehat A - J_a z_a)
 ```
 ```math
 B = s_b \widehat B
 ```
 ```math
 D = A B + C
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
 ```math
 A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
 ```
 ```math
 A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
 ```
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
 ## Epilogues
+### `ScaledEpilogue`
 This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B
 ```
 ```math
 D = s_a s_b \widehat D
 ```
 ```math
 D = s_a s_b \widehat A \widehat B
 ```
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+### `ScaledEpilogueBias`
 This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
 ```math
 D = s_a s_b \widehat A \widehat B + C
 ```
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `bias` is the bias, is always per-channel (row-vector).
+### `ScaledEpilogueAzp`
 This epilogue computes the asymmetric per-tensor quantization for activations with bias.
 The output of the GEMM is:
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
 ```math
 D = s_a s_b \widehat D + C
 ```
 ```math
 D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
 ```
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$.
 That is precomputed and stored in `azp_with_adj` as a row-vector.
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-tensor as the zero-points are per-tensor.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
+### `ScaledEpilogueAzpPerToken`
 This epilogue computes the asymmetric per-token quantization for activations with bias.
 The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
 That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
 Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-token as the zero-points are per-token.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
 The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
+```math
 out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
 ```

cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu ADDED Viewed

	@@ -0,0 +1,23 @@

+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+namespace vllm {
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+}  // namespace vllm

cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh ADDED Viewed

	@@ -0,0 +1,279 @@

+#pragma once
+#include "cuda_utils.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+#include "cutlass_gemm_caller.cuh"
+namespace vllm {
+using namespace cute;
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
+struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
+  using ElementAB = cutlass::float_e4m3_t;
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+  static constexpr int AlignmentC = AlignmentD;
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float;
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
+      AlignmentC,
+      ElementD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
+  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+  struct GemmKernel : public KernelType {};
+};
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
+  LayoutSFA layout_SFA = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+template <typename OutType>
+void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
+                                               torch::Tensor const& a,
+                                               torch::Tensor const& b,
+                                               torch::Tensor const& a_scales,
+                                               torch::Tensor const& b_scales) {
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
+  } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+}  // namespace vllm

cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu CHANGED Viewed

@@ -1,4 +1,3 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
@@ -21,4 +20,4 @@ void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
   }
 }
-}  // namespace vllm

 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
   }
 }
+}  // namespace vllm

cutlass_w8a8/c3x/scaled_mm_helper.hpp ADDED Viewed

	@@ -0,0 +1,75 @@

+#include <torch/all.h>
+#include "cuda_utils.h"
+#include "cutlass_extensions/common.hpp"
+template <typename Fp8Func, typename Int8Func, typename BlockwiseFunc>
+void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                        torch::Tensor const& b, torch::Tensor const& a_scales,
+                        torch::Tensor const& b_scales,
+                        std::optional<torch::Tensor> const& bias,
+                        Fp8Func fp8_func, Int8Func int8_func,
+                        BlockwiseFunc blockwise_func) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
+    // Standard per-tensor/per-token/per-channel scaling
+    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+    if (a.dtype() == torch::kFloat8_e4m3fn) {
+      fp8_func(c, a, b, a_scales, b_scales, bias);
+    } else {
+      TORCH_CHECK(a.dtype() == torch::kInt8);
+      if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
+        int8_func(c, a, b, a_scales, b_scales, bias);
+      } else {
+        TORCH_CHECK(false, "Int8 not supported for this architecture");
+      }
+    }
+  } else {
+    TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
+    TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 100) {
+      TORCH_CHECK(
+          a.size(0) == a_scales.size(0) &&
+              cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
+          "a_scale_group_shape must be [1, 128].");
+      TORCH_CHECK(
+          cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
+              cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
+          "b_scale_group_shape must be [128, 128].");
+    } else {
+      // TODO: Remove this after using cutlass sm90 blockwise scaling gemm
+      // kernel, or introducing ceil_div to the load_init() of mainloop.
+      using GroupShape = std::array<int64_t, 2>;
+      auto make_group_shape = [](torch::Tensor const& x,
+                                 torch::Tensor const& s) -> GroupShape {
+        TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
+        return {cuda_utils::ceil_div(x.size(0), s.size(0)),
+                cuda_utils::ceil_div(x.size(1), s.size(1))};
+      };
+      GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
+      GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
+      // 1x128 per-token group scales for activations
+      // 128x128 blockwise scales for weights
+      TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
+                   b_scale_group_shape == GroupShape{128, 128} &&
+                   a.dtype() == torch::kFloat8_e4m3fn &&
+                   b.dtype() == torch::kFloat8_e4m3fn),
+                  "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
+                  "a_scale_group_shape must be [1, 128]. Got: [",
+                  a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                  "]\n"
+                  "b_scale_group_shape must be [128, 128]. Got: [",
+                  b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
+    }
+    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
+    blockwise_func(c, a, b, a_scales, b_scales);
+  }
+}

cutlass_w8a8/c3x/scaled_mm_kernels.hpp CHANGED Viewed

@@ -36,4 +36,9 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
                                  torch::Tensor const& b_scales,
                                  std::optional<torch::Tensor> const& bias);
 }  // namespace vllm

                                  torch::Tensor const& b_scales,
                                  std::optional<torch::Tensor> const& bias);
+void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales);
 }  // namespace vllm

cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh CHANGED Viewed

@@ -15,16 +15,59 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_256, _128, _64>;
   using ClusterShape = Shape<_2, _2, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                             KernelSchedule, EpilogueSchedule>;
 };
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -39,8 +82,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, b, std::forward<EpilogueArgs>(args)...);
 }
 template <template <typename, typename, typename> typename Epilogue,

 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
+  // M in (256, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _128>;
   using ClusterShape = Shape<_2, _2, _1>;
   using Cutlass3xGemm =
       cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                             KernelSchedule, EpilogueSchedule>;
 };
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M256 {
+  // M in (64, 256]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in (16, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _4, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 =
+      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM256 =
+      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller<Cutlass3xGemmM16>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // m in (16, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 256) {
+    // m in (64, 256]
+    return cutlass_gemm_caller<Cutlass3xGemmM256>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (256, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
 }
 template <template <typename, typename, typename> typename Epilogue,

cutlass_w8a8/common.hpp DELETED Viewed

@@ -1,27 +0,0 @@
-#pragma once
-#include "cutlass/cutlass.h"
-#include <climits>
-/**
- * Helper function for checking CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                        \
-  {                                                  \
-    TORCH_CHECK(status == cutlass::Status::kSuccess, \
-                cutlassGetStatusString(status))      \
-  }
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}

cutlass_w8a8/scaled_mm_c2x.cuh CHANGED Viewed

@@ -103,14 +103,19 @@ struct cutlass_2x_gemm {
   using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
   using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
       ElementAcc, float, cutlass::arch::OpClassTensorOp,
       Arch,
       TileShape, WarpShape, InstructionShape,

   using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
   using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      float, cutlass::layout::RowMajor, AlignmentCD,
       ElementAcc, float, cutlass::arch::OpClassTensorOp,
       Arch,
       TileShape, WarpShape, InstructionShape,

cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh CHANGED Viewed

@@ -336,7 +336,7 @@ inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]

   uint32_t const m = a.size(0);
   uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]

cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh CHANGED Viewed

@@ -321,7 +321,7 @@ inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]

   uint32_t const m = a.size(0);
   uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
   if (mp2 <= 16) {
     // M in [1, 16]

cutlass_w8a8/scaled_mm_c3x.cu DELETED Viewed

@@ -1,87 +0,0 @@
-#include <cudaTypedefs.h>
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
-  #include "scaled_mm_c3x_sm90_int8_dispatch.cuh"
-  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
-using namespace vllm;
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-*/
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        c, a, b, a_scales, b_scales, *bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
-        c, a, b, a_scales, b_scales);
-  }
-}
-void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                std::optional<torch::Tensor> const& azp,
-                                std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (azp) {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
-  }
-}
-#endif

cutlass_w8a8/scaled_mm_c3x.cuh DELETED Viewed

@@ -1,160 +0,0 @@
-#pragma once
-// clang-format will break include orders
-// clang-format off
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include "cutlass/cutlass.h"
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "core/math.hpp"
-#include "cutlass_extensions/common.hpp"
-// clang-format on
-/*
-  Epilogues defined in,
-  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
-  must contain a public type named EVTCompute of type Sm90EVT, as well as a
-  static prepare_args function that constructs an EVTCompute::Arguments struct.
-*/
-using namespace cute;
-namespace vllm {
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-  using EVTCompute = typename Epilogue::EVTCompute;
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-          ElementAB, cutlass::layout::RowMajor, 16,
-          ElementAB, cutlass::layout::ColumnMajor, 16,
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-  struct GemmKernel : public KernelType {};
-};
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  auto const workspace_options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
-  auto workspace = torch::empty(workspace_size, workspace_options);
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
-  CUTLASS_CHECK(status);
-}
-}  // namespace vllm

cutlass_w8a8/scaled_mm_c3x_sm100.cu CHANGED Viewed

@@ -1,34 +1,18 @@
-#include <cudaTypedefs.h>
 #include "c3x/scaled_mm_kernels.hpp"
-#include "cuda_utils.h"
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm100 (Blackwell).
 */
-#if defined CUDA_VERSION && CUDA_VERSION >= 12800
 void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& b,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
                              std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  int M = a.size(0), N = b.size(1), K = a.size(1);
-  TORCH_CHECK(
-      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
-          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
-      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
-  // Standard per-tensor/per-token/per-channel scaling
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
-              "Currently, only fp8 gemm is implemented for Blackwell");
-  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
 }
-#endif

+#include "c3x/scaled_mm_helper.hpp"
 #include "c3x/scaled_mm_kernels.hpp"
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm100 (Blackwell).
 */
 void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& b,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
                              std::optional<torch::Tensor> const& bias) {
+  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
+                     vllm::cutlass_scaled_mm_sm100_fp8,
+                     nullptr,  // int8 not supported on SM100
+                     vllm::cutlass_scaled_mm_blockwise_sm100_fp8);
 }

cutlass_w8a8/scaled_mm_c3x_sm90.cu CHANGED Viewed

@@ -1,63 +1,20 @@
-#include <cudaTypedefs.h>
 #include "c3x/scaled_mm_kernels.hpp"
-#include "cuda_utils.h"
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper).
 */
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  int M = a.size(0), N = b.size(1), K = a.size(1);
-  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
-      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
-    // Standard per-tensor/per-token/per-channel scaling
-    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-    if (a.dtype() == torch::kFloat8_e4m3fn) {
-      vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
-    } else {
-      TORCH_CHECK(a.dtype() == torch::kInt8);
-      vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
-    }
-  } else {
-    using GroupShape = std::array<int64_t, 2>;
-    auto make_group_shape = [](torch::Tensor const& x,
-                               torch::Tensor const& s) -> GroupShape {
-      TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-      return {cuda_utils::ceil_div(x.size(0), s.size(0)),
-              cuda_utils::ceil_div(x.size(1), s.size(1))};
-    };
-    GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
-    GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
-    // 1x128 per-token group scales for activations
-    // 128x128 blockwise scales for weights
-    TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
-                 b_scale_group_shape == GroupShape{128, 128} &&
-                 a.dtype() == torch::kFloat8_e4m3fn &&
-                 b.dtype() == torch::kFloat8_e4m3fn),
-                "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
-                "a_scale_group_shape must be [1, 128]. Got: [",
-                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
-                "]\n"
-                "b_scale_group_shape must be [128, 128]. Got: [",
-                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
-    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
-    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
-  }
 }
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
@@ -73,5 +30,3 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                         azp, bias);
 }
-#endif

+#include "c3x/scaled_mm_helper.hpp"
 #include "c3x/scaled_mm_kernels.hpp"
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
    NVIDIA GPUs with sm90a (Hopper).
 */
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias) {
+  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
+                     vllm::cutlass_scaled_mm_sm90_fp8,
+                     vllm::cutlass_scaled_mm_sm90_int8,
+                     vllm::cutlass_scaled_mm_blockwise_sm90_fp8);
 }
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                         azp, bias);
 }

cutlass_w8a8/scaled_mm_c3x_sm90_fp8_dispatch.cuh DELETED Viewed

@@ -1,96 +0,0 @@
-#pragma once
-#include "scaled_mm_c3x.cuh"
-/**
- * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
- * shape.
- */
-namespace vllm {
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
-                                           torch::Tensor const& a,
-                                           torch::Tensor const& b,
-                                           EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-}  // namespace vllm

cutlass_w8a8/scaled_mm_c3x_sm90_int8_dispatch.cuh DELETED Viewed

@@ -1,140 +0,0 @@
-#pragma once
-#include "scaled_mm_c3x.cuh"
-/**
- * This file defines Gemm kernel configurations for SM90 (int8) based on the
- * Gemm shape.
- */
-namespace vllm {
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
-                                            torch::Tensor const& a,
-                                            torch::Tensor const& b,
-                                            EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-  using Cutlass3xGemmDefault =
-      typename sm90_int8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NBig =
-      typename sm90_int8_config_M32_NBig<InType, OutType,
-                                         Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NSmall =
-      typename sm90_int8_config_M32_NSmall<InType, OutType,
-                                           Epilogue>::Cutlass3xGemm;
-  uint32_t const n = out.size(1);
-  bool const is_small_n = n < 8192;
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-  if (mp2 <= 32) {
-    // m in [1, 32]
-    if (is_small_n) {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else if (mp2 <= 64) {
-    // m in (32, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-}  // namespace vllm

cutlass_w8a8/scaled_mm_entry.cu CHANGED Viewed

@@ -1,3 +1,5 @@
 #include <cudaTypedefs.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -23,7 +25,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -31,6 +33,14 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -55,7 +65,7 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                 std::optional<torch::Tensor> const& azp,
                                 std::optional<torch::Tensor> const& bias);
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -81,6 +91,34 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   return false;
 }
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -89,15 +127,12 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
               b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
   if (bias) {
     TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
@@ -106,15 +141,22 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = get_sm_version_num();
-  // Hopper
   // Guard against compilation issues for sm90 kernels
-  #if defined CUDA_VERSION && CUDA_VERSION >= 12000
-  if (version_num >= 90) {
     cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
     return;
   }
-  #endif
   if (version_num == 89) {
     // Ada Lovelace
@@ -138,7 +180,7 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       false,
       "No compiled cutlass_scaled_mm for a compute capability less than "
       "CUDA device capability: ",
-      version_num);
 }
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
@@ -182,12 +224,12 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
   int32_t version_num = get_sm_version_num();
-  #if defined CUDA_VERSION && CUDA_VERSION >= 12000
   if (version_num >= 90) {
     cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
     return;
   }
-  #endif
   if (version_num == 89) {
     // Ada Lovelace
@@ -210,5 +252,5 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
       false,
       "No compiled cutlass_scaled_mm_azp for a compute capability less than "
       "CUDA device capability: ",
-      version_num);
 }

+#include <string>
 #include <cudaTypedefs.h>
 #include <c10/cuda/CUDAGuard.h>
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+#if __CUDACC_VER_MAJOR__ >= 12
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             std::optional<torch::Tensor> const& bias);
 #endif
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
+#endif
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
                                 std::optional<torch::Tensor> const& azp,
                                 std::optional<torch::Tensor> const& bias);
+#if __CUDACC_VER_MAJOR__ >= 12
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
   return false;
 }
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 100) {
+    return CUDA_VERSION >= 12080;
+  }
+#endif
+  return false;
+}
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
+  // CUTLASS grouped FP8 kernels need at least CUDA 12.3
+  // and SM90 (Hopper)
+#if defined CUDA_VERSION
+  if (cuda_device_capability == 90) {
+    return CUDA_VERSION >= 12030;
+  }
+#endif
+  return false;
+}
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
   TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
   TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
               b.size(1) == c.size(1));
   // Check for strides and alignment
   TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
   TORCH_CHECK(b.stride(0) == 1);                      // Column-major
   TORCH_CHECK(c.stride(0) % 16 == 0 &&
               b.stride(1) % 16 == 0);  // 16 Byte Alignment
   if (bias) {
     TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = get_sm_version_num();
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+  if (version_num >= 100) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+#endif
   // Guard against compilation issues for sm90 kernels
+#if __CUDACC_VER_MAJOR__ >= 12
+  if (version_num >= 90 && version_num < 100) {
+    // Hopper
     cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
     return;
   }
+#endif
   if (version_num == 89) {
     // Ada Lovelace
       false,
       "No compiled cutlass_scaled_mm for a compute capability less than "
       "CUDA device capability: ",
+      std::to_string(version_num));
 }
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
   int32_t version_num = get_sm_version_num();
+#if __CUDACC_VER_MAJOR__ >= 12
   if (version_num >= 90) {
     cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
     return;
   }
+#endif
   if (version_num == 89) {
     // Ada Lovelace
       false,
       "No compiled cutlass_scaled_mm_azp for a compute capability less than "
       "CUDA device capability: ",
+      std::to_string(version_num));
 }

dispatch_utils.h CHANGED Viewed

@@ -6,6 +6,11 @@
 #include <torch/all.h>
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
@@ -14,6 +19,35 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
@@ -31,5 +65,19 @@
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

 #include <torch/all.h>
+// Need a special dispatch case macro since we will nest the FP8 dispatch.
+// Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
+#define AT_DISPATCH_FP8_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__)
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+// ROCm devices might use either fn or fnuz, so set up dispatch table for both.
+// A host-based check at runtime will create a preferred FP8 type for ROCm
+// such that the correct kernel is dispatched.
+#ifdef USE_ROCM
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...)                          \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__)
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)   \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+// When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
+// See AT_DISPATCH_FP8_CASE above.
+#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)        \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))

flake.lock CHANGED Viewed

@@ -1,6 +1,21 @@
 {
   "nodes": {
     "flake-compat": {
       "locked": {
         "lastModified": 1733328505,
         "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
@@ -33,61 +48,82 @@
         "type": "github"
       }
     },
-    "kernel-builder": {
       "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs",
-        "rocm-nix": "rocm-nix"
       },
       "locked": {
-        "lastModified": 1744736115,
-        "narHash": "sha256-9PPp6XHoMx9jZjwCP7XvAlc52+TmmVuCbUqwh3snuI8=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "319af881b27c3645dfc33128f99092c7c1176281",
         "type": "github"
       },
       "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
         "type": "github"
       }
     },
-    "nixpkgs": {
       "locked": {
-        "lastModified": 1743559129,
-        "narHash": "sha256-7gpAWsENV3tY2HmeHYQ2MoQxGpys+jQWnkS/BHAMXVk=",
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "adae22bea8bcc0aa2fd6e8732044660fb7755f5e",
         "type": "github"
       },
       "original": {
-        "owner": "nixos",
-        "ref": "nixos-unstable-small",
-        "repo": "nixpkgs",
         "type": "github"
       }
     },
-    "rocm-nix": {
       "inputs": {
         "nixpkgs": [
           "kernel-builder",
           "nixpkgs"
         ]
       },
       "locked": {
-        "lastModified": 1743085847,
-        "narHash": "sha256-uWG29p+nhZmGRV1LffWwRGjwtPIXeu1F0YTQbXgB+GU=",
         "owner": "huggingface",
-        "repo": "rocm-nix",
-        "rev": "245cdc9bfb4bfafa818711c5f5e0b889afe1ba39",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "repo": "rocm-nix",
         "type": "github"
       }
     },
@@ -110,6 +146,21 @@
         "repo": "default",
         "type": "github"
       }
     }
   },
   "root": "root",

 {
   "nodes": {
     "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
       "locked": {
         "lastModified": 1733328505,
         "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
         "type": "github"
       }
     },
+    "flake-utils_2": {
       "inputs": {
+        "systems": "systems_2"
       },
       "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
         "type": "github"
       },
       "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
         "type": "github"
       }
     },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
       "locked": {
+        "lastModified": 1750234878,
+        "narHash": "sha256-q9DRC9zdpzUf88qqg1qbhP1qgJbE2cMtn8oUmosuyT8=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "c7132f90763d756da3e77da62e01be0a4546dc57",
         "type": "github"
       },
       "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
         "type": "github"
       }
     },
+    "kernel-builder": {
       "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
         "nixpkgs": [
           "kernel-builder",
+          "hf-nix",
           "nixpkgs"
         ]
       },
       "locked": {
+        "lastModified": 1751014803,
+        "narHash": "sha256-9Xfq2k3uPfB602NwQF+zAY2GQZiKUN1G7Q6XiDCUR8Y=",
         "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "bbc4e712ff2046e217818e97de2201e2b996756e",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
         "type": "github"
       }
     },
         "repo": "default",
         "type": "github"
       }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
     }
   },
   "root": "root",

fp8/amd/hip_float8.h DELETED Viewed

@@ -1,137 +0,0 @@
-#pragma once
-#ifdef __HIPCC__
-  #include <hip/hip_runtime.h>
-#else
-  #include <type_traits>
-  #include <stdint.h>
-  #include <math.h>
-  #include <iostream>
-#endif
-#include "hip_float8_impl.h"
-struct alignas(1) hip_fp8 {
-  struct from_bits_t {};
-  HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() {
-    return from_bits_t();
-  }
-  uint8_t data;
-  hip_fp8() = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
-  explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
-      : data(v) {}
-#ifdef __HIP__MI300__
-  // NOTE: ON-DEVICE... always optimal bias
-  explicit HIP_FP8_DEVICE hip_fp8(float v)
-      : data(hip_fp8_impl::to_fp8_from_fp32(v)) {}
-  explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
-      : hip_fp8(static_cast<float>(v)) {}
-  // Host only implementation using s/w simulation
-  explicit HIP_FP8_HOST
-#else   // __HIP__MI300__
-  // both Host and DEVICE for non-MI300 using s/w simulation
-  explicit HIP_FP8_HOST_DEVICE
-#endif  // __HIP__MI300__
-  hip_fp8(float v) {
-    data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/,
-                                   true /*clip*/>(v);
-  }
-  explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
-      : hip_fp8(static_cast<float>(v)) {}
-#ifdef __HIP__MI300__
-  // upcast using device specific intrinsic
-  explicit inline HIP_FP8_DEVICE operator float() const {
-    float fval;
-    uint32_t i32val = static_cast<uint32_t>(data);
-    // upcast
-    asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0"
-                 : "=v"(fval)
-                 : "v"(i32val));
-    return fval;
-  }
-  explicit inline HIP_FP8_HOST operator float() const
-#else   // __HIP__MI300__
-  explicit inline HIP_FP8_HOST_DEVICE operator float() const
-#endif  // __HIP__MI300__
-  {
-    return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(
-        data);
-  }
-};
-namespace std {
-inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); }
-inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); }
-HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; }
-}  // namespace std
-// Special operator overloading
-inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) {
-  return os << float(f8);
-}
-// all + operator overloading with mixed types
-// mixed types, always converts to f32, does computation in f32, and returns
-// float
-inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) {
-  return (fa + float(b));
-}
-inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) {
-  return (float(a) + fb);
-}
-inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) {
-  return hip_fp8(float(a) + float(b));
-}
-inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) {
-  return a = hip_fp8(float(a) + float(b));
-}
-// overloading multiplication, always returns float,
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) {
-  return float(a) * float(b);
-}
-inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) {
-  return (a * float(b));
-}
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) {
-  return (float(a) * b);
-}
-inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-// overloading for compare
-inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) {
-  return (a.data == b.data);
-}
-inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) {
-  return (a.data != b.data);
-}
-inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}

fp8/amd/hip_float8_impl.h DELETED Viewed

@@ -1,316 +0,0 @@
-#pragma once
-#if defined(__HIPCC__) && \
-    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
-  #define __HIP__MI300__
-#endif
-#ifdef __HIPCC__
-  #define HIP_FP8_HOST_DEVICE __host__ __device__
-  #define HIP_FP8_HOST __host__
-  #define HIP_FP8_DEVICE __device__
-#else
-  #define HIP_FP8_HOST_DEVICE
-  #define HIP_FP8_HOST
-  #define HIP_FP8_DEVICE
-#endif
-namespace hip_fp8_impl {
-#ifdef __HIP__MI300__
-HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) {
-  uint8_t i8data;
-  union {
-    float fval;
-    uint32_t i32val;
-    uint8_t i8val[4];  // NOTE: not endian independent
-  } val;
-  uint32_t ival = 0;
-  val.fval = v;
-  if ((val.i32val & 0x7F800000) !=
-      0x7F800000) {  /// propagate NAN/INF, no clipping
-    val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
-  }
-  ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
-                                         false);  // false -> WORD0
-  val.i32val = ival;
-  i8data = val.i8val[0];
-  return i8data;
-}
-#endif  // __HIP__MI300__
-HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
-HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); }
-#endif
-template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
-HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false,
-                                      uint32_t rng = 0) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(wm + we == 7, "wm+we==7");
-  static_assert(is_half || is_float, "Only half and float can be cast to f8");
-  const int mfmt = (sizeof(T) == 4) ? 23 : 10;
-  uint32_t x;
-  if (sizeof(T) == 4) {
-    x = reinterpret_cast<uint32_t&>(_x);
-  } else {
-    x = reinterpret_cast<uint16_t&>(_x);
-  }
-  uint32_t head, mantissa;
-  int exponent, bias;
-  uint32_t sign;
-  if (sizeof(T) == 4) {
-    head = x & 0xFF800000;
-    mantissa = x & 0x7FFFFF;
-    exponent = (head >> 23) & 0xFF;
-    sign = head >> 31;
-    bias = 127;
-  } else {
-    head = x & 0xFC00;
-    mantissa = x & 0x3FF;
-    exponent = (head >> 10) & 0x1F;
-    sign = head >> 15;
-    bias = 15;
-  }
-  uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
-  // Deal with inf and NaNs
-  if (negative_zero_nan) {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return 0x80;
-      }
-    } else {
-      // if(__hisinf(x) || __hisnan(x))
-      if ((x & 0x7C00) == 0x7C00) {
-        return 0x80;
-      }
-    }
-  } else {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    } else {
-      if ((x & 0x7C00) == 0x7C00) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    }
-  }
-  if (x == 0) {
-    return 0;
-  }
-  // First need to check if it is normal or denorm as there is a difference of
-  // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
-  // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
-  // to mantissa and truncate. And for RNE, no need to add rng. Then probably
-  // need to check whether there is carry and adjust exponent and mantissa again
-  // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
-  // bits
-  const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
-  const int f8_denormal_act_exponent =
-      1 - f8_bias;  // actual exponent of f8 denormal
-  // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
-  // f8_exponent is the converted f8 exponent with bias encoding
-  // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
-  // the difference needs to be adjusted and mantissa shifted
-  int act_exponent, f8_exponent, exponent_diff;
-  if (exponent == 0) {  // fp32/fp16 is in denormal.
-    /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
-mostly concern fp16 here. In this case, f8 is usually in denormal. But there
-could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
-exponent bias 16. It means that there are some numbers in fp16 denormal but they
-are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
-where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
-(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
-    act_exponent = exponent - bias + 1;
-    exponent_diff =
-        f8_denormal_act_exponent -
-        act_exponent;  // actual exponent is exponent-bias+1 as it is denormal
-  } else {             // fp32/fp16 is normal with implicit 1
-    act_exponent = exponent - bias;
-    if (act_exponent <= f8_denormal_act_exponent) {
-      /* This is the case where fp32/fp16 is normal but it is in f8 denormal
-range. For example fp8 nanoo mode, denormal exponent is -7, but if the
-fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
-Therefore it needs to be adjust to -6 and mantissa shift right by 1.
-So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
-      exponent_diff = f8_denormal_act_exponent - act_exponent;
-    } else {              // both fp32/fp16 and f8 are in normal range
-      exponent_diff = 0;  // exponent_diff=0 does not mean there is no
-                          // difference for this case, act_exponent could be
-                          // larger. Just that it does not need shift mantissa
-    }
-    mantissa += (1 << mfmt);  // Add the implicit 1 into mantissa
-  }
-  bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
-                  static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
-  /* This part is a bit tricky. The judgment of whether it is a tie needs to be
- done before we shift right as shift right could rip off some residual part
- and make something not midpoint look like midpoint. For example, the fp16
- number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
- shift right by 4 bits, it would look like midpoint.
-*/
-  if (exponent_diff > 0) {
-    mantissa >>= exponent_diff;
-  } else if (exponent_diff == -1) {
-    mantissa <<= -exponent_diff;
-  }
-  bool implicit_one = mantissa & (1 << mfmt);
-  // if there is no implicit 1, it  means the f8 is denormal and need to adjust
-  // to denorm exponent
-  f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ +
-                f8_bias - (implicit_one ? 0 : 1);
-  // Now we have the exponent and mantissa adjusted
-  uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
-  bool odd = mantissa & (1 << (mfmt - wm));  // if the least significant bit
-                                             // that is not truncated is 1
-  mantissa +=
-      (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) &
-      drop_mask;
-  // Now we deal with overflow
-  if (f8_exponent == 0) {
-    if ((1 << mfmt) & mantissa) {
-      f8_exponent = 1;  // denormal overflow to become normal, promote exponent
-    }
-  } else {
-    if ((1 << (mfmt + 1)) & mantissa) {
-      mantissa >>= 1;
-      f8_exponent++;
-    }
-  }
-  mantissa >>= (mfmt - wm);
-  // above range: quantize to maximum possible float of the same sign
-  const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
-  if (f8_exponent > max_exp) {
-    if (clip) {
-      mantissa = (1 << wm) - 1;
-      f8_exponent = max_exp;
-    } else {
-      return signed_inf;
-    }
-  }
-  if (f8_exponent == 0 && mantissa == 0) {
-    return negative_zero_nan ? 0 : (sign << 7);
-  }
-  mantissa &= (1 << wm) - 1;
-  return (sign << 7) | (f8_exponent << wm) | mantissa;
-}
-template <int we, int wm, typename T = float, bool negative_zero_nan = true>
-inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(is_half || is_float, "only half and float are supported");
-  constexpr int weo = is_half ? 5 : 8;
-  constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
-  T fInf, fNegInf, fNaN, fNeg0;
-#ifdef __HIPCC__
-  if (is_half) {
-    const uint16_t ihInf = 0x7C00;
-    const uint16_t ihNegInf = 0xFC00;
-    const uint16_t ihNaN = 0x7C01;
-    const uint16_t ihNeg0 = 0x8000;
-    fInf = reinterpret_cast<const _Float16&>(ihInf);
-    fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
-    fNaN = reinterpret_cast<const _Float16&>(ihNaN);
-    fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
-  } else
-#endif
-      if (is_float) {
-    const uint32_t ifInf = 0x7F800000;
-    const uint32_t ifNegInf = 0xFF800000;
-    const uint32_t ifNaN = 0x7F800001;
-    const uint32_t ifNeg0 = 0x80000000;
-    fInf = reinterpret_cast<const float&>(ifInf);
-    fNegInf = reinterpret_cast<const float&>(ifNegInf);
-    fNaN = reinterpret_cast<const float&>(ifNaN);
-    fNeg0 = reinterpret_cast<const float&>(ifNeg0);
-  }
-  if (x == 0) {
-    return 0;
-  }
-  uint32_t sign = x >> 7;
-  uint32_t mantissa = x & ((1 << wm) - 1);
-  int exponent = (x & 0x7F) >> wm;
-  if (negative_zero_nan) {
-    if (x == 0x80) {
-      return fNaN;
-    }
-  } else {
-    if (x == 0x80) {
-      return fNeg0;
-    }
-    if (exponent == ((1 << we) - 1)) {
-      return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
-    }
-  }
-  typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
-  if (we == 5 && is_half && !negative_zero_nan) {
-    retval = x << 8;
-    return reinterpret_cast<const T&>(retval);
-  }
-  const int exp_low_cutoff =
-      (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-  // subnormal input
-  if (exponent == 0) {
-    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
-    int sh = 1 + clz(mantissa) - (32 - wm);
-    mantissa <<= sh;
-    exponent += 1 - sh;
-    mantissa &= ((1 << wm) - 1);
-  }
-  exponent += exp_low_cutoff - 1;
-  mantissa <<= wmo - wm;
-  // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
-  if (exponent <= 0) {
-    mantissa |= 1 << wmo;
-    mantissa >>= 1 - exponent;
-    exponent = 0;
-  }
-  if (sizeof(T) == 2) {
-    retval = (sign << 15) | (exponent << 10) | mantissa;
-  } else {
-    retval = (sign << 31) | (exponent << 23) | mantissa;
-  }
-  return reinterpret_cast<const T&>(retval);
-}
-}  // namespace hip_fp8_impl

fp8/amd/quant_utils.cuh CHANGED Viewed

@@ -1,13 +1,11 @@
 #pragma once
-#include "hip_float8.h"
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
-#include "../../../attention/dtype_fp8.cuh"
-#include "../../../attention/dtype_float32.cuh"
-#include "../../../attention/dtype_bfloat16.cuh"
 namespace vllm {
 #ifdef USE_ROCM
@@ -15,6 +13,40 @@ namespace vllm {
 namespace fp8 {
   #ifdef ENABLE_FP8
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x) {
   return x;
@@ -26,40 +58,31 @@ __inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
   return x;
 }
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t
 vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8);
-  return res.x;
 }
 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t
 vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
   union {
     __half2_raw h2r;
     uint32_t ui32;
   } tmp;
-  tmp.h2r.x.data = f2[0];
-  tmp.h2r.y.data = f2[1];
   return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-  tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
-  tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return tmp.u32;
-    #endif
 }
 // fp8x4 -> half2x2
@@ -92,9 +115,9 @@ using __nv_bfloat16 = __hip_bfloat16;
 template <>
 __inline__ __device__ __nv_bfloat16
 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f);
 }
 using __nv_bfloat162 = __hip_bfloat162;
@@ -136,27 +159,18 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
 // fp8 -> float
 template <>
 __inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8);
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
 vec_conversion<float2, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0];
-  res.y = f2[1];
-  return res;
-    #else
-  float2 res;
-  res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
-  res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return res;
-    #endif
 }
 // fp8x4 -> float4
@@ -169,6 +183,15 @@ vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
   return res;
 }
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
@@ -189,33 +212,36 @@ __inline__ __device__ uint8_t
 vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
   __half_raw tmp;
   tmp.x = a;
-  hip_fp8 f8{static_cast<float>(tmp.data)};
-  return f8.data;
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t
 vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
-  hip_fp8 res{__bfloat162float(a)};
-  return res.data;
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
-  hip_fp8 f8(a);
-  return f8.data;
-}
-// fp8x4 -> float4
-template <>
-__inline__ __device__ float4
-vec_conversion<float4, uint32_t>(const uint32_t& a) {
-  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
 }
 // float2 -> half2
@@ -307,90 +333,22 @@ vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {
  */
-// fp8 -> half
-template <>
-__inline__ __device__ uint16_t
-scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8) * scale;
-  return res.x;
-}
-// fp8x2 -> half2
-template <>
-__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
-    const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  union {
-    __half2_raw h2r;
-    uint32_t ui32;
-  } tmp;
-  tmp.h2r.x.data = f2[0] * scale;
-  tmp.h2r.y.data = f2[1] * scale;
-  return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-  tmp.u16[0] =
-      scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
-  tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(
-      static_cast<uint8_t>(a >> 8U), scale);
-  return tmp.u32;
-    #endif
-}
-// fp8x4 -> half2x2
-template <>
-__inline__ __device__ uint2
-scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale) {
-  union {
-    uint2 u32x2;
-    uint32_t u32[2];
-  } tmp;
-  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
-  tmp.u32[1] =
-      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
-  return tmp.u32x2;
-}
-// fp8x8 -> half2x4
-template <>
-__inline__ __device__ uint4
-scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale) {
-  union {
-    uint4 u64x2;
-    uint2 u64[2];
-  } tmp;
-  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
-  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
-  return tmp.u64x2;
-}
 using __nv_bfloat16 = __hip_bfloat16;
 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16
-scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a,
-                                              const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f * scale);
 }
-using __nv_bfloat162 = __hip_bfloat162;
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162
 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
-                                                const float scale) {
   __nv_bfloat162 res;
   res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
   res.y =
@@ -400,8 +358,8 @@ scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
 // fp8x4 -> bf16_4_t
 template <>
-__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
-    const uint32_t& a, const float scale) {
   bf16_4_t res;
   res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
   res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
@@ -412,7 +370,7 @@ __inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t
-scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
   bf16_4_t tmp1, tmp2;
   tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
@@ -427,29 +385,19 @@ scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
 // fp8 -> float
 template <>
 __inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
-    const uint8_t& a, const float scale) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8) * scale;
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
-scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0] * scale;
-  res.y = f2[1] * scale;
-  return res;
-    #else
-  float2 res;
-  res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
-  res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U),
-                                                scale);
-  return res;
-    #endif
 }
 // fp8x4 -> float4
@@ -462,10 +410,18 @@ scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
   return res;
 }
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_
-scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
   Float4_ tmp1, tmp2;
   tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
@@ -477,44 +433,182 @@ scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
   return res;
 }
-/* Quantize(HP / scale) => FP8 */
-// TODO(Hai): vectorized to add
 // half -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale) {
   __half_raw tmp;
   tmp.x = a;
-  hip_fp8 f8{static_cast<float>(tmp.data) / scale};
-  return f8.data;
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
-    const __nv_bfloat16& a, const float scale) {
-  hip_fp8 res{__bfloat162float(a) / scale};
-  return res.data;
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, float>(const float& a, const float scale) {
-  hip_fp8 f8(a / scale);
-  return f8.data;
 }
-// fp8x4 -> float4
 template <>
-__inline__ __device__ float4
-scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale) {
-  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
 }
   #endif  // ENABLE_FP8

 #pragma once
+#include <hip/hip_fp8.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
+#include "../../attention/attention_dtypes.h"
 namespace vllm {
 #ifdef USE_ROCM
 namespace fp8 {
   #ifdef ENABLE_FP8
+// Use hardware cvt instruction for fp8 on rocm
+template <typename fp8_type>
+__device__ __forceinline__ fp8_type cvt_c10(float const r) {
+  return {};
+}
+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
+  return c10::Float8_e4m3fn(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
+                             __hip_fp8_e4m3::__default_interpret),
+      c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
+}
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fnuz cvt_c10(float const r) {
+  return c10::Float8_e4m3fnuz(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3_fnuz::__default_saturation,
+                             __hip_fp8_e4m3_fnuz::__default_interpret),
+      c10::Float8_e4m3fnuz::from_bits());
+}
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x) {
   return x;
   return x;
 }
+    #if HIP_FP8_TYPE_OCP
+using fp8_type = __hip_fp8_e4m3;
+using fp8x2_type = __hip_fp8x2_e4m3;
+    #else
+using fp8_type = __hip_fp8_e4m3_fnuz;
+using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
+    #endif
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t
 vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
+  return __hip_cvt_fp8_to_halfraw(a, fp8_type::__default_interpret).x;
 }
 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t
 vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
   union {
     __half2_raw h2r;
     uint32_t ui32;
   } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
   return tmp.ui32;
 }
 // fp8x4 -> half2x2
 template <>
 __inline__ __device__ __nv_bfloat16
 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8));
 }
 using __nv_bfloat162 = __hip_bfloat162;
 // fp8 -> float
 template <>
 __inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8);
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
 vec_conversion<float2, uint16_t>(const uint16_t& a) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2);
 }
 // fp8x4 -> float4
   return res;
 }
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+vec_conversion<float4, uint32_t>(const uint32_t& a) {
+  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
 vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
   __half_raw tmp;
   tmp.x = a;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
+template <>
+__inline__ __device__ uint16_t
+vec_conversion<uint16_t, uint32_t>(const uint32_t& a) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t
 vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a),
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
+  return __hip_cvt_float_to_fp8(a, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
 // float2 -> half2
  */
 using __nv_bfloat16 = __hip_bfloat16;
 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16
+scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8) * scale);
 }
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162
 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
+                                                float scale) {
   __nv_bfloat162 res;
   res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
   res.y =
 // fp8x4 -> bf16_4_t
 template <>
+__inline__ __device__ bf16_4_t
+scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, float scale) {
   bf16_4_t res;
   res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
   res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t
+scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, float scale) {
   bf16_4_t tmp1, tmp2;
   tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
 // fp8 -> float
 template <>
 __inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
+    const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8) * scale;
 }
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
+scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, float scale) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2) * scale;
 }
 // fp8x4 -> float4
   return res;
 }
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, float scale) {
+  Float4_ res = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
+  return {res.x.x, res.x.y, res.y.x, res.y.y};
+}
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_
+scaled_vec_conversion<Float8_, uint2>(const uint2& a, float scale) {
   Float4_ tmp1, tmp2;
   tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
   return res;
 }
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
+  __half_raw res;
+  res.data = scaled_vec_conversion<float, uint8_t>(a, scale);
+  return res.x;
+}
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
+  union {
+    __half2_raw h2r;
+    uint32_t ui32;
+  } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  tmp.h2r.x.data *= scale;
+  tmp.h2r.y.data *= scale;
+  return tmp.ui32;
+}
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
+  tmp.u32[1] =
+      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
+  return tmp.u32x2;
+}
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a,
+                                                                float scale) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
+  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
+  return tmp.u64x2;
+}
 // half -> fp8
 template <>
 __inline__ __device__ uint8_t
+scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, float scale) {
   __half_raw tmp;
   tmp.x = a;
+  tmp.data /= scale;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
+// halfx2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  tmp.h2r.x.data /= scale;
+  tmp.h2r.y.data /= scale;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
+}
+// half2x2 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint2>(const uint2& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, uint32_t>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, uint32_t>(a.y, scale);
+  return tmp.ui32;
+}
+// half2x4 -> fp8x8
+template <>
+__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint4>(const uint4& a,
+                                                                float scale) {
+  union {
+    uint2 ui2[2];
+    uint4 ui4;
+  } tmp;
+  tmp.ui4 = a;
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[0], scale);
+  res.y = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[1], scale);
+  return res;
 }
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
+    const __nv_bfloat16& a, float scale) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a) / scale,
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+// bf16x2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, __nv_bfloat162>(
+    const __nv_bfloat162& a, float scale) {
+  union {
+    uint8_t ui8[2];
+    uint16_t ui16;
+  } tmp;
+  tmp.ui8[0] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.x, scale);
+  tmp.ui8[1] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.y, scale);
+  return tmp.ui16;
+}
+// bf16x4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, bf16_4_t>(const bf16_4_t& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.y, scale);
+  return tmp.ui32;
+}
+// bf16x8 -> fp8x8
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, bf16_8_t>(const bf16_8_t& a, float scale) {
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, bf16_4_t>({a.x, a.y}, scale);
+  res.y = scaled_vec_conversion<uint32_t, bf16_4_t>({a.z, a.w}, scale);
+  return res;
 }
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t
+scaled_vec_conversion<uint8_t, float>(const float& a, float scale) {
+  return __hip_cvt_float_to_fp8(a / scale, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
+// floatx2 -> fp8x2
 template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, float2>(const float2& a, float scale) {
+  return __hip_cvt_float2_to_fp8x2(a / scale, fp8_type::__default_saturation,
+                                   fp8_type::__default_interpret);
+}
+// floatx4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, float4>(const float4& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, float2>({a.x, a.y}, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, float2>({a.z, a.w}, scale);
+  return tmp.ui32;
 }
   #endif  // ENABLE_FP8

fp8/common.cu CHANGED Viewed

@@ -11,8 +11,8 @@
 namespace vllm {
-template <typename scalar_t>
-__global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
                                         const scalar_t* __restrict__ input,
                                         const float* __restrict__ scale,
                                         int64_t num_elems) {
@@ -25,24 +25,22 @@ __global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
       out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
 }
-template <typename scalar_t>
 __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
-    FP8_TYPE* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
-  float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f);
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
   // Use int64 to avoid overflowing an int32 when calculating this offset
   int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
   scalar_t const* __restrict__ token_input = &input[offset];
-  FP8_TYPE* __restrict__ token_output = &out[offset];
   // For vectorization, token_input and token_output pointers need to be
-  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0;
   float absmax_val = 0.0f;
   if (can_vectorize) {
@@ -50,23 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
       float const x = static_cast<float>(token_input[i]);
-      absmax_val = max(absmax_val, fabs(x));
     }
   }
-  using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStorage;
   float const block_absmax_val_maybe =
       BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
   __shared__ float token_scale;
   if (tid == 0) {
     if (scale_ub) {
-      token_scale = min(block_absmax_val_maybe, *scale_ub);
     } else {
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / FP8_E4M3_MAX, min_scaling_factor);
     scale[token_idx] = token_scale;
   }
   __syncthreads();
@@ -77,7 +76,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
         token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
-      token_output[i] = scaled_fp8_conversion<false>(
           static_cast<float>(token_input[i]), token_scale);
     }
   }
@@ -89,17 +88,22 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
-        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
-            scale.data_ptr<float>(), num_elems);
       });
 }
@@ -107,19 +111,26 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
-        vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
-            scale.data_ptr<float>(), input.data_ptr<scalar_t>(), num_elems);
-        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
-            scale.data_ptr<float>(), num_elems);
       });
 }
@@ -132,18 +143,25 @@ void dynamic_per_token_scaled_fp8_quant(
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] {
-        vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t>
-            <<<grid, block, 0, stream>>>(
-                out.data_ptr<FP8_TYPE>(), scales.data_ptr<float>(),
-                input.data_ptr<scalar_t>(),
-                scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                hidden_size);
       });
 }

 namespace vllm {
+template <typename scalar_t, typename fp8_type>
+__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out,
                                         const scalar_t* __restrict__ input,
                                         const float* __restrict__ scale,
                                         int64_t num_elems) {
       out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
 }
+template <typename scalar_t, typename fp8_type>
 __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
+    fp8_type* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
   // Use int64 to avoid overflowing an int32 when calculating this offset
   int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
   scalar_t const* __restrict__ token_input = &input[offset];
+  fp8_type* __restrict__ token_output = &out[offset];
   // For vectorization, token_input and token_output pointers need to be
+  // aligned at 32-byte and 16-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 16 == 0;
   float absmax_val = 0.0f;
   if (can_vectorize) {
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
       float const x = static_cast<float>(token_input[i]);
+      absmax_val = fmaxf(absmax_val, fabsf(x));
     }
   }
+  using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage reduceStorage;
   float const block_absmax_val_maybe =
       BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
   __shared__ float token_scale;
   if (tid == 0) {
     if (scale_ub) {
+      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
     } else {
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
+    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
+                        min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
         token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
+      token_output[i] = scaled_fp8_conversion<false, fp8_type>(
           static_cast<float>(token_input[i]), token_scale);
     }
   }
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
       });
 }
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::segmented_max_reduction<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(scale.data_ptr<float>(),
+                                               input.data_ptr<scalar_t>(),
+                                               num_elems);
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
       });
 }
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
+  int const block_size = 256;
   dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, block_size));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(),
+      "dynamic_per_token_scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(),
+            "dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
+                      input.data_ptr<scalar_t>(),
+                      scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                           : nullptr,
+                      hidden_size);
+            });
       });
 }

fp8/common.cuh CHANGED Viewed

@@ -1,24 +1,27 @@
 #pragma once
 #include "vectorization.cuh"
 #include <cmath>
-#include <c10/core/ScalarType.h>
 #ifndef USE_ROCM
-  #include <c10/util/Float8_e4m3fn.h>
-using FP8_TYPE = c10::Float8_e4m3fn;
-C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
-    std::numeric_limits<FP8_TYPE>::max();
 #else
-  #include <c10/util/Float8_e4m3fnuz.h>
-  #include "amd/hip_float8.h"
-using FP8_TYPE = c10::Float8_e4m3fnuz;
-// Using the default max value from pytorch (240.0) will cause accuracy
-// issue when running dynamic quantization. Here use 224.0f for rocm.
-constexpr auto FP8_E4M3_MAX = 224.0f;
 #endif
-constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
 namespace vllm {
@@ -32,8 +35,8 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
   return old;
 }
-template <bool is_scale_inverted>
-__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
                                                           float const scale) {
   float x = 0.0f;
   if constexpr (is_scale_inverted) {
@@ -42,13 +45,13 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
     x = val / scale;
   }
-  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
 #ifndef USE_ROCM
-  return static_cast<c10::Float8_e4m3fn>(r);
 #else
   // Use hardware cvt instruction for fp8 on rocm
-  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
-                              c10::Float8_e4m3fnuz::from_bits());
 #endif
 }
@@ -58,11 +61,11 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
 // So to get the right answer, *scale needs to be initialized to
 // a value <= 0.0 and we need to wait for all thread blocks to
 // finish before consuming *scale.
-template <typename scalar_t>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
-  __shared__ float cache[1024];
   int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
   // First store maximum for all values processes by
@@ -70,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   scalar_t tmp = 0.0;
   while (i < num_elems) {
     float x = static_cast<float>(input[i]);
-    tmp = max(tmp, fabs(x));
     i += blockDim.x * gridDim.x;
   }
   cache[threadIdx.x] = tmp;
@@ -89,7 +92,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX);
   }
 }
@@ -97,62 +100,64 @@ template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
                                 int const step) {
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  int64_t const num_vec_elems = num_elems >> 2;
   float absmax_val = 0.0f;
-#pragma unroll 4
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    absmax_val = max(absmax_val, fabs(in_vec.x));
-    absmax_val = max(absmax_val, fabs(in_vec.y));
-    absmax_val = max(absmax_val, fabs(in_vec.z));
-    absmax_val = max(absmax_val, fabs(in_vec.w));
   }
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    absmax_val = max(absmax_val, fabs(input[i]));
   }
   return absmax_val;
 }
-template <typename scalar_t, bool is_scale_inverted>
-__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
                                           scalar_t const* __restrict__ input,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
-  using float8x4_t = q8x4_t<FP8_TYPE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
-  int64_t const num_vec_elems = num_elems >> 2;
-#pragma unroll 4
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.w), scale);
     vectorized_out[i] = out_vec;
   }
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    out[i] = scaled_fp8_conversion<is_scale_inverted>(
         static_cast<float>(input[i]), scale);
   }
 }

 #pragma once
 #include "vectorization.cuh"
+#include "utils.cuh"
 #include <cmath>
+#ifdef USE_ROCM
+  #include "amd/quant_utils.cuh"
+#endif
+// Determines the preferred FP8 type for the current platform.
+// Note that for CUDA this just returns true,
+// but on ROCm it will check device props.
+static bool is_fp8_ocp() {
 #ifndef USE_ROCM
+  return true;
 #else
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string device_arch = dprops->gcnArchName;
+  size_t substring = device_arch.find("gfx94");
+  return substring == std::string::npos;
 #endif
+}
 namespace vllm {
   return old;
 }
+template <bool is_scale_inverted, typename fp8_type>
+__device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
                                                           float const scale) {
   float x = 0.0f;
   if constexpr (is_scale_inverted) {
     x = val / scale;
   }
+  float r =
+      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
+  return static_cast<fp8_type>(r);
 #else
   // Use hardware cvt instruction for fp8 on rocm
+  return fp8::cvt_c10<fp8_type>(r);
 #endif
 }
 // So to get the right answer, *scale needs to be initialized to
 // a value <= 0.0 and we need to wait for all thread blocks to
 // finish before consuming *scale.
+template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
+  __shared__ float cache[256];
   int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
   // First store maximum for all values processes by
   scalar_t tmp = 0.0;
   while (i < num_elems) {
     float x = static_cast<float>(input[i]);
+    tmp = fmaxf(tmp, fabsf(x));
     i += blockDim.x * gridDim.x;
   }
   cache[threadIdx.x] = tmp;
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
   }
 }
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
                                 int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
   float absmax_val = 0.0f;
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
+    scalarxN_t in_vec = vectorized_in[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
+    }
   }
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
   }
   return absmax_val;
 }
+template <typename scalar_t, bool is_scale_inverted, typename fp8_type>
+__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                           scalar_t const* __restrict__ input,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
+    scalarxN_t in_vec = vectorized_in[i];
+    float8xN_t out_vec;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+          static_cast<float>(in_vec.val[j]), scale);
+    }
     vectorized_out[i] = out_vec;
   }
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(input[i]), scale);
   }
 }

fp8/nvidia/quant_utils.cuh CHANGED Viewed

@@ -1,6 +1,6 @@
 #pragma once
-#include "../../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>

 #pragma once
+#include "../../attention/attention_dtypes.h"
 #include <assert.h>
 #include <float.h>
 #include <stdint.h>

gptq_marlin/awq_marlin_repack.cu CHANGED Viewed

@@ -12,7 +12,7 @@ __global__ void awq_marlin_repack_kernel(
   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
-  int start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
@@ -49,8 +49,8 @@ __global__ void awq_marlin_repack_kernel(
     int4* sh_ptr = sh + stage_size * pipe;
     if (threadIdx.x < stage_size) {
-      int k_id = threadIdx.x / stage_n_threads;
-      int n_id = threadIdx.x % stage_n_threads;
       int first_k = k_tile_id * tile_k_size;
@@ -68,8 +68,8 @@ __global__ void awq_marlin_repack_kernel(
       return;
     }
-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
     if (warp_id >= 4) {
       return;

   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+  auto start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
     int4* sh_ptr = sh + stage_size * pipe;
     if (threadIdx.x < stage_size) {
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;
       int first_k = k_tile_id * tile_k_size;
       return;
     }
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
     if (warp_id >= 4) {
       return;

gptq_marlin/dequant.h ADDED Viewed

	@@ -0,0 +1,507 @@

+/*
+Fast Dequantization (Converting INT4/INT8/FP4/FP8 to FP16/BF16)
+The process of fast dequantization can be summarized as a combination
+of bitwise operations and floating-point computations:
+weight =>(bit_op / bitwise operations)=>
+f16_value =>(flop / floating-point computation)=>
+dequantized_weight
+Since the dequantized weights typically require subtracting the zero point and
+applying a scale factor, the floating-point computation step can be fused with
+the zero-point subtraction and scaling operations.
+The following are the parts that need to be modified for the fused operation
+of zero-point subtraction and scaling.
+## INT4 => FP16/BF16 or INT8 => FP16
+The floating-point computation is `__hsub2`
+If has zero points:
+    flop(bit_op(weight)) - flop(bit_op(zp))
+  = sub(bit_op(weight), bias) - sub(bit_op(zp), bias)
+  = bit_op(weight) - bit_op(zp)
+so we don't need additional modification.
+If has float zero points:
+    flop(bit_op(weight)) - fzp
+  = sub(bit_op(weight), bias) - fzp
+  = bit_op(weight) - (fzp + bias)
+where the `fzp + bias` can be computed at weight loading. But this
+may have accuracy issue, so we should not use this in most cases.
+If has not zero points:
+    scale(flop(bit_op(weight)))
+  = scale(sub(bit_op(weight), bias))
+  = scale(bit_op(weight)) - scale(bias)
+  = fma(bit_op(weight), scale_factor, scale(bias))
+where the `scale(bias)` can be cached. But this may have accuracy issue,
+so we should not use this in most cases.
+## INT8 => BF16
+INT8 => BF16 is a special case, it use byte_perm instead of flop.
+We cannot fused byte_perm with scaling.
+## FP4/FP8 => FP16/BF16
+    scale(flop(bit_op(weight)))
+  = scale(mul(bit_op(weight), multiplier))
+  = mul(bit_op(weight), scale_factor * multiplier)
+where `scale_factor * multiplier` can be computed at weight loading.
+*/
+#include "marlin_dtypes.cuh"
+namespace MARLIN_NAMESPACE_NAME {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void dequant(int q, scalar_t2* frag_b);
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), true>(int q,
+                                                              half2* frag_b) {
+  const int MASK = 0x000f000f;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU4B8.id(), false>(int q,
+                                                               half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU4B8.id(), true>(q, frag_b);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU4.id(), false>(int q,
+                                                             half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+  frag_b[0] = *reinterpret_cast<nv_bfloat162*>(&lo);
+  frag_b[1] = *reinterpret_cast<nv_bfloat162*>(&hi);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4B8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+  static constexpr uint32_t SUB = 0x43084308;
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4B8.id(), true>(q, frag_b);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU4.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kU4.id(), true>(q, frag_b);
+  static constexpr uint32_t SUB = 0x43004300;
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), true>(int q,
+                                                                half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU8B128.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), true>(int q,
+                                                            half2* frag_b) {
+  dequant<half2, vllm::kU8B128.id(), true>(q, frag_b);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kU8.id(), false>(int q,
+                                                             half2* frag_b) {
+  dequant<half2, vllm::kU8.id(), true>(q, frag_b);
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+  frag_b[0] = __hsub2(frag_b[0],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1],
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8B128.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kU8.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), true>(
+    int q, half2* frag_b) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kFE4M3fn.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE4M3fn.id(), true>(q, frag_b);
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE4M3fn.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE4M3fn.id(), true>(q, frag_b);
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+  // Convert to bfloat162 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), true>(int q,
+                                                                half2* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+template <>
+__device__ inline void dequant<half2, vllm::kFE2M1f.id(), false>(
+    int q, half2* frag_b) {
+  dequant<half2, vllm::kFE2M1f.id(), true>(q, frag_b);
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (FP16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(
+    int q, nv_bfloat162* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+template <>
+__device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
+    int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, vllm::kFE2M1f.id(), true>(q, frag_b);
+  // Constants for FP4 (E2M1) and BF16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET =
+      (1 << (BF16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg =
+      __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+template <typename scalar_t2>
+__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
+template <>
+__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  ;
+  q <<= 8;
+  int Out2 = (q & 0xFF00FF00) >> 1;
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+};
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
+                                                        nv_bfloat162* frag_b) {
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+#endif
+}  // namespace MARLIN_NAMESPACE_NAME

gptq_marlin/generate_kernels.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import itertools
+import os
+import subprocess
+import jinja2
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+#include "kernel.h"
+#include "marlin_template.h"
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = [
+    "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
+    "vllm::kFE2M1f"
+]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
+                  (128, 64, 128)]
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        all_template_str_list = []
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
+                continue
+            if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+            # nvfp4 only supports group_size == 16
+            if scalar_type == "vllm::kFE2M1f" and group_blocks != 1:
+                continue
+            # other quantization methods don't support group_size = 16
+            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+                continue
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+            is_zp_float_list = [False]
+            if dtype == "fp16" and scalar_type == "vllm::kU4" and \
+                    group_blocks == 4:
+                # HQQ (is_zp_float = true) only supports
+                # 4bit quantization and fp16
+                is_zp_float_list.append(True)
+            for is_zp_float in is_zp_float_list:
+                template_str = jinja2.Template(TEMPLATE).render(
+                    scalar_t=c_dtype,
+                    w_type_id=scalar_type + ".id()",
+                    threads=threads,
+                    thread_m_blocks=max(m_blocks, 1),
+                    thread_n_blocks=n_blocks,
+                    thread_k_blocks=k_blocks,
+                    m_block_size_8=m_blocks == 0.5,
+                    stages="pipe_stages",
+                    group_blocks=group_blocks,
+                    is_zp_float=is_zp_float,
+                )
+                all_template_str_list.append(template_str)
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()

gptq_marlin/gptq_marlin.cu CHANGED Viewed

The diff for this file is too large to render. See raw diff

gptq_marlin/gptq_marlin_repack.cu CHANGED Viewed

@@ -13,7 +13,7 @@ __global__ void gptq_marlin_repack_kernel(
   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
-  int start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
@@ -69,8 +69,8 @@ __global__ void gptq_marlin_repack_kernel(
     if constexpr (has_perm) {
       if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
         uint32_t const* sh_perm_int_ptr =
             reinterpret_cast<uint32_t const*>(sh_perm_ptr);
@@ -86,8 +86,8 @@ __global__ void gptq_marlin_repack_kernel(
     } else {
       if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
         int first_k = k_tile_id * tile_k_size;
         int first_k_packed = first_k / pack_factor;
@@ -107,8 +107,8 @@ __global__ void gptq_marlin_repack_kernel(
       return;
     }
-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
     if (warp_id >= 4) {
       return;
@@ -330,4 +330,3 @@ torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
       {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
       options);
 }

   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+  auto start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
     if constexpr (has_perm) {
       if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
         uint32_t const* sh_perm_int_ptr =
             reinterpret_cast<uint32_t const*>(sh_perm_ptr);
     } else {
       if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
         int first_k = k_tile_id * tile_k_size;
         int first_k_packed = first_k / pack_factor;
       return;
     }
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
     if (warp_id >= 4) {
       return;
       {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
       options);
 }

gptq_marlin/kernel.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+#define MARLIN_KERNEL_PARAMS                                                   \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ scales_ptr,                                     \
+      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
+      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
+      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+}

gptq_marlin/kernel_bf16_kfe2m1f.cu ADDED Viewed

	@@ -0,0 +1,39 @@

+// auto generated by generate.py
+// clang-format off
+#include "kernel.h"
+#include "marlin_template.h"
+namespace MARLIN_NAMESPACE_NAME {
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 256, 1, 8, 8, true, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 1, 8, 4, true, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 1, 4, 8, true, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 256, 1, 8, 8, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 1, 8, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 1, 4, 8, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 256, 2, 16, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 2, 8, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 2, 4, 8, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 256, 3, 16, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 3, 8, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 3, 4, 8, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 256, 4, 16, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 4, 8, 4, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE2M1f.id(), 128, 4, 4, 8, false, pipe_stages, 1, false>( MARLIN_KERNEL_PARAMS );
+}

gptq_marlin/kernel_bf16_kfe4m3fn.cu ADDED Viewed

	@@ -0,0 +1,69 @@

+// auto generated by generate.py
+// clang-format off
+#include "kernel.h"
+#include "marlin_template.h"
+namespace MARLIN_NAMESPACE_NAME {
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 1, 8, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 8, 4, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 4, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 1, 8, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 2, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 2, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 2, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 3, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 3, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 3, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 4, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 4, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 4, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 1, 8, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 8, 4, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 4, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 1, 8, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 1, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 2, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 2, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 2, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 3, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 3, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 3, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 256, 4, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 4, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kFE4M3fn.id(), 128, 4, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+}

gptq_marlin/kernel_bf16_ku4.cu ADDED Viewed

	@@ -0,0 +1,129 @@

+// auto generated by generate.py
+// clang-format off
+#include "kernel.h"
+#include "marlin_template.h"
+namespace MARLIN_NAMESPACE_NAME {
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 2, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 3, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 4, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 2, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 3, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 4, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 2, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 3, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 4, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 1, 8, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 1, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 2, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 2, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 3, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 3, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 256, 4, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4.id(), 128, 4, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+}

gptq_marlin/kernel_bf16_ku4b8.cu ADDED Viewed

	@@ -0,0 +1,159 @@

+// auto generated by generate.py
+// clang-format off
+#include "kernel.h"
+#include "marlin_template.h"
+namespace MARLIN_NAMESPACE_NAME {
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, true, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 4, 8, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 4, 8, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 4, 8, false, pipe_stages, 0, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, true, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 4, 8, false, pipe_stages, -1, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, true, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 4, 8, false, pipe_stages, 2, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, true, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 4, 8, false, pipe_stages, 4, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, true, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 1, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 2, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 3, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+template __global__ void Marlin<nv_bfloat16, vllm::kU4B8.id(), 128, 4, 4, 8, false, pipe_stages, 8, false>( MARLIN_KERNEL_PARAMS );
+}