Add fp8 support

Browse files

Files changed (12) hide show

.gitattributes +2 -0
build.toml +2 -0
paged-attention-metal/attention/paged_attention.metal +292 -87
paged-attention-metal/cache.mm +67 -12
paged-attention-metal/cache/copy_blocks.metal +1 -0
paged-attention-metal/cache/reshape_and_cache.metal +74 -23
paged-attention-metal/convert_fp8.metal +77 -0
paged-attention-metal/convert_fp8.mm +109 -1
paged-attention-metal/float8.metal +122 -0
paged-attention-metal/paged_attention.mm +72 -10
tests/kernels/test_attention.py +1 -1
tests/kernels/test_cache.py +68 -11

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.metallib filter=lfs diff=lfs merge=lfs -text

build.toml CHANGED Viewed

@@ -13,6 +13,8 @@ src = [
   "paged-attention-metal/attention/paged_attention.metal",
   "paged-attention-metal/cache/copy_blocks.metal",
   "paged-attention-metal/cache/reshape_and_cache.metal",
   "paged-attention-metal/utils.metal",
   "paged-attention-metal/paged_attention.mm",
   "paged-attention-metal/cache.mm",

   "paged-attention-metal/attention/paged_attention.metal",
   "paged-attention-metal/cache/copy_blocks.metal",
   "paged-attention-metal/cache/reshape_and_cache.metal",
+  "paged-attention-metal/convert_fp8.metal",
+  "paged-attention-metal/float8.metal",
   "paged-attention-metal/utils.metal",
   "paged-attention-metal/paged_attention.mm",
   "paged-attention-metal/cache.mm",

paged-attention-metal/attention/paged_attention.metal CHANGED Viewed

@@ -1,6 +1,7 @@
 // Updated from MLX commit has f70764a
 #include "../utils.metal"
 #include <metal_simdgroup>
 #include <metal_stdlib>
@@ -529,6 +530,154 @@ inline void from_float(thread Half8_ &dst, Float8_ src) {
   dst.y = y;
 }
 // ========================================== Dot product utilities
 // TODO(EricLBuehler): optimize with vectorization
@@ -602,8 +751,9 @@ inline float block_sum(threadgroup float *red_smem, float sum, uint simd_tid,
 constant bool use_partitioning [[function_constant(10)]];
 constant bool use_alibi [[function_constant(20)]];
-template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
           int NUM_SIMD_LANES, int PARTITION_SIZE = 0>
 [[kernel]] void paged_attention(
     device float *exp_sums
@@ -615,22 +765,26 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
     device T *out
     [[buffer(2)]], // [num_seqs, num_heads, max_num_partitions, head_size]
     device const T *q [[buffer(3)]], // [num_seqs, num_heads, head_size]
-    device const T *k_cache
     [[buffer(4)]], // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-    device const T *v_cache
     [[buffer(5)]], // [num_blocks, num_kv_heads, head_size, block_size]
-    const constant int &num_kv_heads [[buffer(6)]], // [num_heads]
-    const constant float &scale [[buffer(7)]],
-    const constant float &softcapping [[buffer(8)]],
     device const uint32_t *block_tables
-    [[buffer(9)]], // [num_seqs, max_num_blocks_per_seq]
-    device const uint32_t *context_lens [[buffer(10)]], // [num_seqs]
-    const constant int &max_num_blocks_per_seq [[buffer(11)]],
     device const float *alibi_slopes
-    [[buffer(12)]], // [num_heads] - only used when use_alibi
-    const constant int &q_stride [[buffer(13)]],
-    const constant int &kv_block_stride [[buffer(14)]],
-    const constant int &kv_head_stride [[buffer(15)]],
     threadgroup char *shared_mem [[threadgroup(0)]],
     uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
     uint3 threadgroups_per_grid [[threadgroups_per_grid]],
@@ -690,6 +844,7 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
   constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
   using K_vec = typename Vec<T, VEC_SIZE>::Type;
   using Q_vec = typename Vec<T, VEC_SIZE>::Type;
   constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
   constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -720,7 +875,7 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
   // x == THREAD_GROUP_SIZE * VEC_SIZE
   // Each thread group fetches x elements from the key at a time.
-  constexpr int x = 16 / sizeof(T);
   float qk_max = -FLT_MAX;
   // Iterate over the key blocks.
@@ -750,14 +905,23 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
 #pragma unroll
       for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
-        const device T *k_ptr =
             k_cache + physical_block_number * kv_block_stride +
             kv_head_idx * kv_head_stride + physical_block_offset * x;
         const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
         const int offset1 = (vec_idx * VEC_SIZE) / x;
         const int offset2 = (vec_idx * VEC_SIZE) % x;
-        k_vecs[j] = *reinterpret_cast<const device K_vec *>(
-            k_ptr + offset1 * BLOCK_SIZE * x + offset2);
       }
       // Compute dot product.
@@ -844,6 +1008,7 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
   using V_vec = typename Vec<T, V_VEC_SIZE>::Type;
   using L_vec = typename Vec<T, V_VEC_SIZE>::Type;
   using Float_L_vec = typename FloatVec<L_vec>::Type;
   constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
   constexpr int NUM_ROWS_PER_ITER = NUM_SIMD_LANES / NUM_V_VECS_PER_ROW;
@@ -872,8 +1037,8 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
         logits + token_idx - start_token_idx);
     from_float(logits_vec, logits_float_vec);
-    const device T *v_ptr = v_cache + physical_block_number * kv_block_stride +
-                            kv_head_idx * kv_head_stride;
 #pragma unroll
     for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
       const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -883,7 +1048,18 @@ template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
         // we should explicitly zero out the values since they may contain NaNs.
         // See
         // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
-        V_vec v_vec = *reinterpret_cast<const device V_vec *>(v_ptr + offset);
         if (block_idx == num_context_blocks - 1) {
           thread T *v_vec_ptr = reinterpret_cast<thread T *>(&v_vec);
 #pragma unroll
@@ -1073,36 +1249,38 @@ template <typename T, int HEAD_SIZE, int NUM_THREADS, int NUM_SIMD_LANES,
   }
 }
-#define instantiate_paged_attention_inner(                                     \
-    type, head_size, block_size, num_threads, num_simd_lanes, partition_size)  \
-  template                                                                     \
-      [[host_name("paged_attention_" #type "_hs" #head_size "_bs" #block_size  \
-                  "_nt" #num_threads "_nsl" #num_simd_lanes                    \
-                  "_ps" #partition_size)]] [[kernel]] void                     \
-      paged_attention<type, head_size, block_size, num_threads,                \
-                      num_simd_lanes, partition_size>(                         \
-          device float *exp_sums [[buffer(0)]],                                \
-          device float *max_logits [[buffer(1)]],                              \
-          device type *out [[buffer(2)]], device const type *q [[buffer(3)]],  \
-          device const type *k_cache [[buffer(4)]],                            \
-          device const type *v_cache [[buffer(5)]],                            \
-          const constant int &num_kv_heads [[buffer(6)]],                      \
-          const constant float &scale [[buffer(7)]],                           \
-          const constant float &softcapping [[buffer(8)]],                     \
-          device const uint32_t *block_tables [[buffer(9)]],                   \
-          device const uint32_t *context_lens [[buffer(10)]],                  \
-          const constant int &max_num_blocks_per_seq [[buffer(11)]],           \
-          device const float *alibi_slopes [[buffer(12)]],                     \
-          const constant int &q_stride [[buffer(13)]],                         \
-          const constant int &kv_block_stride [[buffer(14)]],                  \
-          const constant int &kv_head_stride [[buffer(15)]],                   \
-          threadgroup char *shared_mem [[threadgroup(0)]],                     \
-          uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], \
-          uint3 threadgroups_per_grid [[threadgroups_per_grid]],               \
-          uint3 thread_position_in_threadgroup                                 \
-          [[thread_position_in_threadgroup]],                                  \
-          uint simd_tid [[simdgroup_index_in_threadgroup]],                    \
-          uint simd_lid [[thread_index_in_simdgroup]]);
 #define instantiate_paged_attention_v2_reduce_inner(                           \
     type, head_size, num_threads, num_simd_lanes, partition_size)              \
@@ -1125,26 +1303,35 @@ template <typename T, int HEAD_SIZE, int NUM_THREADS, int NUM_SIMD_LANES,
       uint simd_tid [[simdgroup_index_in_threadgroup]],                        \
       uint simd_lid [[thread_index_in_simdgroup]]);
-#define instantiate_paged_attention_heads(type, block_size, num_threads,       \
-                                          num_simd_lanes, partition_size)      \
-  instantiate_paged_attention_inner(type, 32, block_size, num_threads,         \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 64, block_size, num_threads,         \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 80, block_size, num_threads,         \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 96, block_size, num_threads,         \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 112, block_size, num_threads,        \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 120, block_size, num_threads,        \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 128, block_size, num_threads,        \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 192, block_size, num_threads,        \
-                                    num_simd_lanes, partition_size);           \
-  instantiate_paged_attention_inner(type, 256, block_size, num_threads,        \
-                                    num_simd_lanes, partition_size);
 #define instantiate_paged_attention_v2_reduce_heads(                           \
     type, num_threads, num_simd_lanes, partition_size)                         \
@@ -1167,30 +1354,48 @@ template <typename T, int HEAD_SIZE, int NUM_THREADS, int NUM_SIMD_LANES,
   instantiate_paged_attention_v2_reduce_inner(type, 256, num_threads,          \
                                               num_simd_lanes, partition_size);
-#define instantiate_paged_attention_block_size(type, num_threads,              \
                                                num_simd_lanes, partition_size) \
-  instantiate_paged_attention_heads(type, 8, num_threads, num_simd_lanes,      \
-                                    partition_size);                           \
-  instantiate_paged_attention_heads(type, 16, num_threads, num_simd_lanes,     \
-                                    partition_size);                           \
-  instantiate_paged_attention_heads(type, 32, num_threads, num_simd_lanes,     \
-                                    partition_size);
 // TODO: tune num_threads = 256
 // NOTE: partition_size = 0
-#define instantiate_paged_attention_v1(type, num_simd_lanes)                   \
-  instantiate_paged_attention_block_size(type, 256, num_simd_lanes, 0);
 // TODO: tune num_threads = 256
 // NOTE: partition_size = 512
-#define instantiate_paged_attention_v2(type, num_simd_lanes)                   \
-  instantiate_paged_attention_block_size(type, 256, num_simd_lanes, 512);      \
   instantiate_paged_attention_v2_reduce_heads(type, 256, num_simd_lanes, 512);
-instantiate_paged_attention_v1(float, 32);
-instantiate_paged_attention_v1(bfloat16_t, 32);
-instantiate_paged_attention_v1(half, 32);
-instantiate_paged_attention_v2(float, 32);
-instantiate_paged_attention_v2(bfloat16_t, 32);
-instantiate_paged_attention_v2(half, 32);

 // Updated from MLX commit has f70764a
 #include "../utils.metal"
+#include "../float8.metal"
 #include <metal_simdgroup>
 #include <metal_stdlib>
   dst.y = y;
 }
+// ========================================== FP8 (uchar) vector data types.
+// 8‑lane uchar vector – Metal only provides up to uchar4, so build our own.
+struct Uchar8_ {
+  uchar4 x;
+  uchar4 y;
+};
+// Vec specialisations so Vec<uchar, N>::Type resolves correctly.
+template <> struct Vec<uchar, 1> {
+  using Type = uchar;
+};
+template <> struct Vec<uchar, 2> {
+  using Type = uchar2;
+};
+template <> struct Vec<uchar, 4> {
+  using Type = uchar4;
+};
+template <> struct Vec<uchar, 8> {
+  using Type = Uchar8_;
+};
+// General case: not uchar
+template <typename T> inline constexpr bool is_uchar() { return false; }
+// Specialization: T is uchar
+template <> inline constexpr bool is_uchar<uchar>() { return true; }
+// Generic fallback – will fail to compile if a required specialisation is
+// missing.
+template <typename Vec, typename Quant_vec>
+inline Vec fp8_convert(const thread Quant_vec &, float scale) {
+  static_assert(sizeof(Vec) == 0, "Missing fp8_convert specialisation");
+}
+// ========================================== FP8 → float/half/bfloat
+inline float __dequant_single(uchar v, float scale) {
+  return fp8_e4m3_to_float(v) * scale;
+}
+// ---- 1‑lane ----
+template <>
+inline float fp8_convert<float, uchar>(const thread uchar &in, float scale) {
+  return __dequant_single(in, scale);
+}
+template <>
+inline half fp8_convert<half, uchar>(const thread uchar &in, float scale) {
+  return half(__dequant_single(in, scale));
+}
+template <>
+inline bfloat16_t fp8_convert<bfloat16_t, uchar>(const thread uchar &in,
+                                                 float scale) {
+  return bfloat16_t(__dequant_single(in, scale));
+}
+// ---- 2‑lane ----
+template <>
+inline float2 fp8_convert<float2, uchar2>(const thread uchar2 &in,
+                                          float scale) {
+  return float2(__dequant_single(in.x, scale), __dequant_single(in.y, scale));
+}
+template <>
+inline half2 fp8_convert<half2, uchar2>(const thread uchar2 &in, float scale) {
+  half2 out;
+  out.x = half(__dequant_single(in.x, scale));
+  out.y = half(__dequant_single(in.y, scale));
+  return out;
+}
+template <>
+inline Bfloat2_ fp8_convert<Bfloat2_, uchar2>(const thread uchar2 &in,
+                                              float scale) {
+  Bfloat2_ out;
+  out.x = bfloat16_t(__dequant_single(in.x, scale));
+  out.y = bfloat16_t(__dequant_single(in.y, scale));
+  return out;
+}
+// ---- 4‑lane ----
+template <>
+inline float4 fp8_convert<float4, uchar4>(const thread uchar4 &in,
+                                          float scale) {
+  return float4(__dequant_single(in.x, scale), __dequant_single(in.y, scale),
+                __dequant_single(in.z, scale), __dequant_single(in.w, scale));
+}
+template <>
+inline half4 fp8_convert<half4, uchar4>(const thread uchar4 &in, float scale) {
+  half4 out;
+  out.x = half(__dequant_single(in.x, scale));
+  out.y = half(__dequant_single(in.y, scale));
+  out.z = half(__dequant_single(in.z, scale));
+  out.w = half(__dequant_single(in.w, scale));
+  return out;
+}
+template <>
+inline Bfloat4_ fp8_convert<Bfloat4_, uchar4>(const thread uchar4 &in,
+                                              float scale) {
+  Bfloat4_ out;
+  out.x.x = bfloat16_t(__dequant_single(in.x, scale));
+  out.x.y = bfloat16_t(__dequant_single(in.y, scale));
+  out.y.x = bfloat16_t(__dequant_single(in.z, scale));
+  out.y.y = bfloat16_t(__dequant_single(in.w, scale));
+  return out;
+}
+// ---- 8‑lane ----
+template <>
+inline Float8_ fp8_convert<Float8_, Uchar8_>(const thread Uchar8_ &in,
+                                             float scale) {
+  Float8_ out;
+  out.x =
+      float4(__dequant_single(in.x.x, scale), __dequant_single(in.x.y, scale),
+             __dequant_single(in.x.z, scale), __dequant_single(in.x.w, scale));
+  out.y =
+      float4(__dequant_single(in.y.x, scale), __dequant_single(in.y.y, scale),
+             __dequant_single(in.y.z, scale), __dequant_single(in.y.w, scale));
+  return out;
+}
+template <>
+inline Half8_ fp8_convert<Half8_, Uchar8_>(const thread Uchar8_ &in,
+                                           float scale) {
+  Half8_ out;
+  out.x = half4(half(__dequant_single(in.x.x, scale)),
+                half(__dequant_single(in.x.y, scale)),
+                half(__dequant_single(in.x.z, scale)),
+                half(__dequant_single(in.x.w, scale)));
+  out.y = half4(half(__dequant_single(in.y.x, scale)),
+                half(__dequant_single(in.y.y, scale)),
+                half(__dequant_single(in.y.z, scale)),
+                half(__dequant_single(in.y.w, scale)));
+  return out;
+}
+template <>
+inline Bfloat8_ fp8_convert<Bfloat8_, Uchar8_>(const thread Uchar8_ &in,
+                                               float scale) {
+  Bfloat8_ out;
+  // first 4
+  out.x.x.x = bfloat16_t(__dequant_single(in.x.x, scale));
+  out.x.x.y = bfloat16_t(__dequant_single(in.x.y, scale));
+  out.x.y.x = bfloat16_t(__dequant_single(in.x.z, scale));
+  out.x.y.y = bfloat16_t(__dequant_single(in.x.w, scale));
+  // second 4
+  out.y.x.x = bfloat16_t(__dequant_single(in.y.x, scale));
+  out.y.x.y = bfloat16_t(__dequant_single(in.y.y, scale));
+  out.y.y.x = bfloat16_t(__dequant_single(in.y.z, scale));
+  out.y.y.y = bfloat16_t(__dequant_single(in.y.w, scale));
+  return out;
+}
 // ========================================== Dot product utilities
 // TODO(EricLBuehler): optimize with vectorization
 constant bool use_partitioning [[function_constant(10)]];
 constant bool use_alibi [[function_constant(20)]];
+constant bool use_fp8_scales [[function_constant(30)]];
+template <typename T, typename CACHE_T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
           int NUM_SIMD_LANES, int PARTITION_SIZE = 0>
 [[kernel]] void paged_attention(
     device float *exp_sums
     device T *out
     [[buffer(2)]], // [num_seqs, num_heads, max_num_partitions, head_size]
     device const T *q [[buffer(3)]], // [num_seqs, num_heads, head_size]
+    device const CACHE_T *k_cache
     [[buffer(4)]], // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    device const CACHE_T *v_cache
     [[buffer(5)]], // [num_blocks, num_kv_heads, head_size, block_size]
+    const device float *__restrict__ k_scale
+    [[buffer(6)]], // [1] - only used when use_fp8_scales
+    const device float *__restrict__ v_scale
+    [[buffer(7)]], // [1] - only used when use_fp8_scales
+    const constant int &num_kv_heads [[buffer(8)]], // [num_heads]
+    const constant float &scale [[buffer(9)]],
+    const constant float &softcapping [[buffer(10)]],
     device const uint32_t *block_tables
+    [[buffer(11)]], // [num_seqs, max_num_blocks_per_seq]
+    device const uint32_t *context_lens [[buffer(12)]], // [num_seqs]
+    const constant int &max_num_blocks_per_seq [[buffer(13)]],
     device const float *alibi_slopes
+    [[buffer(14)]], // [num_heads] - only used when use_alibi
+    const constant int &q_stride [[buffer(15)]],
+    const constant int &kv_block_stride [[buffer(16)]],
+    const constant int &kv_head_stride [[buffer(17)]],
     threadgroup char *shared_mem [[threadgroup(0)]],
     uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
     uint3 threadgroups_per_grid [[threadgroups_per_grid]],
   constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
   using K_vec = typename Vec<T, VEC_SIZE>::Type;
   using Q_vec = typename Vec<T, VEC_SIZE>::Type;
+  using Quant_vec = typename Vec<CACHE_T, VEC_SIZE>::Type;
   constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
   constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
   // x == THREAD_GROUP_SIZE * VEC_SIZE
   // Each thread group fetches x elements from the key at a time.
+  constexpr int x = 16 / sizeof(CACHE_T);
   float qk_max = -FLT_MAX;
   // Iterate over the key blocks.
 #pragma unroll
       for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
+        const device CACHE_T *k_ptr =
             k_cache + physical_block_number * kv_block_stride +
             kv_head_idx * kv_head_stride + physical_block_offset * x;
         const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
         const int offset1 = (vec_idx * VEC_SIZE) / x;
         const int offset2 = (vec_idx * VEC_SIZE) % x;
+        if constexpr (is_uchar<CACHE_T>()) {
+          // FP8 support
+          Quant_vec k_vec_quant = *reinterpret_cast<const device Quant_vec *>(
+              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+          k_vecs[j] = fp8_convert<K_vec, Quant_vec>(k_vec_quant, *k_scale);
+        } else {
+          // Non-FP8 default
+          k_vecs[j] = *reinterpret_cast<const device K_vec *>(
+              k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+        }
       }
       // Compute dot product.
   using V_vec = typename Vec<T, V_VEC_SIZE>::Type;
   using L_vec = typename Vec<T, V_VEC_SIZE>::Type;
   using Float_L_vec = typename FloatVec<L_vec>::Type;
+  using V_quant_vec = typename Vec<CACHE_T, V_VEC_SIZE>::Type;
   constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
   constexpr int NUM_ROWS_PER_ITER = NUM_SIMD_LANES / NUM_V_VECS_PER_ROW;
         logits + token_idx - start_token_idx);
     from_float(logits_vec, logits_float_vec);
+    const device CACHE_T *v_ptr = v_cache + physical_block_number * kv_block_stride +
+                                  kv_head_idx * kv_head_stride;
 #pragma unroll
     for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
       const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
         // we should explicitly zero out the values since they may contain NaNs.
         // See
         // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+        V_vec v_vec;
+        if constexpr (is_uchar<CACHE_T>()) {
+          // FP8 support
+          V_quant_vec v_quant_vec =
+              *reinterpret_cast<const device V_quant_vec *>(v_ptr + offset);
+          v_vec = fp8_convert<V_vec, V_quant_vec>(v_quant_vec, *v_scale);
+        } else {
+          // Non-FP8 default
+          v_vec = *reinterpret_cast<const device V_vec *>(v_ptr + offset);
+        }
         if (block_idx == num_context_blocks - 1) {
           thread T *v_vec_ptr = reinterpret_cast<thread T *>(&v_vec);
 #pragma unroll
   }
 }
+#define instantiate_paged_attention_inner(type, cache_type, head_size,         \
+                                          block_size, num_threads,             \
+                                          num_simd_lanes, partition_size)      \
+  template [[host_name("paged_attention_" #type "_cache_" #cache_type          \
+                       "_hs" #head_size "_bs" #block_size "_nt" #num_threads   \
+                       "_nsl" #num_simd_lanes                                  \
+                       "_ps" #partition_size)]] [[kernel]] void                \
+  paged_attention<type, cache_type, head_size, block_size, num_threads,        \
+                  num_simd_lanes, partition_size>(                             \
+      device float *exp_sums [[buffer(0)]],                                   \
+      device float *max_logits [[buffer(1)]],                                 \
+      device type *out [[buffer(2)]], device const type *q [[buffer(3)]],      \
+      device const cache_type *k_cache [[buffer(4)]],                          \
+      device const cache_type *v_cache [[buffer(5)]],                          \
+      const device float *__restrict__ k_scale [[buffer(6)]],                  \
+      const device float *__restrict__ v_scale [[buffer(7)]],                  \
+      const constant int &num_kv_heads [[buffer(8)]],                          \
+      const constant float &scale [[buffer(9)]],                               \
+      const constant float &softcapping [[buffer(10)]],                        \
+      device const uint32_t *block_tables [[buffer(11)]],                      \
+      device const uint32_t *context_lens [[buffer(12)]],                      \
+      const constant int &max_num_blocks_per_seq [[buffer(13)]],               \
+      device const float *alibi_slopes [[buffer(14)]],                         \
+      const constant int &q_stride [[buffer(15)]],                             \
+      const constant int &kv_block_stride [[buffer(16)]],                      \
+      const constant int &kv_head_stride [[buffer(17)]],                       \
+      threadgroup char *shared_mem [[threadgroup(0)]],                         \
+      uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],     \
+      uint3 threadgroups_per_grid [[threadgroups_per_grid]],                   \
+      uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], \
+      uint simd_tid [[simdgroup_index_in_threadgroup]],                        \
+      uint simd_lid [[thread_index_in_simdgroup]]);
 #define instantiate_paged_attention_v2_reduce_inner(                           \
     type, head_size, num_threads, num_simd_lanes, partition_size)              \
       uint simd_tid [[simdgroup_index_in_threadgroup]],                        \
       uint simd_lid [[thread_index_in_simdgroup]]);
+#define instantiate_paged_attention_heads(                                     \
+    type, cache_type, block_size, num_threads, num_simd_lanes, partition_size) \
+  instantiate_paged_attention_inner(type, cache_type, 32, block_size,          \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 64, block_size,          \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 80, block_size,          \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 96, block_size,          \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 112, block_size,         \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 120, block_size,         \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 128, block_size,         \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 192, block_size,         \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);                           \
+  instantiate_paged_attention_inner(type, cache_type, 256, block_size,         \
+                                    num_threads, num_simd_lanes,               \
+                                    partition_size);
 #define instantiate_paged_attention_v2_reduce_heads(                           \
     type, num_threads, num_simd_lanes, partition_size)                         \
   instantiate_paged_attention_v2_reduce_inner(type, 256, num_threads,          \
                                               num_simd_lanes, partition_size);
+#define instantiate_paged_attention_block_size(type, cache_type, num_threads,  \
                                                num_simd_lanes, partition_size) \
+  instantiate_paged_attention_heads(type, cache_type, 8, num_threads,          \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_heads(type, cache_type, 16, num_threads,         \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_heads(type, cache_type, 32, num_threads,         \
+                                    num_simd_lanes, partition_size);
 // TODO: tune num_threads = 256
 // NOTE: partition_size = 0
+#define instantiate_paged_attention_v1(type, cache_type, num_simd_lanes)       \
+  instantiate_paged_attention_block_size(type, cache_type, 256,                \
+                                         num_simd_lanes, 0);
 // TODO: tune num_threads = 256
 // NOTE: partition_size = 512
+#define instantiate_paged_attention_v2(type, cache_type, num_simd_lanes)       \
+  instantiate_paged_attention_block_size(type, cache_type, 256,                \
+                                         num_simd_lanes, 512);
+// TODO: tune num_threads = 256
+// NOTE: partition_size = 512
+#define instantiate_paged_attention_v2_reduce(type, num_simd_lanes)            \
   instantiate_paged_attention_v2_reduce_heads(type, 256, num_simd_lanes, 512);
+instantiate_paged_attention_v1(float, float, 32);
+instantiate_paged_attention_v1(bfloat16_t, bfloat16_t, 32);
+instantiate_paged_attention_v1(half, half, 32);
+instantiate_paged_attention_v1(float, uchar, 32);
+instantiate_paged_attention_v1(bfloat16_t, uchar, 32);
+instantiate_paged_attention_v1(half, uchar, 32);
+instantiate_paged_attention_v2_reduce(float, 32);
+instantiate_paged_attention_v2_reduce(bfloat16_t, 32);
+instantiate_paged_attention_v2_reduce(half, 32);
+instantiate_paged_attention_v2(float, float, 32);
+instantiate_paged_attention_v2(bfloat16_t, bfloat16_t, 32);
+instantiate_paged_attention_v2(half, half, 32);
+instantiate_paged_attention_v2(float, uchar, 32);
+instantiate_paged_attention_v2(bfloat16_t, uchar, 32);
+instantiate_paged_attention_v2(half, uchar, 32);

paged-attention-metal/cache.mm CHANGED Viewed

@@ -147,6 +147,9 @@ void copy_blocks(const std::vector<torch::Tensor> &key_caches,
       case torch::kBFloat16:
         kernName = @"copy_blocks_bfloat16_t";
         break;
       default:
         TORCH_CHECK(false, "Unsupported dtype for copy_blocks");
       }
@@ -214,6 +217,16 @@ void reshape_and_cache(
     const std::string &kv_cache_dtype, torch::Tensor &k_scale,
     torch::Tensor &v_scale) {
   TORCH_CHECK(key.device().is_mps() && value.device().is_mps() &&
                   key_cache.device().is_mps() && value_cache.device().is_mps(),
               "All tensors must be on MPS device");
@@ -256,22 +269,51 @@ void reshape_and_cache(
     }
     NSString *kernName = nil;
     switch (key.scalar_type()) {
     case torch::kFloat:
-      kernName = @"reshape_and_cache_float";
       break;
     case torch::kHalf:
-      kernName = @"reshape_and_cache_half";
       break;
     case torch::kBFloat16:
-      kernName = @"reshape_and_cache_bfloat16_t";
       break;
     default:
       TORCH_CHECK(false, "Unsupported dtype for reshape_and_cache");
     }
-    id<MTLFunction> fn = [lib newFunctionWithName:kernName];
-    TORCH_CHECK(fn, "Missing Metal kernel function: ", kernName.UTF8String);
     id<MTLComputePipelineState> pso =
         [device newComputePipelineStateWithFunction:fn error:&error];
@@ -305,46 +347,59 @@ void reshape_and_cache(
                              options:MTLResourceStorageModeShared];
       [enc setBuffer:slotMappingBuf offset:0 atIndex:4];
       // Set parameters as individual buffers (matching mistralrs pattern)
       id<MTLBuffer> keyStrideBuf =
           [device newBufferWithBytes:&key_stride
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:keyStrideBuf offset:0 atIndex:5];
       id<MTLBuffer> valueStrideBuf =
           [device newBufferWithBytes:&value_stride
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:valueStrideBuf offset:0 atIndex:6];
       const int32_t num_heads_i32 = static_cast<int32_t>(num_heads);
       id<MTLBuffer> numHeadsBuf =
           [device newBufferWithBytes:&num_heads_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:numHeadsBuf offset:0 atIndex:7];
       const int32_t head_size_i32 = static_cast<int32_t>(head_size);
       id<MTLBuffer> headSizeBuf =
           [device newBufferWithBytes:&head_size_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:headSizeBuf offset:0 atIndex:8];
       const int32_t block_size_i32 = static_cast<int32_t>(block_size);
       id<MTLBuffer> blockSizeBuf =
           [device newBufferWithBytes:&block_size_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:blockSizeBuf offset:0 atIndex:9];
       const int32_t x_i32 = static_cast<int32_t>(x);
       id<MTLBuffer> xBuf =
           [device newBufferWithBytes:&x_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
-      [enc setBuffer:xBuf offset:0 atIndex:10];
       const uint64_t threads_per_threadgroup =
           std::min<uint64_t>(512, num_heads * head_size);

       case torch::kBFloat16:
         kernName = @"copy_blocks_bfloat16_t";
         break;
+      case torch::kUInt8:
+        kernName = @"copy_blocks_uchar";
+        break;
       default:
         TORCH_CHECK(false, "Unsupported dtype for copy_blocks");
       }
     const std::string &kv_cache_dtype, torch::Tensor &k_scale,
     torch::Tensor &v_scale) {
+  // Determine cache dtype and FP8 usage
+  torch::ScalarType cache_dtype = key_cache.scalar_type();
+  bool use_fp8_scales = (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3");
+  if (use_fp8_scales) {
+    TORCH_CHECK(cache_dtype == torch::kUInt8, "FP8 cache requires UInt8 tensor type");
+    TORCH_CHECK(k_scale.numel() == 1 && v_scale.numel() == 1, "FP8 scales must be scalars");
+    TORCH_CHECK(k_scale.scalar_type() == torch::kFloat32 && v_scale.scalar_type() == torch::kFloat32,
+                "FP8 scales must be float32");
+  }
   TORCH_CHECK(key.device().is_mps() && value.device().is_mps() &&
                   key_cache.device().is_mps() && value_cache.device().is_mps(),
               "All tensors must be on MPS device");
     }
     NSString *kernName = nil;
+    std::string kv_dtype_str, cache_dtype_str;
+    // Get KV dtype string
     switch (key.scalar_type()) {
     case torch::kFloat:
+      kv_dtype_str = "float";
       break;
     case torch::kHalf:
+      kv_dtype_str = "half";
       break;
     case torch::kBFloat16:
+      kv_dtype_str = "bfloat16_t";
       break;
     default:
       TORCH_CHECK(false, "Unsupported dtype for reshape_and_cache");
     }
+    // Get cache dtype string
+    switch (cache_dtype) {
+    case torch::kFloat:
+      cache_dtype_str = "float";
+      break;
+    case torch::kHalf:
+      cache_dtype_str = "half";
+      break;
+    case torch::kBFloat16:
+      cache_dtype_str = "bfloat16_t";
+      break;
+    case torch::kUInt8:
+      cache_dtype_str = "uchar";
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported cache dtype for reshape_and_cache");
+    }
+    std::string kernName_str = "reshape_and_cache_kv_" + kv_dtype_str + "_cache_" + cache_dtype_str;
+    kernName = [NSString stringWithUTF8String:kernName_str.c_str()];
+    // Create function constants for FP8 support
+    MTLFunctionConstantValues *constants = [[MTLFunctionConstantValues alloc] init];
+    [constants setConstantValue:&use_fp8_scales type:MTLDataTypeBool atIndex:10];
+    id<MTLFunction> fn = [lib newFunctionWithName:kernName constantValues:constants error:&error];
+    TORCH_CHECK(fn, "Missing Metal kernel function: ", kernName.UTF8String,
+                error ? [NSString stringWithFormat:@": %@", error.localizedDescription].UTF8String : "");
     id<MTLComputePipelineState> pso =
         [device newComputePipelineStateWithFunction:fn error:&error];
                              options:MTLResourceStorageModeShared];
       [enc setBuffer:slotMappingBuf offset:0 atIndex:4];
+      // k_scale and v_scale buffers (for FP8)
+      if (use_fp8_scales) {
+        [enc setBuffer:getMTLBufferStorage(k_scale)
+                offset:k_scale.storage_offset() * k_scale.element_size()
+               atIndex:5];
+        [enc setBuffer:getMTLBufferStorage(v_scale)
+                offset:v_scale.storage_offset() * v_scale.element_size()
+               atIndex:6];
+      } else {
+        // For non-FP8, we still need to increment buffer indices
+        // The Metal kernel expects buffers at indices 5 and 6 even if unused
+      }
       // Set parameters as individual buffers (matching mistralrs pattern)
       id<MTLBuffer> keyStrideBuf =
           [device newBufferWithBytes:&key_stride
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:keyStrideBuf offset:0 atIndex:7];
       id<MTLBuffer> valueStrideBuf =
           [device newBufferWithBytes:&value_stride
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:valueStrideBuf offset:0 atIndex:8];
       const int32_t num_heads_i32 = static_cast<int32_t>(num_heads);
       id<MTLBuffer> numHeadsBuf =
           [device newBufferWithBytes:&num_heads_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:numHeadsBuf offset:0 atIndex:9];
       const int32_t head_size_i32 = static_cast<int32_t>(head_size);
       id<MTLBuffer> headSizeBuf =
           [device newBufferWithBytes:&head_size_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:headSizeBuf offset:0 atIndex:10];
       const int32_t block_size_i32 = static_cast<int32_t>(block_size);
       id<MTLBuffer> blockSizeBuf =
           [device newBufferWithBytes:&block_size_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:blockSizeBuf offset:0 atIndex:11];
       const int32_t x_i32 = static_cast<int32_t>(x);
       id<MTLBuffer> xBuf =
           [device newBufferWithBytes:&x_i32
                               length:sizeof(int32_t)
                              options:MTLResourceStorageModeShared];
+      [enc setBuffer:xBuf offset:0 atIndex:12];
       const uint64_t threads_per_threadgroup =
           std::min<uint64_t>(512, num_heads * head_size);

paged-attention-metal/cache/copy_blocks.metal CHANGED Viewed

@@ -48,3 +48,4 @@ template <typename T>
 instantiate_copy_blocks(float);
 instantiate_copy_blocks(bfloat16_t);
 instantiate_copy_blocks(half);

 instantiate_copy_blocks(float);
 instantiate_copy_blocks(bfloat16_t);
 instantiate_copy_blocks(half);
+instantiate_copy_blocks(uchar);

paged-attention-metal/cache/reshape_and_cache.metal CHANGED Viewed

@@ -1,23 +1,56 @@
 #include "../utils.metal"
 #include <metal_stdlib>
 using namespace metal;
-template <typename T>
 [[kernel]] void reshape_and_cache(
-    const device T *__restrict__ key
     [[buffer(0)]], // [num_tokens, num_heads, head_size]
-    const device T *__restrict__ value
     [[buffer(1)]], // [num_tokens, num_heads, head_size]
-    device T *__restrict__ key_cache
     [[buffer(2)]], // [num_blocks, num_heads, head_size/x, block_size, x]
-    device T *__restrict__ value_cache
     [[buffer(3)]], // [num_blocks, num_heads, head_size, block_size]
     const device int64_t *__restrict__ slot_mapping
     [[buffer(4)]], // [num_tokens]
-    device const int &key_stride, device const int &value_stride,
-    device const int &num_heads, device const int &head_size,
-    device const int &block_size, device const int &x,
     uint gid [[threadgroup_position_in_grid]],
     uint tid [[thread_position_in_threadgroup]],
     uint threads_per_threadgroup [[threads_per_threadgroup]]) {
@@ -49,29 +82,47 @@ template <typename T>
         block_idx * num_heads * head_size * block_size +
         head_idx * head_size * block_size + head_offset * block_size +
         block_offset;
-    key_cache[tgt_key_idx] = key[src_key_idx];
-    value_cache[tgt_value_idx] = value[src_value_idx];
   }
 }
-#define instantiate_reshape_and_cache(type)                                    \
-  template [[host_name("reshape_and_cache_" #type)]] [[kernel]] void           \
-  reshape_and_cache<type>(                                                     \
-      const device type *__restrict__ key [[buffer(0)]],                       \
-      const device type *__restrict__ value [[buffer(1)]],                     \
-      device type *__restrict__ key_cache [[buffer(2)]],                       \
-      device type *__restrict__ value_cache [[buffer(3)]],                     \
       const device int64_t *__restrict__ slot_mapping [[buffer(4)]],           \
-      device const int &key_stride, device const int &value_stride,            \
-      device const int &num_heads, device const int &head_size,                \
-      device const int &block_size, device const int &x,                       \
       uint gid [[threadgroup_position_in_grid]],                               \
       uint tid [[thread_position_in_threadgroup]],                             \
       uint threads_per_threadgroup [[threads_per_threadgroup]]);
-instantiate_reshape_and_cache(float);
-instantiate_reshape_and_cache(bfloat16_t);
-instantiate_reshape_and_cache(half);
 // Flash version with different cache layout: [num_blocks, block_size,
 // num_heads, head_size]

 #include "../utils.metal"
+#include "../float8.metal"
 #include <metal_stdlib>
 using namespace metal;
+template <typename KV_T, typename CACHE_T>
+inline CACHE_T to_cache(KV_T v) = delete;
+template <> inline uchar to_cache<float, uchar>(float v) {
+  return float_to_fp8_e4m3(v);
+}
+template <> inline uchar to_cache<bfloat16_t, uchar>(bfloat16_t v) {
+  return float_to_fp8_e4m3((float)v);
+}
+template <> inline uchar to_cache<half, uchar>(half v) {
+  return float_to_fp8_e4m3((float)v);
+}
+template <> inline float to_cache<float, float>(float v) { return v; }
+template <> inline bfloat16_t to_cache<bfloat16_t, bfloat16_t>(bfloat16_t v) {
+  return v;
+}
+template <> inline half to_cache<half, half>(half v) { return v; }
+constant bool use_fp8_scales [[function_constant(10)]];
+template <typename KV_T, typename CACHE_T>
 [[kernel]] void reshape_and_cache(
+    const device KV_T *__restrict__ key
     [[buffer(0)]], // [num_tokens, num_heads, head_size]
+    const device KV_T *__restrict__ value
     [[buffer(1)]], // [num_tokens, num_heads, head_size]
+    device CACHE_T *__restrict__ key_cache
     [[buffer(2)]], // [num_blocks, num_heads, head_size/x, block_size, x]
+    device CACHE_T *__restrict__ value_cache
     [[buffer(3)]], // [num_blocks, num_heads, head_size, block_size]
     const device int64_t *__restrict__ slot_mapping
     [[buffer(4)]], // [num_tokens]
+    const device float *__restrict__ k_scale
+    [[buffer(5)]], // [1] - only used when use_fp8_scales
+    const device float *__restrict__ v_scale
+    [[buffer(6)]], // [1] - only used when use_fp8_scales
+    device const int &key_stride [[buffer(7)]],
+    device const int &value_stride [[buffer(8)]],
+    device const int &num_heads [[buffer(9)]],
+    device const int &head_size [[buffer(10)]],
+    device const int &block_size [[buffer(11)]],
+    device const int &x [[buffer(12)]],
     uint gid [[threadgroup_position_in_grid]],
     uint tid [[thread_position_in_threadgroup]],
     uint threads_per_threadgroup [[threads_per_threadgroup]]) {
         block_idx * num_heads * head_size * block_size +
         head_idx * head_size * block_size + head_offset * block_size +
         block_offset;
+    if (use_fp8_scales) {
+      key_cache[tgt_key_idx] =
+          to_cache<KV_T, CACHE_T>(KV_T((float)key[src_key_idx] / *k_scale));
+      value_cache[tgt_value_idx] =
+          to_cache<KV_T, CACHE_T>(KV_T((float)value[src_value_idx] / *v_scale));
+    } else {
+      key_cache[tgt_key_idx] = to_cache<KV_T, CACHE_T>(key[src_key_idx]);
+      value_cache[tgt_value_idx] = to_cache<KV_T, CACHE_T>(value[src_value_idx]);
+    }
   }
 }
+#define instantiate_reshape_and_cache(kv_type, cache_type)                     \
+  template [[host_name("reshape_and_cache_kv_" #kv_type                        \
+                       "_cache_" #cache_type)]] [[kernel]] void                \
+  reshape_and_cache<kv_type, cache_type>(                                      \
+      const device kv_type *__restrict__ key [[buffer(0)]],                    \
+      const device kv_type *__restrict__ value [[buffer(1)]],                  \
+      device cache_type *__restrict__ key_cache [[buffer(2)]],                 \
+      device cache_type *__restrict__ value_cache [[buffer(3)]],               \
       const device int64_t *__restrict__ slot_mapping [[buffer(4)]],           \
+      const device float *__restrict__ k_scale [[buffer(5)]],                  \
+      const device float *__restrict__ v_scale [[buffer(6)]],                  \
+      device const int &key_stride [[buffer(7)]],                              \
+      device const int &value_stride [[buffer(8)]],                            \
+      device const int &num_heads [[buffer(9)]],                               \
+      device const int &head_size [[buffer(10)]],                              \
+      device const int &block_size [[buffer(11)]],                             \
+      device const int &x [[buffer(12)]],                                      \
       uint gid [[threadgroup_position_in_grid]],                               \
       uint tid [[thread_position_in_threadgroup]],                             \
       uint threads_per_threadgroup [[threads_per_threadgroup]]);
+instantiate_reshape_and_cache(float, float);
+instantiate_reshape_and_cache(bfloat16_t, bfloat16_t);
+instantiate_reshape_and_cache(half, half);
+instantiate_reshape_and_cache(float, uchar);
+instantiate_reshape_and_cache(bfloat16_t, uchar);
+instantiate_reshape_and_cache(half, uchar);
 // Flash version with different cache layout: [num_blocks, block_size,
 // num_heads, head_size]

paged-attention-metal/convert_fp8.metal ADDED Viewed

	@@ -0,0 +1,77 @@

+#include "float8.metal"
+#include "utils.metal"
+#include <metal_stdlib>
+using namespace metal;
+// Convert between different precision formats for cache tensors
+// This kernel handles conversions like float->fp8, fp8->float, etc.
+template <typename SRC_T, typename DST_T>
+[[kernel]] void convert_fp8_kernel(
+    const device SRC_T *__restrict__ src [[buffer(0)]],
+    device DST_T *__restrict__ dst [[buffer(1)]],
+    const device float &scale [[buffer(2)]],
+    const device uint32_t &num_elements [[buffer(3)]],
+    uint gid [[thread_position_in_grid]]) {
+    if (gid >= num_elements) {
+        return;
+    }
+    // Load source value
+    SRC_T src_val = src[gid];
+    // Convert based on source and destination types
+    if constexpr (is_same_v<SRC_T, uchar> && !is_same_v<DST_T, uchar>) {
+        // FP8 -> higher precision (dequantization)
+        float fp32_val = fp8_e4m3_to_float(src_val) * scale;
+        dst[gid] = static_cast<DST_T>(fp32_val);
+    } else if constexpr (!is_same_v<SRC_T, uchar> && is_same_v<DST_T, uchar>) {
+        // Higher precision -> FP8 (quantization)
+        float fp32_val = static_cast<float>(src_val) / scale;
+        dst[gid] = float_to_fp8_e4m3(fp32_val);
+    } else if constexpr (is_same_v<SRC_T, uchar> && is_same_v<DST_T, uchar>) {
+        // FP8 -> FP8 (with rescaling)
+        float fp32_val = fp8_e4m3_to_float(src_val) * scale;
+        dst[gid] = float_to_fp8_e4m3(fp32_val);
+    } else {
+        // Regular precision -> regular precision (with scaling)
+        float fp32_val = static_cast<float>(src_val) * scale;
+        dst[gid] = static_cast<DST_T>(fp32_val);
+    }
+}
+// Instantiate all required combinations
+#define INSTANTIATE_CONVERT_FP8(src_type, dst_type) \
+    template [[host_name("convert_fp8_" #src_type "_to_" #dst_type)]] \
+    [[kernel]] void convert_fp8_kernel<src_type, dst_type>( \
+        const device src_type *__restrict__ src [[buffer(0)]], \
+        device dst_type *__restrict__ dst [[buffer(1)]], \
+        const device float &scale [[buffer(2)]], \
+        const device uint32_t &num_elements [[buffer(3)]], \
+        uint gid [[thread_position_in_grid]]);
+// FP8 to other formats (dequantization)
+INSTANTIATE_CONVERT_FP8(uchar, float);
+INSTANTIATE_CONVERT_FP8(uchar, half);
+INSTANTIATE_CONVERT_FP8(uchar, bfloat16_t);
+// Other formats to FP8 (quantization)
+INSTANTIATE_CONVERT_FP8(float, uchar);
+INSTANTIATE_CONVERT_FP8(half, uchar);
+INSTANTIATE_CONVERT_FP8(bfloat16_t, uchar);
+// FP8 to FP8 (rescaling)
+INSTANTIATE_CONVERT_FP8(uchar, uchar);
+// Regular precision conversions with scaling
+INSTANTIATE_CONVERT_FP8(float, float);
+INSTANTIATE_CONVERT_FP8(float, half);
+INSTANTIATE_CONVERT_FP8(float, bfloat16_t);
+INSTANTIATE_CONVERT_FP8(half, float);
+INSTANTIATE_CONVERT_FP8(half, half);
+INSTANTIATE_CONVERT_FP8(half, bfloat16_t);
+INSTANTIATE_CONVERT_FP8(bfloat16_t, float);
+INSTANTIATE_CONVERT_FP8(bfloat16_t, half);
+INSTANTIATE_CONVERT_FP8(bfloat16_t, bfloat16_t);

paged-attention-metal/convert_fp8.mm CHANGED Viewed

@@ -1,3 +1,5 @@
 #include <torch/torch.h>
 #import <Foundation/Foundation.h>
@@ -24,7 +26,113 @@ static std::string getModuleDirectory() {
   return ".";
 }
 void convert_fp8(torch::Tensor &dst_cache, torch::Tensor &src_cache,
                  const double scale, const std::string &kv_cache_dtype) {
-  TORCH_CHECK(false, "fp8 is not supported on Metal.");
 }

+#include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
 #include <torch/torch.h>
 #import <Foundation/Foundation.h>
   return ".";
 }
+// Helper function to get conversion kernel name
+static std::string getConvertKernelName(torch::ScalarType src_dtype, torch::ScalarType dst_dtype) {
+  std::string src_str, dst_str;
+  auto dtype_to_string = [](torch::ScalarType dtype) -> std::string {
+    switch (dtype) {
+    case torch::kFloat: return "float";
+    case torch::kHalf: return "half";
+    case torch::kBFloat16: return "bfloat16_t";
+    case torch::kUInt8: return "uchar";
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for convert_fp8: ", dtype);
+    }
+  };
+  src_str = dtype_to_string(src_dtype);
+  dst_str = dtype_to_string(dst_dtype);
+  return "convert_fp8_" + src_str + "_to_" + dst_str;
+}
 void convert_fp8(torch::Tensor &dst_cache, torch::Tensor &src_cache,
                  const double scale, const std::string &kv_cache_dtype) {
+  // Validate input tensors
+  TORCH_CHECK(src_cache.device().is_mps() && dst_cache.device().is_mps(),
+              "Both tensors must be on MPS device");
+  TORCH_CHECK(src_cache.device() == dst_cache.device(),
+              "Source and destination tensors must be on the same device");
+  TORCH_CHECK(src_cache.numel() == dst_cache.numel(),
+              "Source and destination tensors must have the same number of elements");
+  TORCH_CHECK(src_cache.is_contiguous() && dst_cache.is_contiguous(),
+              "Both tensors must be contiguous");
+  const uint32_t num_elements = static_cast<uint32_t>(src_cache.numel());
+  if (num_elements == 0) {
+    return; // Nothing to convert
+  }
+  // Determine conversion kernel name
+  std::string kernel_name = getConvertKernelName(src_cache.scalar_type(), dst_cache.scalar_type());
+  @autoreleasepool {
+    at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLDevice> device = stream->device();
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
+    TORCH_CHECK(cmdBuf, "Failed to get command buffer");
+    // Load Metal library
+    std::string moduleDir = getModuleDirectory();
+    std::string metallibPath = moduleDir + "/" + METALLIB_PATH;
+    NSString *metallibPathStr = [NSString stringWithUTF8String:metallibPath.c_str()];
+    NSURL *metallibURL = [NSURL fileURLWithPath:metallibPathStr];
+    NSError *error = nil;
+    id<MTLLibrary> lib = [device newLibraryWithURL:metallibURL error:&error];
+    TORCH_CHECK(lib, "Failed to load Metal library at ", metallibPath, ": ",
+                error ? error.localizedDescription.UTF8String : "unknown error");
+    // Create kernel function
+    NSString *kernelNameStr = [NSString stringWithUTF8String:kernel_name.c_str()];
+    id<MTLFunction> fn = [lib newFunctionWithName:kernelNameStr];
+    TORCH_CHECK(fn, "Failed to find Metal kernel function: ", kernel_name);
+    id<MTLComputePipelineState> pso = [device newComputePipelineStateWithFunction:fn error:&error];
+    TORCH_CHECK(pso, "Failed to create compute pipeline state: ",
+                error ? error.localizedDescription.UTF8String : "unknown error");
+    dispatch_queue_t q = stream->queue();
+    dispatch_sync(q, ^{
+      id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
+      TORCH_CHECK(enc, "Failed to create compute encoder");
+      [enc setComputePipelineState:pso];
+      // Set buffers
+      [enc setBuffer:getMTLBufferStorage(src_cache)
+              offset:src_cache.storage_offset() * src_cache.element_size()
+             atIndex:0];
+      [enc setBuffer:getMTLBufferStorage(dst_cache)
+              offset:dst_cache.storage_offset() * dst_cache.element_size()
+             atIndex:1];
+      // Set scale parameter
+      float scale_f32 = static_cast<float>(scale);
+      id<MTLBuffer> scaleBuf = [device newBufferWithBytes:&scale_f32
+                                                   length:sizeof(float)
+                                                  options:MTLResourceStorageModeShared];
+      [enc setBuffer:scaleBuf offset:0 atIndex:2];
+      // Set num_elements parameter
+      id<MTLBuffer> numElementsBuf = [device newBufferWithBytes:&num_elements
+                                                         length:sizeof(uint32_t)
+                                                        options:MTLResourceStorageModeShared];
+      [enc setBuffer:numElementsBuf offset:0 atIndex:3];
+      // Dispatch threads
+      const uint32_t threads_per_threadgroup = std::min<uint32_t>(1024, num_elements);
+      const uint32_t threadgroups = (num_elements + threads_per_threadgroup - 1) / threads_per_threadgroup;
+      MTLSize threadsPerThreadgroup = MTLSizeMake(threads_per_threadgroup, 1, 1);
+      MTLSize threadgroupsPerGrid = MTLSizeMake(threadgroups, 1, 1);
+      [enc dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
+      [enc endEncoding];
+    });
+    stream->synchronize(at::mps::SyncType::COMMIT);
+  }
 }

paged-attention-metal/float8.metal ADDED Viewed

	@@ -0,0 +1,122 @@

+#include <metal_stdlib>
+using namespace metal;
+// Helpers ------------------------------------------------------------
+static inline uint as_bits(float x) { return as_type<uint>(x); }
+static inline float from_bits(uint b) { return as_type<float>(b); }
+// -------------------------------------------------------------------
+// FP8 E4M3 (bias = 7)
+// -------------------------------------------------------------------
+inline float fp8_e4m3_to_float(uchar v) {
+  const uint s = v >> 7;
+  const uint exp = (v >> 3) & 0xF;
+  const uint man = v & 0x7;
+  if (exp == 0) { // zero / sub-normal
+    if (man == 0)
+      return s ? -0.f : 0.f;
+    const float m = float(man) / 8.f; // already scaled by 2^-3
+    float val = ldexp(m, 1 - 7);      // 2^(1-bias) = 2^-6
+    return s ? -val : val;
+  }
+  if (exp == 0xF) { // Inf / NaN  (E4M3FN keeps only NaN)
+    if (man != 0)
+      return NAN;
+    return s ? -INFINITY : INFINITY;
+  }
+  const float m = 1.f + float(man) / 8.f;
+  float val = ldexp(m, int(exp) - 7);
+  return s ? -val : val;
+}
+// -------------------------------------------------------------------
+// FP8 E5M2 (bias = 15)
+// -------------------------------------------------------------------
+inline float fp8_e5m2_to_float(uchar v) {
+  const uint s = v >> 7;
+  const uint exp = (v >> 2) & 0x1F;
+  const uint man = v & 0x3;
+  if (exp == 0) {
+    if (man == 0)
+      return s ? -0.f : 0.f;
+    const float m = float(man) / 4.f;
+    float val = ldexp(m, 1 - 15); // 2^(1-bias) = 2^-14
+    return s ? -val : val;
+  }
+  if (exp == 0x1F) {
+    if (man != 0)
+      return NAN;
+    return s ? -INFINITY : INFINITY;
+  }
+  const float m = 1.f + float(man) / 4.f;
+  float val = ldexp(m, int(exp) - 15);
+  return s ? -val : val;
+}
+// -------------------------------------------------------------------
+// Encoding helpers (round-to-nearest-even, gradual under-flow, sat-to-∞)
+// -------------------------------------------------------------------
+namespace detail {
+template <int EXP_BITS, int MAN_BITS, int BIAS>
+inline uchar fp32_to_fp8(float f) {
+  const uint bits = as_bits(f);
+  const uint s = bits >> 31;
+  const uint abs = bits & 0x7FFFFFFF;
+  // NaN propagates, Inf saturates
+  if (abs >= 0x7F800000u) {
+    return uchar((s << 7) | (((1u << EXP_BITS) - 1u) << MAN_BITS) |
+                 (abs != 0x7F800000u));
+  }
+  int e = int((abs >> 23) & 0xFF) - 127;   // unbiased exponent
+  uint m = abs & 0x7FFFFFu;                // 23-bit mantissa
+  const int EXP_MAX = (1 << EXP_BITS) - 2; // last finite exponent
+  // ---------- Normal path -------------------------------------------------
+  int e_fp8 = e + BIAS;
+  if (e_fp8 >= 1 && e_fp8 <= EXP_MAX) {
+    // round-to-nearest-even
+    const int shift = 23 - MAN_BITS;
+    uint mant = m >> shift;
+    const uint lsb = mant & 1u;
+    const uint round = (m >> (shift - 1)) & 1u;
+    const uint sticky = (m & ((1u << (shift - 1)) - 1u)) != 0u;
+    mant += (round & (sticky | lsb));
+    if (mant >> MAN_BITS) { // mantissa overflow
+      mant = 0;
+      ++e_fp8;
+      if (e_fp8 > EXP_MAX)
+        return uchar((s << 7) | (((1u << EXP_BITS) - 1u) << MAN_BITS)); // ∞
+    }
+    return uchar((s << 7) | (uint(e_fp8) << MAN_BITS) |
+                 (mant & ((1u << MAN_BITS) - 1u)));
+  }
+  // ---------- Sub-normal / under-flow ------------------------------------
+  if (e_fp8 < 1 - MAN_BITS) // too small -> ±0
+    return uchar(s << 7);
+  // shift so that exponent becomes 1
+  int rshift = (1 - e_fp8) + (23 - MAN_BITS);
+  uint mant = (0x800000u | m); // implicit 1
+  uint rounded = (mant + (1u << (rshift - 1))) >> rshift;
+  if (rounded == 0)
+    return uchar(s << 7); // rounds to zero
+  return uchar((s << 7) | (rounded & ((1u << MAN_BITS) - 1u)));
+}
+} // namespace detail
+inline uchar float_to_fp8_e4m3(float f) {
+  return detail::fp32_to_fp8<4, 3, 7>(f);
+}
+inline uchar float_to_fp8_e5m2(float f) {
+  return detail::fp32_to_fp8<5, 2, 15>(f);
+}

paged-attention-metal/paged_attention.mm CHANGED Viewed

@@ -28,7 +28,9 @@ static std::string getModuleDirectory() {
 // Helper function to get kernel name based on dtype and parameters
 static std::string getKernelName(const std::string &base_name,
-                                 torch::ScalarType dtype, int head_size,
                                  int block_size, int num_threads,
                                  int num_simd_lanes, int partition_size = 0) {
   std::string dtype_str;
@@ -46,8 +48,26 @@ static std::string getKernelName(const std::string &base_name,
     TORCH_CHECK(false, "Unsupported dtype for paged attention: ", dtype);
   }
   std::string kernel_name =
-      base_name + "_" + dtype_str + "_hs" + std::to_string(head_size) + "_bs" +
       std::to_string(block_size) + "_nt" + std::to_string(num_threads) +
       "_nsl" + std::to_string(num_simd_lanes);
@@ -106,12 +126,19 @@ void paged_attention_v1(
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
   // Validate block sparse is not supported yet
-  // TODO: support blocksparse, k/v scale.
   TORCH_CHECK(
       !is_block_sparse,
       "Block sparse attention is not yet supported in Metal implementation");
-  if (kv_cache_dtype != "auto") {
-    TORCH_CHECK(false, "fp8 is not supported on Metal.");
   }
   // Validate input tensors
@@ -147,7 +174,7 @@ void paged_attention_v1(
   // Get kernel name - v1 kernels have partition_size=0 in their name
   std::string kernel_name =
-      getKernelName("paged_attention", query.scalar_type(), head_size,
                     block_size, num_threads, num_simd_lanes, partition_size);
   @autoreleasepool {
@@ -174,6 +201,7 @@ void paged_attention_v1(
                            type:MTLDataTypeBool
                         atIndex:10];
     [constants setConstantValue:&use_alibi type:MTLDataTypeBool atIndex:20];
     NSString *kernelNameStr =
         [NSString stringWithUTF8String:kernel_name.c_str()];
@@ -233,6 +261,18 @@ void paged_attention_v1(
               offset:value_cache.storage_offset() * value_cache.element_size()
              atIndex:buffer_idx++];
       // num_kv_heads
       int32_t num_kv_heads_i32 = static_cast<int32_t>(num_kv_heads);
       [enc setBytes:&num_kv_heads_i32
@@ -324,13 +364,20 @@ void paged_attention_v2(
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
-  // TODO: support blocksparse, k/v scale.
   // Validate block sparse is not supported yet
   TORCH_CHECK(
       !is_block_sparse,
       "Block sparse attention is not yet supported in Metal implementation");
-  if (kv_cache_dtype != "auto") {
-    TORCH_CHECK(false, "fp8 is not supported on Metal.");
   }
   // Validate input tensors
@@ -365,7 +412,7 @@ void paged_attention_v2(
   // Get kernel names
   std::string kernel_name =
-      getKernelName("paged_attention", query.scalar_type(), head_size,
                     block_size, num_threads, num_simd_lanes, partition_size);
   // Reduce kernel doesn't have block_size in its name
   std::string reduce_kernel_name = "paged_attention_v2_reduce";
@@ -427,6 +474,9 @@ void paged_attention_v2(
       [mainConstants setConstantValue:&use_alibi
                                  type:MTLDataTypeBool
                               atIndex:20];
       NSString *kernelNameStr =
           [NSString stringWithUTF8String:kernel_name.c_str()];
@@ -485,6 +535,18 @@ void paged_attention_v2(
               offset:value_cache.storage_offset() * value_cache.element_size()
              atIndex:buffer_idx++];
       // num_kv_heads
       int32_t num_kv_heads_i32 = static_cast<int32_t>(num_kv_heads);
       [enc setBytes:&num_kv_heads_i32

 // Helper function to get kernel name based on dtype and parameters
 static std::string getKernelName(const std::string &base_name,
+                                 torch::ScalarType dtype,
+                                 torch::ScalarType cache_dtype,
+                                 int head_size,
                                  int block_size, int num_threads,
                                  int num_simd_lanes, int partition_size = 0) {
   std::string dtype_str;
     TORCH_CHECK(false, "Unsupported dtype for paged attention: ", dtype);
   }
+  std::string cache_dtype_str;
+  switch (cache_dtype) {
+  case torch::kFloat:
+    cache_dtype_str = "float";
+    break;
+  case torch::kHalf:
+    cache_dtype_str = "half";
+    break;
+  case torch::kBFloat16:
+    cache_dtype_str = "bfloat16_t";
+    break;
+  case torch::kUInt8:
+    cache_dtype_str = "uchar";
+    break;
+  default:
+    TORCH_CHECK(false, "Unsupported cache dtype for paged attention: ", cache_dtype);
+  }
   std::string kernel_name =
+      base_name + "_" + dtype_str + "_cache_" + cache_dtype_str + "_hs" + std::to_string(head_size) + "_bs" +
       std::to_string(block_size) + "_nt" + std::to_string(num_threads) +
       "_nsl" + std::to_string(num_simd_lanes);
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
   // Validate block sparse is not supported yet
+  // TODO: support blocksparse.
   TORCH_CHECK(
       !is_block_sparse,
       "Block sparse attention is not yet supported in Metal implementation");
+  // Determine cache dtype based on kv_cache_dtype
+  torch::ScalarType cache_dtype = key_cache.scalar_type();
+  bool use_fp8_scales = (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3");
+  if (use_fp8_scales) {
+    TORCH_CHECK(cache_dtype == torch::kUInt8, "FP8 cache requires UInt8 tensor type");
+    TORCH_CHECK(k_scale.numel() == 1 && v_scale.numel() == 1, "FP8 scales must be scalars");
+    TORCH_CHECK(k_scale.scalar_type() == torch::kFloat32 && v_scale.scalar_type() == torch::kFloat32,
+                "FP8 scales must be float32");
   }
   // Validate input tensors
   // Get kernel name - v1 kernels have partition_size=0 in their name
   std::string kernel_name =
+      getKernelName("paged_attention", query.scalar_type(), cache_dtype, head_size,
                     block_size, num_threads, num_simd_lanes, partition_size);
   @autoreleasepool {
                            type:MTLDataTypeBool
                         atIndex:10];
     [constants setConstantValue:&use_alibi type:MTLDataTypeBool atIndex:20];
+    [constants setConstantValue:&use_fp8_scales type:MTLDataTypeBool atIndex:30];
     NSString *kernelNameStr =
         [NSString stringWithUTF8String:kernel_name.c_str()];
               offset:value_cache.storage_offset() * value_cache.element_size()
              atIndex:buffer_idx++];
+      // k_scale and v_scale (for FP8)
+      if (use_fp8_scales) {
+        [enc setBuffer:getMTLBufferStorage(k_scale)
+                offset:k_scale.storage_offset() * k_scale.element_size()
+               atIndex:buffer_idx++];
+        [enc setBuffer:getMTLBufferStorage(v_scale)
+                offset:v_scale.storage_offset() * v_scale.element_size()
+               atIndex:buffer_idx++];
+      } else {
+        buffer_idx += 2; // Skip k_scale and v_scale buffer slots
+      }
       // num_kv_heads
       int32_t num_kv_heads_i32 = static_cast<int32_t>(num_kv_heads);
       [enc setBytes:&num_kv_heads_i32
     const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
+  // TODO: support blocksparse.
   // Validate block sparse is not supported yet
   TORCH_CHECK(
       !is_block_sparse,
       "Block sparse attention is not yet supported in Metal implementation");
+  // Determine cache dtype based on kv_cache_dtype
+  torch::ScalarType cache_dtype = key_cache.scalar_type();
+  bool use_fp8_scales = (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3");
+  if (use_fp8_scales) {
+    TORCH_CHECK(cache_dtype == torch::kUInt8, "FP8 cache requires UInt8 tensor type");
+    TORCH_CHECK(k_scale.numel() == 1 && v_scale.numel() == 1, "FP8 scales must be scalars");
+    TORCH_CHECK(k_scale.scalar_type() == torch::kFloat32 && v_scale.scalar_type() == torch::kFloat32,
+                "FP8 scales must be float32");
   }
   // Validate input tensors
   // Get kernel names
   std::string kernel_name =
+      getKernelName("paged_attention", query.scalar_type(), cache_dtype, head_size,
                     block_size, num_threads, num_simd_lanes, partition_size);
   // Reduce kernel doesn't have block_size in its name
   std::string reduce_kernel_name = "paged_attention_v2_reduce";
       [mainConstants setConstantValue:&use_alibi
                                  type:MTLDataTypeBool
                               atIndex:20];
+      [mainConstants setConstantValue:&use_fp8_scales
+                                 type:MTLDataTypeBool
+                              atIndex:30];
       NSString *kernelNameStr =
           [NSString stringWithUTF8String:kernel_name.c_str()];
               offset:value_cache.storage_offset() * value_cache.element_size()
              atIndex:buffer_idx++];
+      // k_scale and v_scale (for FP8)
+      if (use_fp8_scales) {
+        [enc setBuffer:getMTLBufferStorage(k_scale)
+                offset:k_scale.storage_offset() * k_scale.element_size()
+               atIndex:buffer_idx++];
+        [enc setBuffer:getMTLBufferStorage(v_scale)
+                offset:v_scale.storage_offset() * v_scale.element_size()
+               atIndex:buffer_idx++];
+      } else {
+        buffer_idx += 2; // Skip k_scale and v_scale buffer slots
+      }
       // num_kv_heads
       int32_t num_kv_heads_i32 = static_cast<int32_t>(num_kv_heads);
       [enc setBytes:&num_kv_heads_i32

tests/kernels/test_attention.py CHANGED Viewed

@@ -34,7 +34,7 @@ HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 if current_platform.is_mps():
-    KV_CACHE_DTYPE = ["auto"]
 else:
     KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]

 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 if current_platform.is_mps():
+    KV_CACHE_DTYPE = ["auto", "fp8"]
 else:
     KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]

tests/kernels/test_cache.py CHANGED Viewed

@@ -8,7 +8,7 @@ from paged_attention.platforms import current_platform
 from .utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
-COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -28,7 +28,7 @@ else:
     DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 if current_platform.is_mps():
-    KV_CACHE_DTYPE = ["auto"]
 else:
     KV_CACHE_DTYPE = ["auto", "fp8"]
@@ -226,10 +226,10 @@ def test_reshape_and_cache(
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(
-            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
         )
         torch.testing.assert_close(
-            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
         )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
@@ -258,6 +258,9 @@ def test_reshape_and_cache_flash(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -346,10 +349,10 @@ def test_reshape_and_cache_flash(
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(
-            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
         )
         torch.testing.assert_close(
-            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
         )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
@@ -387,8 +390,8 @@ def test_swap_blocks(
     current_platform.seed_everything(seed)
-    src_device = device if direction[0] == "cuda" else "cpu"
-    dst_device = device if direction[1] == "cuda" else "cpu"
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
@@ -474,8 +477,6 @@ def test_fp8_e4m3_conversion(
     seed: int,
     device: str,
 ) -> None:
-    if current_platform.is_mps():
-        pytest.skip()
     current_platform.seed_everything(seed)
     low = -224.0
@@ -490,4 +491,60 @@ def test_fp8_e4m3_conversion(
     converted_cache = torch.empty_like(cache)
     ops.convert_fp8(converted_cache, cache_fp8)
-    torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)

 from .utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+COPYING_DIRECTION = [("gpu", "cpu"), ("gpu", "gpu"), ("cpu", "gpu")]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
     DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 if current_platform.is_mps():
+    KV_CACHE_DTYPE = ["auto", "fp8"]
 else:
     KV_CACHE_DTYPE = ["auto", "fp8"]
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.02, rtol=0.2
         )
         torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.02, rtol=0.2
         )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
     device: str,
     kv_cache_dtype: str,
 ) -> None:
+    # Flash variant doesn't support FP8 on MPS devices yet
+    if current_platform.is_mps() and kv_cache_dtype == "fp8":
+        pytest.skip("reshape_and_cache_flash doesn't support FP8 on MPS")
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.02, rtol=0.2
         )
         torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.02, rtol=0.2
         )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
     current_platform.seed_everything(seed)
+    src_device = device if direction[0] == "gpu" else "cpu"
+    dst_device = device if direction[1] == "gpu" else "cpu"
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
     seed: int,
     device: str,
 ) -> None:
     current_platform.seed_everything(seed)
     low = -224.0
     converted_cache = torch.empty_like(cache)
     ops.convert_fp8(converted_cache, cache_fp8)
+    torch.testing.assert_close(cache, converted_cache, atol=0.02, rtol=0.2)
+@pytest.mark.parametrize("src_dtype", [torch.float, torch.half, torch.bfloat16, torch.uint8])
+@pytest.mark.parametrize("dst_dtype", [torch.float, torch.half, torch.bfloat16, torch.uint8])
+@pytest.mark.parametrize("scale", [1.0, 0.5, 2.0, 0.1])
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_convert_fp8_comprehensive(
+    src_dtype: torch.dtype,
+    dst_dtype: torch.dtype,
+    scale: float,
+    device: str,
+) -> None:
+    """Test comprehensive FP8 conversion between all supported types"""
+    if current_platform.is_mps() and device != "mps:0":
+        pytest.skip()
+    if not current_platform.is_mps() and device == "mps:0":
+        pytest.skip()
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    # Create test tensor with reasonable values for FP8 range
+    shape = (32, 8, 16, 16)  # Small tensor for fast testing
+    if src_dtype == torch.uint8:
+        # Create FP8 data by converting from float
+        src_float = torch.randn(shape, dtype=torch.float, device=device) * 0.1
+        src_cache = torch.empty(shape, dtype=torch.uint8, device=device)
+        ops.convert_fp8(src_cache, src_float, 1.0, "fp8")
+    else:
+        # Create source data in range suitable for FP8 conversion
+        src_cache = torch.randn(shape, dtype=src_dtype, device=device) * 0.1
+    # Perform conversion
+    dst_cache = torch.empty_like(src_cache, dtype=dst_dtype, device=device)
+    ops.convert_fp8(dst_cache, src_cache, scale, "fp8")
+    # Verify the tensor was modified (not all zeros)
+    assert not torch.allclose(dst_cache.float(), torch.zeros_like(dst_cache.float()))
+    # For round-trip tests (same type), verify approximate equality
+    if src_dtype == dst_dtype and scale == 1.0:
+        if src_dtype == torch.uint8:
+            # FP8 -> FP8 should be identity with scale=1.0
+            torch.testing.assert_close(src_cache, dst_cache)
+        else:
+            # Non-FP8 -> Non-FP8 should be identity with scale=1.0
+            torch.testing.assert_close(src_cache, dst_cache, atol=1e-6, rtol=1e-5)
+    # For FP8 round-trip tests (float -> FP8 -> float), verify reasonable approximation
+    if src_dtype != torch.uint8 and dst_dtype == torch.uint8 and scale == 1.0:
+        # Convert back to verify round-trip accuracy
+        roundtrip = torch.empty_like(src_cache, dtype=src_dtype, device=device)
+        ops.convert_fp8(roundtrip, dst_cache, 1.0, "fp8")
+        # FP8 has limited precision, so use relaxed tolerances
+        torch.testing.assert_close(src_cache, roundtrip, atol=0.02, rtol=0.2)