Add metal kernels

Files changed (15) hide show

README.md +12 -3
build.toml +19 -0
flake.lock +117 -0
flake.nix +17 -0
paged-attention-metal/attention/pagedattention.metal +1187 -0
paged-attention-metal/cache/copy_blocks.metal +50 -0
paged-attention-metal/cache/reshape_and_cache.metal +74 -0
paged-attention-metal/paged_attention.mm +117 -0
paged-attention-metal/utils.metal +0 -0
tests/kernels/__init__.py +0 -0
tests/kernels/allclose_default.py +14 -0
tests/kernels/conftest.py +158 -0
tests/kernels/test_attention.py +418 -0
tests/kernels/test_cache.py +486 -0
tests/kernels/utils.py +92 -0

README.md CHANGED Viewed

@@ -1,3 +1,12 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+tags:
+- kernel
+---
+![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/paged-attention)
+## attention
+Paged attention kernels from [vLLM](https://github.com/vllm-project/).

build.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[general]
+name = "paged_attention"
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.activation_metal]
+backend = "metal"
+src = [
+  "paged-attention-metal/attention/paged_attention.metal",
+  "paged-attention-metal/cache/copy_blocks.metal",
+  "paged-attention-metal/cache/reshape_and_cache.metal",
+  "paged-attention-metal/utils.metal",
+  "paged-attention-metal/paged_attention.mm",
+]
+depends = [ "torch" ]

flake.lock ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs",
+        "rocm-nix": "rocm-nix"
+      },
+      "locked": {
+        "lastModified": 1744976941,
+        "narHash": "sha256-+csrhVaT6Mj2j1FM7P2BDITvf1Xwj2AKdMm0IKZK340=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "0a278c2e9aaf6003a4ec6fe35c7158624762de5a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1743559129,
+        "narHash": "sha256-7gpAWsENV3tY2HmeHYQ2MoQxGpys+jQWnkS/BHAMXVk=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "adae22bea8bcc0aa2fd6e8732044660fb7755f5e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "rocm-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "kernel-builder",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1743085847,
+        "narHash": "sha256-uWG29p+nhZmGRV1LffWwRGjwtPIXeu1F0YTQbXgB+GU=",
+        "owner": "huggingface",
+        "repo": "rocm-nix",
+        "rev": "245cdc9bfb4bfafa818711c5f5e0b889afe1ba39",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "rocm-nix",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for attention kernels";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

paged-attention-metal/attention/pagedattention.metal ADDED Viewed

	@@ -0,0 +1,1187 @@

+// Updated from MLX commit has f70764a
+#include "utils.metal"
+#include <metal_simdgroup>
+#include <metal_stdlib>
+using namespace metal;
+// ========================================== Generic vector types
+// A vector type to store Q, K, V elements.
+template <typename T, int VEC_SIZE> struct Vec {};
+// A vector type to store FP32 accumulators.
+template <typename T> struct FloatVec {};
+// Template vector operations.
+template <typename Acc, typename A, typename B> inline Acc mul(A a, B b);
+template <typename T> inline float sum(T v);
+template <typename T> inline float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+template <typename A, typename T> inline float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+// FP32 vector data types.
+struct Float8_ {
+  float4 x;
+  float4 y;
+};
+template <> struct Vec<float, 1> {
+  using Type = float;
+};
+template <> struct Vec<float, 2> {
+  using Type = float2;
+};
+template <> struct Vec<float, 4> {
+  using Type = float4;
+};
+template <> struct Vec<float, 8> {
+  using Type = Float8_;
+};
+template <> struct FloatVec<float> {
+  using Type = float;
+};
+template <> struct FloatVec<float2> {
+  using Type = float2;
+};
+template <> struct FloatVec<float4> {
+  using Type = float4;
+};
+template <> struct FloatVec<Float8_> {
+  using Type = Float8_;
+};
+template <> inline float mul(float a, float b) { return a * b; }
+template <> inline float2 mul(float2 a, float2 b) { return a * b; }
+template <> inline float4 mul(float4 a, float4 b) { return a * b; }
+template <> inline Float8_ mul(Float8_ a, Float8_ b) {
+  Float8_ c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+template <> inline float sum(float a) { return a; }
+template <> inline float sum(float2 a) { return a.x + a.y; }
+template <> inline float sum(float4 a) { return a.x + a.y + a.z + a.w; }
+template <> inline float sum(Float8_ a) { return sum(a.x) + sum(a.y); }
+inline Float8_ fma(Float8_ a, Float8_ b, Float8_ c) {
+  Float8_ res;
+  res.x = fma(a.x, b.x, c.x);
+  res.y = fma(a.y, b.y, c.y);
+  return res;
+}
+inline void from_float(thread float &dst, float src) { dst = src; }
+inline void from_float(thread float2 &dst, float2 src) { dst = src; }
+inline void from_float(thread float4 &dst, float4 src) { dst = src; }
+inline void from_float(thread Float8_ &dst, Float8_ src) { dst = src; }
+// BF16 vector data types.
+// #if defined(__HAVE_BFLOAT__)
+// struct Bfloat8_ {
+//   bfloat4 x;
+//   bfloat4 y;
+// };
+// template<>
+// struct Vec<bfloat, 1> {
+//   using Type = bfloat;
+// };
+// template<>
+// struct Vec<bfloat, 2> {
+//   using Type = bfloat2;
+// };
+// template<>
+// struct Vec<bfloat, 4> {
+//   using Type = bfloat4;
+// };
+// template<>
+// struct Vec<bfloat, 8> {
+//   using Type = Bfloat8_;
+// };
+// template<>
+// struct FloatVec<bfloat> {
+//   using Type = float;
+// };
+// template<>
+// struct FloatVec<bfloat2> {
+//   using Type = float2;
+// };
+// template<>
+// struct FloatVec<bfloat4> {
+//   using Type = float4;
+// };
+// template<>
+// struct FloatVec<Bfloat8_> {
+//   using Type = Float8_;
+// };
+// template<>
+// inline float mul(bfloat a, bfloat b) {
+//   return (float)a * (float)b;
+// }
+// template<>
+// inline bfloat mul(bfloat a, bfloat b) {
+//   return a*b;
+// }
+// template<>
+// inline float2 mul(bfloat2 a, bfloat2 b) {
+//   return (float2)a * (float2)b;
+// }
+// template<>
+// inline bfloat2 mul(bfloat2 a, bfloat2 b) {
+//   return a * b;
+// }
+// template<>
+// inline float4 mul(bfloat4 a, bfloat4 b) {
+//   return (float4)a * (float4)b;
+// }
+// template<>
+// inline bfloat4 mul(bfloat4 a, bfloat4 b) {
+//   return a * b;
+// }
+// template<>
+// inline Float8_ mul(Bfloat8_ a, Bfloat8_ b) {
+//   Float8_ c;
+//   c.x = mul<float4, bfloat4, bfloat4>(a.x, b.x);
+//   c.y = mul<float4, bfloat4, bfloat4>(a.y, b.y);
+//   return c;
+// }
+// template<>
+// inline Bfloat8_ mul(Bfloat8_ a, Bfloat8_ b) {
+//   Bfloat8_ c;
+//   c.x = mul<bfloat4, bfloat4, bfloat4>(a.x, b.x);
+//   c.y = mul<bfloat4, bfloat4, bfloat4>(a.y, b.y);
+//   return c;
+// }
+// template<>
+// inline float sum(bfloat a) {
+//   return (float)a;
+// }
+// template<>
+// inline float sum(bfloat2 a) {
+//   return (float)a.x + (float)a.y;
+// }
+// template<>
+// inline float sum(bfloat4 a) {
+//   return sum(a.x) + sum(a.y);
+// }
+// template<>
+// inline float sum(Bfloat8_ a) {
+//   return sum(a.x) + sum(a.y);
+// }
+// inline float fma(bfloat a, bfloat b, float c) {
+//   return (float)a * (float)b + c;
+// }
+// inline float2 fma(bfloat2 a, bfloat2 b, float2 c) {
+//   return (float2)a * (float2)b + c;
+// }
+// inline float4 fma(bfloat4 a, bfloat4 b, float4 c) {
+//   return (float4)a * (float4)b + c;
+// }
+// inline Float8_ fma(Bfloat8_ a, Bfloat8_ b, Float8_ c) {
+//   Float8_ res;
+//   res.x = fma((float4)a.x, (float4)b.x, (float4)c.x);
+//   res.y = fma((float4)a.y, (float4)b.y, (float4)c.y);
+//   return res;
+// }
+// inline Bfloat8_ fma(Bfloat8_ a, Bfloat8_ b, Bfloat8_ c) {
+//   Bfloat8_ res;
+//   res.x = (bfloat4)fma((float4)a.x, (float4)b.x, (float4)c.x);
+//   res.y = (bfloat4)fma((float4)a.y, (float4)b.x, (float4)c.y);
+//   return c;
+// }
+// inline void from_float(thread bfloat& dst, float src) {
+//   dst = static_cast<bfloat>(src);
+// }
+// inline void from_float(thread bfloat2& dst, float2 src) {
+//   dst.x = static_cast<bfloat>(src.x);
+//   dst.y = static_cast<bfloat>(src.y);
+// }
+// inline void from_float(thread bfloat4& dst, float4 src) {
+//   dst.x = static_cast<bfloat>(src.x);
+//   dst.y = static_cast<bfloat>(src.y);
+//   dst.z = static_cast<bfloat>(src.z);
+//   dst.w = static_cast<bfloat>(src.w);
+// }
+// inline void from_float(thread Bfloat8_& dst, Float8_ src) {
+//   bfloat4 x;
+//   bfloat4 y;
+//   from_float(x, src.x);
+//   from_float(y, src.y);
+//   dst.x = x;
+//   dst.y = y;
+// }
+// #else
+struct Bfloat2_ {
+  bfloat16_t x;
+  bfloat16_t y;
+};
+struct Bfloat4_ {
+  Bfloat2_ x;
+  Bfloat2_ y;
+};
+struct Bfloat8_ {
+  Bfloat4_ x;
+  Bfloat4_ y;
+};
+template <> struct Vec<bfloat16_t, 1> {
+  using Type = bfloat16_t;
+};
+template <> struct Vec<bfloat16_t, 2> {
+  using Type = Bfloat2_;
+};
+template <> struct Vec<bfloat16_t, 4> {
+  using Type = Bfloat4_;
+};
+template <> struct Vec<bfloat16_t, 8> {
+  using Type = Bfloat8_;
+};
+template <> struct FloatVec<bfloat16_t> {
+  using Type = float;
+};
+template <> struct FloatVec<Bfloat2_> {
+  using Type = float2;
+};
+template <> struct FloatVec<Bfloat4_> {
+  using Type = float4;
+};
+template <> struct FloatVec<Bfloat8_> {
+  using Type = Float8_;
+};
+template <> inline float mul(bfloat16_t a, bfloat16_t b) {
+  return (float)a * (float)b;
+}
+template <> inline bfloat16_t mul(bfloat16_t a, bfloat16_t b) { return a * b; }
+template <> inline float2 mul(Bfloat2_ a, Bfloat2_ b) {
+  float2 a_f((float)a.x, (float)a.y);
+  float2 b_f((float)b.x, (float)b.y);
+  return a_f * b_f;
+}
+template <> inline Bfloat2_ mul(Bfloat2_ a, Bfloat2_ b) {
+  Bfloat2_ c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+template <> inline float4 mul(Bfloat4_ a, Bfloat4_ b) {
+  float2 x = mul<float2, Bfloat2_, Bfloat2_>(a.x, b.x);
+  float2 y = mul<float2, Bfloat2_, Bfloat2_>(a.y, b.y);
+  float4 c;
+  c.x = x.x;
+  c.y = x.y;
+  c.z = y.x;
+  c.w = y.y;
+  return c;
+}
+template <> inline Bfloat4_ mul(Bfloat4_ a, Bfloat4_ b) {
+  Bfloat4_ c;
+  c.x = mul<Bfloat2_, Bfloat2_, Bfloat2_>(a.x, b.x);
+  c.y = mul<Bfloat2_, Bfloat2_, Bfloat2_>(a.y, b.y);
+  return c;
+}
+template <> inline Float8_ mul(Bfloat8_ a, Bfloat8_ b) {
+  Float8_ c;
+  c.x = mul<float4, Bfloat4_, Bfloat4_>(a.x, b.x);
+  c.y = mul<float4, Bfloat4_, Bfloat4_>(a.y, b.y);
+  return c;
+}
+template <> inline Bfloat8_ mul(Bfloat8_ a, Bfloat8_ b) {
+  Bfloat8_ c;
+  c.x = mul<Bfloat4_, Bfloat4_, Bfloat4_>(a.x, b.x);
+  c.y = mul<Bfloat4_, Bfloat4_, Bfloat4_>(a.y, b.y);
+  return c;
+}
+template <> inline float sum(bfloat16_t a) { return (float)a; }
+template <> inline float sum(Bfloat2_ a) { return (float)a.x + (float)a.y; }
+template <> inline float sum(Bfloat4_ a) { return sum(a.x) + sum(a.y); }
+template <> inline float sum(Bfloat8_ a) { return sum(a.x) + sum(a.y); }
+inline float fma(bfloat16_t a, bfloat16_t b, float c) {
+  return (float)a * (float)b + c;
+}
+inline bfloat16_t fma(bfloat16_t a, bfloat16_t b, bfloat16_t c) {
+  return a * b + c;
+}
+inline float2 fma(Bfloat2_ a, Bfloat2_ b, float2 c) {
+  float2 a_f((float)a.x, (float)a.y);
+  float2 b_f((float)b.x, (float)b.y);
+  return a_f * b_f + c;
+}
+inline Bfloat2_ fma(Bfloat2_ a, Bfloat2_ b, Bfloat2_ c) {
+  Bfloat2_ res;
+  res.x = a.x * b.x + c.x;
+  res.y = a.y * b.y + c.y;
+  return res;
+}
+inline float4 fma(Bfloat4_ a, Bfloat4_ b, float4 c) {
+  float4 res;
+  res.x = fma(a.x.x, b.x.x, c.x);
+  res.y = fma(a.x.y, b.x.y, c.y);
+  res.z = fma(a.y.x, b.y.x, c.z);
+  res.w = fma(a.y.y, b.y.y, c.w);
+  return res;
+}
+inline Bfloat4_ fma(Bfloat4_ a, Bfloat4_ b, Bfloat4_ c) {
+  Bfloat4_ res;
+  res.x = fma(a.x, b.x, c.x);
+  res.y = fma(a.y, b.y, c.y);
+  return res;
+}
+inline Float8_ fma(Bfloat8_ a, Bfloat8_ b, Float8_ c) {
+  float4 x = fma(a.x, b.x, c.x);
+  float4 y = fma(a.y, b.y, c.y);
+  Float8_ res;
+  res.x = x;
+  res.y = y;
+  return res;
+}
+inline Bfloat8_ fma(Bfloat8_ a, Bfloat8_ b, Bfloat8_ c) {
+  Bfloat8_ res;
+  res.x = fma(a.x, b.x, c.x);
+  res.y = fma(a.y, b.y, c.y);
+  return res;
+}
+inline void from_float(thread bfloat16_t &dst, float src) {
+  dst = static_cast<bfloat16_t>(src);
+}
+inline void from_float(thread Bfloat2_ &dst, float2 src) {
+  dst.x = static_cast<bfloat16_t>(src.x);
+  dst.y = static_cast<bfloat16_t>(src.y);
+}
+inline void from_float(thread Bfloat4_ &dst, float4 src) {
+  dst.x.x = static_cast<bfloat16_t>(src.x);
+  dst.x.y = static_cast<bfloat16_t>(src.y);
+  dst.y.x = static_cast<bfloat16_t>(src.z);
+  dst.y.y = static_cast<bfloat16_t>(src.w);
+}
+inline void from_float(thread Bfloat8_ &dst, Float8_ src) {
+  Bfloat4_ x;
+  Bfloat4_ y;
+  from_float(x, src.x);
+  from_float(y, src.y);
+  dst.x = x;
+  dst.y = y;
+}
+// #endif
+// FP16 vector data types.
+struct Half8_ {
+  half4 x;
+  half4 y;
+};
+template <> struct Vec<half, 1> {
+  using Type = half;
+};
+template <> struct Vec<half, 2> {
+  using Type = half2;
+};
+template <> struct Vec<half, 4> {
+  using Type = half4;
+};
+template <> struct Vec<half, 8> {
+  using Type = Half8_;
+};
+template <> struct FloatVec<half> {
+  using Type = float;
+};
+template <> struct FloatVec<half2> {
+  using Type = float2;
+};
+template <> struct FloatVec<half4> {
+  using Type = float4;
+};
+template <> struct FloatVec<Half8_> {
+  using Type = Float8_;
+};
+template <> inline float mul(half a, half b) { return (float)a * (float)b; }
+template <> inline half mul(half a, half b) { return a * b; }
+template <> inline float2 mul(half2 a, half2 b) {
+  return (float2)a * (float2)b;
+}
+template <> inline half2 mul(half2 a, half2 b) { return a * b; }
+template <> inline float4 mul(half4 a, half4 b) {
+  return (float4)a * (float4)b;
+}
+template <> inline half4 mul(half4 a, half4 b) { return a * b; }
+template <> inline Float8_ mul(Half8_ a, Half8_ b) {
+  float4 x = mul<float4, half4, half4>(a.x, b.x);
+  float4 y = mul<float4, half4, half4>(a.y, b.y);
+  Float8_ c;
+  c.x = x;
+  c.y = y;
+  return c;
+}
+template <> inline Half8_ mul(Half8_ a, Half8_ b) {
+  Half8_ c;
+  c.x = mul<half4, half4, half4>(a.x, b.x);
+  c.y = mul<half4, half4, half4>(a.y, b.y);
+  return c;
+}
+template <> inline float sum(half a) { return (float)a; }
+template <> inline float sum(half2 a) { return (float)a.x + (float)a.y; }
+template <> inline float sum(half4 a) { return a.x + a.y + a.z + a.w; }
+template <> inline float sum(Half8_ a) { return sum(a.x) + sum(a.y); }
+inline float fma(half a, half b, float c) { return (float)a * (float)b + c; }
+inline float2 fma(half2 a, half2 b, float2 c) {
+  return (float2)a * (float2)b + c;
+}
+inline float4 fma(half4 a, half4 b, float4 c) {
+  return (float4)a * (float4)b + c;
+}
+inline Float8_ fma(Half8_ a, Half8_ b, Float8_ c) {
+  float4 x = fma(a.x, b.x, c.x);
+  float4 y = fma(a.y, b.y, c.y);
+  Float8_ res;
+  res.x = x;
+  res.y = y;
+  return res;
+}
+inline Half8_ fma(Half8_ a, Half8_ b, Half8_ c) {
+  Half8_ res;
+  res.x = fma(a.x, b.x, c.x);
+  res.y = fma(a.y, b.y, c.y);
+  return res;
+}
+inline void from_float(thread half &dst, float src) {
+  dst = static_cast<half>(src);
+}
+inline void from_float(thread half2 &dst, float2 src) {
+  dst.x = static_cast<half>(src.x);
+  dst.y = static_cast<half>(src.y);
+}
+inline void from_float(thread half4 &dst, float4 src) {
+  dst.x = static_cast<half>(src.x);
+  dst.y = static_cast<half>(src.y);
+  dst.z = static_cast<half>(src.z);
+  dst.w = static_cast<half>(src.w);
+}
+inline void from_float(thread Half8_ &dst, Float8_ src) {
+  half4 x;
+  half4 y;
+  from_float(x, src.x);
+  from_float(y, src.y);
+  dst.x = x;
+  dst.y = y;
+}
+// ========================================== Dot product utilities
+// TODO(EricLBuehler): optimize with vectorization
+template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline float qk_dot_(const threadgroup Vec (&q)[N], const thread Vec (&k)[N]) {
+  // Compute the parallel products for Q*K^T (treat vector lanes separately).
+  using A_vec = typename FloatVec<Vec>::Type;
+  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
+#pragma unroll
+  for (int ii = 1; ii < N; ++ii) {
+    qk_vec = fma(q[ii], k[ii], qk_vec);
+  }
+  // Finalize the reduction across lanes.
+  float qk = sum(qk_vec);
+#pragma unroll
+  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+    qk += simd_shuffle_xor(qk, mask);
+  }
+  return qk;
+}
+template <typename T, int THREAD_GROUP_SIZE> struct Qk_dot {
+  template <typename Vec, int N>
+  static inline float dot(const threadgroup Vec (&q)[N],
+                          const thread Vec (&k)[N]) {
+    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+  }
+};
+// ========================================== Block sum utility
+// Utility function for attention softmax.
+template <int NUM_WARPS, int NUM_SIMD_LANES>
+inline float block_sum(threadgroup float *red_smem, float sum, uint simd_tid,
+                       uint simd_lid) {
+  // Compute the sum per simdgroup.
+#pragma unroll
+  for (int mask = NUM_SIMD_LANES / 2; mask >= 1; mask /= 2) {
+    sum += simd_shuffle_xor(sum, mask);
+  }
+  // Simd leaders store the data to shared memory.
+  if (simd_lid == 0) {
+    red_smem[simd_tid] = sum;
+  }
+  // Make sure the data is in shared memory.
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // The warps compute the final sums.
+  if (simd_lid < NUM_WARPS) {
+    sum = red_smem[simd_lid];
+  }
+  // Parallel reduction inside the simd group.
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    sum += simd_shuffle_xor(sum, mask);
+  }
+  // Broadcast to other threads.
+  return simd_shuffle(sum, 0);
+}
+// ========================================== Paged Attention kernel
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+constant bool use_partitioning [[function_constant(10)]];
+constant bool use_alibi [[function_constant(20)]];
+template <typename T, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS,
+          int NUM_SIMD_LANES, int PARTITION_SIZE = 0>
+[[kernel]] void paged_attention(
+    device float *exp_sums
+    [[buffer(0), function_constant(use_partitioning)]], // [num_seqs, num_heads,
+                                                        // max_num_partitions]
+    device float *max_logits
+    [[buffer(1), function_constant(use_partitioning)]], // [num_seqs, num_heads,
+                                                        // max_num_partitions]
+    device T *out
+    [[buffer(2)]], // [num_seqs, num_heads, max_num_partitions, head_size]
+    device const T *q [[buffer(3)]], // [num_seqs, num_heads, head_size]
+    device const T *k_cache
+    [[buffer(4)]], // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    device const T *v_cache
+    [[buffer(5)]], // [num_blocks, num_kv_heads, head_size, block_size]
+    const constant int &num_kv_heads [[buffer(6)]], // [num_heads]
+    const constant float &scale [[buffer(7)]],
+    const constant float &softcapping [[buffer(8)]],
+    device const uint32_t *block_tables
+    [[buffer(9)]], // [num_seqs, max_num_blocks_per_seq]
+    device const uint32_t *context_lens [[buffer(10)]], // [num_seqs]
+    const constant int &max_num_blocks_per_seq [[buffer(11)]],
+    device const float *alibi_slopes
+    [[buffer(12), function_constant(use_alibi)]], // [num_heads]
+    const constant int &q_stride [[buffer(13)]],
+    const constant int &kv_block_stride [[buffer(14)]],
+    const constant int &kv_head_stride [[buffer(15)]],
+    threadgroup char *shared_mem [[threadgroup(0)]],
+    uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
+    uint3 threadgroups_per_grid [[threadgroups_per_grid]],
+    uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]],
+    uint simd_tid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  const int seq_idx = threadgroup_position_in_grid.y;
+  const int partition_idx = threadgroup_position_in_grid.z;
+  const int max_num_partitions = threadgroups_per_grid.z;
+  const int thread_idx = thread_position_in_threadgroup.x;
+  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
+  const uint32_t context_len = context_lens[seq_idx];
+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
+    // No work to do. Terminate the thread block.
+    return;
+  }
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int num_blocks_per_partition =
+      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
+  // [start_block_idx, end_block_idx) is the range of blocks to process.
+  const int start_block_idx =
+      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+  const int end_block_idx =
+      MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
+  const int num_blocks = end_block_idx - start_block_idx;
+  // [start_token_idx, end_token_idx) is the range of tokens to process.
+  const int start_token_idx = start_block_idx * BLOCK_SIZE;
+  const int end_token_idx =
+      MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
+  const int num_tokens = end_token_idx - start_token_idx;
+  constexpr int THREAD_GROUP_SIZE = MAX(NUM_SIMD_LANES / BLOCK_SIZE, 1);
+  constexpr int NUM_THREAD_GROUPS =
+      NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
+                                       // divides NUM_THREADS
+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
+  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+      DIVIDE_ROUND_UP(BLOCK_SIZE, NUM_SIMD_LANES);
+  constexpr int NUM_WARPS = NUM_THREADS / NUM_SIMD_LANES;
+  const int warp_idx = simd_tid;
+  const int lane = simd_lid;
+  const int head_idx = threadgroup_position_in_grid.x;
+  const int num_heads = threadgroups_per_grid.x;
+  const int num_queries_per_kv = num_heads / num_kv_heads;
+  const int kv_head_idx = head_idx / num_queries_per_kv;
+  const float alibi_slope = !use_alibi ? 0.f : alibi_slopes[head_idx];
+  // A vector type to store a part of a key or a query.
+  // The vector size is configured in such a way that the threads in a thread
+  // group fetch or compute 16 bytes at a time. For example, if the size of a
+  // thread group is 4 and the data type is half, then the vector size is 16 /
+  // (4 * sizeof(half)) == 2.
+  constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(T)), 1);
+  using K_vec = typename Vec<T, VEC_SIZE>::Type;
+  using Q_vec = typename Vec<T, VEC_SIZE>::Type;
+  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
+  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
+  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
+  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
+  // Load the query to registers.
+  // Each thread in a thread group has a different part of the query.
+  // For example, if the thread group size is 4, then the first thread in the
+  // group has 0, 4, 8, ... th vectors of the query, and the second thread has
+  // 1, 5, 9, ... th vectors of the query, and so on.
+  const device T *q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+  threadgroup Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+#pragma unroll
+  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+       i += NUM_THREAD_GROUPS) {
+    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
+    q_vecs[thread_group_offset][i] =
+        *reinterpret_cast<const device Q_vec *>(q_ptr + vec_idx * VEC_SIZE);
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Use fp32 on softmax logits for better accuracy
+  threadgroup float *logits = reinterpret_cast<threadgroup float *>(shared_mem);
+  // Workspace for reduction
+  threadgroup float red_smem[2 * NUM_WARPS];
+  // x == THREAD_GROUP_SIZE * VEC_SIZE
+  // Each thread group fetches x elements from the key at a time.
+  constexpr int x = 16 / sizeof(T);
+  float qk_max = -FLT_MAX;
+  // Iterate over the key blocks.
+  // Each warp fetches a block of keys for each iteration.
+  // Each thread group in a warp fetches a key from the block, and computes
+  // dot product with the query.
+  const device uint32_t *block_table =
+      block_tables + seq_idx * max_num_blocks_per_seq;
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+       block_idx += NUM_WARPS) {
+    // NOTE: The block number is stored in int32. However, we cast it to int64
+    // because int32 can lead to overflow when this variable is multiplied by
+    // large numbers (e.g., kv_block_stride).
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+    // Load a key to registers.
+    // Each thread in a thread group has a different part of the key.
+    // For example, if the thread group size is 4, then the first thread in the
+    // group has 0, 4, 8, ... th vectors of the key, and the second thread has
+    // 1, 5, 9, ... th vectors of the key, and so on.
+    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+      const int physical_block_offset =
+          (thread_group_idx + i * NUM_SIMD_LANES) % BLOCK_SIZE;
+      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+      K_vec k_vecs[NUM_VECS_PER_THREAD];
+#pragma unroll
+      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
+        const device T *k_ptr =
+            k_cache + physical_block_number * kv_block_stride +
+            kv_head_idx * kv_head_stride + physical_block_offset * x;
+        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
+        const int offset1 = (vec_idx * VEC_SIZE) / x;
+        const int offset2 = (vec_idx * VEC_SIZE) % x;
+        k_vecs[j] = *reinterpret_cast<const device K_vec *>(
+            k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+      }
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+      float qk = scale * Qk_dot<T, THREAD_GROUP_SIZE>::dot(
+                             q_vecs[thread_group_offset], k_vecs);
+      // Apply softcapping
+      if (softcapping != 1.0) {
+        qk = precise::tanh(qk / softcapping) * softcapping;
+      }
+      // Add the ALiBi bias if slopes are given.
+      qk +=
+          (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+      if (thread_group_offset == 0) {
+        // Store the partial reductions to shared memory.
+        // NOTE: It is required to zero out the masked logits.
+        const bool mask = token_idx >= context_len;
+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+        // Update the max value.
+        qk_max = mask ? qk_max : max(qk_max, qk);
+      }
+    }
+  }
+  // Perform reduction across the threads in the same warp to get the
+  // max qk value for each "warp" (not across the thread block yet).
+  // The 0-th thread of each thread group already has its max qk value.
+#pragma unroll
+  for (int mask = NUM_SIMD_LANES / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+    qk_max = max(qk_max, simd_shuffle_xor(qk_max, mask));
+  }
+  if (lane == 0) {
+    red_smem[warp_idx] = qk_max;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Get the max qk value for the sequence.
+  qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    qk_max = max(qk_max, simd_shuffle_xor(qk_max, mask));
+  }
+  // Broadcast the max qk value to all threads.
+  qk_max = simd_shuffle(qk_max, 0);
+  // Get the sum of the exp values.
+  float exp_sum = 0.f;
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    float val = exp(logits[i] - qk_max);
+    logits[i] = val;
+    exp_sum += val;
+  }
+  exp_sum = block_sum<NUM_WARPS, NUM_SIMD_LANES>(&red_smem[NUM_WARPS], exp_sum,
+                                                 simd_tid, simd_lid);
+  // Compute softmax.
+  const float inv_sum = divide(1.f, exp_sum + 1e-6f);
+  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    logits[i] *= inv_sum;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // If partitioning is enabled, store the max logit and exp_sum.
+  if (USE_PARTITIONING && thread_idx == 0 && use_partitioning) {
+    device float *max_logits_ptr =
+        max_logits + seq_idx * num_heads * max_num_partitions +
+        head_idx * max_num_partitions + partition_idx;
+    *max_logits_ptr = qk_max;
+    device float *exp_sums_ptr = exp_sums +
+                                 seq_idx * num_heads * max_num_partitions +
+                                 head_idx * max_num_partitions + partition_idx;
+    *exp_sums_ptr = exp_sum;
+  }
+  // Each thread will fetch 16 bytes from the value cache at a time.
+  constexpr int V_VEC_SIZE = MIN(16 / sizeof(T), BLOCK_SIZE);
+  using V_vec = typename Vec<T, V_VEC_SIZE>::Type;
+  using L_vec = typename Vec<T, V_VEC_SIZE>::Type;
+  using Float_L_vec = typename FloatVec<L_vec>::Type;
+  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
+  constexpr int NUM_ROWS_PER_ITER = NUM_SIMD_LANES / NUM_V_VECS_PER_ROW;
+  constexpr int NUM_ROWS_PER_THREAD =
+      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+  // NOTE: We use FP32 for the accumulator for better accuracy.
+  float accs[NUM_ROWS_PER_THREAD];
+#pragma unroll
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    accs[i] = 0.f;
+  }
+  T zero_value = 0;
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+       block_idx += NUM_WARPS) {
+    // NOTE: The block number is stored in int32. However, we cast it to int64
+    // because int32 can lead to overflow when this variable is multiplied by
+    // large numbers (e.g., kv_block_stride).
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
+    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+    L_vec logits_vec;
+    Float_L_vec logits_float_vec = *reinterpret_cast<threadgroup Float_L_vec *>(
+        logits + token_idx - start_token_idx);
+    from_float(logits_vec, logits_float_vec);
+    const device T *v_ptr = v_cache + physical_block_number * kv_block_stride +
+                            kv_head_idx * kv_head_stride;
+#pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE) {
+        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
+        // NOTE: When v_vec contains the tokens that are out of the context,
+        // we should explicitly zero out the values since they may contain NaNs.
+        // See
+        // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+        V_vec v_vec = *reinterpret_cast<const device V_vec *>(v_ptr + offset);
+        if (block_idx == num_context_blocks - 1) {
+          thread T *v_vec_ptr = reinterpret_cast<thread T *>(&v_vec);
+#pragma unroll
+          for (int j = 0; j < V_VEC_SIZE; j++) {
+            v_vec_ptr[j] =
+                token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
+          }
+        }
+        accs[i] += dot(logits_vec, v_vec);
+      }
+    }
+  }
+  // Perform reduction within each warp.
+#pragma unroll
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    float acc = accs[i];
+#pragma unroll
+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+      acc += simd_shuffle_xor(acc, mask);
+    }
+    accs[i] = acc;
+  }
+  // NOTE: A barrier is required because the shared memory space for logits
+  // is reused for the output.
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Perform reduction across warps.
+  threadgroup float *out_smem =
+      reinterpret_cast<threadgroup float *>(shared_mem);
+#pragma unroll
+  for (int i = NUM_WARPS; i > 1; i /= 2) {
+    int mid = i / 2;
+    // Upper warps write to shared memory.
+    if (warp_idx >= mid && warp_idx < i) {
+      threadgroup float *dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          dst[row_idx] = accs[i];
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Lower warps update the output.
+    if (warp_idx < mid) {
+      const threadgroup float *src = &out_smem[warp_idx * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          accs[i] += src[row_idx];
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+  // Write the final output.
+  if (warp_idx == 0) {
+    device T *out_ptr =
+        out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+        *(out_ptr + row_idx) = T(accs[i]);
+      }
+    }
+  }
+}
+template <typename T, int HEAD_SIZE, int NUM_THREADS, int NUM_SIMD_LANES,
+          int PARTITION_SIZE = 0>
+[[kernel]] void paged_attention_v2_reduce(
+    device T *out [[buffer(0)]], const device float *exp_sums [[buffer(1)]],
+    const device float *max_logits [[buffer(2)]],
+    const device T *tmp_out [[buffer(3)]],
+    device uint32_t *context_lens [[buffer(4)]],
+    const constant int &max_num_partitions [[buffer(5)]],
+    threadgroup char *shared_mem [[threadgroup(0)]],
+    uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
+    uint3 threadgroups_per_grid [[threadgroups_per_grid]],
+    uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]],
+    uint3 threads_per_threadgroup [[threads_per_threadgroup]],
+    uint simd_tid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  const int num_heads = threadgroups_per_grid.x;
+  const int head_idx = threadgroup_position_in_grid.x;
+  const int seq_idx = threadgroup_position_in_grid.y;
+  const uint32_t context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  if (num_partitions == 1) {
+    // No need to reduce. Only copy tmp_out to out.
+    device T *out_ptr =
+        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+    const device T *tmp_out_ptr =
+        tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE;
+    for (int i = thread_position_in_threadgroup.x; i < HEAD_SIZE;
+         i += threads_per_threadgroup.x) {
+      out_ptr[i] = tmp_out_ptr[i];
+    }
+    // Terminate the thread block.
+    return;
+  }
+  constexpr int NUM_WARPS = NUM_THREADS / NUM_SIMD_LANES;
+  const int warp_idx = simd_tid;
+  const int lane = simd_lid;
+  // Workspace for reduction.
+  threadgroup float red_smem[2 * NUM_WARPS];
+  // Load max logits to shared memory.
+  threadgroup float *shared_max_logits =
+      reinterpret_cast<threadgroup float *>(shared_mem);
+  const device float *max_logits_ptr =
+      max_logits + seq_idx * num_heads * max_num_partitions +
+      head_idx * max_num_partitions;
+  float max_logit = -FLT_MAX;
+  for (int i = thread_position_in_threadgroup.x; i < num_partitions;
+       i += threads_per_threadgroup.x) {
+    const float l = max_logits_ptr[i];
+    shared_max_logits[i] = l;
+    max_logit = max(max_logit, l);
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Get the global max logit.
+  // Reduce within the warp.
+#pragma unroll
+  for (int mask = NUM_SIMD_LANES / 2; mask >= 1; mask /= 2) {
+    max_logit = max(max_logit, simd_shuffle_xor(max_logit, mask));
+  }
+  if (lane == 0) {
+    red_smem[warp_idx] = max_logit;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Reduce across warps.
+  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    max_logit = max(max_logit, simd_shuffle_xor(max_logit, mask));
+  }
+  // Broadcast the max value to all threads.
+  max_logit = simd_shuffle(max_logit, 0);
+  // Load rescaled exp sums to shared memory.
+  threadgroup float *shared_exp_sums = reinterpret_cast<threadgroup float *>(
+      shared_mem + sizeof(float) * num_partitions);
+  const device float *exp_sums_ptr = exp_sums +
+                                     seq_idx * num_heads * max_num_partitions +
+                                     head_idx * max_num_partitions;
+  float global_exp_sum = 0.0f;
+  for (int i = thread_position_in_threadgroup.x; i < num_partitions;
+       i += threads_per_threadgroup.x) {
+    float l = shared_max_logits[i];
+    float rescaled_exp_sum = exp_sums_ptr[i] * exp(l - max_logit);
+    global_exp_sum += rescaled_exp_sum;
+    shared_exp_sums[i] = rescaled_exp_sum;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  global_exp_sum = block_sum<NUM_WARPS, NUM_SIMD_LANES>(
+      &red_smem[NUM_WARPS], global_exp_sum, simd_tid, simd_lid);
+  const float inv_global_exp_sum = divide(1.0f, global_exp_sum + 1e-6f);
+  // Aggregate tmp_out to out.
+  const device T *tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE;
+  device T *out_ptr =
+      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+#pragma unroll
+  for (int i = thread_position_in_threadgroup.x; i < HEAD_SIZE;
+       i += NUM_THREADS) {
+    float acc = 0.0f;
+    for (int j = 0; j < num_partitions; ++j) {
+      acc += float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+             inv_global_exp_sum;
+    }
+    out_ptr[i] = T(acc);
+  }
+}
+#define instantiate_paged_attention_inner(                                     \
+    type, head_size, block_size, num_threads, num_simd_lanes, partition_size)  \
+  template                                                                     \
+      [[host_name("paged_attention_" #type "_hs" #head_size "_bs" #block_size  \
+                  "_nt" #num_threads "_nsl" #num_simd_lanes                    \
+                  "_ps" #partition_size)]] [[kernel]] void                     \
+      paged_attention<type, head_size, block_size, num_threads,                \
+                      num_simd_lanes, partition_size>(                         \
+          device float *exp_sums                                               \
+          [[buffer(0), function_constant(use_partitioning)]],                  \
+          device float *max_logits                                             \
+          [[buffer(1), function_constant(use_partitioning)]],                  \
+          device type *out [[buffer(2)]], device const type *q [[buffer(3)]],  \
+          device const type *k_cache [[buffer(4)]],                            \
+          device const type *v_cache [[buffer(5)]],                            \
+          const constant int &num_kv_heads [[buffer(6)]],                      \
+          const constant float &scale [[buffer(7)]],                           \
+          const constant float &softcapping [[buffer(8)]],                     \
+          device const uint32_t *block_tables [[buffer(9)]],                   \
+          device const uint32_t *context_lens [[buffer(10)]],                  \
+          const constant int &max_num_blocks_per_seq [[buffer(11)]],           \
+          device const float *alibi_slopes                                     \
+          [[buffer(12), function_constant(use_alibi)]],                        \
+          const constant int &q_stride [[buffer(13)]],                         \
+          const constant int &kv_block_stride [[buffer(14)]],                  \
+          const constant int &kv_head_stride [[buffer(15)]],                   \
+          threadgroup char *shared_mem [[threadgroup(0)]],                     \
+          uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], \
+          uint3 threadgroups_per_grid [[threadgroups_per_grid]],               \
+          uint3 thread_position_in_threadgroup                                 \
+          [[thread_position_in_threadgroup]],                                  \
+          uint simd_tid [[simdgroup_index_in_threadgroup]],                    \
+          uint simd_lid [[thread_index_in_simdgroup]]);
+#define instantiate_paged_attention_v2_reduce_inner(                           \
+    type, head_size, num_threads, num_simd_lanes, partition_size)              \
+  template [[host_name("paged_attention_v2_reduce_" #type "_hs" #head_size     \
+                       "_nt" #num_threads "_nsl" #num_simd_lanes               \
+                       "_ps" #partition_size)]] [[kernel]] void                \
+  paged_attention_v2_reduce<type, head_size, num_threads, num_simd_lanes,      \
+                            partition_size>(                                   \
+      device type * out [[buffer(0)]],                                         \
+      const device float *exp_sums [[buffer(1)]],                              \
+      const device float *max_logits [[buffer(2)]],                            \
+      const device type *tmp_out [[buffer(3)]],                                \
+      device uint32_t *context_lens [[buffer(4)]],                             \
+      const constant int &max_num_partitions [[buffer(5)]],                    \
+      threadgroup char *shared_mem [[threadgroup(0)]],                         \
+      uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],     \
+      uint3 threadgroups_per_grid [[threadgroups_per_grid]],                   \
+      uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], \
+      uint3 threads_per_threadgroup [[threads_per_threadgroup]],               \
+      uint simd_tid [[simdgroup_index_in_threadgroup]],                        \
+      uint simd_lid [[thread_index_in_simdgroup]]);
+#define instantiate_paged_attention_heads(type, block_size, num_threads,       \
+                                          num_simd_lanes, partition_size)      \
+  instantiate_paged_attention_inner(type, 64, block_size, num_threads,         \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 80, block_size, num_threads,         \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 96, block_size, num_threads,         \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 112, block_size, num_threads,        \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 128, block_size, num_threads,        \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 192, block_size, num_threads,        \
+                                    num_simd_lanes, partition_size);           \
+  instantiate_paged_attention_inner(type, 256, block_size, num_threads,        \
+                                    num_simd_lanes, partition_size);
+#define instantiate_paged_attention_v2_reduce_heads(                           \
+    type, num_threads, num_simd_lanes, partition_size)                         \
+  instantiate_paged_attention_v2_reduce_inner(type, 64, num_threads,           \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 80, num_threads,           \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 96, num_threads,           \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 112, num_threads,          \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 128, num_threads,          \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 192, num_threads,          \
+                                              num_simd_lanes, partition_size); \
+  instantiate_paged_attention_v2_reduce_inner(type, 256, num_threads,          \
+                                              num_simd_lanes, partition_size);
+#define instantiate_paged_attention_block_size(type, num_threads,              \
+                                               num_simd_lanes, partition_size) \
+  instantiate_paged_attention_heads(type, 8, num_threads, num_simd_lanes,      \
+                                    partition_size);                           \
+  instantiate_paged_attention_heads(type, 16, num_threads, num_simd_lanes,     \
+                                    partition_size);                           \
+  instantiate_paged_attention_heads(type, 32, num_threads, num_simd_lanes,     \
+                                    partition_size);
+// TODO: tune num_threads = 256
+// NOTE: partition_size = 0
+#define instantiate_paged_attention_v1(type, num_simd_lanes)                   \
+  instantiate_paged_attention_block_size(type, 256, num_simd_lanes, 0);
+// TODO: tune num_threads = 256
+// NOTE: partition_size = 512
+#define instantiate_paged_attention_v2(type, num_simd_lanes)                   \
+  instantiate_paged_attention_block_size(type, 256, num_simd_lanes, 512);      \
+  instantiate_paged_attention_v2_reduce_heads(type, 256, num_simd_lanes, 512);
+instantiate_paged_attention_v1(float, 32);
+instantiate_paged_attention_v1(bfloat16_t, 32);
+instantiate_paged_attention_v1(half, 32);
+instantiate_paged_attention_v2(float, 32);
+instantiate_paged_attention_v2(bfloat16_t, 32);
+instantiate_paged_attention_v2(half, 32);

paged-attention-metal/cache/copy_blocks.metal ADDED Viewed

	@@ -0,0 +1,50 @@

+#include "utils.metal"
+#include <metal_stdlib>
+using namespace metal;
+template <typename T>
+[[kernel]] void copy_blocks(device T *key_cache [[buffer(0)]],
+                            device T *value_cache [[buffer(1)]],
+                            const device int64_t *block_mapping [[buffer(2)]],
+                            device const int &numel_per_block,
+                            uint gid [[thread_position_in_grid]],
+                            uint tid [[thread_position_in_threadgroup]],
+                            uint threads_per_threadgroup
+                            [[threads_per_threadgroup]]) {
+  const int pair_idx = gid;
+  int64_t src_block_number = block_mapping[2 * pair_idx];
+  int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
+  const int64_t src_block_offset = src_block_number * numel_per_block;
+  const int64_t dst_block_offset = dst_block_number * numel_per_block;
+  // Copy key cache blocks
+  for (int i = tid; i < numel_per_block; i += threads_per_threadgroup) {
+    int64_t src_offset = src_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
+    key_cache[dst_offset] = key_cache[src_offset];
+  }
+  // Copy value cache blocks
+  for (int i = tid; i < numel_per_block; i += threads_per_threadgroup) {
+    int64_t src_offset = src_block_offset + i;
+    int64_t dst_offset = dst_block_offset + i;
+    value_cache[dst_offset] = value_cache[src_offset];
+  }
+}
+#define instantiate_copy_blocks(type)                                          \
+  template [[host_name("copy_blocks_" #type)]] [[kernel]] void                 \
+  copy_blocks<type>(device type * key_cache_ptrs [[buffer(0)]],                \
+                    device type * value_cache_ptrs [[buffer(1)]],              \
+                    const device int64_t *block_mapping [[buffer(2)]],         \
+                    device const int &numel_per_block,                         \
+                    uint gid [[thread_position_in_grid]],                      \
+                    uint tid [[thread_position_in_threadgroup]],               \
+                    uint threads_per_threadgroup [[threads_per_threadgroup]]);
+instantiate_copy_blocks(float);
+instantiate_copy_blocks(bfloat16_t);
+instantiate_copy_blocks(half);

paged-attention-metal/cache/reshape_and_cache.metal ADDED Viewed

	@@ -0,0 +1,74 @@

+#include "utils.metal"
+#include <metal_stdlib>
+using namespace metal;
+template <typename T>
+[[kernel]] void reshape_and_cache(
+    const device T *__restrict__ key
+    [[buffer(0)]], // [num_tokens, num_heads, head_size]
+    const device T *__restrict__ value
+    [[buffer(1)]], // [num_tokens, num_heads, head_size]
+    device T *__restrict__ key_cache
+    [[buffer(2)]], // [num_blocks, num_heads, head_size/x, block_size, x]
+    device T *__restrict__ value_cache
+    [[buffer(3)]], // [num_blocks, num_heads, head_size, block_size]
+    const device int64_t *__restrict__ slot_mapping
+    [[buffer(4)]], // [num_tokens]
+    device const int &key_stride, device const int &value_stride,
+    device const int &num_heads, device const int &head_size,
+    device const int &block_size, device const int &x,
+    uint gid [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint threads_per_threadgroup [[threads_per_threadgroup]]) {
+  const int64_t token_idx = gid;
+  const int64_t slot_idx = slot_mapping[token_idx];
+  if (slot_idx < 0) {
+    // Padding token that should be ignored.
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+  const int n = num_heads * head_size;
+  for (int i = tid; i < n; i += threads_per_threadgroup) {
+    const int64_t src_key_idx = token_idx * key_stride + i;
+    const int64_t src_value_idx = token_idx * value_stride + i;
+    const int head_idx = i / head_size;
+    const int head_offset = i % head_size;
+    const int x_idx = head_offset / x;
+    const int x_offset = head_offset % x;
+    const int64_t tgt_key_idx =
+        block_idx * num_heads * (head_size / x) * block_size * x +
+        head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
+        block_offset * x + x_offset;
+    const int64_t tgt_value_idx =
+        block_idx * num_heads * head_size * block_size +
+        head_idx * head_size * block_size + head_offset * block_size +
+        block_offset;
+    key_cache[tgt_key_idx] = key[src_key_idx];
+    value_cache[tgt_value_idx] = value[src_value_idx];
+  }
+}
+#define instantiate_reshape_and_cache(type)                                    \
+  template [[host_name("reshape_and_cache_" #type)]] [[kernel]] void           \
+  reshape_and_cache<type>(                                                     \
+      const device type *__restrict__ key [[buffer(0)]],                       \
+      const device type *__restrict__ value [[buffer(1)]],                     \
+      device type *__restrict__ key_cache [[buffer(2)]],                       \
+      device type *__restrict__ value_cache [[buffer(3)]],                     \
+      const device int64_t *__restrict__ slot_mapping [[buffer(4)]],           \
+      device const int &key_stride, device const int &value_stride,            \
+      device const int &num_heads, device const int &head_size,                \
+      device const int &block_size, device const int &x,                       \
+      uint gid [[threadgroup_position_in_grid]],                               \
+      uint tid [[thread_position_in_threadgroup]],                             \
+      uint threads_per_threadgroup [[threads_per_threadgroup]]);
+instantiate_reshape_and_cache(float);
+instantiate_reshape_and_cache(bfloat16_t);
+instantiate_reshape_and_cache(half);

paged-attention-metal/paged_attention.mm ADDED Viewed

	@@ -0,0 +1,117 @@

+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <string>
+char const *CUSTOM_KERNEL = R"(
+        #include <metal_stdlib>
+        using namespace metal;
+        kernel void relu_forward_kernel_float(device const float *inA [[buffer(0)]],
+                                        device float *outC [[buffer(1)]],
+                                        uint index [[thread_position_in_grid]]) {
+            // Explicitly write to output
+            outC[index] = max(0.0f, inA[index]);
+        }
+        kernel void relu_forward_kernel_half(device const half *inA [[buffer(0)]],
+                                        device half *outC [[buffer(1)]],
+                                        uint index [[thread_position_in_grid]]) {
+            // Explicitly write to output
+            outC[index] = max(static_cast<half>(0.0), inA[index]);
+        }
+)";
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+torch::Tensor &dispatchReluKernel(torch::Tensor const &input,
+                                  torch::Tensor &output) {
+  @autoreleasepool {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    NSError *error = nil;
+    int numThreads = input.numel();
+    id<MTLLibrary> customKernelLibrary = [device
+        newLibraryWithSource:[NSString stringWithUTF8String:CUSTOM_KERNEL]
+                     options:nil
+                       error:&error];
+    TORCH_CHECK(customKernelLibrary,
+                "Failed to to create custom kernel library, error: ",
+                error.localizedDescription.UTF8String);
+    std::string kernel_name =
+        std::string("relu_forward_kernel_") +
+        (input.scalar_type() == torch::kFloat ? "float" : "half");
+    id<MTLFunction> customReluFunction = [customKernelLibrary
+        newFunctionWithName:[NSString
+                                stringWithUTF8String:kernel_name.c_str()]];
+    TORCH_CHECK(customReluFunction,
+                "Failed to create function state object for ",
+                kernel_name.c_str());
+    id<MTLComputePipelineState> reluPSO =
+        [device newComputePipelineStateWithFunction:customReluFunction
+                                              error:&error];
+    TORCH_CHECK(reluPSO, error.localizedDescription.UTF8String);
+    id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+    TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+    dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+    dispatch_sync(serialQueue, ^() {
+      id<MTLComputeCommandEncoder> computeEncoder =
+          [commandBuffer computeCommandEncoder];
+      TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+      [computeEncoder setComputePipelineState:reluPSO];
+      [computeEncoder setBuffer:getMTLBufferStorage(input)
+                         offset:input.storage_offset() * input.element_size()
+                        atIndex:0];
+      [computeEncoder setBuffer:getMTLBufferStorage(output)
+                         offset:output.storage_offset() * output.element_size()
+                        atIndex:1];
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+      NSUInteger threadGroupSize = reluPSO.maxTotalThreadsPerThreadgroup;
+      if (threadGroupSize > numThreads) {
+        threadGroupSize = numThreads;
+      }
+      MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1);
+      [computeEncoder dispatchThreads:gridSize
+                threadsPerThreadgroup:threadgroupSize];
+      [computeEncoder endEncoding];
+      torch::mps::commit();
+    });
+  }
+  return output;
+}
+void relu(torch::Tensor &out, const torch::Tensor &input) {
+  TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == torch::kFloat ||
+                  input.scalar_type() == torch::kHalf,
+              "Unsupported data type: ", input.scalar_type());
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape. Got input shape: ",
+              input.sizes(), " and output shape: ", out.sizes());
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same data type. Got input dtype: ",
+              input.scalar_type(), " and output dtype: ", out.scalar_type());
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device. Got input device: ",
+              input.device(), " and output device: ", out.device());
+  dispatchReluKernel(input, out);
+}

paged-attention-metal/utils.metal ADDED Viewed

File without changes

tests/kernels/__init__.py ADDED Viewed

File without changes

tests/kernels/allclose_default.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6}
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]

tests/kernels/conftest.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from typing import List, Optional, Tuple, Union
+import paged_attention as ops
+import pytest
+import torch
+@pytest.fixture()
+def kv_cache_factory():
+    return create_kv_caches_with_random
+@pytest.fixture()
+def kv_cache_factory_flashinfer():
+    return create_kv_caches_with_random_flash
+STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+    "float": torch.float,
+    "fp8": torch.uint8,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
+}
+def create_kv_caches_with_random(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: int = 0,
+    device: Optional[str] = "cuda",
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    if cache_dtype == "fp8" and head_size % 16:
+        raise ValueError(
+            f"Does not support key cache of type fp8 with head_size {head_size}"
+        )
+    from paged_attention.platforms import current_platform
+    current_platform.seed_everything(seed)
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    scale = head_size**-0.5
+    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    key_caches: List[torch.Tensor] = []
+    for _ in range(num_layers):
+        key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device)
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(key_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_cache)
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_caches: List[torch.Tensor] = []
+    for _ in range(num_layers):
+        value_cache = torch.empty(
+            size=value_cache_shape, dtype=torch_dtype, device=device
+        )
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            value_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(value_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support value cache of type {cache_dtype}")
+        value_caches.append(value_cache)
+    return key_caches, value_caches
+def create_kv_caches_with_random_flash(
+    num_blocks: int,
+    block_size: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+    seed: int = 0,
+    device: Optional[str] = "cuda",
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    from paged_attention.platforms import current_platform
+    current_platform.seed_everything(seed)
+    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    scale = head_size**-0.5
+    key_caches: List[torch.Tensor] = []
+    value_caches: List[torch.Tensor] = []
+    for _ in range(num_layers):
+        key_value_cache = torch.empty(
+            size=key_value_cache_shape, dtype=torch_dtype, device=device
+        )
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+            key_value_cache.uniform_(-scale, scale)
+        elif cache_dtype == "fp8":
+            _generate_random_fp8(key_value_cache, -scale, scale)
+        else:
+            raise ValueError(f"Does not support key cache of type {cache_dtype}")
+        key_caches.append(key_value_cache[:, 0])
+        value_caches.append(key_value_cache[:, 1])
+    return key_caches, value_caches
+def get_kv_cache_torch_dtype(
+    cache_dtype: Optional[Union[str, torch.dtype]],
+    model_dtype: Optional[Union[str, torch.dtype]] = None,
+) -> torch.dtype:
+    if isinstance(cache_dtype, str):
+        if cache_dtype == "auto":
+            if isinstance(model_dtype, str):
+                torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
+            elif isinstance(model_dtype, torch.dtype):
+                torch_dtype = model_dtype
+            else:
+                raise ValueError(f"Invalid model dtype: {model_dtype}")
+        elif cache_dtype in ["half", "bfloat16", "float"]:
+            torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        elif cache_dtype == "fp8":
+            torch_dtype = torch.uint8
+        else:
+            raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    elif isinstance(cache_dtype, torch.dtype):
+        torch_dtype = cache_dtype
+    else:
+        raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
+    return torch_dtype
+def _generate_random_fp8(
+    tensor: torch.Tensor,
+    low: float,
+    high: float,
+) -> None:
+    # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
+    # it may occur Inf or NaN if we directly use torch.randint
+    # to generate random data for fp8 data.
+    # For example, s.11111.00 in fp8e5m2 format represents Inf.
+    #     | E4M3        | E5M2
+    # -----|-------------|-------------------
+    # Inf | N/A         | s.11111.00
+    # NaN | s.1111.111  | s.11111.{01,10,11}
+    tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
+    tensor_tmp.uniform_(low, high)
+    ops.convert_fp8(tensor, tensor_tmp)
+    del tensor_tmp

tests/kernels/test_attention.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import random
+from typing import List, Optional, Tuple
+import paged_attention as ops
+import pytest
+import torch
+from paged_attention.platforms import current_platform
+from .allclose_default import get_default_atol, get_default_rtol
+from .utils import get_max_shared_memory_bytes, opcheck
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+# This will change depending on the compute capability.
+# - 512 as a buffer
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
+PARTITION_SIZE = 512
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = (
+    [torch.half, torch.bfloat16, torch.float]
+    if not current_platform.is_rocm()
+    else [torch.half, torch.bfloat16]
+)
+NUM_GEN_SEQS = [7]  # Arbitrary values for testing
+NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+# This should be sync with get_supported_head_sizes() in
+# vllm.attention.ops.paged_attn.PagedAttention
+HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
+BLOCK_SIZES = [16, 32]
+USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8"]
+SEEDS = [0]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
+def ref_single_query_cached_kv_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    num_queries_per_kv: int,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+) -> None:
+    num_query_heads = query.shape[1]
+    num_kv_heads = value_cache.shape[1]
+    head_size = value_cache.shape[2]
+    block_size = value_cache.shape[3]
+    num_seqs = query.shape[0]
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
+    for i in range(num_seqs):
+        q = query[i].unsqueeze(0)
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
+        keys_lst: List[torch.Tensor] = []
+        values_lst: List[torch.Tensor] = []
+        for j in range(seq_len):
+            block_number = int(block_table[j // block_size])
+            block_offset = j % block_size
+            k = key_cache[block_number, :, :, block_offset, :]
+            k = k.reshape(num_kv_heads, head_size)
+            keys_lst.append(k)
+            v = value_cache[block_number, :, :, block_offset]
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
+        if num_queries_per_kv > 1:
+            # Handle MQA and GQA
+            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
+        alibi_bias = None
+        if alibi_slopes is not None:
+            # Create the ALiBi bias used in the paged attention kernel.
+            position_ids = torch.arange(seq_len).int()
+            alibi_bias = (position_ids - seq_len + 1).float()
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
+        out = ref_masked_attention(q, keys, values, scale, alibi_bias)
+        out = out.view(num_query_heads, head_size)
+        output[i].copy_(out, non_blocking=True)
+@pytest.mark.parametrize(
+    "version", ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]
+)
+@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("use_alibi", USE_ALIBI)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_paged_attention(
+    kv_cache_factory,
+    version: str,
+    num_seqs: int,
+    num_heads: Tuple[int, int],
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    seed: int,
+    device: str,
+) -> None:
+    if (kv_cache_dtype == "fp8" and head_size % 16) or (
+        version == "rocm" and head_size not in (64, 128)
+    ):
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query.uniform_(-scale, scale)
+    assert num_query_heads % num_kv_heads == 0
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: List[List[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    # Using default kv_scale
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+    # Call the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v1":
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            max_seq_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+        opcheck(
+            ops.ops.paged_attention_v1,
+            (
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                0,
+                0,
+                0,
+                64,
+                0,
+            ),
+            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+        )
+    elif version in ("v2", "rocm"):
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
+        assert PARTITION_SIZE % block_size == 0
+        num_seqs, num_heads, head_size = output.shape
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, num_partitions, head_size),
+            dtype=output.dtype,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, num_partitions),
+            dtype=torch.float32,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        if version == "v2":
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+            opcheck(
+                ops.ops.paged_attention_v2,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                    0,
+                    0,
+                    0,
+                    64,
+                    0,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
+        else:
+            ops.paged_attention_rocm(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+            opcheck(
+                torch.ops._rocm_C.paged_attention,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
+    else:
+        raise AssertionError(f"Unknown version: {version}")
+    # Run the reference implementation.
+    if kv_cache_dtype == "fp8":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+        dequantized_key_cache = torch.empty(
+            size=key_cache_shape, dtype=dtype, device=device
+        )
+        ops.convert_fp8(dequantized_key_cache, key_cache)
+        key_cache = dequantized_key_cache
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(
+            size=value_cache_shape, dtype=dtype, device=device
+        )
+        ops.convert_fp8(dequantized_value_cache, value_cache)
+        value_cache = dequantized_value_cache
+    ref_output = torch.empty_like(query)
+    ref_single_query_cached_kv_attention(
+        ref_output,
+        query,
+        num_queries_per_kv,
+        key_cache,
+        value_cache,
+        block_tables,
+        seq_lens,
+        scale,
+        alibi_slopes,
+    )
+    # NOTE(woosuk): Due to the kernel-level differences in the two
+    # implementations, there is a small numerical difference in the two
+    # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
+        atol, rtol = 1e-2, 1e-5
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+def ref_multi_query_kv_attention(
+    cu_seq_lens: List[int],
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_seqs = len(cu_seq_lens) - 1
+    ref_outputs: List[torch.Tensor] = []
+    for i in range(num_seqs):
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        seq_len = end_idx - start_idx
+        # Create attention mask.
+        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1)
+        attn_mask = attn_mask * torch.finfo(dtype).min
+        attn_mask = attn_mask.to(dtype=dtype)
+        ref_output = ref_masked_attention(
+            query[start_idx:end_idx],
+            key[start_idx:end_idx],
+            value[start_idx:end_idx],
+            scale,
+            attn_mask=attn_mask,
+        )
+        ref_outputs.append(ref_output)
+    return torch.cat(ref_outputs, dim=0)

tests/kernels/test_cache.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import random
+from typing import List, Tuple
+import paged_attention as ops
+import pytest
+import torch
+from paged_attention.platforms import current_platform
+from .utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+NUM_TOKENS = [42]  # Arbitrary values for testing
+NUM_LAYERS = [1]  # Arbitrary values for testing
+NUM_HEADS = [8]  # Arbitrary values for testing
+HEAD_SIZES = [64, 80, 120, 256]
+BLOCK_SIZES = [8, 16, 32]
+# Arbitrary values for testing
+# don't make it too large. e.g. [1024, 36000] will OOM
+NUM_BLOCKS = [1024, 10000]
+NUM_MAPPINGS = [256]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+# We assume fp8 is always enabled for testing.
+KV_CACHE_DTYPE = ["auto", "fp8"]
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_copy_blocks(
+    kv_cache_factory,
+    num_mappings: int,
+    num_layers: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # Generate random block mappings where each source block is mapped to two
+    # destination blocks.
+    assert 2 * num_mappings <= num_blocks
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
+    block_mapping: List[Tuple[int, int]] = []
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        num_layers,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
+    # Clone the KV caches.
+    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
+    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
+    # Call the copy blocks kernel.
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device=device
+    ).view(-1, 2)
+    opcheck(
+        ops.ops.copy_blocks,
+        (key_caches, value_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
+    # Run the reference implementation.
+    for src, dst in block_mapping:
+        for cloned_key_cache in cloned_key_caches:
+            cloned_key_cache[dst].copy_(cloned_key_cache[src])
+        for cloned_value_cache in cloned_value_caches:
+            cloned_value_cache[dst].copy_(cloned_value_cache[src])
+    # Compare the results.
+    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
+        torch.testing.assert_close(value_cache, cloned_value_cache)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache(
+    kv_cache_factory,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
+    _, key, value = qkv.unbind(dim=1)
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    # Clone the KV caches.
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+    # Using default kv_scale
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+    # Call the reshape_and_cache kernel.
+    opcheck(
+        ops.ops.reshape_and_cache,
+        (
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        ),
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(result_key_cache, key_cache)
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(result_value_cache, value_cache)
+    # Run the reference implementation.
+    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets_lst = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
+        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+    if kv_cache_dtype == "fp8":
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
+    else:
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_reshape_and_cache_flash(
+    kv_cache_factory_flashinfer,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # Create a random slot mapping.
+    num_slots = block_size * num_blocks
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype, device=device)
+    _, key, value = qkv.unbind(dim=1)
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory_flashinfer(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0].contiguous(), value_caches[0].contiguous()
+    del key_caches
+    del value_caches
+    k_scale = (key.amax() / 256.0).to(torch.float32)
+    v_scale = (value.amax() / 256.0).to(torch.float32)
+    # Clone the KV caches.
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale, kv_cache_dtype)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+    # Call the reshape_and_cache kernel.
+    opcheck(
+        ops.ops.reshape_and_cache_flash,
+        (
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        ),
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.reshape_and_cache_flash(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        ops.convert_fp8(
+            result_key_cache, key_cache, k_scale.item(), kv_dtype=kv_cache_dtype
+        )
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        ops.convert_fp8(
+            result_value_cache, value_cache, v_scale.item(), kv_dtype=kv_cache_dtype
+        )
+    # Run the reference implementation.
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets_lst = block_offsets.cpu().tolist()
+    for i in range(num_tokens):
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
+        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+        cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+    if kv_cache_dtype == "fp8":
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
+    else:
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: Tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+) -> None:
+    if kv_cache_dtype == "fp8" and "cpu" in direction:
+        pytest.skip()
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
+    current_platform.seed_everything(seed)
+    src_device = device if direction[0] == "cuda" else "cpu"
+    dst_device = device if direction[1] == "cuda" else "cpu"
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device="cpu"
+    ).view(-1, 2)
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        src_device,
+    )
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        dst_device,
+    )
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+    # Call the swap_blocks kernel.
+    do_opcheck = head_size == HEAD_SIZES[0]
+    opcheck(
+        ops.ops.swap_blocks,
+        (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+    opcheck(
+        ops.ops.swap_blocks,
+        (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping_tensor)
+    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping_tensor)
+    for src, dst in block_mapping:
+        torch.testing.assert_close(
+            src_key_caches_clone[src].cpu(), dist_key_caches[0][dst].cpu()
+        )
+        torch.testing.assert_close(
+            src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()
+        )
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_fp8_e4m3_conversion(
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    low = -224.0
+    high = 224.0
+    shape = (num_blocks, num_heads, head_size, block_size)
+    cache = torch.empty(shape, dtype=dtype, device=device)
+    cache.uniform_(low, high)
+    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+    ops.convert_fp8(cache_fp8, cache)
+    converted_cache = torch.empty_like(cache)
+    ops.convert_fp8(converted_cache, cache_fp8)
+    torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)

tests/kernels/utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Kernel test utils"""
+import itertools
+import random
+import unittest
+from functools import lru_cache
+from numbers import Number
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+import pytest
+import torch
+from torch._prims_common import TensorLikeType
+# For now, disable "test_aot_dispatch_dynamic" since there are some
+# bugs related to this test in PyTorch 2.4.
+DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+)
+ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+# Copied/modified from torch._refs.__init__.py
+def fp8_allclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> bool:
+    """
+    Reference implementation of torch.allclose
+    """
+    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
+    return bool(
+        torch.all(
+            torch.isclose(
+                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
+            )
+        ).item()
+    )
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref)
+    )
+# A special version of op check that has a restricted default set of test_utils
+# and a patched version of allclose that supports fp8 types.
+def opcheck(
+    op: Union[
+        torch._ops.OpOverload,
+        torch._ops.OpOverloadPacket,
+        torch._library.custom_ops.CustomOpDef,
+    ],
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+    raise_exception: bool = True,
+    cond: bool = True
+) -> Dict[str, str]:
+    with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+        return (
+            torch.library.opcheck(
+                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
+            )
+            if cond
+            else {}
+        )
+@lru_cache(maxsize=None)
+def get_max_shared_memory_bytes(gpu: int = 0) -> int:
+    """Returns the maximum shared memory per thread block in bytes."""
+    from paged_attention import ops
+    max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu)
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
+    assert max_shared_mem > 0, "max_shared_mem can not be zero"
+    return int(max_shared_mem)