Erland commited on Jun 13

Commit

bec1e88

verified ·

1 Parent(s): 7fdd671

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/__init__.py +110 -0
fla/__pycache__/__init__.cpython-311.pyc +0 -0
fla/__pycache__/utils.cpython-311.pyc +0 -0
fla/ops/__init__.py +45 -0
fla/ops/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/generalized_delta_rule/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/gla/__init__.py +11 -0
fla/ops/gla/fused_recurrent.py +113 -0
fla/ops/gsa/__init__.py +9 -0
fla/ops/gsa/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/gsa/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/gsa/__pycache__/fused_recurrent.cpython-311.pyc +0 -0
fla/ops/gsa/chunk.py +1264 -0
fla/ops/gsa/fused_recurrent.py +564 -0
fla/ops/gsa/naive.py +68 -0
fla/ops/hgrn/__init__.py +9 -0
fla/ops/hgrn/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/hgrn/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/hgrn/__pycache__/fused_recurrent.cpython-311.pyc +0 -0
fla/ops/hgrn/fused_recurrent.py +308 -0
fla/ops/hgrn/naive.py +63 -0
fla/ops/lightning_attn/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/lightning_attn/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/lightning_attn/chunk.py +74 -0
fla/ops/lightning_attn/fused_recurrent.py +75 -0
fla/ops/linear_attn/__init__.py +11 -0
fla/ops/linear_attn/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/linear_attn/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/linear_attn/__pycache__/fused_chunk.cpython-311.pyc +0 -0
fla/ops/linear_attn/__pycache__/utils.cpython-311.pyc +0 -0
fla/ops/linear_attn/fused_chunk.py +318 -0
fla/ops/nsa/__init__.py +9 -0
fla/ops/nsa/__pycache__/naive.cpython-311.pyc +0 -0
fla/ops/nsa/__pycache__/parallel.cpython-311.pyc +0 -0
fla/ops/nsa/__pycache__/utils.cpython-311.pyc +0 -0
fla/ops/nsa/naive.py +94 -0
fla/ops/rebased/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/rebased/parallel.py +466 -0
fla/ops/retention/__init__.py +13 -0
fla/ops/retention/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/retention/__pycache__/parallel.cpython-311.pyc +0 -0
fla/ops/retention/chunk.py +72 -0
fla/ops/retention/fused_recurrent.py +42 -0
fla/ops/retention/naive.py +15 -0
fla/ops/rwkv4/__init__.py +7 -0
fla/ops/rwkv4/fused_recurrent.py +476 -0
fla/ops/rwkv6/__init__.py +9 -0
fla/ops/rwkv6/__pycache__/__init__.cpython-311.pyc +0 -0
fla/ops/rwkv6/__pycache__/chunk.cpython-311.pyc +0 -0
fla/ops/rwkv6/__pycache__/fused_recurrent.cpython-311.pyc +0 -0

fla/__init__.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# -*- coding: utf-8 -*-
+from fla.layers import (
+    ABCAttention,
+    Attention,
+    BasedLinearAttention,
+    BitAttention,
+    DeltaNet,
+    GatedDeltaNet,
+    GatedDeltaProduct,
+    GatedLinearAttention,
+    GatedSlotAttention,
+    HGRN2Attention,
+    HGRNAttention,
+    LightNetAttention,
+    LinearAttention,
+    MultiScaleRetention,
+    NativeSparseAttention,
+    ReBasedLinearAttention,
+    RWKV6Attention,
+    RWKV7Attention
+)
+from fla.models import (
+    ABCForCausalLM,
+    ABCModel,
+    BitNetForCausalLM,
+    BitNetModel,
+    DeltaNetForCausalLM,
+    DeltaNetModel,
+    GatedDeltaNetForCausalLM,
+    GatedDeltaNetModel,
+    GatedDeltaProductForCausalLM,
+    GatedDeltaProductModel,
+    GLAForCausalLM,
+    GLAModel,
+    GSAForCausalLM,
+    GSAModel,
+    HGRN2ForCausalLM,
+    HGRN2Model,
+    HGRNForCausalLM,
+    LightNetForCausalLM,
+    LightNetModel,
+    LinearAttentionForCausalLM,
+    LinearAttentionModel,
+    NSAForCausalLM,
+    NSAModel,
+    RetNetForCausalLM,
+    RetNetModel,
+    RWKV6ForCausalLM,
+    RWKV6Model,
+    RWKV7ForCausalLM,
+    RWKV7Model,
+    TransformerForCausalLM,
+    TransformerModel
+)
+__all__ = [
+    'ABCAttention',
+    'Attention',
+    'BasedLinearAttention',
+    'BitAttention',
+    'DeltaNet',
+    'GatedDeltaNet',
+    'GatedDeltaProduct',
+    'GatedLinearAttention',
+    'GatedSlotAttention',
+    'HGRNAttention',
+    'HGRN2Attention',
+    'LightNetAttention',
+    'LinearAttention',
+    'MultiScaleRetention',
+    'NativeSparseAttention',
+    'ReBasedLinearAttention',
+    'RWKV6Attention',
+    'RWKV7Attention',
+    'ABCForCausalLM',
+    'ABCModel',
+    'BitNetForCausalLM',
+    'BitNetModel',
+    'DeltaNetForCausalLM',
+    'DeltaNetModel',
+    'GatedDeltaNetForCausalLM',
+    'GatedDeltaNetModel',
+    'GatedDeltaProductForCausalLM',
+    'GatedDeltaProductModel',
+    'GLAForCausalLM',
+    'GLAModel',
+    'GSAForCausalLM',
+    'GSAModel',
+    'HGRNForCausalLM',
+    'HGRNModel',
+    'HGRN2ForCausalLM',
+    'HGRN2Model',
+    'LightNetForCausalLM',
+    'LightNetModel',
+    'LinearAttentionForCausalLM',
+    'LinearAttentionModel',
+    'NSAForCausalLM',
+    'NSAModel',
+    'RetNetForCausalLM',
+    'RetNetModel',
+    'RWKV6ForCausalLM',
+    'RWKV6Model',
+    'RWKV7ForCausalLM',
+    'RWKV7Model',
+    'TransformerForCausalLM',
+    'TransformerModel',
+]
+__version__ = '0.1.2'

fla/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.33 kB). View file

fla/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

fla/ops/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# -*- coding: utf-8 -*-
+from .abc import chunk_abc
+from .attn import parallel_attn
+from .based import fused_chunk_based, parallel_based
+from .delta_rule import chunk_delta_rule, fused_chunk_delta_rule, fused_recurrent_delta_rule
+from .forgetting_attn import parallel_forgetting_attn
+from .gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
+from .generalized_delta_rule import (
+    chunk_dplr_delta_rule,
+    chunk_iplr_delta_rule,
+    fused_recurrent_dplr_delta_rule,
+    fused_recurrent_iplr_delta_rule
+)
+from .gla import chunk_gla, fused_chunk_gla, fused_recurrent_gla
+from .gsa import chunk_gsa, fused_recurrent_gsa
+from .hgrn import fused_recurrent_hgrn
+from .lightning_attn import chunk_lightning_attn, fused_recurrent_lightning_attn
+from .linear_attn import chunk_linear_attn, fused_chunk_linear_attn, fused_recurrent_linear_attn
+from .nsa import parallel_nsa
+from .retention import chunk_retention, fused_chunk_retention, fused_recurrent_retention, parallel_retention
+from .rwkv6 import chunk_rwkv6, fused_recurrent_rwkv6
+from .rwkv7 import chunk_rwkv7, fused_recurrent_rwkv7
+from .simple_gla import chunk_simple_gla, fused_recurrent_simple_gla, parallel_simple_gla
+__all__ = [
+    'chunk_abc',
+    'parallel_attn',
+    'fused_chunk_based', 'parallel_based',
+    'chunk_delta_rule', 'fused_chunk_delta_rule', 'fused_recurrent_delta_rule',
+    'parallel_forgetting_attn',
+    'chunk_gated_delta_rule', 'fused_recurrent_gated_delta_rule',
+    'chunk_dplr_delta_rule', 'chunk_iplr_delta_rule',
+    'fused_recurrent_dplr_delta_rule', 'fused_recurrent_iplr_delta_rule',
+    'chunk_gla', 'fused_chunk_gla', 'fused_recurrent_gla',
+    'chunk_gsa', 'fused_recurrent_gsa',
+    'fused_recurrent_hgrn',
+    'chunk_lightning_attn', 'fused_recurrent_lightning_attn',
+    'chunk_linear_attn', 'fused_chunk_linear_attn', 'fused_recurrent_linear_attn',
+    'parallel_nsa',
+    'chunk_retention', 'fused_chunk_retention', 'fused_recurrent_retention', 'parallel_retention',
+    'chunk_rwkv6', 'fused_recurrent_rwkv6',
+    'chunk_rwkv7', 'fused_recurrent_rwkv7',
+    'chunk_simple_gla', 'fused_recurrent_simple_gla', 'parallel_simple_gla',
+]

fla/ops/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.29 kB). View file

fla/ops/generalized_delta_rule/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (448 Bytes). View file

fla/ops/gla/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_gla
+from .fused_chunk import fused_chunk_gla
+from .fused_recurrent import fused_recurrent_gla
+__all__ = [
+    'chunk_gla',
+    'fused_chunk_gla',
+    'fused_recurrent_gla'
+]

fla/ops/gla/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+from fla.ops.common.fused_recurrent import fused_recurrent
+def fused_recurrent_gla(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    gk: Optional[torch.Tensor] = None,
+    gv: Optional[torch.Tensor] = None,
+    scale: Optional[int] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    reverse: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        gk (torch.Tensor):
+            Forget gates of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` applied to keys.
+        gv (torch.Tensor):
+            Forget gates of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` applied to values.
+        scale (Optional[int]):
+            Scale factor for the attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        reverse (Optional[bool]):
+            If `True`, process the state passing in reverse order. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gla import fused_recurrent_gla
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = torch.randn(B, T, H, K, device='cuda')
+        >>> v = torch.randn(B, T, H, V, device='cuda')
+        >>> g = F.logsigmoid(torch.randn(B, T, H, K, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, device='cuda')
+        >>> o, ht = fused_recurrent_gla(q, k, v, g,
+                                        initial_state=h0,
+                                        output_final_state=True,
+                                        head_first=False)
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g = map(lambda x: rearrange(x, 'b t h d -> 1 (b t) h d'), (q, k, v, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_recurrent_gla(q, k, v, g,
+                                                initial_state=h0,
+                                                output_final_state=True,
+                                                cu_seqlens=cu_seqlens,
+                                                head_first=False)
+        >>> assert o.allclose(o_var.view(o.shape))
+        >>> assert ht.allclose(ht_var)
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                             f"Please flatten variable-length inputs before processing.")
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(f"The number of initial states is expected to be equal to the number of input sequences, "
+                             f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}.")
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    o, final_state = fused_recurrent(
+        q=q,
+        k=k,
+        v=v,
+        g=None,
+        gk=gk,
+        gv=gv,
+        scale=scale,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        reverse=reverse,
+        cu_seqlens=cu_seqlens,
+        head_first=head_first
+    )
+    return o, final_state

fla/ops/gsa/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_gsa
+from .fused_recurrent import fused_recurrent_gsa
+__all__ = [
+    'chunk_gsa',
+    'fused_recurrent_gsa'
+]

fla/ops/gsa/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (319 Bytes). View file

fla/ops/gsa/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (70.3 kB). View file

fla/ops/gsa/__pycache__/fused_recurrent.cpython-311.pyc ADDED Viewed

Binary file (26.6 kB). View file

fla/ops/gsa/chunk.py ADDED Viewed

	@@ -0,0 +1,1264 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from einops import reduce
+from fla.ops.common.chunk_h import chunk_bwd_dh, chunk_fwd_h
+from fla.ops.gla.chunk import chunk_gla_bwd, chunk_gla_fwd
+from fla.ops.utils import chunk_local_cumsum, softmax_bwd, softmax_fwd
+from fla.ops.utils.op import exp, safe_exp
+from fla.utils import input_guard
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for BV in [32, 64]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gsa_fwd_k_kernel_inter(
+    q,
+    k,
+    h,
+    g,
+    o,
+    A,
+    offsets,
+    indices,
+    scale,
+    T,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NG: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_bh // NG
+    i_b, i_hq = i_bh // HQ, i_bh % HQ
+    i_h = i_hq // NG
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    o_i = tl.arange(0, BT)
+    m_s = o_i[:, None] >= o_i[None, :]
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_k = tl.make_block_ptr(k + i_bg * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+            p_h = tl.make_block_ptr(h + (i_bg * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_q = tl.make_block_ptr(q + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+            p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BT]
+        b_A += tl.dot(b_q, b_k)
+    if HEAD_FIRST:
+        p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_A = tl.make_block_ptr(A + i_bh * T*BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_g = tl.make_block_ptr(g + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_A = tl.make_block_ptr(A + (bos * HQ + i_hq) * BT, (T, BT), (HQ*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    # [BT, BV]
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+    b_o = b_o * exp(b_g)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.where(m_s, b_A, 0.)
+    if i_v == 0:
+        tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.jit(do_not_specialize=['T'])
+def chunk_gsa_fwd_k_kernel_intra(
+    v,
+    g,
+    o,
+    A,
+    offsets,
+    indices,
+    T,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BV: tl.constexpr,
+    NC: tl.constexpr,
+    NG: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_bh // NG
+    i_b, i_hq = i_bh // HQ, i_bh % HQ
+    i_h = i_hq // NG
+    i_t, i_i = i_c // NC, i_c % NC
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_v = i_v * BV + tl.arange(0, BV)
+    m_v = o_v < V
+    if i_t * BT + i_i * BC > T:
+        return
+    if HEAD_FIRST:
+        p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_gn = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + min(i_t * BT + i_i * BC, T) * V + o_v, BV), BV)
+    else:
+        p_g = tl.make_block_ptr(g + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_gn = g + (bos + min(i_t * BT + i_i * BC, T)) * H*V + i_h * V + o_v
+    # [BV,]
+    b_gn = tl.load(p_gn, mask=m_v, other=0)
+    # [BC, BV]
+    b_o = tl.zeros([BC, BV], dtype=tl.float32)
+    for i_j in range(0, i_i):
+        if HEAD_FIRST:
+            p_A = tl.make_block_ptr(A + i_bh * T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+            p_v = tl.make_block_ptr(v + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+            p_gv = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+        else:
+            p_A = tl.make_block_ptr(A + (bos*HQ+i_hq) * BT, (T, BT), (HQ*BT, 1), (i_t*BT+i_i*BC, i_j * BC), (BC, BC), (1, 0))
+            p_v = tl.make_block_ptr(v + (bos*H+i_h) * V, (T, V), (H*V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+            p_gv = tl.make_block_ptr(g + (bos*H+i_h) * V, (T, V), (H*V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+        # [BC, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_gv = tl.load(p_gv, boundary_check=(0, 1))
+        b_vg = (b_v * exp(b_gn[None, :] - b_gv)).to(b_v.dtype)
+        # [BC, BC]
+        b_A = tl.load(p_A, boundary_check=(0, 1))
+        b_o += tl.dot(b_A, b_vg)
+    # [BC, BV]
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+    b_o *= exp(b_g - b_gn[None, :])
+    o_i = tl.arange(0, BC)
+    if HEAD_FIRST:
+        o_A = i_bh * T*BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_i * BC
+    else:
+        o_A = (bos + i_t * BT + i_i * BC + tl.arange(0, BC)) * HQ*BT + i_hq * BT + i_i * BC
+    m_A = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        if HEAD_FIRST:
+            p_v = tl.max_contiguous(tl.multiple_of(v + i_bg * T*V + (i_t * BT + i_i * BC + j) * V + o_v, BV), BV)
+            p_gv = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (i_t * BT + i_i * BC + j) * V + o_v, BV), BV)
+        else:
+            p_v = v + (bos + i_t * BT + i_i * BC + j) * H*V + i_h * V + o_v
+            p_gv = g + (bos + i_t * BT + i_i * BC + j) * H*V + i_h * V + o_v
+        # [BC,]
+        b_A = tl.load(A + o_A + j, mask=m_A, other=0)
+        # [BV,]
+        b_v = tl.load(p_v, mask=m_v, other=0).to(tl.float32)
+        b_gv = tl.load(p_gv, mask=m_v, other=0).to(tl.float32)
+        # [BC, BV]
+        b_vg = b_v[None, :] * exp(b_g - b_gv[None, :])
+        # avoid 0 * inf = inf
+        b_o += tl.where(o_i[:, None] >= j, b_A[:, None] * b_vg, 0.)
+    if HEAD_FIRST:
+        p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+    else:
+        p_o = tl.make_block_ptr(o + (bos*HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+    b_o += tl.load(p_o, boundary_check=(0, 1))
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [2, 4, 8]
+    ],
+    key=["BT"]
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gsa_bwd_k_kernel_dA(
+    v,
+    g,
+    do,
+    dA,
+    indices,
+    offsets,
+    scale,
+    T,
+    B: tl.constexpr,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BV: tl.constexpr,
+    NC: tl.constexpr,
+    NG: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_bh // NG
+    i_b, i_hq = i_bh // HQ, i_bh % HQ
+    i_h = i_hq // NG
+    i_t, i_i, i_j = i_c // (NC * NC), (i_c % (NC * NC)) // NC, (i_c % (NC * NC)) % NC
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+        all = B * T
+    o_v = i_v * BV + tl.arange(0, BV)
+    m_v = o_v < V
+    if i_t * BT + i_i * BC > T:
+        return
+    if HEAD_FIRST:
+        p_dA = tl.make_block_ptr(dA+(i_v*B*H+i_bh)*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+    else:
+        p_dA = tl.make_block_ptr(dA+((i_v*all+bos)*HQ+i_hq)*BT, (T, BT), (HQ*BT, 1), (i_t*BT+i_i*BC, i_j*BC), (BC, BC), (1, 0))
+    # [BC, BC]
+    b_dA = tl.zeros([BC, BC], dtype=tl.float32)
+    if i_i > i_j:
+        if HEAD_FIRST:
+            p_v = tl.make_block_ptr(v + i_bg * T*V, (V, T), (1, V), (i_v * BV, i_t * BT + i_j * BC), (BV, BC), (0, 1))
+            p_gv = tl.make_block_ptr(g + i_bg * T*V, (V, T), (1, V), (i_v * BV, i_t * BT + i_j * BC), (BV, BC), (0, 1))
+            p_gn = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (i_t * BT + i_i * BC) * V + o_v, BV), BV)
+            p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+            p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        else:
+            p_v = tl.make_block_ptr(v + (bos*H+i_h) * V, (V, T), (1, H*V), (i_v * BV, i_t*BT + i_j*BC), (BV, BC), (0, 1))
+            p_gv = tl.make_block_ptr(g + (bos*H+i_h) * V, (V, T), (1, H*V), (i_v * BV, i_t*BT + i_j*BC), (BV, BC), (0, 1))
+            p_gn = g + (bos + i_t*BT + i_i*BC) * H*V + i_h * V + o_v
+            p_g = tl.make_block_ptr(g + (bos*H+i_h) * V, (T, V), (H*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+            p_do = tl.make_block_ptr(do + (bos*HQ+i_hq) * V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+        # [BV,]
+        b_gn = tl.load(p_gn, mask=m_v, other=0.)
+        # [BC, BV]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        b_do = (b_do * exp(b_g - b_gn[None, :]) * scale).to(b_do.dtype)
+        # [BV, BC]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_gv = tl.load(p_gv, boundary_check=(0, 1))
+        b_vg = (b_v * exp(b_gn[:, None] - b_gv)).to(b_v.dtype)
+        # [BC, BC]
+        b_dA = tl.dot(b_do, b_vg)
+    elif i_i == i_j:
+        if HEAD_FIRST:
+            p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+            p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+            p_v = tl.max_contiguous(tl.multiple_of(v + i_bg * T*V + (i_t * BT + i_j * BC) * V + o_v, BV), BV)
+            p_gv = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (i_t * BT + i_j * BC) * V + o_v, BV), BV)
+        else:
+            p_g = tl.make_block_ptr(g + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+            p_do = tl.make_block_ptr(do + (bos*HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+            p_v = v + (bos + i_t*BT + i_j*BC) * H*V + i_h * V + o_v
+            p_gv = g + (bos + i_t*BT + i_j*BC) * H*V + i_h * V + o_v
+        # [BC, BV]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1)) * scale
+        m_v = o_v < V
+        o_i = tl.arange(0, BC)
+        # [BC, BC]
+        m_dA = o_i[:, None] >= o_i[None, :]
+        for j in range(0, min(BC, T - i_t * BT - i_j * BC)):
+            # [BV,]
+            b_v = tl.load(p_v, mask=m_v, other=0).to(tl.float32)
+            b_gv = tl.load(p_gv, mask=m_v, other=0).to(tl.float32)
+            # [BC,]
+            b_dAj = tl.sum(b_do * b_v[None, :] * exp(b_g - b_gv[None, :]), 1)
+            b_dA = tl.where((o_i == j)[None, :], b_dAj[:, None], b_dA)
+            p_v += (1 if HEAD_FIRST else H) * V
+            p_gv += (1 if HEAD_FIRST else H) * V
+        b_dA = tl.where(m_dA, b_dA, 0.)
+    tl.store(p_dA, b_dA.to(dA.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gsa_bwd_k_kernel_dqkvg(
+    q,
+    k,
+    v,
+    h,
+    g,
+    A,
+    do,
+    dh,
+    dq,
+    dk,
+    dv,
+    dg,
+    dgv,
+    dA,
+    offsets,
+    indices,
+    scale,
+    T,
+    B: tl.constexpr,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NG: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_bh // NG
+    i_b, i_hq = i_bh // HQ, i_bh % HQ
+    i_h = i_hq // NG
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        all = T
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+        all = B * T
+    o_i = tl.arange(0, BT)
+    o_t = min(i_t * BT + BT, T)
+    m_s = o_i[:, None] >= o_i[None, :]
+    if HEAD_FIRST:
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + i_bg * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_A = tl.make_block_ptr(A + (i_k*B*H+i_bh) * T*BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_q = tl.make_block_ptr(q + (bos*HQ+i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + (bos*H+i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_A = tl.make_block_ptr(A + ((i_k*all+bos)*HQ+i_hq)*BT, (T, BT), (HQ*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    # [BT, BK]
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.dot((b_q * scale).to(b_q.dtype), tl.trans(b_k))
+    b_A = tl.where(m_s, b_A, 0.)
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        o_v = i_v * BV + tl.arange(0, BV)
+        if HEAD_FIRST:
+            p_v = tl.make_block_ptr(v + i_bg * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_gn = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (o_t - 1) * V + o_v, BV), BV)
+            p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv = tl.make_block_ptr(dv + (i_k*B*H+i_bh) * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dg = tl.make_block_ptr(dg + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dgv = tl.make_block_ptr(dgv + (i_k*B*H+i_bh) * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_h = tl.make_block_ptr(h + i_bg * NT*K*V + i_t * K*V, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+            p_dh = tl.make_block_ptr(dh + i_bh * NT*K*V + i_t * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_v = tl.make_block_ptr(v + (bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_g = tl.make_block_ptr(g + (bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_gn = g + (bos + o_t - 1) * H*V + i_h * V + o_v
+            p_do = tl.make_block_ptr(do + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv = tl.make_block_ptr(dv + ((i_k*all+bos)*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dg = tl.make_block_ptr(dg + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dgv = tl.make_block_ptr(dgv+((i_k*all+bos)*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K*V, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+            p_dh = tl.make_block_ptr(dh + (i_tg * HQ + i_hq) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        m_v = o_v < V
+        # [BV,]
+        b_gn = tl.load(p_gn, mask=m_v, other=0)
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_gv = exp(b_gn[None, :] - b_g)
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # [BT, BV]
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        b_do = (b_do * exp(b_g) * scale).to(b_do.dtype)
+        # [BK, BV]
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+        # [BV]
+        b_dg = tl.sum(tl.trans(b_h) * b_dh, 0) * exp(b_gn)
+        b_dh = b_dh.to(b_k.dtype)
+        # [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_k.dtype))
+        b_dk += tl.dot((b_v * b_gv).to(b_v.dtype), tl.trans(b_dh))
+        # [BT, BV]
+        b_dv = tl.dot(b_k, b_dh) * b_gv
+        # [BV]
+        b_dg += tl.sum(b_dv * b_v, 0)
+        if i_k == 0:
+            b_dgv = tl.load(p_dg, boundary_check=(0, 1)) + b_dg[None, :]
+        else:
+            b_dgv = tl.zeros([BT, BV], dtype=tl.float32) + b_dg[None, :]
+        tl.store(p_dgv, b_dgv.to(p_dgv.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    if HEAD_FIRST:
+        p_dA = tl.make_block_ptr(dA + i_bh * T*BT, (T, BT, ), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_dq = tl.make_block_ptr(dq + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk = tl.make_block_ptr(dk + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    else:
+        p_dA = tl.make_block_ptr(dA + (bos*HQ + i_hq) * BT, (T, BT), (HQ*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_dq = tl.make_block_ptr(dq + (bos*HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk = tl.make_block_ptr(dk + (bos*HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    # [BT, BT]
+    b_dA = tl.load(p_dA, boundary_check=(0, 1))
+    # [BT, BK]
+    b_dq += tl.dot(b_dA, b_k)
+    b_dk += tl.dot(tl.trans(b_dA).to(b_k.dtype), b_q)
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.jit(do_not_specialize=['T'])
+def chunk_gsa_bwd_k_kernel_intra_dvg(
+    v,
+    g,
+    o,
+    A,
+    do,
+    dv,
+    dg,
+    offsets,
+    indices,
+    T,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BV: tl.constexpr,
+    NC: tl.constexpr,
+    NG: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_bh // NG
+    i_b, i_hq = i_bh // HQ, i_bh % HQ
+    i_h = i_hq // NG
+    i_t, i_i = i_c // NC, i_c % NC
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_v = i_v * BV + tl.arange(0, BV)
+    m_v = o_v < V
+    if i_t * BT + i_i * BC > T:
+        return
+    if HEAD_FIRST:
+        p_gv = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_gn = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (min(i_t * BT + i_i * BC + BC, T) - 1) * V + o_v, BV), BV)
+    else:
+        p_gv = tl.make_block_ptr(g + (bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_gn = g + (bos + min(i_t * BT + i_i * BC + BC, T)-1)*H*V + i_h*V + o_v
+    # [BV,]
+    b_gn = tl.load(p_gn, mask=m_v, other=0)
+    # [BC, BV]
+    b_gv = tl.load(p_gv, boundary_check=(0, 1))
+    b_dv = tl.zeros([BC, BV], dtype=tl.float32)
+    for i_j in range(i_i + 1, NC):
+        if HEAD_FIRST:
+            p_g = tl.make_block_ptr(g + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+            p_A = tl.make_block_ptr(A + i_bh * T*BT, (BT, T), (1, BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1))
+            p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+        else:
+            p_g = tl.make_block_ptr(g + (bos*H+i_h) * V, (T, V), (H*V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0))
+            p_A = tl.make_block_ptr(A + (bos*HQ+i_hq) * BT, (BT, T), (1, HQ*BT), (i_i*BC, i_t*BT + i_j*BC), (BC, BC), (0, 1))
+            p_do = tl.make_block_ptr(do + (bos*HQ+i_hq) * V, (T, V), (HQ*V, 1), (i_t*BT + i_j*BC, i_v*BV), (BC, BV), (1, 0))
+        # [BC, BV]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1)) * safe_exp(b_g - b_gn[None, :])
+        # [BC, BC]
+        b_A = tl.load(p_A, boundary_check=(0, 1))
+        # [BC, BV]
+        b_dv += tl.dot(b_A, b_do.to(b_A.dtype))
+    b_dv *= exp(b_gn[None, :] - b_gv)
+    o_i = tl.arange(0, BC)
+    o_c = i_i * BC + tl.arange(0, BC)
+    if HEAD_FIRST:
+        p_g = tl.max_contiguous(tl.multiple_of(g + i_bg * T*V + (i_t * BT + i_i * BC) * V + o_v, BV), BV)
+        p_A = tl.max_contiguous(tl.multiple_of(A + i_bh * T*BT + (i_t * BT + i_i * BC) * BT + o_c, BC), BC)
+        p_do = tl.max_contiguous(tl.multiple_of(do + i_bh * T*V + (i_t * BT + i_i * BC) * V + o_v, BV), BV)
+    else:
+        p_g = g + (bos + i_t * BT + i_i * BC) * H*V + i_h * V + o_v
+        p_A = A + (bos + i_t*BT + i_i*BC) * HQ*BT + i_hq * BT + o_c
+        p_do = do + (bos + i_t*BT + i_i*BC) * HQ*V + i_hq * V + o_v
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # [BC,]
+        b_A = tl.load(p_A)
+        # [BV,]
+        b_g = tl.load(p_g, mask=m_v, other=0)
+        b_do = tl.load(p_do, mask=m_v, other=0)
+        # [BC, BV]
+        m_i = o_i[:, None] <= j
+        b_dv += tl.where(m_i, exp(b_g[None, :] - b_gv) * b_A[:, None] * b_do[None, :], 0.)
+        p_g += (1 if HEAD_FIRST else H) * V
+        p_A += (1 if HEAD_FIRST else HQ) * BT
+        p_do += (1 if HEAD_FIRST else HQ) * V
+    if HEAD_FIRST:
+        p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_v = tl.make_block_ptr(v + i_bg * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+        p_dg = tl.make_block_ptr(dg + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0))
+    else:
+        p_o = tl.make_block_ptr(o + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+        p_v = tl.make_block_ptr(v + (bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+        p_do = tl.make_block_ptr(do + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+        p_dg = tl.make_block_ptr(dg + (bos*HQ+i_hq)*V, (T, V), (HQ*V, 1), (i_t*BT + i_i*BC, i_v*BV), (BC, BV), (1, 0))
+    b_o = tl.load(p_o, boundary_check=(0, 1)).to(tl.float32)
+    b_v = tl.load(p_v, boundary_check=(0, 1)).to(tl.float32)
+    b_do = tl.load(p_do, boundary_check=(0, 1)).to(tl.float32)
+    b_dv = b_dv + tl.load(p_dv, boundary_check=(0, 1)).to(tl.float32)
+    b_dg = b_o * b_do - b_v * b_dv
+    tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
+def chunk_gsa_fwd_v(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    scale: float = 1.,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    _, A, h, ht, o = chunk_gla_fwd(
+        q=q,
+        k=k,
+        v=v,
+        g=None,
+        g_cumsum=g,
+        scale=scale,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    return A, h, ht, o
+def chunk_gsa_fwd_k(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    h0: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    scale: float = 1.,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *k.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BC = min(16, BT)
+    BV = min(64, triton.next_power_of_2(V))
+    HQ = q.shape[1] if head_first else q.shape[2]
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    NC = triton.cdiv(BT, BC)
+    NG = HQ // H
+    h, ht = chunk_fwd_h(
+        k=k,
+        v=v,
+        g=None,
+        gk=None,
+        gv=g,
+        h0=h0,
+        output_final_state=output_final_state,
+        offsets=offsets,
+        head_first=head_first,
+        chunk_size=BT,
+        states_in_fp32=False
+    )
+    o = v.new_empty(B, *((HQ, T) if head_first else (T, HQ)), V)
+    A = q.new_empty(B, *((HQ, T) if head_first else (T, HQ)), BT)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT, B * HQ)
+    chunk_gsa_fwd_k_kernel_inter[grid](
+        q,
+        k,
+        h,
+        g,
+        o,
+        A,
+        offsets=offsets,
+        indices=indices,
+        scale=scale,
+        T=T,
+        HQ=HQ,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        NG=NG,
+        HEAD_FIRST=head_first
+    )
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT * NC, B * HQ)
+    chunk_gsa_fwd_k_kernel_intra[grid](
+        v,
+        g,
+        o,
+        A,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        HQ=HQ,
+        H=H,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BV=BV,
+        NC=NC,
+        NG=NG,
+        HEAD_FIRST=head_first,
+        num_warps=4,
+        num_stages=2
+    )
+    return A, h, ht, o
+def chunk_gsa_bwd_v(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    h0: torch.Tensor,
+    h: torch.Tensor,
+    A: torch.Tensor,
+    do: torch.Tensor,
+    dht: torch.Tensor,
+    dg: torch.Tensor,
+    scale: float = 1.,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+):
+    dq, dk, dv, dg, dh0 = chunk_gla_bwd(
+        q=q,
+        k=k,
+        v=v,
+        g=None,
+        g_cumsum=g,
+        scale=scale,
+        initial_state=h0,
+        h=h,
+        A=A,
+        do=do,
+        dht=dht,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    return dq, dk, dv, dg, dh0
+def chunk_gsa_bwd_k(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    h: torch.Tensor,
+    h0: torch.Tensor,
+    o: torch.Tensor,
+    do: torch.Tensor,
+    dht: torch.Tensor,
+    dg: torch.Tensor,
+    scale: float = 1.,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+):
+    if head_first:
+        B, H, T, K, V = *k.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BC = min(16, BT)
+    BK = min(64, triton.next_power_of_2(K))
+    BV = min(64, triton.next_power_of_2(V))
+    HQ = q.shape[1] if head_first else q.shape[2]
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    NC = triton.cdiv(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    NG = HQ // H
+    if h is None:
+        h, _ = chunk_fwd_h(
+            k=k,
+            v=v,
+            g=None,
+            gk=None,
+            gv=g,
+            h0=h0,
+            output_final_state=False,
+            offsets=offsets,
+            head_first=head_first,
+            chunk_size=BT,
+            states_in_fp32=False
+        )
+    dh, dh0 = chunk_bwd_dh(
+        q=q,
+        k=k,
+        v=v,
+        g=None,
+        gk=None,
+        gv=g,
+        do=do,
+        h0=h0,
+        dht=dht,
+        scale=scale,
+        offsets=offsets,
+        head_first=head_first,
+        chunk_size=BT,
+        states_in_fp32=True
+    )
+    dA = q.new_empty(NV, B, *((HQ, T) if head_first else (T, HQ)), BT)
+    grid = (NV, NT * NC * NC, B * HQ)
+    chunk_gsa_bwd_k_kernel_dA[grid](
+        v,
+        g,
+        do,
+        dA,
+        offsets=offsets,
+        indices=indices,
+        scale=scale,
+        T=T,
+        B=B,
+        HQ=HQ,
+        H=H,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BV=BV,
+        NC=NC,
+        NG=NG,
+        HEAD_FIRST=head_first
+    )
+    dA = dA.sum(0, dtype=dA.dtype)
+    A = do.new_empty(NK, B, *((HQ, T) if head_first else (T, HQ)), BT)
+    dq = torch.empty_like(q)
+    dk = k.new_empty(B, *((HQ, T) if head_first else (T, HQ)), K)
+    dv = v.new_empty(NK, B, *((HQ, T) if head_first else (T, HQ)), V)
+    dgv = g.new_empty(NK, B, *((HQ, T) if head_first else (T, HQ)), V, dtype=torch.float)
+    grid = (NK, NT, B * HQ)
+    chunk_gsa_bwd_k_kernel_dqkvg[grid](
+        q,
+        k,
+        v,
+        h,
+        g,
+        A,
+        do,
+        dh,
+        dq,
+        dk,
+        dv,
+        dg,
+        dgv,
+        dA,
+        offsets=offsets,
+        indices=indices,
+        scale=scale,
+        T=T,
+        B=B,
+        HQ=HQ,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        NG=NG,
+        HEAD_FIRST=head_first
+    )
+    A = A.sum(0, dtype=A.dtype)
+    dv = dv.sum(0, dtype=dv.dtype)
+    dgv = dgv.sum(0, dtype=dgv.dtype)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT * NC, B * HQ)
+    chunk_gsa_bwd_k_kernel_intra_dvg[grid](
+        v,
+        g,
+        o,
+        A,
+        do,
+        dv,
+        dg,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        HQ=HQ,
+        H=H,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BV=BV,
+        NC=NC,
+        NG=NG,
+        HEAD_FIRST=head_first,
+        num_warps=4,
+        num_stages=2
+    )
+    dg = dgv.add_(chunk_local_cumsum(dg, chunk_size=BT, reverse=True, offsets=offsets, indices=indices, head_first=head_first))
+    return dq, dk, dv, dg, dh0
+def chunk_gsa_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    output_final_state: bool = False,
+    scale: float = 1.,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    hk0, hv0 = None, None
+    if initial_state is not None:
+        hk0, hv0 = initial_state
+    Ak, hk, hkt, ok = chunk_gsa_fwd_k(
+        q=q,
+        k=k,
+        v=s,
+        g=g,
+        h0=hk0,
+        output_final_state=output_final_state,
+        scale=scale,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    # p is kept in fp32 for safe softmax backward
+    p = softmax_fwd(ok, dtype=torch.float)
+    qv = p.to(q.dtype)
+    Av, hv, hvt, ov = chunk_gsa_fwd_v(
+        q=qv,
+        k=s,
+        v=v,
+        g=g,
+        scale=1.,
+        initial_state=hv0,
+        output_final_state=output_final_state,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    return Ak, hk, hkt, ok, p, Av, hv, hvt, ov
+def chunk_gsa_bwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: torch.Tensor,
+    ok: torch.Tensor,
+    p: torch.Tensor,
+    A: Tuple[torch.Tensor, torch.Tensor],
+    h: Tuple[torch.Tensor, torch.Tensor],
+    initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    scale: float,
+    do: torch.Tensor,
+    dht: Tuple[torch.Tensor, torch.Tensor],
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+):
+    hk0, hv0 = None, None
+    if initial_state is not None:
+        hk0, hv0 = initial_state
+    _, Av = A
+    hk, hv = h
+    dhkt, dhvt = dht
+    qv = p.to(q.dtype)
+    dqv, dsv, dv, dg, dhv0 = chunk_gsa_bwd_v(
+        q=qv,
+        k=s,
+        v=v,
+        g=g,
+        h0=hv0,
+        h=hv,
+        A=Av,
+        do=do,
+        dht=dhvt,
+        dg=None,
+        scale=1.,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    # softmax gradient, equivalent to:
+    # dok = qv * (dqv - (qv * dqv).sum(-1, True))
+    dok = softmax_bwd(p, dqv, dtype=ok.dtype)
+    dq, dk, dsk, dg, dhk0 = chunk_gsa_bwd_k(
+        q=q,
+        k=k,
+        v=s,
+        g=g,
+        h0=hk0,
+        h=hk,
+        o=ok,
+        do=dok,
+        dht=dhkt,
+        dg=dg,
+        scale=scale,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    ds = dsv.add_(dsk)
+    if q.shape[1] != k.shape[1]:
+        dk, dv, ds, dg = map(lambda x: reduce(x, 'b (h g) ... -> b h ...', 'sum', h=k.shape[1]), (dk, dv, ds, dg))
+    dg = dg.to(s.dtype)
+    return dq, dk, dv, ds, dg, dhk0, dhv0
+class ChunkGSAFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        s: torch.Tensor,
+        g: torch.Tensor,
+        scale: float,
+        hk0: Optional[torch.Tensor],
+        hv0: Optional[torch.Tensor],
+        output_final_state: bool,
+        checkpoint_level: int,
+        offsets: Optional[torch.LongTensor],
+        head_first: bool = True
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        T = q.shape[2] if head_first else q.shape[1]
+        chunk_size = min(64, max(16, triton.next_power_of_2(T)))
+        # 2-d indices denoting the offsets of chunks in each sequence
+        # for example, if the passed `offsets` is [0, 100, 356] and `chunk_size` is 64,
+        # then there are 2 and 4 chunks in the 1st and 2nd sequences respectively, and `indices` will be
+        # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
+        indices = None
+        if offsets is not None:
+            indices = torch.cat([torch.arange(n) for n in triton.cdiv(offsets[1:] - offsets[:-1], chunk_size).tolist()])
+            indices = torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(offsets)
+        g_org, g = g, chunk_local_cumsum(g, chunk_size, offsets=offsets, indices=indices, head_first=head_first)
+        Ak, hk, hkt, ok, p, Av, hv, hvt, ov = chunk_gsa_fwd(
+            q=q,
+            k=k,
+            v=v,
+            s=s,
+            g=g,
+            initial_state=(hk0, hv0),
+            output_final_state=output_final_state,
+            scale=scale,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=chunk_size
+        )
+        if checkpoint_level >= 1:
+            del g
+            g = g_org
+        if checkpoint_level > 1:
+            del hk
+            del hv
+            hk, hv = None, None
+        else:
+            hk0, hv0 = None, None
+        ctx.save_for_backward(q, k, v, s, g, ok, p, Av, hk0, hv0, hk, hv)
+        ctx.checkpoint_level = checkpoint_level
+        ctx.scale = scale
+        ctx.offsets = offsets
+        ctx.indices = indices
+        ctx.head_first = head_first
+        ctx.chunk_size = chunk_size
+        return ov, hkt, hvt
+    @staticmethod
+    @input_guard
+    def backward(ctx, dov, dhkt=None, dhvt=None):
+        q, k, v, s, g, ok, p, Av, hk0, hv0, hk, hv = ctx.saved_tensors
+        scale = ctx.scale
+        offsets = ctx.offsets
+        indices = ctx.indices
+        head_first = ctx.head_first
+        chunk_size = ctx.chunk_size
+        if ctx.checkpoint_level >= 1:
+            g = chunk_local_cumsum(g, chunk_size, offsets=offsets, indices=indices, head_first=head_first)
+        dq, dk, dv, ds, dg, dhk0, dhv0 = chunk_gsa_bwd(
+            q=q,
+            k=k,
+            v=v,
+            s=s,
+            g=g,
+            ok=ok,
+            p=p,
+            A=(None, Av),
+            h=(hk, hv),
+            initial_state=(hk0, hv0),
+            scale=scale,
+            do=dov,
+            dht=(dhkt, dhvt),
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=chunk_size
+        )
+        return dq, dk, dv, ds, dg, None, dhk0, dhv0, None, None, None, None
+@torch.compiler.disable
+def chunk_gsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    scale: Optional[int] = None,
+    initial_state: Optional[Tuple[torch.Tensor]] = None,
+    output_final_state: Optional[bool] = False,
+    checkpoint_level: Optional[int] = 2,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: Optional[bool] = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, HQ, T, K]` if `head_first=True` else `[B, T, HQ, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+            GQA is performed if `H` is not equal to `HQ`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        s (torch.Tensor):
+            slot representations of shape `[B, H, T, M]` if `head_first=True` else `[B, T, H, M]`.
+        g (torch.Tensor):
+            Forget gates of shape `[B, H, T, M]` applied to keys.
+            If not provided, this function is equivalent to vanilla ABC.
+        scale (Optional[int]):
+            Scale factor for attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[Tuple[torch.Tensor]]):
+            Initial state tuple having tensors of shape `[N, H, K, M]` and `[N, H, M, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state tuple, having tensors of shape `[N, H, K, M]` and `[N, H, M, V]`.
+            Default: `False`.
+        checkpoint_level (Optional[int]):
+            Checkpointing level; higher values will save more memories and do more recomputations during backward.
+            Default: `2`:
+            - Level `0`: no memory saved, no recomputation.
+            - Level `1`: recompute the fp32 cumulative values during backward.
+            - Level `2`: recompute the fp32 cumulative values and forward hidden states during backward.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (Tuple[torch.Tensor]):
+            Final state tuple having tensors of shape `[N, H, K, M]` and `[N, H, M, V]` if `output_final_state=True`.
+            `None` otherwise.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gsa import fused_recurrent_gsa
+        # inputs with equal lengths
+        >>> B, T, H, K, V, M = 4, 2048, 4, 512, 512, 64
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = torch.randn(B, T, H, K, device='cuda')
+        >>> v = torch.randn(B, T, H, V, device='cuda')
+        >>> s = torch.randn(B, T, H, M, device='cuda')
+        >>> g = F.logsigmoid(torch.randn(B, T, H, M, device='cuda'))
+        >>> h0 = (torch.randn(B, H, K, M, device='cuda'), torch.randn(B, H, M, V, device='cuda'))
+        >>> o, (hk, hv) = chunk_gsa(q, k, v, s, g,
+                                    initial_state=h0,
+                                    output_final_state=True,
+                                    head_first=False)
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, s, g = map(lambda x: rearrange(x, 'b t h d -> 1 (b t) h d'), (q, k, v, s, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, (hk_var, hv_var) = chunk_gsa(q, k, v, s, g,
+                                                initial_state=h0,
+                                                output_final_state=True,
+                                                cu_seqlens=cu_seqlens,
+                                                head_first=False)
+        >>> assert o.allclose(o_var.view(o.shape))
+        >>> assert hk.allclose(hk_var)
+        >>> assert hv.allclose(hv_var)
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                             f"Please flatten variable-length inputs before processing.")
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+        if initial_state is not None and initial_state[0].shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(f"The number of initial states is expected to be equal to the number of input sequences, "
+                             f"i.e., {len(cu_seqlens) - 1} rather than {initial_state[0].shape[0]}.")
+    assert checkpoint_level in [0, 1, 2]
+    if g is None:
+        # TODO: this 3 steps took huge amount of time, ought to be optimized
+        z = s.float().logcumsumexp(2)
+        g = torch.cat((z[:, :, :1], z[:, :, :-1]), 2) - z
+        s = torch.exp(s - z).to(k.dtype)
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    hk0, hv0 = None, None
+    if initial_state is not None:
+        hk0, hv0 = initial_state
+    o, *final_state = ChunkGSAFunction.apply(
+        q,
+        k,
+        v,
+        s,
+        g,
+        scale,
+        hk0,
+        hv0,
+        output_final_state,
+        checkpoint_level,
+        cu_seqlens,
+        head_first
+    )
+    return o, final_state

fla/ops/gsa/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,564 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.common.fused_recurrent import fused_recurrent_bwd_kernel, fused_recurrent_fwd_kernel
+from fla.ops.utils import chunk_global_cumsum
+from fla.ops.utils.op import exp
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard
+@triton.jit
+def fused_recurrent_gsa_inference_kernel(
+    q,
+    k,
+    v,
+    s,
+    g,
+    o,
+    hk0,
+    hv0,
+    hkt,
+    hvt,
+    scale,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    M: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NG: tl.constexpr
+):
+    i_bh = tl.program_id(0)
+    i_bg = i_bh // NG
+    b_s = tl.load(s + i_bg * M + tl.arange(0, M)).to(tl.float32)
+    b_g = tl.load(g + i_bg * M + tl.arange(0, M)).to(tl.float32)
+    b_g = exp(b_g)
+    b_ok = tl.zeros([M], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        p_hk0 = hk0 + i_bg * K * M + (o_k[None, :]) * M + tl.arange(0, M)[:, None]
+        # [BK,]
+        mask_k = o_k < K
+        # [M, BK]
+        mask_hk = (tl.arange(0, M) < M)[:, None] & mask_k[None, :]
+        # [M, BK]
+        b_hk = tl.load(p_hk0, mask=mask_hk, other=0.).to(tl.float32)
+        # [BK,]
+        b_q = tl.load(q + i_bh * K + o_k, mask=mask_k, other=0.).to(tl.float32) * scale
+        b_k = tl.load(k + i_bg * K + o_k, mask=mask_k, other=0.).to(tl.float32)
+        b_hk = b_hk * b_g[:, None] + b_k[None, :] * b_s[:, None]
+        b_ok += tl.sum(b_hk * b_q[None, :], axis=1)
+        if i_bh % NG == 0:
+            p_hkt = hkt + i_bg * K * M + o_k[None, :] * M + tl.arange(0, M)[:, None]
+            tl.store(p_hkt, b_hk.to(p_hkt.dtype.element_ty), mask=mask_hk)
+    b_qv = tl.softmax(b_ok)
+    for i_v in range(tl.cdiv(V, BV)):
+        o_v = i_v * BV + tl.arange(0, BV)
+        p_hv0 = hv0 + i_bg * M * V + tl.arange(0, M)[None, :] * V + o_v[:, None]
+        # [BV,]
+        mask_v = o_v < V
+        # [BV, M]
+        mask_hv = mask_v[:, None] & (tl.arange(0, M) < M)[None, :]
+        # [BV, M]
+        b_hv = tl.load(p_hv0, mask=mask_hv, other=0).to(tl.float32)
+        # [BV,]
+        b_v = tl.load(v + i_bg * V + o_v, mask=mask_v, other=0).to(tl.float32)
+        b_hv = b_hv * b_g[None, :] + b_s[None, :] * b_v[:, None]
+        b_ov = tl.sum(b_hv * b_qv[None, :], axis=1)
+        tl.store(o + i_bh * V + o_v, b_ov.to(o.dtype.element_ty), mask=mask_v)
+        if i_bh % NG == 0:
+            p_hvt = hvt + i_bg * M * V + tl.arange(0, M)[None, :] * V + o_v[:, None]
+            tl.store(p_hvt, b_hv.to(p_hvt.dtype.element_ty), mask=mask_hv)
+def fused_recurrent_gsa_inference(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    output_final_state: bool = False,
+    scale: float = 1.,
+    head_first: bool = True
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, K, V, M = *k.shape, v.shape[-1], s.shape[-1]
+    else:
+        B, T, H, K, V, M = *k.shape, v.shape[-1], s.shape[-1]
+    HQ = q.shape[1] if head_first else q.shape[2]
+    BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64)
+    NG = HQ // H
+    if initial_state != (None, None) and initial_state is not None:
+        hk0, hv0 = initial_state
+    else:
+        hk0, hv0 = q.new_zeros(B, H, K, M, dtype=torch.float), q.new_zeros(B, H, M, V, dtype=torch.float)
+    hkt, hvt = None, None
+    if output_final_state:
+        if NG == 1:
+            hkt, hvt = hk0, hv0
+        else:
+            hkt, hvt = q.new_empty(B, H, K, M, dtype=torch.float), q.new_empty(B, H, M, V, dtype=torch.float)
+    o = v.new_empty(B, HQ, T, V) if head_first else v.new_empty(B, T, HQ, V)
+    grid = (B * HQ,)
+    fused_recurrent_gsa_inference_kernel[grid](
+        q,
+        k,
+        v,
+        s,
+        g,
+        o,
+        hk0,
+        hv0,
+        hkt,
+        hvt,
+        scale=scale,
+        K=K,
+        V=V,
+        M=M,
+        BK=BK,
+        BV=BV,
+        NG=NG
+    )
+    return o, (hkt, hvt)
+def fused_recurrent_gsa_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    output_final_state: bool = False,
+    scale: float = 1.,
+    reverse: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+    if head_first:
+        B, H, T, K, V, M = *k.shape, v.shape[-1], s.shape[-1]
+    else:
+        B, T, H, K, V, M = *k.shape, v.shape[-1], s.shape[-1]
+    N = B if offsets is None else len(offsets) - 1
+    HQ = q.shape[1] if head_first else q.shape[2]
+    if HQ != H:
+        raise ValueError("GQA not supported yet.")
+    BK, BV, BM = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64), min(M, 64)
+    NK, NV, NM = triton.cdiv(K, BK), triton.cdiv(V, BV), triton.cdiv(M, BM)
+    hk0, hv0 = None, None
+    if initial_state != (None, None) and initial_state is not None:
+        hk0, hv0 = initial_state
+    hkt, hvt = None, None
+    if output_final_state:
+        hkt, hvt = q.new_empty(N, H, K, M, dtype=torch.float), q.new_empty(N, H, M, V, dtype=torch.float)
+    ok = q.new_empty(NK, *s.shape, dtype=torch.float)
+    gk, gv = None, g
+    grid = (NM, NK, N * H)
+    fused_recurrent_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=s,
+        g=None,
+        gk=gk,
+        gv=gv,
+        o=ok,
+        h0=hk0,
+        ht=hkt,
+        offsets=offsets,
+        scale=scale,
+        B=B,
+        T=T,
+        H=H,
+        K=K,
+        V=M,
+        BK=BK,
+        BV=BM,
+        USE_G=False,
+        USE_GK=False,
+        USE_GV=True,
+        REVERSE=reverse,
+        HEAD_FIRST=head_first
+    )
+    ok = ok.sum(0)
+    qv = ok.softmax(-1, dtype=torch.float)
+    ov = q.new_empty(NM, *v.shape, dtype=torch.float)
+    gk, gv = g, None
+    grid = (NV, NM, N * H)
+    fused_recurrent_fwd_kernel[grid](
+        q=qv,
+        k=s,
+        v=v,
+        g=None,
+        gk=gk,
+        gv=gv,
+        o=ov,
+        h0=hv0,
+        ht=hvt,
+        offsets=offsets,
+        scale=1.,
+        B=B,
+        T=T,
+        H=H,
+        K=M,
+        V=V,
+        BK=BM,
+        BV=BV,
+        USE_G=False,
+        USE_GK=True,
+        USE_GV=False,
+        REVERSE=reverse,
+        HEAD_FIRST=head_first
+    )
+    ov = ov.sum(0)
+    return ok, hkt, qv, ov, hvt
+def fused_recurrent_gsa_bwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: torch.Tensor,
+    qv: torch.Tensor,
+    hk0: Optional[torch.Tensor] = None,
+    hv0: Optional[torch.Tensor] = None,
+    ok: Optional[torch.Tensor] = None,
+    do: Optional[torch.Tensor] = None,
+    dhkt: Optional[torch.Tensor] = None,
+    dhvt: Optional[torch.Tensor] = None,
+    scale: float = 1.,
+    reverse: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor]:
+    if head_first:
+        B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1]
+    else:
+        B, T, H, K, V, M = *q.shape, v.shape[-1], s.shape[-1]
+    N = B if offsets is None else len(offsets) - 1
+    BK, BV, BM = min(K, 64), min(V, 64), min(M, 64)
+    NK, NV, NM = triton.cdiv(K, BK), triton.cdiv(V, BV), triton.cdiv(M, BM)
+    if head_first:
+        dqv = q.new_empty(NV, B, H, T, M, dtype=torch.float)
+        dsv = q.new_empty(NV, B, H, T, M, dtype=torch.float)
+        dv = q.new_empty(NM, B, H, T, V, dtype=torch.float)
+    else:
+        dqv = q.new_empty(NV, B, T, H, M, dtype=torch.float)
+        dsv = q.new_empty(NV, B, T, H, M, dtype=torch.float)
+        dv = q.new_empty(NM, B, T, H, V, dtype=torch.float)
+    dhk0 = torch.empty_like(hk0)if hk0 is not None else None
+    dhv0 = torch.empty_like(hv0)if hv0 is not None else None
+    gk, gv = g, None
+    grid = (NV, NM, N * H)
+    fused_recurrent_bwd_kernel[grid](
+        q=qv,
+        k=s,
+        v=v,
+        g=None,
+        gk=gk,
+        gv=gv,
+        h0=hv0,
+        do=do,
+        dq=dqv,
+        dk=dsv,
+        dv=dv,
+        dht=dhvt,
+        dh0=dhv0,
+        offsets=offsets,
+        scale=1.,
+        B=B,
+        T=T,
+        H=H,
+        K=M,
+        V=V,
+        BK=BM,
+        BV=BV,
+        USE_G=False,
+        USE_GK=True,
+        USE_GV=False,
+        REVERSE=reverse,
+        HEAD_FIRST=head_first
+    )
+    dqv = dqv.sum(0)
+    dsv = dsv.sum(0)
+    dv = dv.sum(0)
+    dgk = chunk_global_cumsum(dqv * qv.float() - dsv * s.float(),
+                              reverse=not reverse,
+                              offsets=offsets,
+                              head_first=head_first)
+    dok = qv * (dqv - (qv * dqv).sum(-1, True))
+    if head_first:
+        dq = q.new_empty(NM, B, H, T, K, dtype=torch.float)
+        dk = q.new_empty(NM, B, H, T, K, dtype=torch.float)
+        dsk = q.new_empty(NK, B, H, T, M, dtype=torch.float)
+    else:
+        dq = q.new_empty(NM, B, T, H, K, dtype=torch.float)
+        dk = q.new_empty(NM, B, T, H, K, dtype=torch.float)
+        dsk = q.new_empty(NK, B, T, H, M, dtype=torch.float)
+    gk, gv = None, g
+    grid = (NM, NK, N * H)
+    fused_recurrent_bwd_kernel[grid](
+        q=q,
+        k=k,
+        v=s,
+        g=None,
+        gk=gk,
+        gv=gv,
+        h0=hk0,
+        do=dok,
+        dq=dq,
+        dk=dk,
+        dv=dsk,
+        dht=dhkt,
+        dh0=dhk0,
+        offsets=offsets,
+        scale=scale,
+        B=B,
+        T=T,
+        H=H,
+        K=K,
+        V=M,
+        BK=BK,
+        BV=BM,
+        USE_G=False,
+        USE_GK=False,
+        USE_GV=True,
+        REVERSE=reverse,
+        HEAD_FIRST=head_first
+    )
+    dq = dq.sum(0)
+    dk = dk.sum(0)
+    dsk = dsk.sum(0)
+    dgv = chunk_global_cumsum(dok.float() * ok.float() - dsk * s.float(),
+                              reverse=not reverse,
+                              offsets=offsets,
+                              head_first=head_first)
+    ds = dsk.add_(dsv)
+    dg = dgk.add_(dgv)
+    return dq, dk, dv, ds, dg, dhk0, dhv0
+class FusedRecurrentGSAFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        s: torch.Tensor,
+        g: torch.Tensor,
+        scale: Optional[float] = None,
+        hk0: Optional[torch.Tensor] = None,
+        hv0: Optional[torch.Tensor] = None,
+        output_final_state: bool = False,
+        reverse: bool = False,
+        offsets: Optional[torch.LongTensor] = None,
+        head_first: bool = True
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+        T = q.shape[2] if head_first else q.shape[1]
+        if T == 1 and not q.requires_grad:
+            o, (hkt, hvt) = fused_recurrent_gsa_inference(
+                q=q,
+                k=k,
+                v=v,
+                s=s,
+                g=g,
+                initial_state=(hk0, hv0),
+                output_final_state=output_final_state,
+                scale=scale,
+                head_first=head_first
+            )
+            return o, hkt, hvt
+        ok, hkt, qv, ov, hvt = fused_recurrent_gsa_fwd(
+            q=q,
+            k=k,
+            v=v,
+            s=s,
+            g=g,
+            initial_state=(hk0, hv0),
+            output_final_state=output_final_state,
+            scale=scale,
+            reverse=reverse,
+            offsets=offsets,
+            head_first=head_first
+        )
+        ctx.save_for_backward(q, k, v, s, g, qv, hk0, hv0, ok)
+        ctx.scale = scale
+        ctx.reverse = reverse
+        ctx.offsets = offsets
+        ctx.head_first = head_first
+        return ov.to(q.dtype), hkt, hvt
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(ctx, do, dhkt=None, dhvt=None):
+        q, k, v, s, g, qv, hk0, hv0, ok = ctx.saved_tensors
+        scale = ctx.scale
+        reverse = ctx.reverse
+        offsets = ctx.offsets
+        head_first = ctx.head_first
+        # not supported yet.
+        if dhkt is not None or dhvt is not None:
+            if g is not None:
+                assert g.requires_grad is False, "Cannot load final state gradient and use gates at the same time"
+        dq, dk, dv, ds, dg, dhk0, dhv0 = fused_recurrent_gsa_bwd(
+            q=q,
+            k=k,
+            v=v,
+            s=s,
+            g=g,
+            qv=qv,
+            hk0=hk0,
+            hv0=hv0,
+            ok=ok,
+            do=do,
+            dhkt=dhkt,
+            dhvt=dhvt,
+            scale=scale,
+            reverse=reverse,
+            offsets=offsets,
+            head_first=head_first
+        )
+        return dq.to(q), dk.to(k), dv.to(v), ds.to(s), dg.to(g), None, dhk0, dhv0, None, None, None, None
+def fused_recurrent_gsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    scale: Optional[int] = None,
+    initial_state: Optional[Tuple[torch.Tensor]] = None,
+    output_final_state: Optional[bool] = False,
+    reverse: Optional[bool] = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        s (torch.Tensor):
+            slot representations of shape `[B, H, T, M]` if `head_first=True` else `[B, T, H, M]`.
+        g (torch.Tensor):
+            Forget gates of shape `[B, H, T, M]` applied to keys.
+        scale (Optional[int]):
+            Scale factor for the attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[Tuple[torch.Tensor]]):
+            Initial state tuple having tensors of shape `[N, H, K, M]` and `[N, H, M, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]` and `[N, H, M, V]`.
+            Default: `False`.
+        reverse (Optional[bool]):
+            If `True`, process the state passing in reverse order. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (Tuple[torch.Tensor]):
+            Final state tuple having tensors of shape `[N, H, K, M]` and `[N, H, M, V]`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gsa import fused_recurrent_gsa
+        # inputs with equal lengths
+        >>> B, T, H, K, V, M = 4, 2048, 4, 512, 512, 64
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = torch.randn(B, T, H, K, device='cuda')
+        >>> v = torch.randn(B, T, H, V, device='cuda')
+        >>> s = torch.randn(B, T, H, M, device='cuda')
+        >>> g = F.logsigmoid(torch.randn(B, T, H, M, device='cuda'))
+        >>> h0 = (torch.randn(B, H, K, M, device='cuda'), torch.randn(B, H, M, V, device='cuda'))
+        >>> o, (hk, hv) = fused_recurrent_gsa(q, k, v, s, g,
+                                              initial_state=h0,
+                                              output_final_state=True,
+                                              head_first=False)
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, s, g = map(lambda x: rearrange(x, 'b t h d -> 1 (b t) h d'), (q, k, v, s, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, (hk_var, hv_var) = fused_recurrent_gsa(q, k, v, s, g,
+                                                          initial_state=h0,
+                                                          output_final_state=True,
+                                                          cu_seqlens=cu_seqlens,
+                                                          head_first=False)
+        >>> assert o.allclose(o_var.view(o.shape))
+        >>> assert hk.allclose(hk_var)
+        >>> assert hv.allclose(hv_var)
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                             f"Please flatten variable-length inputs before processing.")
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+        if initial_state is not None and initial_state[0].shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(f"The number of initial states is expected to be equal to the number of input sequences, "
+                             f"i.e., {len(cu_seqlens) - 1} rather than {initial_state[0].shape[0]}.")
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    if initial_state is None:
+        initial_state = (None, None)
+    o, *final_state = FusedRecurrentGSAFunction.apply(
+        q,
+        k,
+        v,
+        s,
+        g,
+        scale,
+        *initial_state,
+        output_final_state,
+        reverse,
+        cu_seqlens,
+        head_first
+    )
+    return o, final_state

fla/ops/gsa/naive.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# -*- coding: utf-8 -*-
+from typing import Optional
+import torch
+from einops import repeat
+def naive_recurrent_gsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    s: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    scale: Optional[int] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: Optional[bool] = False
+) -> torch.Tensor:
+    dtype = q.dtype
+    NG = q.shape[1]//k.shape[1]
+    # [batch_size, n_heads, seq_len, n_slots]
+    if g is None:
+        z = s.float().logcumsumexp(2)
+        g = torch.cat((z[:, :, :1], z[:, :, :-1]), 2) - z
+        s = torch.exp(s - z)
+    q, k, v, s, g = map(lambda x: x.float(), (q, k, v, s, g))
+    k, v, s, g = map(lambda x: repeat(x, 'b h t d -> b (h g) t d', g=NG), (k, v, s, g))
+    if initial_state is not None:
+        initial_state = tuple(map(lambda x: repeat(x, 'b h k v -> b (h g) k v', g=NG), initial_state))
+    B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1]
+    hk = torch.zeros(B, H, K, M, dtype=torch.float, device=q.device)
+    ok = torch.zeros_like(s)
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    final_state = None
+    if initial_state is not None:
+        hk += initial_state[0]
+    for i in range(T):
+        q_i = q[:, :, i] * scale
+        k_i = k[:, :, i]
+        v_i = s[:, :, i]
+        g_i = g[:, :, i].exp()
+        hk = hk * g_i[..., None, :] + k_i[..., None] * v_i[..., None, :]
+        ok[:, :, i] = (q_i[..., None] * hk).sum(-2)
+    qv = ok.softmax(-1)
+    hv = torch.zeros(B, H, M, V, dtype=torch.float, device=q.device)
+    ov = torch.zeros_like(v)
+    if initial_state is not None:
+        hv += initial_state[1]
+    for i in range(T):
+        q_i = qv[:, :, i]
+        k_i = s[:, :, i]
+        v_i = v[:, :, i]
+        g_i = g[:, :, i].exp()
+        hv = hv * g_i[..., :, None] + k_i[..., None] * v_i[..., None, :]
+        ov[:, :, i] = (q_i[..., None] * hv).sum(-2)
+    if output_final_state:
+        final_state = (hk.view(B, -1, NG, K, M)[:, :, 0], hv.view(B, -1, NG, M, V)[:, :, 0])
+    return ov.to(dtype), final_state

fla/ops/hgrn/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_hgrn
+from .fused_recurrent import fused_recurrent_hgrn
+__all__ = [
+    'chunk_hgrn',
+    'fused_recurrent_hgrn'
+]

fla/ops/hgrn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (322 Bytes). View file

fla/ops/hgrn/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (16.9 kB). View file

fla/ops/hgrn/__pycache__/fused_recurrent.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

fla/ops/hgrn/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.utils.op import exp
+from fla.utils import input_guard
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BD': BD}, num_warps=num_warps)
+        for BD in [32, 64, 128]
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['D']
+)
+@triton.jit(do_not_specialize=['T'])
+def fused_recurrent_hgrn_fwd_kernel(
+    x,
+    g,
+    o,
+    h0,
+    ht,
+    offsets,
+    T,
+    D: tl.constexpr,
+    BD: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr
+):
+    i_d, i_n = tl.program_id(0), tl.program_id(1)
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int64), tl.load(offsets + i_n + 1).to(tl.int64)
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    o_d = i_d * BD + tl.arange(0, BD)
+    mask = o_d < D
+    p_x = x + bos * D + o_d
+    p_g = g + bos * D + o_d
+    p_o = o + bos * D + o_d
+    b_h = tl.zeros([BD], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = h0 + i_n * D + o_d
+        b_h += tl.load(p_h0, mask=mask, other=0).to(tl.float32)
+    for _ in range(0, T):
+        b_x = tl.load(p_x, mask=mask, other=0).to(tl.float32)
+        b_g = tl.load(p_g, mask=mask, other=0).to(tl.float32)
+        b_h = exp(b_g) * b_h + b_x
+        tl.store(p_o, b_h.to(p_o.dtype.element_ty), mask=mask)
+        p_x += D
+        p_g += D
+        p_o += D
+    if STORE_FINAL_STATE:
+        p_ht = ht + i_n * D + o_d
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask)
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BD': BD}, num_warps=num_warps)
+        for BD in [32, 64, 128]
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['D']
+)
+@triton.jit(do_not_specialize=['T'])
+def fused_recurrent_hgrn_bwd_kernel(
+    g,
+    o,
+    h0,
+    dx,
+    dg,
+    do,
+    dht,
+    dh0,
+    offsets,
+    T,
+    D: tl.constexpr,
+    BD: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_OFFSETS: tl.constexpr
+):
+    i_d, i_n = tl.program_id(0), tl.program_id(1)
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int64), tl.load(offsets + i_n + 1).to(tl.int64)
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    o_d = i_d * BD + tl.arange(0, BD)
+    mask = o_d < D
+    p_g = g + (bos + T - 1) * D + o_d
+    p_o = o + (bos + T - 2) * D + o_d
+    p_dx = dx + (bos + T - 1) * D + o_d
+    p_dg = dg + (bos + T - 1) * D + o_d
+    p_do = do + (bos + T - 1) * D + o_d
+    b_dh = tl.zeros([BD], dtype=tl.float32)
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht = dht + i_n * D + o_d
+        b_dh += tl.load(p_dht, mask=mask, other=0).to(tl.float32)
+    for i in range(T - 1, -1, -1):
+        b_g = tl.load(p_g, mask=mask, other=0).to(tl.float32)
+        b_do = tl.load(p_do, mask=mask, other=0).to(tl.float32)
+        if i > 0:
+            b_o = tl.load(p_o, mask=mask, other=0).to(tl.float32)
+        elif USE_INITIAL_STATE:
+            b_o = tl.load(h0 + i_n * D + o_d, mask=mask, other=0).to(tl.float32)
+        else:
+            b_o = tl.zeros([BD], dtype=tl.float32)
+        b_dh = b_dh + b_do
+        b_dx = b_dh
+        b_dh = b_dh * exp(b_g)
+        b_dg = b_dh * b_o
+        tl.store(p_dx, b_dx.to(p_dx.dtype.element_ty), mask=mask)
+        tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), mask=mask)
+        p_g -= D
+        p_o -= D
+        p_dx -= D
+        p_dg -= D
+        p_do -= D
+    if USE_INITIAL_STATE:
+        p_dh0 = dh0 + i_n * D + o_d
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), mask=mask)
+def fused_recurrent_hgrn_fwd(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, D = x.shape
+    N = B if offsets is None else len(offsets) - 1
+    o = torch.empty_like(x)
+    final_state = x.new_empty(N, D) if output_final_state else None
+    def grid(meta): return (triton.cdiv(D, meta['BD']), N)
+    fused_recurrent_hgrn_fwd_kernel[grid](
+        x=x,
+        g=g,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        offsets=offsets,
+        T=T,
+        D=D
+    )
+    return o, final_state
+def fused_recurrent_hgrn_bwd(
+    g: torch.Tensor,
+    o: torch.Tensor,
+    do: torch.Tensor,
+    dht: torch.Tensor = None,
+    initial_state: torch.Tensor = None,
+    offsets: Optional[torch.LongTensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, D = do.shape
+    N = B if offsets is None else len(offsets) - 1
+    dx = torch.empty_like(o, dtype=torch.float)
+    dg = torch.empty_like(g, dtype=torch.float)
+    dh0 = torch.empty_like(initial_state, dtype=torch.float) if initial_state is not None else None
+    def grid(meta): return (triton.cdiv(D, meta['BD']), N)
+    fused_recurrent_hgrn_bwd_kernel[grid](
+        g=g,
+        o=o,
+        h0=initial_state,
+        dx=dx,
+        dg=dg,
+        do=do,
+        dht=dht,
+        dh0=dh0,
+        offsets=offsets,
+        T=T,
+        D=D
+    )
+    return dx, dg, dh0
+class FusedRecurrentHGRNFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        g: torch.Tensor,
+        initial_state: torch.Tensor = None,
+        output_final_state: bool = False,
+        offsets: Optional[torch.LongTensor] = None
+    ):
+        o, ht = fused_recurrent_hgrn_fwd(
+            x=x,
+            g=g,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            offsets=offsets
+        )
+        ctx.save_for_backward(g, o, initial_state)
+        ctx.offsets = offsets
+        return o, ht
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht=None):
+        g, o, initial_state = ctx.saved_tensors
+        offsets = ctx.offsets
+        dx, dg, dh0 = fused_recurrent_hgrn_bwd(
+            g=g,
+            o=o,
+            do=do,
+            dht=dht,
+            initial_state=initial_state,
+            offsets=offsets
+        )
+        return dx, dg, dh0, None, None
+@torch.compiler.disable
+def fused_recurrent_hgrn(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        x (torch.Tensor):
+            inputs of shape `[B, T, D].
+        g (torch.Tensor):
+            Forget gates of shape `[B, T, D]`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, D]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, D]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, D]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, D]` if `output_final_state=True` else `None`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.hgrn import fused_recurrent_hgrn
+        # inputs with equal lengths
+        >>> B, T, D = 4, 2048, 512
+        >>> x = torch.randn(B, T, D, device='cuda')
+        >>> g = F.logsigmoid(torch.randn(B, T, D, device='cuda'))
+        >>> h0 = torch.randn(B, D, device='cuda')
+        >>> o, ht = fused_recurrent_hgrn(x, g, initial_state=h0, output_final_state=True)
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> x, g = map(lambda x: rearrange(x, 'b t d -> 1 (b t) d'), (x, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = x.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_recurrent_hgrn(x, g, initial_state=h0, output_final_state=True, cu_seqlens=cu_seqlens)
+        >>> assert o.allclose(o_var.view(o.shape))
+        >>> assert ht.allclose(ht_var)
+    """
+    return FusedRecurrentHGRNFunction.apply(
+        x,
+        g,
+        initial_state,
+        output_final_state,
+        cu_seqlens
+    )

fla/ops/hgrn/naive.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# -*- coding: utf-8 -*-
+from typing import Optional
+import torch
+def naive_recurrent_hgrn(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: Optional[bool] = False
+) -> torch.Tensor:
+    dtype = x.dtype
+    x, g = map(lambda i: i.float(), (x, g))
+    B, T, D = x.shape
+    h = torch.zeros(B, D, dtype=torch.float, device=x.device)
+    o = torch.zeros_like(x)
+    final_state = None
+    if initial_state is not None:
+        h += initial_state
+    for i in range(T):
+        h = g[:, i].exp() * h + x[:, i]
+        o[:, i] = h
+    if output_final_state:
+        final_state = h
+    return o.to(dtype), final_state
+def naive_chunk_hgrn(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: Optional[bool] = False,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    dtype = x.dtype
+    x, g = map(lambda i: i.float(), (x, g))
+    B, T, D = x.shape
+    gc = g.view(B, chunk_size, D).cumsum(-2).view_as(g)
+    h = torch.zeros(B, D, dtype=torch.float, device=x.device)
+    o = torch.zeros_like(x)
+    final_state = None
+    if initial_state is not None:
+        h += initial_state
+    for i in range(0, T, chunk_size):
+        hp = h
+        h = torch.zeros(B, D, dtype=torch.float, device=x.device)
+        for j in range(i, i + chunk_size):
+            h = g[:, j].exp() * h + x[:, j]
+            o[:, j] = hp * gc[:, j].exp() + h
+        h = o[:, j].clone()
+    if output_final_state:
+        final_state = h
+    return o.to(dtype), final_state

fla/ops/lightning_attn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (352 Bytes). View file

fla/ops/lightning_attn/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (3.8 kB). View file

fla/ops/lightning_attn/chunk.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+from fla.ops.simple_gla.chunk import chunk_simple_gla
+@torch.compiler.disable
+def chunk_lightning_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    layer_idx: int,
+    num_layers: int,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        layer_idx (int):
+            The index of the current layer.
+        num_layers (int):
+            The total number of layers. Both `layer_idx` and `num_layers` are used to compute the decay factor.
+        scale (Optional[int]):
+            Scale factor for the attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    H = q.shape[1] if head_first else q.shape[2]
+    s = -(8 / H * (1 - layer_idx / num_layers)) * q.new_tensor(range(H), dtype=torch.float)
+    if head_first:
+        g = s[None, :, None].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    else:
+        g = s[None, None, :].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    return chunk_simple_gla(
+        q=q,
+        k=k,
+        v=v,
+        scale=scale,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        head_first=head_first,
+        cu_seqlens=cu_seqlens
+    )

fla/ops/lightning_attn/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+from fla.ops.simple_gla.fused_recurrent import fused_recurrent_simple_gla
+def fused_recurrent_lightning_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    layer_idx: int,
+    num_layers: int,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    reverse: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        layer_idx (int):
+            The index of the current layer.
+        num_layers (int):
+            The total number of layers. Both `layer_idx` and `num_layers` are used to compute the decay factor.
+        scale (Optional[int]):
+            Scale factor for the attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    H = q.shape[1] if head_first else q.shape[2]
+    s = -(8 / H * (1 - layer_idx / num_layers)) * q.new_tensor(range(H), dtype=torch.float)
+    if head_first:
+        g = s[None, :, None].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    else:
+        g = s[None, None, :].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    return fused_recurrent_simple_gla(
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        scale=scale,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        reverse=reverse,
+        cu_seqlens=cu_seqlens,
+        head_first=head_first
+    )

fla/ops/linear_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_linear_attn
+from .fused_chunk import fused_chunk_linear_attn
+from .fused_recurrent import fused_recurrent_linear_attn
+__all__ = [
+    'chunk_linear_attn',
+    'fused_chunk_linear_attn',
+    'fused_recurrent_linear_attn'
+]

fla/ops/linear_attn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (427 Bytes). View file

fla/ops/linear_attn/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (2.62 kB). View file

fla/ops/linear_attn/__pycache__/fused_chunk.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

fla/ops/linear_attn/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (583 Bytes). View file

fla/ops/linear_attn/fused_chunk.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from packaging import version
+from fla.ops.linear_attn.utils import normalize_output
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard
+@triton.jit
+def fused_chunk_linear_attn_fwd_kernel(
+    q,  # query [B, H, T, K]
+    k,  # key [B, H, T, V]
+    v,  # value [B, H, T, V]
+    o,  # output [B, H, T, V]
+    h0,
+    ht,
+    scale,
+    B,  # batch size
+    H,  # H
+    T,  # T
+    K: tl.constexpr,  # K
+    V: tl.constexpr,  # V
+    BT: tl.constexpr,  # BLOCK SIZE along the sequence dimension, a.k.a. chunk size
+    BK: tl.constexpr,  # BLOCK SIZE along the K dimension
+    BV: tl.constexpr,  # BLOCK SIZE along the V dimension
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    CHECK: tl.constexpr
+):
+    # indices
+    i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    o_i = tl.arange(0, BT)
+    # [BT, BT]
+    m_s = o_i[:, None] >= o_i[None, :]
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    # make block pointers
+    p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (0, i_k * BK), (BT, BK), (1, 0))
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, 0), (BK, BT), (0, 1))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (0, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o + (i_bh+i_k*B*H) * T*V, (T, V), (V, 1), (0, i_v * BV), (BT, BV), (1, 0))
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i in range(0, tl.cdiv(T, BT)):
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BT, BT]
+        b_s = tl.dot(b_q, b_k, allow_tf32=False)
+        b_s = tl.where(m_s, b_s, 0)
+        # [BT, BV]
+        b_o = tl.dot(b_s.to(b_q.dtype), b_v, allow_tf32=False)
+        if CHECK and i == 0:
+            b_o += tl.dot(b_q, b_h.to(b_q.dtype), allow_tf32=False)
+            b_h = b_h + tl.dot(b_k, b_v, allow_tf32=False)
+        else:
+            b_o += tl.dot(b_q, b_h.to(b_q.dtype), allow_tf32=False)
+            b_h = b_h + tl.dot(b_k, b_v, allow_tf32=False)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+        p_q = tl.advance(p_q, (BT, 0))
+        p_k = tl.advance(p_k, (0, BT))
+        p_v = tl.advance(p_v, (BT, 0))
+        p_o = tl.advance(p_o, (BT, 0))
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.jit
+def fused_chunk_linear_attn_bwd_kernel(
+    q,  # query [B, H, T, K]
+    k,  # key [B, H, T, V]
+    v,  # value [B, H, T, V]
+    do,  # gradient of output [B, H, T, V]
+    dq,  # gradient of query [NV, B, H, T, K]
+    dk,  # gradient of key [NV, B, H, T, K]
+    dv,  # gradient of value [NK, B, H, T, V]
+    h0,  # initial state of the chunk [B, H, K, V]
+    scale,  # K ** -0.5
+    B,  # B
+    H,  # H
+    T,  # T
+    K: tl.constexpr,  # K
+    V: tl.constexpr,  # V
+    BT: tl.constexpr,  # BLOCK SIZE along the sequence dimension, a.k.a. chunk size
+    BK: tl.constexpr,  # BLOCK SIZE along the K dimension
+    BV: tl.constexpr,  # BLOCK SIZE along the V dimension
+    USE_INITIAL_STATE: tl.constexpr,
+    CHECK: tl.constexpr
+):
+    i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    o_i = tl.arange(0, BT)
+    m_s = o_i[:, None] >= o_i[None, :]
+    # [BV, BK]
+    b_h = tl.zeros([BV, BK], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h = tl.make_block_ptr(h0 + i_bh * K * V, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        b_h = tl.load(p_h, boundary_check=(0, 1)).to(tl.float32)
+    for i in range(0, tl.cdiv(T, BT)):
+        p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i * BT, i_k * BK), (BT, BK), (1, 0))
+        p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v * BV, i * BT), (BV, BT), (0, 1))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dq = tl.make_block_ptr(dq + (i_bh + i_v*B*H) * T*K, (T, K), (K, 1), (i*BT, i_k*BK), (BT, BK), (1, 0))
+        # [BT, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [V, BT]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BT, V]
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BT, BT]
+        b_ds = tl.dot(b_do, b_v, allow_tf32=False)
+        b_ds = tl.where(m_s, b_ds, 0)
+        # [BT, BK]
+        b_dq = tl.dot(b_ds.to(b_k.dtype), b_k, allow_tf32=False)
+        # [BV, BK]
+        if CHECK and i == 0:
+            b_dq += tl.dot(b_do, b_h.to(b_do.dtype), allow_tf32=False)
+            b_h = b_h + tl.dot(b_v, b_k, allow_tf32=False)
+        else:
+            b_dq += tl.dot(b_do, b_h.to(b_do.dtype), allow_tf32=False)
+            b_h = b_h + tl.dot(b_v, b_k, allow_tf32=False)
+        b_dq *= scale
+        tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    # sync threads
+    b_h = None
+    tl.debug_barrier()
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    m_s = o_i[:, None] <= o_i[None, :]
+    for i in range(1, tl.cdiv(T, BT) + 1):
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, T - i * BT), (BK, BT), (0, 1))
+        p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (T - i * BT, i_k * BK), (BT, BK), (1, 0))
+        p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (T - i * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (T - i * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dk = tl.make_block_ptr(dk + (i_bh+i_v*B*H) * T*K, (T, K), (K, 1), (T - i*BT, i_k*BK), (BT, BK), (1, 0))
+        p_dv = tl.make_block_ptr(dv + (i_bh+i_k*B*H) * T*V, (T, V), (V, 1), (T - i*BT, i_v*BV), (BT, BV), (1, 0))
+        # [BK, BT]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BT, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BT, BT]
+        b_s = tl.dot(b_k, b_q, allow_tf32=False)
+        b_s = tl.where(m_s, b_s, 0).to(b_q.dtype)
+        # [BT, BT]
+        b_ds = tl.dot(b_v, tl.trans(b_do), allow_tf32=False)
+        b_ds = tl.where(m_s, b_ds, 0).to(b_q.dtype)
+        # [BT, BK]
+        b_dk = tl.dot(b_ds, tl.trans(b_q), allow_tf32=False)
+        # [BT, BV]
+        b_dv = tl.dot(b_s, b_do, allow_tf32=False)
+        if CHECK and i == 1:
+            b_dk += tl.dot(b_v, tl.trans(b_dh).to(b_v.dtype), allow_tf32=False)
+            b_dv += tl.dot(b_k, b_dh.to(b_k.dtype), allow_tf32=False)
+            b_dh += tl.dot(b_q, b_do, allow_tf32=False)
+        else:
+            b_dk += tl.dot(b_v, tl.trans(b_dh).to(b_v.dtype), allow_tf32=False)
+            b_dv += tl.dot(b_k, b_dh.to(b_k.dtype), allow_tf32=False)
+            b_dh += tl.dot(b_q, b_do, allow_tf32=False)
+        tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+class FusedChunkLinearAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(ctx, q, k, v, scale, initial_state, output_final_state):
+        B, H, T, K, V = *k.shape, v.shape[-1]
+        BT = 64
+        BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64)
+        NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+        num_warps = 4
+        num_stages = 1
+        o = q.new_empty(NK, B, H, T, V)
+        final_state = q.new_empty(B, H, K, V, dtype=torch.float) if output_final_state else None
+        # the bug still exists even for Triton 2.2 on H100 GPUs
+        # so we always enable initial checks
+        CHECK = True
+        if version.parse(triton.__version__) < version.parse('2.2.0'):
+            import warnings
+            warnings.warn(
+                "Triton<2.2.0 detected for running this kernel, "
+                "which is known to have some weird compiler issues (refer to https://github.com/openai/triton/issues/2852) "
+                "that lead to significant precision loss. "
+                "We've add some initial condition checks to resolve this, sadly at the sacrifice of the speed. "
+                "For optimal performance, it is recommended to install Triton>=2.2.0 (if possible)."
+            )
+            CHECK = True
+        grid = (NV, NK, B * H)
+        fused_chunk_linear_attn_fwd_kernel[grid](
+            q, k, v, o, initial_state, final_state,
+            scale,
+            B=B, H=H, T=T, K=K, V=V, BT=BT, BK=BK, BV=BV,
+            USE_INITIAL_STATE=initial_state is not None,
+            STORE_FINAL_STATE=output_final_state,
+            CHECK=CHECK,
+            num_warps=num_warps,
+            num_stages=num_stages
+        )
+        o = o.sum(0) if NK > 1 else o[0]
+        ctx.save_for_backward(q, k, v, initial_state)
+        ctx.scale = scale
+        ctx.CHECK = CHECK
+        return o.to(q.dtype), final_state
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(ctx, do, dht=None):
+        q, k, v, initial_state = ctx.saved_tensors
+        B, H, T, K, V = *k.shape, v.shape[-1]
+        scale = ctx.scale
+        BT = 64
+        BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64)
+        NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+        num_warps = 4
+        num_stages = 1
+        dq = q.new_empty(NV, B, H, T, K)
+        dk = q.new_empty(NV, B, H, T, K)
+        dv = q.new_empty(NK, B, H, T, V)
+        grid = (NV, NK, B * H)
+        fused_chunk_linear_attn_bwd_kernel[grid](
+            q, k, v, do, dq, dk, dv, initial_state,
+            scale,
+            B=B, H=H, T=T, K=K, V=V, BT=BT, BK=BK, BV=BV,
+            USE_INITIAL_STATE=initial_state is not None,
+            CHECK=ctx.CHECK,
+            num_warps=num_warps,
+            num_stages=num_stages
+        )
+        dq = dq.sum(0)
+        dk = dk.sum(0)
+        dv = dv.sum(0)
+        return dq.to(q.dtype), dk.to(k.dtype), dv.to(v.dtype), None, None, None
+def fused_chunk_linear_attn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    normalize: bool = True,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`
+        scale (Optional[int]):
+            Scale factor for linear attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[B, H, K, V]`. Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[B, H, K, V]`. Default: `False`.
+        normalize (bool):
+            Whether to normalize the output. Default: `True`.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format. Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`
+        final_state (torch.Tensor):
+            Final state of shape `[B, H, K, V]` if `output_final_state=True` else `None`
+    """
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    if not head_first:
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+    o, final_state = FusedChunkLinearAttentionFunction.apply(q, k, v, scale, initial_state, output_final_state)
+    if normalize:
+        o = normalize_output(q * scale, k, o)
+    if not head_first:
+        o = o.transpose(1, 2)
+    return o, final_state

fla/ops/nsa/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+from .naive import naive_nsa
+from .parallel import parallel_nsa
+__all__ = [
+    'naive_nsa',
+    'parallel_nsa'
+]

fla/ops/nsa/__pycache__/naive.cpython-311.pyc ADDED Viewed

Binary file (6.28 kB). View file

fla/ops/nsa/__pycache__/parallel.cpython-311.pyc ADDED Viewed

Binary file (70.1 kB). View file

fla/ops/nsa/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.99 kB). View file

fla/ops/nsa/naive.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+from einops import rearrange, repeat
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    indices: torch.LongTensor,
+    block_size: int = 64,
+    scale: Optional[float] = None,
+    head_first: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None
+) -> torch.Tensor:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, HQ, T, K]` if `head_first=True` else `[B, T, HQ, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+            GQA is enforced here. The ratio of query heads (HQ) to key/value heads (H) must be a power of 2 and >=16.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        indices (torch.LongTensor):
+            Block indices of shape `[B, T, H, S]` if `head_first=True` else `[B, T, H, S]`.
+            `S` is the number of selected blocks for each query token, which is set to 16 in the paper.
+        block_size (int):
+            Selected block size. Default: 64.
+        scale (Optional[int]):
+            Scale factor for attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, HQ, T, V]` if `head_first=True` else `[B, T, HQ, V]`.
+    """
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    if cu_seqlens is not None:
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+    if head_first:
+        q, k, v, indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'), (q, k, v, indices))
+    dtype = q.dtype
+    G = q.shape[2] // k.shape[2]
+    BS = block_size
+    k, v, indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, indices))
+    q, k, v = map(lambda x: x.float(), (q, k, v))
+    o = torch.zeros_like(v)
+    varlen = True
+    if cu_seqlens is None:
+        varlen = False
+        B, T = q.shape[:2]
+        cu_seqlens = torch.cat([indices.new_tensor(range(0, B*T, T)), indices.new_tensor([B*T])])
+    for i in range(len(cu_seqlens) - 1):
+        if not varlen:
+            q_b, k_b, v_b, i_b = q[i], k[i], v[i], indices[i]
+        else:
+            T = cu_seqlens[i+1] - cu_seqlens[i]
+            q_b, k_b, v_b, i_b = map(lambda x: x[0][cu_seqlens[i]:cu_seqlens[i+1]], (q, k, v, indices))
+        i_b = i_b.unsqueeze(-1) * BS + i_b.new_tensor(range(BS))
+        # [T, S*BS, HQ]
+        i_b = i_b.view(T, indices.shape[2], -1).transpose(1, 2)
+        for i_q in range(T):
+            # [HQ, D]
+            q_i = q_b[i_q] * scale
+            # [S*BS, HQ]
+            i_i = i_b[i_q]
+            # [S*BS, HQ, -1]
+            k_i, v_i = map(lambda x: x.gather(0, i_i.clamp(0, T-1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            # [S*BS, HQ]
+            attn = torch.einsum('h d, n h d -> n h', q_i, k_i).masked_fill(i_i > i_q, float('-inf')).softmax(0)
+            if not varlen:
+                o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+            else:
+                o[0][cu_seqlens[i]+i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+    if head_first:
+        o = rearrange(o, 'b t h d -> b h t d')
+    return o.to(dtype)

fla/ops/rebased/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (248 Bytes). View file

fla/ops/rebased/parallel.py ADDED Viewed

	@@ -0,0 +1,466 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import torch
+import triton
+import triton.language as tl
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard
+# Rebased: Linear Transformers with Learnable Kernel Functions are Better In-Context Models
+# https://github.com/corl-team/rebased/blob/main/flash_linear_attention/fla/ops/triton/rebased_fast/parallel.py
+@triton.jit(do_not_specialize=['T'])
+def parallel_rebased_fwd_kernel(
+    q,
+    k,
+    v,
+    o,
+    z,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BTL: tl.constexpr,
+    BTS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+):
+    # i_c: chunk index. used for sequence parallelism
+    i_kv, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    NV = tl.cdiv(V, BV)
+    i_k = i_kv // (NV)
+    i_v = i_kv % (NV)
+    p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0))
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k*BK, 0), (BK, BTS), (0, 1))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (0, i_v*BV), (BTS, BV), (1, 0))
+    # [BQ, BD] block Q, in the shared memory throughout the whole kernel
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_q = (b_q * scale).to(b_q.dtype)
+    b_o = tl.zeros([BTL, BV], dtype=tl.float32)
+    b_z = tl.zeros([BTL], dtype=tl.float32)
+    # Q block and K block have no overlap
+    # no need for mask, thereby saving flops
+    for _ in range(0, i_c*BTL, BTS):
+        # [BK, BTS]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BTS, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BTL, BTS]
+        b_s = tl.dot(b_q, (b_k), allow_tf32=False)
+        b_s = b_s * b_s
+        b_z += tl.sum(b_s, axis=1)
+        # [BQ, BD]
+        b_o = b_o + tl.dot(b_s.to(b_v.dtype), b_v, allow_tf32=False)
+        p_k = tl.advance(p_k, (0, BTS))
+        p_v = tl.advance(p_v, (BTS, 0))
+    # # rescale interchunk output
+    tl.debug_barrier()
+    o_q = tl.arange(0, BTL)
+    # # sync threads, easy for compiler to optimize
+    # tl.debug_barrier()
+    o_k = tl.arange(0, BTS)
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k*BK, i_c*BTL), (BK, BTS), (0, 1))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTS, BV), (1, 0))
+    # Q block and K block have overlap. masks required
+    for _ in range(i_c*BTL, (i_c + 1) * BTL, BTS):
+        # [BK, BTS]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BTS, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BTL, BTS]
+        m_s = o_q[:, None] >= o_k[None, :]
+        b_s = tl.dot(b_q, b_k, allow_tf32=False)
+        b_s = b_s * b_s
+        b_s = tl.where(m_s, b_s, 0)
+        b_z += tl.sum(b_s, axis=1)
+        # [BTL, BV]
+        b_o += tl.dot(b_s.to(b_q.dtype), b_v, allow_tf32=False)
+        p_k = tl.advance(p_k, (0, BTS))
+        p_v = tl.advance(p_v, (BTS, 0))
+        o_k += BTS
+    p_o = tl.make_block_ptr(o + (i_bh + B * H * i_k) * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0))
+    p_z = z + (i_bh + B * H * i_k) * T + i_c*BTL + tl.arange(0, BTL)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_z, b_z.to(p_z.dtype.element_ty), mask=((i_c*BTL + tl.arange(0, BTL)) < T))
+@triton.jit(do_not_specialize=['T'])
+def _parallel_rebased_bwd_dq(
+    i_bh,
+    i_c,
+    i_k,
+    i_v,
+    i_h,
+    q,
+    k,
+    v,
+    do,
+    dz,
+    dq,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BTL: tl.constexpr,
+    BTS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr
+):
+    p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0))
+    p_q = tl.make_block_ptr(q + (i_bh) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype)
+    b_q = (b_q * scale).to(b_q.dtype)
+    b_dq = tl.zeros([BTL, BK], dtype=tl.float32)
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (0, i_k*BK), (BTS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v*BV, 0), (BV, BTS), (0, 1))
+    p_dz = dz + i_bh * T + i_c*BTL + tl.arange(0, BTL)
+    b_dz = tl.load(p_dz, mask=(i_c*BTL + tl.arange(0, BTL)) < T)
+    for _ in range(0, i_c*BTL, BTS):
+        # [BTS, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BV, BTS]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BTL, BTS]
+        b_ds = tl.dot(b_do, b_v, allow_tf32=False)
+        if i_v == 0:
+            b_ds += b_dz[:, None]
+        else:
+            b_ds = b_ds
+        b_s = tl.dot(b_q, tl.trans(b_k), allow_tf32=False)
+        # [BQ, BD]
+        b_dq += tl.dot((2 * b_ds * b_s).to(b_v.dtype), b_k, allow_tf32=False)
+        p_k = tl.advance(p_k, (BTS, 0))
+        p_v = tl.advance(p_v, (0, BTS))
+    b_dq *= scale
+    o_q = tl.arange(0, BTL)
+    o_k = tl.arange(0, BTS)
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v*BV, i_c*BTL), (BV, BTS), (0, 1))
+    # Q block and K block have overlap. masks required
+    for _ in range(i_c*BTL, (i_c + 1) * BTL, BTS):
+        # [BTS, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BV, BTS]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        # [BTL, BTS]
+        m_s = o_q[:, None] >= o_k[None, :]
+        b_ds = tl.dot(b_do, b_v, allow_tf32=False)
+        if i_v == 0:
+            b_ds += b_dz[:, None]
+        else:
+            b_ds = b_ds
+        b_ds = tl.where(m_s, b_ds, 0) * scale
+        b_s = tl.dot(b_q, tl.trans(b_k), allow_tf32=False)
+        b_s = tl.where(m_s, b_s, 0)
+        # [BTL, BK]
+        b_dq += tl.dot((2 * b_ds * b_s).to(b_k.dtype),
+                       b_k, allow_tf32=False)
+        p_k = tl.advance(p_k, (BTS, 0))
+        p_v = tl.advance(p_v, (0, BTS))
+        o_k += BTS
+    p_dq = tl.make_block_ptr(dq + (i_bh + B * H * i_v) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0))
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    return
+@triton.jit(do_not_specialize=['T'])
+def _parallel_rebased_bwd_dkv(
+    i_bh,
+    i_c,
+    i_k,
+    i_v,
+    i_h,
+    q,
+    k,
+    v,
+    do,
+    dz,
+    dk,
+    dv,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BTL: tl.constexpr,
+    BTS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+):
+    # compute dk dv
+    p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0))
+    b_k, b_v = tl.load(p_k, boundary_check=(0, 1)), tl.load(p_v, boundary_check=(0, 1))
+    b_dk, b_dv = tl.zeros([BTL, BK], dtype=tl.float32), tl.zeros(
+        [BTL, BV], dtype=tl.float32)
+    for i in range((tl.cdiv(T, BTS) * BTS)-BTS, (i_c + 1) * BTL - BTS, -BTS):
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k*BK, i), (BK, BTS), (0, 1))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (V, T), (1, V), (i_v*BV, i), (BV, BTS), (0, 1))
+        p_dz = dz + i_bh * T + i + tl.arange(0, BTS)
+        # [BK, BTS]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BV, BTS]
+        b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype)
+        b_dz = tl.load(p_dz, mask=(i + tl.arange(0, BTS)) < T)
+        # [BTL, BTS]
+        b_s = tl.dot(b_k.to(b_q.dtype), b_q, allow_tf32=False) * scale
+        b_s2 = b_s * b_s
+        b_dv += tl.dot(b_s2.to(b_q.dtype), tl.trans(b_do), allow_tf32=False)
+        b_ds = tl.dot(b_v, b_do, allow_tf32=False) * scale
+        if i_v == 0:
+            b_ds += b_dz[None, :] * scale
+        else:
+            b_ds = b_ds
+        b_dk += tl.dot((2 * b_ds * b_s).to(b_q.dtype), tl.trans(b_q), allow_tf32=False)
+    tl.debug_barrier()
+    o_q, o_k = tl.arange(0, BTS), tl.arange(0, BTL)
+    for i in range(i_c*BTL, (i_c+1)*BTL, BTS):
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k*BK, i), (BK, BTS), (0, 1))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (V, T), (1, V), (i_v*BV, i), (BV, BTS), (0, 1))
+        p_dz = dz + i_bh * T + i + tl.arange(0, BTS)
+        b_q = tl.load(p_q, boundary_check=(0, 1))  # [BD, BQ]
+        b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype)
+        b_dz = tl.load(p_dz, mask=(i + tl.arange(0, BTS)) < T)
+        # [BK, BQ]
+        m_s = o_k[:, None] <= o_q[None, :]
+        b_s = tl.dot(b_k, b_q, allow_tf32=False) * scale
+        b_s2 = b_s * b_s
+        b_s = tl.where(m_s, b_s, 0)
+        b_s2 = tl.where(m_s, b_s2, 0)
+        b_ds = tl.dot(b_v, b_do, allow_tf32=False)
+        if i_v == 0:
+            b_ds += b_dz[None, :]
+        else:
+            b_ds = b_ds
+        b_ds = tl.where(m_s, b_ds, 0) * scale
+        # [BK, BD]
+        b_dv += tl.dot(b_s2.to(b_q.dtype), tl.trans(b_do), allow_tf32=False)
+        b_dk += tl.dot((2 * b_ds * b_s).to(b_q.dtype), tl.trans(b_q), allow_tf32=False)
+        o_q += BTS
+    p_dk = tl.make_block_ptr(dk + (i_bh + B * H * i_v) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (i_bh + B * H * i_k) * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    return
+@triton.jit(do_not_specialize=['T'])
+def parallel_rebased_bwd_kernel(
+    q,
+    k,
+    v,
+    do,
+    dz,
+    dq,
+    dk,
+    dv,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BTL: tl.constexpr,
+    BTS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr
+):
+    i_kv, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    NV = tl.cdiv(V, BV)
+    i_k = i_kv // (NV)
+    i_v = i_kv % (NV)
+    i_h = i_bh % H
+    _parallel_rebased_bwd_dq(
+        i_bh,
+        i_c,
+        i_k,
+        i_v,
+        i_h,
+        q,
+        k,
+        v,
+        do,
+        dz,
+        dq,
+        scale,
+        B=B,
+        H=H,
+        T=T,
+        K=K,
+        V=V,
+        BTL=BTL,
+        BTS=BTS,
+        BK=BK,
+        BV=BV
+    )
+    tl.debug_barrier()
+    _parallel_rebased_bwd_dkv(
+        i_bh,
+        i_c,
+        i_k,
+        i_v,
+        i_h,
+        q,
+        k,
+        v,
+        do,
+        dz,
+        dk,
+        dv,
+        scale,
+        B=B,
+        H=H,
+        T=T,
+        K=K,
+        V=V,
+        BTL=BTL,
+        BTS=BTS,
+        BK=BK,
+        BV=BV
+    )
+class ParallelBasedFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(ctx, q, k, v, scale):
+        BTL, BTS = 128, 32
+        assert BTL % BTS == 0
+        # assert q.shape[-1] % 16 == 0
+        BK = min(128, triton.next_power_of_2(k.shape[-1]))
+        BV = min(128, triton.next_power_of_2(v.shape[-1]))
+        BK, BV = max(BK, 16), max(BV, 16)
+        B, H, T, K, V = *k.shape, v.shape[-1]
+        num_stages = 2
+        num_warps = 4
+        NK = triton.cdiv(K, BK)
+        NV = triton.cdiv(V, BV)
+        grid = (NK * NV, triton.cdiv(T, BTL), B * H)
+        assert NK == 1, "will encounter some synchronization issue if not."
+        o = torch.empty(NK, B, H, T, V, device=q.device)
+        z = torch.empty(NK, B, H, T, device=q.device)
+        parallel_rebased_fwd_kernel[grid](
+            q,
+            k,
+            v,
+            o,
+            z,
+            scale,
+            T=T,
+            B=B,
+            H=H,
+            K=K,
+            V=V,
+            BTL=BTL,
+            BTS=BTS,
+            BK=BK,
+            BV=BV,
+            num_warps=num_warps,
+            num_stages=num_stages
+        )
+        ctx.save_for_backward(q, k, v)
+        ctx.scale = scale
+        return o.sum(0).to(q.dtype), z.sum(0).to(q.dtype)
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(ctx, do, dz):
+        q, k, v = ctx.saved_tensors
+        scale = ctx.scale
+        BTL, BTS = 64, 32
+        assert BTL % BTS == 0
+        BK = min(128, triton.next_power_of_2(k.shape[-1]))
+        BV = min(128, triton.next_power_of_2(v.shape[-1]))
+        BK, BV = max(BK, 16), max(BV, 16)
+        B, H, T, K, V = *k.shape, v.shape[-1]
+        num_stages = 2
+        num_warps = 4
+        NK = triton.cdiv(K, BK)
+        NV = triton.cdiv(V, BV)
+        grid = (NK * NV, triton.cdiv(T, BTL), B * H)
+        assert NK == 1, "will encounter some synchronization issue if not"
+        dq = torch.empty(NV, B, H, T, K, dtype=q.dtype, device=q.device)
+        dk = torch.empty(NV, B, H, T, K, dtype=q.dtype, device=q.device)
+        dv = torch.empty(NK, B, H, T, V, dtype=q.dtype, device=q.device)
+        parallel_rebased_bwd_kernel[grid](
+            q,
+            k,
+            v,
+            do,
+            dz,
+            dq,
+            dk,
+            dv,
+            scale,
+            T=T,
+            B=B,
+            H=H,
+            K=K,
+            V=V,
+            BTL=BTL,
+            BTS=BTS,
+            BK=BK,
+            BV=BV,
+            num_warps=num_warps,
+            num_stages=num_stages
+        )
+        return dq.sum(0).to(q.dtype), dk.sum(0).to(k.dtype), dv.sum(0).to(v.dtype), None
+def parallel_rebased(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    eps: float = 1e-5,
+    use_scale: bool = True,
+    use_normalize: bool = True,
+    return_both: bool = False,
+    head_first: bool = True
+):
+    assert q.shape[-1] <= 128, "only support feature dim up to 128"
+    if use_scale:
+        scale = q.shape[-1] ** -0.5
+    else:
+        scale = 1
+    if not head_first:
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+    o, z = ParallelBasedFunction.apply(q, k, v, scale)
+    if return_both:
+        return o, z
+    if use_normalize:
+        o = o / (z[..., None] + eps)
+    if not head_first:
+        o = o.transpose(1, 2)
+    return o.to(q.dtype)

fla/ops/retention/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_retention
+from .fused_chunk import fused_chunk_retention
+from .fused_recurrent import fused_recurrent_retention
+from .parallel import parallel_retention
+__all__ = [
+    'chunk_retention',
+    'fused_chunk_retention',
+    'parallel_retention',
+    'fused_recurrent_retention'
+]

fla/ops/retention/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (3.7 kB). View file

fla/ops/retention/__pycache__/parallel.cpython-311.pyc ADDED Viewed

Binary file (3.25 kB). View file

fla/ops/retention/chunk.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+from fla.ops.simple_gla.chunk import chunk_simple_gla
+@torch.compiler.disable
+def chunk_retention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        scale (Optional[int]):
+            Scale factor for the attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    if head_first:
+        n_heads = q.shape[1]
+    else:
+        n_heads = q.shape[2]
+    s = (1 - q.new_tensor(2., dtype=torch.float).pow(-5. - q.new_tensor(range(n_heads), dtype=torch.float))).log()
+    if head_first:
+        g = s[None, :, None].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    else:
+        g = s[None, None, :].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    return chunk_simple_gla(
+        q=q,
+        k=k,
+        v=v,
+        scale=scale,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        head_first=head_first,
+        cu_seqlens=cu_seqlens
+    )

fla/ops/retention/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+from fla.ops.simple_gla.fused_recurrent import fused_recurrent_simple_gla
+def fused_recurrent_retention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    reverse: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        n_heads = q.shape[1]
+    else:
+        n_heads = q.shape[2]
+    s = (1 - q.new_tensor(2., dtype=torch.float).pow(-5. - q.new_tensor(range(n_heads), dtype=torch.float))).log()
+    if head_first:
+        g = s[None, :, None].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    else:
+        g = s[None, None, :].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous()
+    return fused_recurrent_simple_gla(
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        scale=scale,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        reverse=reverse,
+        cu_seqlens=cu_seqlens,
+        head_first=head_first
+    )

fla/ops/retention/naive.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# -*- coding: utf-8 -*-
+import torch
+def naive_retention(q, k, v):
+    orig_type = q.dtype
+    q, k, v = q.float(), k.float(), v.float()
+    _, n_heads, seq_len, d_head = q.shape
+    s = (1 - q.new_tensor(2., dtype=torch.float).pow(-5. - q.new_tensor(range(n_heads), dtype=torch.float))).log2()
+    n = q.new_tensor(range(seq_len), dtype=torch.float)
+    n = torch.exp2((n.unsqueeze(-1) - n) * s.view(-1, 1, 1)) * n.unsqueeze(-1).ge(n)
+    s = torch.einsum('bhqd,bhkd,hqk->bhqk', q * d_head ** -0.5, k, n.to(q.dtype))
+    o = torch.einsum('bhqk,bhkd->bhqd', s, v)
+    return o.to(orig_type)

fla/ops/rwkv4/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# -*- coding: utf-8 -*-
+from .fused_recurrent import fused_recurrent_rwkv4
+__all__ = [
+    'fused_recurrent_rwkv4'
+]

fla/ops/rwkv4/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Songlin Yang, Yu Zhang
+from typing import Any, cast
+import torch
+import triton
+import triton.language as tl
+from torch import Tensor
+from torch.autograd.function import Function, FunctionCtx, once_differentiable
+from fla.ops.utils.op import exp
+def get_block_size_c(chans: int) -> int:
+    if chans < 32:
+        return 32
+    if chans < 64:
+        return 64
+    return 128
+@triton.jit
+def fused_recurrent_rwkv4_forward_kernel(
+    # W
+    w_ptr,
+    w_s_c,
+    # U
+    u_ptr,
+    u_s_c,
+    # K
+    k_ptr,
+    k_s_b,
+    k_s_t,
+    k_s_c,
+    # V
+    v_ptr,
+    v_s_b,
+    v_s_t,
+    v_s_c,
+    # State
+    state_ptr,
+    state_s_b,
+    state_s_abe,
+    state_s_c,
+    # WKV
+    wkv_ptr,
+    wkv_s_b,
+    wkv_s_t,
+    wkv_s_c,
+    # Output state
+    state_out_ptr,
+    state_out_s_b,
+    state_out_s_abe,
+    state_out_s_t,
+    state_out_s_c,
+    # Params
+    chans,
+    tsz,
+    BLOCK_SIZE_C: tl.constexpr,
+):
+    # Parallelize over the batch dimension.
+    b_idx = tl.program_id(0)
+    c_idx = tl.program_id(1)
+    cs = (c_idx * BLOCK_SIZE_C) + tl.arange(0, BLOCK_SIZE_C)
+    cmask = cs < chans
+    # Pointers to the batch (and possibly channel) for the input tensors.
+    k_ptr = k_ptr + b_idx * k_s_b
+    v_ptr = v_ptr + b_idx * v_s_b
+    alpha_ptr = state_ptr + b_idx * state_s_b
+    beta_ptr = state_ptr + b_idx * state_s_b + state_s_abe
+    eps_ptr = state_ptr + b_idx * state_s_b + 2 * state_s_abe
+    # Pointers to the batch (and possibly channel) for the output tensors.
+    wkv_ptr = wkv_ptr + b_idx * wkv_s_b
+    alpha_out_ptr = state_out_ptr + b_idx * state_out_s_b
+    beta_out_ptr = state_out_ptr + b_idx * state_out_s_b + state_out_s_abe
+    eps_out_ptr = state_out_ptr + b_idx * state_out_s_b + 2 * state_out_s_abe
+    # Loads parameters.
+    alpha = tl.load(alpha_ptr + cs * state_s_c, mask=cmask).to(tl.float32)
+    beta = tl.load(beta_ptr + cs * state_s_c, mask=cmask).to(tl.float32)
+    eps = tl.load(eps_ptr + cs * state_s_c, mask=cmask).to(tl.float32)
+    w = tl.load(w_ptr + cs * w_s_c, mask=cmask).to(tl.float32)
+    u = tl.load(u_ptr + cs * u_s_c, mask=cmask).to(tl.float32)
+    for t in range(tsz):
+        kt = tl.load(k_ptr + t * k_s_t + cs * k_s_c, mask=cmask).to(tl.float32)
+        vt = tl.load(v_ptr + t * v_s_t + cs * v_s_c, mask=cmask).to(tl.float32)
+        ukt = u + kt
+        tau = tl.maximum(ukt, eps)
+        e1a = exp(eps - tau)
+        e2a = exp(ukt - tau)
+        wkv = (e1a * alpha + e2a * vt) / (e1a * beta + e2a)
+        tl.store(wkv_ptr + t * wkv_s_t + cs * wkv_s_c, wkv, mask=cmask)
+        w_eps = w + eps
+        eps = tl.maximum(w_eps, kt)
+        e1b = exp(w_eps - eps)
+        e2b = exp(kt - eps)
+        alpha = e1b * alpha + e2b * vt
+        beta = e1b * beta + e2b
+        tl.store(alpha_out_ptr + t * state_out_s_t + cs * state_out_s_c, alpha, mask=cmask)
+        tl.store(beta_out_ptr + t * state_out_s_t + cs * state_out_s_c, beta, mask=cmask)
+        tl.store(eps_out_ptr + t * state_out_s_t + cs * state_out_s_c, eps, mask=cmask)
+def fused_recurrent_rwkv4_forward(
+    w: Tensor,
+    u: Tensor,
+    k: Tensor,
+    v: Tensor,
+    state: Tensor,
+) -> tuple[Tensor, Tensor]:
+    (bsz, tsz, chans) = k.shape
+    # New tensors to output.
+    wkvs = k.new_empty(bsz, tsz, chans)
+    state_out = k.new_empty(bsz, 3, tsz, chans)
+    # Constants.
+    block_size_c = get_block_size_c(chans)
+    def grid(meta: dict[str, Any]) -> tuple[int, ...]:
+        return (bsz, triton.cdiv(chans, meta["BLOCK_SIZE_C"]))
+    fused_recurrent_rwkv4_forward_kernel[grid](
+        # W
+        w,
+        w.stride(0),
+        # U
+        u,
+        u.stride(0),
+        # K
+        k,
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        # V
+        v,
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        # State
+        state,
+        state.stride(0),
+        state.stride(1),
+        state.stride(3),
+        # WKV
+        wkvs,
+        wkvs.stride(0),
+        wkvs.stride(1),
+        wkvs.stride(2),
+        # Output state
+        state_out,
+        state_out.stride(0),
+        state_out.stride(1),
+        state_out.stride(2),
+        state_out.stride(3),
+        # Params
+        chans,
+        tsz,
+        BLOCK_SIZE_C=block_size_c,
+    )
+    state_out = torch.cat((state, state_out), dim=2)
+    return wkvs, state_out
+@triton.jit
+def fused_recurrent_rwkv4_backward_kernel(
+    # W
+    w_ptr,
+    w_s_c,
+    # U
+    u_ptr,
+    u_s_c,
+    # K
+    k_ptr,
+    k_s_b,
+    k_s_t,
+    k_s_c,
+    # V
+    v_ptr,
+    v_s_b,
+    v_s_t,
+    v_s_c,
+    # State
+    state_ptr,
+    state_s_b,
+    state_s_abe,
+    state_s_t,
+    state_s_c,
+    # WKV grad
+    gwkv_ptr,
+    gwkv_s_b,
+    gwkv_s_t,
+    gwkv_s_c,
+    # Output state grad
+    gstate_out_ptr,
+    gstate_out_s_b,
+    gstate_out_s_abe,
+    gstate_out_s_c,
+    # W grad
+    gw_ptr,
+    gw_s_c,
+    # U grad
+    gu_ptr,
+    gu_s_c,
+    # K grad
+    gk_ptr,
+    gk_s_b,
+    gk_s_t,
+    gk_s_c,
+    # V grad
+    gv_ptr,
+    gv_s_b,
+    gv_s_t,
+    gv_s_c,
+    # State grad
+    gstate_ptr,
+    gstate_s_b,
+    gstate_s_abe,
+    gstate_s_c,
+    # Params
+    tsz,
+    chans,
+    BLOCK_SIZE_C: tl.constexpr,
+):
+    # Parallelize over the batch dimension.
+    b_idx = tl.program_id(0)
+    c_idx = tl.program_id(1)
+    cs = (c_idx * BLOCK_SIZE_C) + tl.arange(0, BLOCK_SIZE_C)
+    cmask = cs < chans
+    # Pointers to the batch (and possibly channel) for the input tensors.
+    k_ptr = k_ptr + b_idx * k_s_b
+    v_ptr = v_ptr + b_idx * v_s_b
+    alpha_ptr = state_ptr + b_idx * state_s_b
+    beta_ptr = state_ptr + b_idx * state_s_b + state_s_abe
+    eps_ptr = state_ptr + b_idx * state_s_b + 2 * state_s_abe
+    # Pointers to the batch (and possibly channel) for the output tensors.
+    gk_ptr = gk_ptr + b_idx * gk_s_b
+    gv_ptr = gv_ptr + b_idx * gv_s_b
+    # Pointers to gradients which were recieved by the function.
+    gwkv_ptr = gwkv_ptr + b_idx * gwkv_s_b
+    galpha_out_ptr = gstate_out_ptr + b_idx * gstate_out_s_b
+    gbeta_out_ptr = gstate_out_ptr + b_idx * gstate_out_s_b + gstate_out_s_abe
+    geps_out_ptr = gstate_out_ptr + b_idx * gstate_out_s_b + 2 * gstate_out_s_abe
+    # Loads parameters.
+    galpha = tl.load(galpha_out_ptr + gstate_out_s_c * cs, mask=cmask).to(tl.float32)
+    gbeta = tl.load(gbeta_out_ptr + gstate_out_s_c * cs, mask=cmask).to(tl.float32)
+    geps = tl.load(geps_out_ptr + gstate_out_s_c * cs, mask=cmask).to(tl.float32)
+    w = tl.load(w_ptr + w_s_c * cs, mask=cmask).to(tl.float32)
+    u = tl.load(u_ptr + u_s_c * cs, mask=cmask).to(tl.float32)
+    # Gradient accumulators.
+    gw = tl.zeros_like(w)
+    gu = tl.zeros_like(u)
+    alpha_prev = tl.load(alpha_ptr + tsz * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+    beta_prev = tl.load(beta_ptr + tsz * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+    eps_prev = tl.load(eps_ptr + tsz * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+    for t in range(tsz):
+        tc = tsz - t - 1
+        kt = tl.load(k_ptr + tc * k_s_t + k_s_c * cs, mask=cmask).to(tl.float32)
+        vt = tl.load(v_ptr + tc * v_s_t + v_s_c * cs, mask=cmask).to(tl.float32)
+        alpha_curr = alpha_prev
+        beta_curr = beta_prev
+        eps_curr = eps_prev
+        alpha_prev = tl.load(alpha_ptr + tc * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+        beta_prev = tl.load(beta_ptr + tc * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+        eps_prev = tl.load(eps_ptr + tc * state_s_t + state_s_c * cs, mask=cmask).to(tl.float32)
+        ukt = u + kt
+        tau = tl.maximum(ukt, eps_prev)
+        e1 = exp(eps_prev - tau)
+        e2 = exp(ukt - tau)
+        euke = exp(ukt + eps_prev - 2 * tau)
+        denom = e1 * beta_prev + e2
+        denom_sq = denom * denom
+        gwkvt = tl.load(gwkv_ptr + tc * gwkv_s_t + gwkv_s_c * cs, mask=cmask).to(tl.float32)
+        # Backpropagates wkv gradients.
+        guk = gwkvt * e2 * (e1 * beta_prev * vt - e1 * alpha_prev) / denom_sq
+        gu += guk
+        gk = guk
+        gv = gwkvt * e2 / denom
+        galpha_wkv = gwkvt * e1 / denom
+        gbeta_wkv = -gwkvt * e1 * (e2 * vt + e1 * alpha_prev) / denom_sq
+        geps_wkv_denom = e1 * beta_prev + e2
+        geps_wkv = gwkvt * euke * (alpha_prev - vt * beta_prev) / (geps_wkv_denom * geps_wkv_denom)
+        e1 = exp(w + eps_prev - eps_curr)
+        e2 = exp(kt - eps_curr)
+        # Backpropagates alpha gradients.
+        galpha_we = galpha * e1 * alpha_prev
+        gw += galpha_we
+        gk += galpha * e2 * vt
+        gv += galpha * e2
+        geps += galpha * -alpha_curr
+        # Backpropagates beta gradients.
+        gbeta_we = gbeta * e1 * beta_prev
+        gw += gbeta_we
+        gk += gbeta * e2
+        geps += gbeta * -beta_curr
+        # Backpropagates epsilon gradients.
+        geps_mask = w + eps_prev > kt
+        geps_we = tl.where(geps_mask, geps, tl.zeros_like(geps))
+        gw += geps_we
+        gk += tl.where(geps_mask, tl.zeros_like(geps), geps)
+        # Stores the gradients for k and v.
+        tl.store(gk_ptr + tc * gk_s_t + gk_s_c * cs, gk, mask=cmask)
+        tl.store(gv_ptr + tc * gv_s_t + gv_s_c * cs, gv, mask=cmask)
+        # Computes new gradients for alpha and beta.
+        galpha = galpha * e1 + galpha_wkv
+        gbeta = gbeta * e1 + gbeta_wkv
+        geps = galpha_we + gbeta_we + geps_we + geps_wkv
+    # Stores final gradients for alpha and beta.
+    galpha_ptr = gstate_ptr + b_idx * gstate_s_b
+    gbeta_ptr = gstate_ptr + b_idx * gstate_s_b + gstate_s_abe
+    geps_ptr = gstate_ptr + b_idx * gstate_s_b + 2 * gstate_s_abe
+    tl.store(galpha_ptr + gstate_s_c * cs, galpha, mask=cmask)
+    tl.store(gbeta_ptr + gstate_s_c * cs, gbeta, mask=cmask)
+    tl.store(geps_ptr + gstate_s_c * cs, geps, mask=cmask)
+    # Stores final gradients for w and u.
+    gw_temp = tl.load(gw_ptr + gw_s_c * cs, mask=cmask).to(tl.float32)
+    gw_temp += gw
+    tl.store(gw_ptr + gw_s_c * cs, gw_temp, mask=cmask)
+    gu_temp = tl.load(gu_ptr + gu_s_c * cs, mask=cmask).to(tl.float32)
+    gu_temp += gu
+    tl.store(gu_ptr + gu_s_c * cs, gu_temp, mask=cmask)
+def fused_recurrent_rwkv4_backward(
+    w: Tensor,
+    u: Tensor,
+    k: Tensor,
+    v: Tensor,
+    state: Tensor,
+    grad_wkv: Tensor,
+    grad_state: Tensor,
+) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    bsz, tsz, chans = k.shape
+    gw = torch.zeros_like(w)  # New tensors to output.
+    gu = torch.zeros_like(u)
+    gk = torch.empty_like(k)
+    gv = torch.empty_like(v)
+    gstate = k.new_empty(bsz, 3, 1, chans)
+    block_size_c = get_block_size_c(chans)  # Constants.
+    def grid(meta: dict[str, Any]) -> tuple[int, ...]:
+        return (bsz, triton.cdiv(chans, meta["BLOCK_SIZE_C"]))
+    fused_recurrent_rwkv4_backward_kernel[grid](
+        # W
+        w,
+        w.stride(0),
+        # U
+        u,
+        u.stride(0),
+        # K
+        k,
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        # V
+        v,
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        # State
+        state,
+        state.stride(0),
+        state.stride(1),
+        state.stride(2),
+        state.stride(3),
+        # WKV grad
+        grad_wkv,
+        grad_wkv.stride(0),
+        grad_wkv.stride(1),
+        grad_wkv.stride(2),
+        # Output state grad
+        grad_state,
+        grad_state.stride(0),
+        grad_state.stride(1),
+        grad_state.stride(3),
+        # W grad
+        gw,
+        gw.stride(0),
+        # U grad
+        gu,
+        gu.stride(0),
+        # K grad
+        gk,
+        gk.stride(0),
+        gk.stride(1),
+        gk.stride(2),
+        # V grad
+        gv,
+        gv.stride(0),
+        gv.stride(1),
+        gv.stride(2),
+        # State grad
+        gstate,
+        gstate.stride(0),
+        gstate.stride(1),
+        gstate.stride(3),
+        # Params
+        tsz,
+        chans,
+        BLOCK_SIZE_C=block_size_c,
+    )
+    return gw, gu, gk, gv, gstate
+class FusedRecurrentRWKV4Function(Function):
+    @staticmethod
+    def forward(
+        ctx: FunctionCtx,
+        w: Tensor,
+        u: Tensor,
+        k: Tensor,
+        v: Tensor,
+        state: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        ctx.input_dtype = k.dtype
+        w = -torch.exp(w.float().contiguous())
+        if k.dtype == torch.float16:
+            u = u.float()
+            k = k.float()
+            v = v.float()
+        u = u.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        wkv, state_out = fused_recurrent_rwkv4_forward(w, u, k, v, state)
+        ctx.save_for_backward(w, u, k, v, state_out[:, :, :-1])
+        return wkv, state_out[:, :, -1:]
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: FunctionCtx, gwkv: Tensor, gstate: Tensor) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+        w, u, k, v, state = cast(tuple[Tensor, ...], ctx.saved_tensors)
+        gw, gu, gk, gv, gstate = fused_recurrent_rwkv4_backward(w, u, k, v, state, gwkv, gstate)
+        return gw, gu, gk, gv, gstate
+def fused_recurrent_rwkv4(w: Tensor, u: Tensor, k: Tensor, v: Tensor, state: Tensor) -> tuple[Tensor, Tensor]:
+    return FusedRecurrentRWKV4Function.apply(w, u, k, v, state)

fla/ops/rwkv6/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+from .chunk import chunk_rwkv6
+from .fused_recurrent import fused_recurrent_rwkv6
+__all__ = [
+    'chunk_rwkv6',
+    'fused_recurrent_rwkv6'
+]

fla/ops/rwkv6/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (325 Bytes). View file

fla/ops/rwkv6/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (83 kB). View file

fla/ops/rwkv6/__pycache__/fused_recurrent.cpython-311.pyc ADDED Viewed

Binary file (40.3 kB). View file