diff --git "a/modeling_blocks_rwkv7.py" "b/modeling_blocks_rwkv7.py"
new file mode 100644--- /dev/null
+++ "b/modeling_blocks_rwkv7.py"
@@ -0,0 +1,2426 @@
+# ========== AUTO GENERATED FILE =========
+# This file is auto generated by 'hf_builder.py', do not edit this file directly
+# As part of the RWKV/RWKV-block project
+# ========== =================== =========
+# ----------------
+# block/kernel/rwkv7_attn_pytorch.py
+# ----------------
+import torch
+
+# Enable tensorfloat 32 
+torch.set_float32_matmul_precision('high')
+
+# Handles the RWKV v7 attention mechanic, in pure pytorch
+def rwkv7_attn_pytorch(
+    r,w,k,v, kk,a, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+
+    ### Reference implement
+    # return rwkv7_attn_pytorch_ref(
+    #     r,w,k,v, kk,a,
+    #     BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    #     xx, wkv_state_in
+    # )
+    ###
+
+    # # This works, but it has too much of a vram overhead
+    ###
+    # return rwkv7_attn_pytorch_v2_chunk_w_compile_break(
+    #     r,w,k,v, kk,a,
+    #     BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    #     xx, wkv_state_in
+    # )
+    ###
+
+    # > per 9k chunk, per block, on a 4090 ...
+    # > with forward_with_reduce_compile on the timemix ...
+    #
+    # Somehow...
+    # The reference implement takes: 2281ms
+    # The chunked version takes:     389ms  (chunksize 256)
+
+    # Get the shape
+    B,T,HC = w.shape
+
+    # Compute the chunks
+    chunk_size = 256
+    chunk_count = SEQ_LEN // chunk_size
+    chunk_remainder = SEQ_LEN % chunk_size
+
+    # The wkv_state_out
+    wkv_state_out = wkv_state_in.float()
+
+    # # List of tensor to build
+    # xlist = []
+    xx = xx.clone()
+
+    # Loop over the chunks
+    for i in range(chunk_count):
+        sta = i * chunk_size
+        end = sta + chunk_size
+
+        xx[:,sta:end], wkv_state_out = rwkv7_attn_pytorch_v2_chunk_w_compile_break(
+        # xpart, wkv_state_out = rwkv7_attn_pytorch_chunk_with_w_compile_break(
+            r[:,sta:end],w[:,sta:end],k[:,sta:end],v[:,sta:end], 
+            kk[:,sta:end],a[:,sta:end],
+            BATCH_SIZE, chunk_size, N_HEAD, HEAD_SIZE,
+            # xx[:,sta:end], wkv_state_out
+            torch.zeros(B,chunk_size,HC, dtype=xx.dtype, device=xx.device), wkv_state_out
+        )
+        # xlist.append(xpart)
+
+    # Handle the remainder
+    if chunk_remainder > 0:
+        sta = chunk_count * chunk_size
+        end = sta + chunk_remainder
+
+        xx[:,sta:end], wkv_state_out = rwkv7_attn_pytorch_v2_chunk_w_compile_break(
+        # xpart, wkv_state_out = rwkv7_attn_pytorch_chunk_with_w_compile_break(
+            r[:,sta:end],w[:,sta:end],k[:,sta:end],v[:,sta:end], 
+            kk[:,sta:end],a[:,sta:end],
+            BATCH_SIZE, chunk_remainder, N_HEAD, HEAD_SIZE,
+            # xx[:,sta:end], wkv_state_out,
+            torch.zeros(B,chunk_remainder,HC, dtype=xx.dtype, device=xx.device), wkv_state_out,
+            # offset=0, chunk_size=chunk_remainder
+        )
+        # xlist.append(xpart)
+
+    # # Concatenate the list
+    # xx = torch_cat_no_compiler(xlist, dim=1)
+
+    # Return the output
+    return xx, wkv_state_out.to(dtype=wkv_state_in.dtype)
+
+####################################################################################################
+# Working reference copy, that has been validated to be "identical" to the reference implementation
+# However this has known pytorch compilation issues, hence the modified chunk wise version is used 
+# instead for an approximate 5x speed up
+####################################################################################################
+@torch.compiler.disable()
+def rwkv7_attn_pytorch_ref(
+    r,w,k,v, kk,a, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    ######## pure pytorch method
+    # See: https://github.com/BlinkDL/RWKV-LM/blob/d4c42b2cac10f8f3896ce153e2310dc763662b7a/RWKV-v7/rwkv_v7_demo_fast.py#L238
+    ########
+    vk_state = wkv_state_in.float()
+    for t in range(SEQ_LEN):
+        r_, w_, k_, v_, kk_, a_ = r[:,t], w[:,t], k[:,t], v[:,t], kk[:,t], a[:,t]
+        vk = v_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ k_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        ab = (-kk_).view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ (kk_*a_).view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        vk_state = (vk_state * w_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE).float() + vk_state @ ab.float() + vk.float())
+        xx[:,t] = ((vk_state.to(dtype=xx.dtype) @ r_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,N_HEAD*HEAD_SIZE))
+    wkv_state_out = vk_state.to(dtype=wkv_state_in.dtype)
+    return xx, wkv_state_out
+####################################################################################################
+
+####################################################################################################
+# Modified reference computation done in fp32, 
+# with changes made to bring the result closer to the cuda kernel
+####################################################################################################
+@torch.compiler.disable()
+def rwkv7_attn_pytorch_ref_fp32(
+    r,w,k,v, kk, iclr, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    ######## pure pytorch method (modified for fp32)
+    # See: https://github.com/BlinkDL/RWKV-LM/blob/d4c42b2cac10f8f3896ce153e2310dc763662b7a/RWKV-v7/rwkv_v7_demo_fast.py#L238
+    ########
+    w = (-w.float().exp()).exp()
+
+    # wkv_state_in = torch.zeros(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE, dtype=torch.float,device=w.device)
+    vk_state = wkv_state_in.float()
+
+    a = -kk
+    b = kk * iclr
+
+    for t in range(SEQ_LEN):
+        r_, w_, k_, v_, a_, b_= r[:,t].float(), w[:,t].float(), k[:,t].float(), v[:,t].float(), a[:,t].float(), b[:,t].float()
+        # ab = (-kk_).view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ (kk_*a_).view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        vk = v_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ k_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        vk_state = (vk_state * w_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE).float() + vk_state @ a_.float().view(BATCH_SIZE, N_HEAD,HEAD_SIZE,1) @ b_.view(BATCH_SIZE, N_HEAD,1,HEAD_SIZE) + vk.float())
+        xx[:,t] = ((vk_state @ r_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,N_HEAD*HEAD_SIZE)).to(dtype=xx.dtype)
+
+    wkv_state_out = vk_state.to(dtype=wkv_state_in.dtype)
+    return xx, wkv_state_out
+####################################################################################################
+
+def rwkv7_attn_pytorch_chunk(
+    r,w,k,v, kk,a, 
+    BATCH_SIZE, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in,
+    offset=0, chunk_size=16
+):
+    '''
+    Chunked version of the RWKV7 attention, for better performance. 
+    If the chunk size is less then 128, this is generally compilable
+
+    This is used by the triton/cuda implement, for the remaining % 16 chunks
+    '''
+    ######## pure pytorch method
+    # See: https://github.com/BlinkDL/RWKV-LM/blob/d4c42b2cac10f8f3896ce153e2310dc763662b7a/RWKV-v7/rwkv_v7_demo_fast.py#L238
+    ########
+    vk_state = wkv_state_in.float()
+    for i in range(chunk_size):
+        t = offset + i
+        r_, w_, k_, v_, kk_, a_ = r[:,t], w[:,t], k[:,t], v[:,t], kk[:,t], a[:,t]
+        vk = v_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ k_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        ab = (-kk_).view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1) @ (kk_*a_).view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE)
+        vk_state = (vk_state * w_.view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE).float() + vk_state @ ab.float() + vk.float())
+        xx[:,t] = (vk_state.to(dtype=xx.dtype) @ r_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,N_HEAD*HEAD_SIZE)
+    wkv_state_out = vk_state.to(dtype=wkv_state_in.dtype)
+    return xx, wkv_state_out
+
+
+def rwkv7_attn_pytorch_v2_chunk_w_compile_break(
+    r,w,k,v, kk,a, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    '''
+    Chunked version of the RWKV7 attention, for better performance
+    '''
+    full_vk_ = v.view(BATCH_SIZE,SEQ_LEN,N_HEAD, HEAD_SIZE,1) @ k.view(BATCH_SIZE,SEQ_LEN,N_HEAD, 1,HEAD_SIZE)
+    full_iclr_ = (kk * a).view(BATCH_SIZE,SEQ_LEN,N_HEAD,1,HEAD_SIZE)
+    full_ab = (-kk).view(BATCH_SIZE,SEQ_LEN,N_HEAD, HEAD_SIZE,1) @ full_iclr_
+
+    wkv_xx = torch.empty(BATCH_SIZE,SEQ_LEN,N_HEAD,HEAD_SIZE,HEAD_SIZE, dtype=xx.dtype, device=xx.device)
+    wkv_xx, wkv_state_out = rwkv7_attn_pytorch_v2_inner_w_compile_break(
+        r,w,
+        full_vk_, full_ab, 
+        BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+        wkv_xx, wkv_state_in
+        # xx, wkv_state_in
+    )
+
+    # if BATCH_SIZE != 1:
+    #     print("BATCH_SIZE != 1 : ", BATCH_SIZE)
+    # if SEQ_LEN != 256:
+    #     print("SEQ_LEN != 256 : ", SEQ_LEN)
+
+    # xx[:,t] = ((wkv_state.to(dtype=xx.dtype) @ r_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,N_HEAD*HEAD_SIZE))
+    xx[:] = (wkv_xx.to(dtype=xx.dtype) @ r.view(BATCH_SIZE,SEQ_LEN,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,SEQ_LEN,N_HEAD*HEAD_SIZE)
+
+    return xx, wkv_state_out
+
+@torch.compiler.disable()
+def rwkv7_attn_pytorch_v2_inner_w_compile_break(
+    r, w,
+    full_vk_, full_ab, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    '''
+    Isolated sub-function with no compilation
+    '''
+    return rwkv7_attn_pytorch_v2_inner_jit(
+        r, w,
+        full_vk_, full_ab,
+        BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+        xx, wkv_state_in
+    )
+
+# @torch.compile(fullgraph=True)
+@torch.jit.script
+def rwkv7_attn_pytorch_v2_inner_jit(
+    r, w,
+    full_vk_, full_ab, 
+    BATCH_SIZE:int, SEQ_LEN:int, N_HEAD:int, HEAD_SIZE:int,
+    wkv_xx, wkv_state_in
+):
+    '''
+    Isolated sub-function with JIT
+    '''
+    # wkv_xx = torch.zeros(BATCH_SIZE,SEQ_LEN,N_HEAD,HEAD_SIZE,HEAD_SIZE, dtype=xx.dtype,device=xx.device)
+    # wkv_state_in = torch.zeros(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE, dtype=torch.float,device=w.device)
+    wkv_state = wkv_state_in
+    for t in range(SEQ_LEN):
+        # r_ = r[:,t]
+        # w_ = w[:,t]
+        # vk = full_vk_[:,t].view(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE)
+        # ab = full_ab[:,t].view(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE)
+
+        wkv_state = (wkv_state * w[:,t].view(BATCH_SIZE,N_HEAD,1,HEAD_SIZE).float() + wkv_state @ full_ab[:,t].view(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE).float() + full_vk_[:,t].view(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE).float()).clone()
+        wkv_xx[:,t] = wkv_state.to(dtype=w.dtype)
+    return wkv_xx, wkv_state
+    #     xx[:,t] = ((wkv_state.to(dtype=xx.dtype) @ r_.view(BATCH_SIZE,N_HEAD,HEAD_SIZE,1)).view(BATCH_SIZE,N_HEAD*HEAD_SIZE))
+    # return xx, wkv_state
+    
+
+# ----------------
+# block/kernel/rwkv7_attn_cuda.py
+# ----------------
+import torch, os, time
+# from .rwkv7_attn_pytorch import rwkv7_attn_pytorch_chunk
+
+####################################################################################################
+# Stateless reference implementation
+####################################################################################################
+
+def load_ref_wkv_cuda_kernel(CHUNK_LEN = 16, HEAD_SIZE = 64):
+    from torch.utils.cpp_extension import load
+
+    # load_name = f"wind_backstepping_C{HEAD_SIZE}_L{CHUNK_LEN}"
+    load_name = "wind_backstepping"
+    load_file = "wkv7"
+
+    # Check if the load_name is already loaded
+    if load_name in torch.ops:
+        return torch.ops.wind_backstepping
+
+    # Logging of warning usage for reference implementation
+    print("[WARNING] Reference CUDA kernel does not support input RWKV state, and is used only for training/validaiton purposes")
+    
+    # Get the this script file path, to cmpute the cuda path
+    this_file_path = os.path.dirname(os.path.abspath(__file__))
+
+    # # Get the device compute capability
+    # cuda_device = torch.cuda.current_device()
+    # compute_capability = torch.cuda.get_device_capability(cuda_device)
+    # compute_capability_str = f"{compute_capability[0]}{compute_capability[1]}"
+    # print("[INFO] Using compute capability:", compute_capability_str)
+
+    # Load the kernel, there is some wierd edge condition in compilation,
+    # that try catching.... and trying again.... sometimes work?
+    flags = ['-res-usage', f'-D_C_={HEAD_SIZE}', f"-D_CHUNK_LEN_={CHUNK_LEN}", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization"] # 
+    try:
+        load(name=load_name, sources=[f'{this_file_path}/cuda/{load_file}_cuda.cu', f'{this_file_path}/cuda/{load_file}_op.cpp'], is_python_module=False, verbose=True, extra_cuda_cflags=flags)
+    except Exception as e:
+        print("[WARNING] Failed to load the kernel, trying again (sometimes the compiler has wierd race condition)...")
+        time.sleep(2) # Somehow this works, with minor compilation error, that passes on subsequent reruns
+        load(name=load_name, sources=[f'{this_file_path}/cuda/{load_file}_cuda.cu', f'{this_file_path}/cuda/{load_file}_op.cpp'], is_python_module=False, verbose=True, extra_cuda_cflags=flags)
+
+    # Return the loaded kernel
+    return torch.ops.wind_backstepping
+
+@torch.compiler.disable()
+def ref_wkv_cuda_forward(w,q,k,v,z,b, y,s,sa):
+    torch.ops.wind_backstepping.forward(w,q,k,v,z,b, y,s,sa)
+
+@torch.compiler.disable()
+def ref_wkv_cuda_backward(w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db):
+    torch.ops.wind_backstepping.backward(w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db)
+
+class RefCudaWindBackstepping(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w,q,k,v,z,b):
+        CHUNK_LEN=16
+        B,T,H,C = w.shape 
+        assert T%CHUNK_LEN == 0
+        assert all(i.dtype==torch.bfloat16 for i in [w,q,k,v,z,b])
+        assert all(i.is_contiguous() for i in [w,q,k,v,z,b])
+        y = torch.empty_like(v)
+        s = torch.empty(B,H,T//CHUNK_LEN,C,C, dtype=torch.float32,device=w.device)
+        sa = torch.empty(B,T,H,C, dtype=torch.float32,device=w.device)
+        ref_wkv_cuda_forward(w,q,k,v,z,b, y,s,sa)
+        ctx.save_for_backward(w,q,k,v,z,b,s,sa)
+        return y
+    @staticmethod
+    def backward(ctx, dy):
+        assert all(i.dtype==torch.bfloat16 for i in [dy])
+        assert all(i.is_contiguous() for i in [dy])
+        w,q,k,v,z,b,s,sa = ctx.saved_tensors
+        dw,dq,dk,dv,dz,db = [torch.empty_like(x) for x in [w,q,k,v,z,b]]
+        ref_wkv_cuda_backward(w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db)
+        return dw,dq,dk,dv,dz,db
+
+@torch.compiler.disable()
+def rwkv7_attn_cuda_ref(q,w,k,v, kk,iclr, HEAD_SIZE=64, s0=None):
+    # Preload the kernel
+    load_ref_wkv_cuda_kernel()
+
+    # Get the shape
+    B,T,HC = w.shape
+    C = HEAD_SIZE
+    H = HC//C
+
+    # Assert that the chunk is multiple of 16
+    assert T % 16 == 0, 'reference cuda, only works in multiple of 16'
+
+    # Initialize the state, if not provided - for compatibility (THE STATE IS NOT UPDATED)
+    s0 = torch.zeros(B,H,C,C, dtype=torch.float,device=w.device) if s0 is None else s0
+    
+    # Handling the cuda kernel
+    q,w,k,v,a,b = [i.view(B,T,H,C) for i in [q,w,k,v,(-kk),(kk*iclr)]]
+
+    # Forward with backprop
+    xx = RefCudaWindBackstepping.apply(w,q,k,v,a,b)
+    return xx.view(B,T,HC), s0.view(B,H,C,C)
+
+####################################################################################################
+# State based cuda code
+####################################################################################################
+
+def load_wkv_cuda_kernel(CHUNK_LEN = 16, HEAD_SIZE = 64):
+    from torch.utils.cpp_extension import load
+
+    # load_name = f"wind_backstepping_C{HEAD_SIZE}_L{CHUNK_LEN}"
+    load_name = "state_wind_backstepping"
+    load_file = "state_wkv7"
+
+    # Check if the load_name is already loaded
+    if load_name in torch.ops:
+        return torch.ops.state_wind_backstepping
+    
+    # Get the this script file path, to cmpute the cuda path
+    this_file_path = os.path.dirname(os.path.abspath(__file__))
+
+    # Load the kernel, there is some wierd edge condition in compilation,
+    # that try catching.... and trying again.... sometimes work?
+    flags = ['-res-usage', f'-D_C_={HEAD_SIZE}', f"-D_CHUNK_LEN_={CHUNK_LEN}", "--use_fast_math", "-O3", "-Xptxas -O3", "--extra-device-vectorization"] # 
+    try:
+        load(name=load_name, sources=[f'{this_file_path}/cuda/{load_file}_cuda.cu', f'{this_file_path}/cuda/{load_file}_op.cpp'], is_python_module=False, verbose=True, extra_cuda_cflags=flags)
+    except Exception as e:
+        print("[WARNING] Failed to load the kernel, trying again (sometimes the compiler has wierd race condition)...")
+        time.sleep(2) # Somehow this works, with minor compilation error, that passes on subsequent reruns
+        load(name=load_name, sources=[f'{this_file_path}/cuda/{load_file}_cuda.cu', f'{this_file_path}/cuda/{load_file}_op.cpp'], is_python_module=False, verbose=True, extra_cuda_cflags=flags)
+
+    # Return the loaded kernel
+    return torch.ops.state_wind_backstepping
+
+@torch.compiler.disable()
+def wkv_cuda_forward(state, w,q,k,v,z,b, y,s,sa):
+    torch.ops.state_wind_backstepping.forward(state, w,q,k,v,z,b, y,s,sa)
+
+@torch.compiler.disable()
+def wkv_cuda_backward(state, w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db):
+    torch.ops.state_wind_backstepping.backward(state, w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db)
+
+class CudaWindBackstepping(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, s0, w,q,k,v,z,b):
+        CHUNK_LEN=16
+        B,T,H,C = w.shape 
+        assert T%CHUNK_LEN == 0
+        assert all(i.dtype==torch.bfloat16 for i in [w,q,k,v,z,b])
+        assert all(i.is_contiguous() for i in [w,q,k,v,z,b])
+        y = torch.empty_like(v)
+        s = torch.empty(B,H,T//CHUNK_LEN,C,C, dtype=torch.float32,device=w.device)
+        sa = torch.empty(B,T,H,C, dtype=torch.float32,device=w.device)
+        sOri = s0.clone()
+        wkv_cuda_forward(s0, w,q,k,v,z,b, y,s,sa)
+        ctx.save_for_backward(sOri, w,q,k,v,z,b,s,sa)
+        return y
+    @staticmethod
+    def backward(ctx, dy):
+        assert all(i.dtype==torch.bfloat16 for i in [dy])
+        assert all(i.is_contiguous() for i in [dy])
+        state,w,q,k,v,z,b,s,sa = ctx.saved_tensors
+        dS0,dw,dq,dk,dv,dz,db = [torch.empty_like(x) for x in [state,w,q,k,v,z,b]]
+        wkv_cuda_backward(state, w,q,k,v,z,b, dy,s,sa, dw,dq,dk,dv,dz,db)
+        return dS0,dw,dq,dk,dv,dz,db
+
+@torch.compiler.disable()
+def rwkv7_attn_cuda(r,w,k,v, kk,iclr, HEAD_SIZE=64, s0=None):
+    # Preload the kernel
+    load_wkv_cuda_kernel()
+
+    # Get the shape
+    B,T,HC = w.shape
+
+    # Check if the chunk is multiple of 16
+    chunk_remainder = T % 16
+
+    # Initialize the state
+    C = HEAD_SIZE
+    H = HC//C
+
+    # Initialize the state
+    s0 = torch.zeros(B,H,C,C, dtype=torch.float,device=w.device) if s0 is None else s0
+    sT = s0.to(dtype=torch.float)
+
+    # Optimize the call, if chunk is multiple of 16
+    if chunk_remainder == 0:
+        chunk_xx, chunk_sT = rwkv7_attn_cuda_chunk(r,w,k,v, kk,iclr, HEAD_SIZE, sT)
+        return chunk_xx, chunk_sT.to(dtype=s0.dtype)
+
+    # Compute the number of chunks
+    chunks = T // 16
+    si = chunks * 16
+
+    # Get the chunked output
+    chunk_xx, chunk_sT = rwkv7_attn_cuda_chunk(
+        r[:,:si],w[:,:si],k[:,:si],v[:,:si], kk[:,:si],iclr[:,:si],
+        HEAD_SIZE, s0
+    )
+
+    # Get the remainder
+    remain_xx, last_sT = rwkv7_attn_pytorch_chunk(
+        r[:,si:],torch.exp(-torch.exp(w[:,si:])),k[:,si:],v[:,si:], kk[:,si:],iclr[:,si:], 
+        B, H, C, 
+        torch.zeros(B, chunk_remainder, HC, device=w.device, dtype=w.dtype), 
+        chunk_sT, chunk_size=chunk_remainder
+    )
+
+    # Concatenate and return results
+    return torch.cat([chunk_xx.to(dtype=w.dtype), remain_xx.to(dtype=w.dtype)], dim=1), last_sT.to(dtype=s0.dtype)
+
+
+def rwkv7_attn_cuda_chunk(r,w,k,v, kk,iclr, HEAD_SIZE=64, s0=None):
+    '''
+    Triton implementation running in blocks of 16 (hardcoded requirement for the kernel)
+    '''
+    B,T,HC = w.shape
+    assert T % 16 == 0, 'pure cuda, only works in multiple of 16'
+    C = HEAD_SIZE
+    H = HC//C
+
+    # Handling the cuda kernel
+    a,b = -kk, (kk*iclr)
+    r,w,k,v,a,b = [i.view(B,T,H,C) for i in [r,w,k,v,a,b]]
+
+    if s0 is None:
+        s1 = torch.zeros(B,H,C,C, dtype=torch.float,device=w.device)
+    else:
+        s1 = s0.clone()
+
+    # Forward with backprop
+    xx = CudaWindBackstepping.apply(s1,w,r,k,v,a,b)
+    return xx.view(B,T,HC), s1.view(B,H,C,C)
+
+
+# ----------------
+# block/kernel/rwkv7_attn_fla.py
+# ----------------
+def rwkv7_attn_fla(
+    r,w,k,v, kk,iclr, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    from fla.ops.rwkv7.chunk import chunk_rwkv7
+
+    # Preprocessing the FLA
+    r,w,k,v,a,b = [i.view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1) for i in [r,w,k,v,-kk,(kk*iclr)]]
+    log_w = -w.float().exp()
+
+    # Run the FLA
+    output, vk_state = chunk_rwkv7(r=r, log_w=log_w, k=k, v=v, a=a, b=b, initial_state=wkv_state_in.float(), output_final_state=True)
+    return output, vk_state.to(dtype=wkv_state_in.dtype)
+
+def rwkv7_attn_fused_reccurent_fla(
+    r,w,k,v, kk,iclr, 
+    BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE,
+    xx, wkv_state_in
+):
+    from fla.ops.rwkv7.fused_recurrent import fused_recurrent_rwkv7
+
+    # Preprocessing the FLA
+    r,w,k,v,a,b = [i.view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1) for i in [r,w,k,v,-kk,(kk*iclr)]]
+    log_w = -w.float().exp()
+
+    # Run the FLA
+    output, vk_state = fused_recurrent_rwkv7(r=r, log_w=log_w, k=k, v=v, a=a, b=b, initial_state=wkv_state_in.float(), output_final_state=True)
+    return output, vk_state.to(dtype=wkv_state_in.dtype)
+
+# ----------------
+# block/kernel/rwkv7_attn_triton.py
+# ----------------
+import torch
+import torch as th
+import triton
+import triton.language as tl
+
+####################################################################################################
+# Triton specific coding (aka mostly songlin & Johan Sokrates Wind stuff)
+#
+# Copyright (c) 2024, Johan Sokrates Wind, licensed under MIT
+# https://github.com/johanwind/wind_rwkv/blob/main/LICENSE
+####################################################################################################
+
+# -------------------------
+# Triton "smallhead" and "bighead" common code
+# -------------------------
+
+@triton.jit
+def IND3(a,b,c,nb,nc):
+    return (a*nb+b)*nc+c
+@triton.jit
+def IND4(a,b,c,d,nb,nc,nd):
+    return ((a*nb+b)*nc+c)*nd+d
+@triton.jit
+def IND5(a,b,c,d,e,nb,nc,nd,ne):
+    return (((a*nb+b)*nc+c)*nd+d)*ne+e
+
+@triton.jit
+def _prod(a,b): return a*b
+
+# inv(I-A) where A is a strictly lower triangular nxn matrix
+@triton.jit
+def tri_minv(A, n:tl.constexpr, prec:tl.constexpr):
+    i = tl.arange(0,n)
+    prod = (i[None,:]==i[:,None]).to(tl.float32)
+    for j in range(n-1):
+        prod += tl_dot(prec, prod, (A*((i[None,:]==j)*(i[:,None]>i[None,:]))).trans())
+    return prod.trans()
+
+@triton.jit
+def tl_dot(prec:tl.constexpr, a, b):
+    if prec == 'fp32':
+        return tl.dot(a.to(tl.float32),b.trans().to(tl.float32).trans(), allow_tf32=False)
+    elif prec == 'tf32':
+        return tl.dot(a.to(tl.float32),b.trans().to(tl.float32).trans(), allow_tf32=True)
+    elif prec == 'bf16':
+        return tl.dot(a.to(tl.bfloat16),b.trans().to(tl.bfloat16).trans(), allow_tf32=True)
+    else:
+        tl.static_assert(False)
+
+# -------------------------
+# Triton "smallhead" code
+# -------------------------
+
+@triton.jit
+def fw_attn_triton(w_,q_,k_,v_,a_,b_, s0_,y_,s_,sT_, B:tl.constexpr,T:tl.constexpr,H:tl.constexpr,C:tl.constexpr,dT:tl.constexpr, prec:tl.constexpr):
+    bi = tl.program_id(1)
+    hi = tl.program_id(0)
+
+    i = tl.arange(0,C)[None,:]
+    state = tl.load(s0_+IND4(bi,hi,i.trans(),i, H,C,C)).to(tl.float32)
+    for t0 in range(T//dT):
+        t = t0*dT+tl.arange(0,dT)[:,None]
+        sw = tl.load(w_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sq = tl.load(q_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sk = tl.load(k_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sv = tl.load(v_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sa = tl.load(a_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sb = tl.load(b_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+
+        w = (-sw.exp()).exp()
+        fw = tl.reduce(w, 0, _prod, keep_dims=True)
+        incl_pref = tl.cumprod(w,axis=0)
+        non_incl_pref = incl_pref / w
+        inv_incl_pref = 1 / incl_pref
+
+        wq = sq * incl_pref
+        wa = sa * non_incl_pref
+        kwi = sk * inv_incl_pref
+        bwi = sb * inv_incl_pref
+
+        mask1 = (t > t.trans())
+        ab = tl_dot(prec, wa, bwi.trans()) * mask1
+        ak = tl_dot(prec, wa, kwi.trans()) * mask1
+
+        ab_inv = tri_minv(ab, dT, prec)
+
+        ab_u = tl_dot(prec, ak, sv) + tl_dot(prec, wa, state.trans())
+        u = tl_dot(prec, ab_inv, ab_u)
+        mask2 = (t >= t.trans())
+        qk = tl_dot(prec, wq, kwi.trans()) * mask2
+        qb = tl_dot(prec, wq, bwi.trans()) * mask2
+        yy = tl_dot(prec, qk, sv) + tl_dot(prec, qb, u) + tl_dot(prec, wq, state.trans())
+        tl.store(y_+IND4(bi,t,hi,i, T,H,C), yy.to(tl.bfloat16))
+
+        tl.store(s_+IND5(bi,hi,t0,i.trans(),i, H,T//dT,C,C), state.to(tl.float32))
+        state = state * fw + tl_dot(prec, sv.trans(), kwi*fw) + tl_dot(prec, u.trans(), bwi*fw)
+    tl.store(sT_+IND4(bi,hi,i.trans(),i, H,C,C), state.to(tl.bfloat16))
+
+@triton.jit
+def bw_attn_triton(w_,q_,k_,v_,a_,b_, dy_,s_,dsT_, dw_,dq_,dk_,dv_,da_,db_,ds0_, B:tl.constexpr,T:tl.constexpr,H:tl.constexpr,C:tl.constexpr,dT:tl.constexpr, prec:tl.constexpr):
+    bi = tl.program_id(1)
+    hi = tl.program_id(0)
+
+    i = tl.arange(0,C)[None,:]
+    dstate = tl.load(dsT_+IND4(bi,hi,i.trans(),i, H,C,C)).to(tl.float32)
+
+    for t0 in range(T//dT-1,-1,-1):
+        t = t0*dT+tl.arange(0,dT)[:,None]
+
+        state = tl.load(s_+IND5(bi,hi,t0,i.trans(),i, H,T//dT,C,C)).to(tl.float32)
+
+        sw = tl.load(w_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sq = tl.load(q_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sk = tl.load(k_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sv = tl.load(v_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sa = tl.load(a_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sb = tl.load(b_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+        sdy = tl.load(dy_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+
+        dw_fac = -sw.exp()
+        w = dw_fac.exp()
+        fw = tl.reduce(w, 0, _prod, keep_dims=True)
+        incl_pref = tl.cumprod(w,axis=0)
+        non_incl_pref = incl_pref / w
+        inv_incl_pref = 1 / incl_pref
+
+        wq = sq * incl_pref
+        wa = sa * non_incl_pref
+        kwi = sk * inv_incl_pref
+        bwi = sb * inv_incl_pref
+
+        mask1 = (t > t.trans())
+        ab = tl_dot(prec, wa, bwi.trans()) * mask1
+        ak = tl_dot(prec, wa, kwi.trans()) * mask1
+
+        ab_inv = tri_minv(ab, dT, prec)
+
+        ab_u = tl_dot(prec, ak, sv) + tl_dot(prec, wa, state.trans())
+        u = tl_dot(prec, ab_inv, ab_u)
+        mask2 = (t >= t.trans())
+        qk = tl_dot(prec, wq, kwi.trans()) * mask2
+        qb = tl_dot(prec, wq, bwi.trans()) * mask2
+
+        du = tl_dot(prec, qb.trans(), sdy) + tl_dot(prec, bwi*fw, dstate.trans())
+        dab_u = tl_dot(prec, ab_inv.trans(), du)
+
+        dv = tl_dot(prec, qk.trans(), sdy) + tl_dot(prec, kwi*fw, dstate.trans()) + tl_dot(prec, ak.trans(), dab_u)
+        tl.store(dv_+IND4(bi,t,hi,i, T,H,C), dv.to(tl.bfloat16))
+
+        dab = tl_dot(prec, tl_dot(prec, ab_inv.trans(), du), u.trans()) * mask1
+        dak = tl_dot(prec, dab_u, sv.trans()) * mask1
+        dab_u_state = tl_dot(prec, dab_u, state)
+        da = non_incl_pref * (tl_dot(prec, dab, bwi) + tl_dot(prec, dak, kwi) + dab_u_state)
+        tl.store(da_+IND4(bi,t,hi,i, T,H,C), da.to(tl.bfloat16))
+
+        dqb = tl_dot(prec, sdy, u.trans()) * mask2
+        dqk = tl_dot(prec, sdy, sv.trans()) * mask2
+        dy_state = tl_dot(prec, sdy, state)
+        dq = incl_pref * (tl_dot(prec, dqb, bwi) + tl_dot(prec, dqk, kwi) + dy_state)
+        tl.store(dq_+IND4(bi,t,hi,i, T,H,C), dq.to(tl.bfloat16))
+
+        fw_u_dstate = fw * tl_dot(prec, u, dstate)
+        db = inv_incl_pref * (tl_dot(prec, dab.trans(), wa) + tl_dot(prec, dqb.trans(), wq) + fw_u_dstate)
+        tl.store(db_+IND4(bi,t,hi,i, T,H,C), db.to(tl.bfloat16))
+
+        fw_v_dstate = fw * tl_dot(prec, sv, dstate)
+        dk = inv_incl_pref * (tl_dot(prec, dak.trans(), wa) + tl_dot(prec, dqk.trans(), wq) + fw_v_dstate)
+        tl.store(dk_+IND4(bi,t,hi,i, T,H,C), dk.to(tl.bfloat16))
+
+        dw0 = fw * tl.sum(state*dstate, axis=0,keep_dims=True)
+        for k in range(t0*dT,t0*dT+dT):
+            lmask = (t<k).trans()
+            A = (tl_dot(prec, dab*lmask, bwi) + tl_dot(prec, dak*lmask, kwi)) * wa * (t>k)
+            A += (tl_dot(prec, dqb*lmask, bwi) + tl_dot(prec, dqk*lmask, kwi)) * wq * (t>=k)
+            A += (fw_v_dstate*kwi + fw_u_dstate*bwi) * (t<k)
+            A += dab_u_state*wa * (t>k) + dy_state*wq * (t>=k)
+            dw = tl.sum(A, axis=0,keep_dims=True) + dw0
+
+            wk = tl.load(w_+IND4(bi,k,hi,i, T,H,C)).to(tl.float32)
+            dw *= -wk.exp()
+            tl.store(dw_+IND4(bi,k,hi,i, T,H,C), dw.to(tl.bfloat16))
+
+        dstate = dstate * fw + tl_dot(prec, sdy.trans(), wq) + tl_dot(prec, dab_u.trans(), wa)
+    tl.store(ds0_+IND4(bi,hi,i.trans(),i, H,C,C), dstate.to(tl.bfloat16))
+
+class TritonRWKV7(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, w,q,k,v,z,b,s0, dot_prec):
+        K = 16
+        B,T,H,C = w.shape
+        s0 = th.zeros(B,H,C,C, dtype=w.dtype,device=w.device) if s0 is None else s0
+        y = th.empty_like(v)
+        sT = th.empty_like(s0)
+        s = th.zeros(B,H,T//K,C,C, dtype=th.float32,device=w.device)
+        fw_attn_triton[(H,B)](w,q,k,v,z,b, s0,y,s,sT, B,T,H,C,K, dot_prec)
+        ctx.dot_prec = dot_prec
+        ctx.save_for_backward(w,q,k,v,z,b,s)
+        return y, sT
+    @staticmethod
+    def backward(ctx, dy, dsT):
+        K = 16
+        w,q,k,v,z,b,s = ctx.saved_tensors
+        B,T,H,C = w.shape
+        dw,dq,dk,dv,dz,db,ds0 = [th.empty_like(x) for x in [w,q,k,v,z,b,dsT]]
+        bw_attn_triton[(H,B)](w,q,k,v,z,b, dy,s,dsT, dw,dq,dk,dv,dz,db,ds0, B,T,H,C,K, ctx.dot_prec)
+        return dw,dq,dk,dv,dz,db,ds0,None
+    
+# -------------------------
+# Triton "bighead" code
+# -------------------------
+
+@triton.autotune(configs=[triton.Config({'dC': dC}, num_stages=1) for dC in [16,32,64]], key=['T','H','C','dT','prec'])
+@triton.jit
+def fw_attn_triton_bighead(w_,q_,k_,v_,a_,b_, s0_,y_,s_,sT_, wq_,wa_,kwi_,bwi_,fw_, B:tl.constexpr,T:tl.constexpr,H:tl.constexpr,C:tl.constexpr,dT:tl.constexpr, prec:tl.constexpr, dC:tl.constexpr):
+    tl.static_assert(C%dC == 0)
+    bi = tl.program_id(1)
+    hi = tl.program_id(0)
+    for i0 in range(0,C,dC):
+        i = i0+tl.arange(0,dC)[None,:]
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+            state = tl.load(s0_+IND4(bi,hi,i.trans(),j, H,C,C)).to(tl.float32)
+            tl.store(s_+IND5(bi,hi,0,i.trans(),j, H,T//dT,C,C), state.to(tl.float32))
+
+    for t0 in range(T//dT):
+        dt = tl.arange(0,dT)[:,None]
+        t = t0*dT+dt
+        tl.debug_barrier()
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+            sw = tl.load(w_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sq = tl.load(q_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sk = tl.load(k_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sa = tl.load(a_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sb = tl.load(b_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+
+            w = (-sw.exp()).exp()
+            fw = tl.reduce(w, 0, _prod, keep_dims=True)
+            incl_pref = tl.cumprod(w,axis=0)
+            non_incl_pref = incl_pref / w
+            inv_incl_pref = 1 / incl_pref
+
+            wq = sq * incl_pref
+            wa = sa * non_incl_pref
+            kwi = sk * inv_incl_pref
+            bwi = sb * inv_incl_pref
+
+            tl.store(wq_+IND4(bi,hi,dt,j, H,dT,C), wq.to(tl.float32))
+            tl.store(wa_+IND4(bi,hi,dt,j, H,dT,C), wa.to(tl.float32))
+            tl.store(kwi_+IND4(bi,hi,dt,j, H,dT,C), kwi.to(tl.float32))
+            tl.store(bwi_+IND4(bi,hi,dt,j, H,dT,C), bwi.to(tl.float32))
+            tl.store(fw_+IND3(bi,hi,j, H,C), fw.to(tl.float32))
+        tl.debug_barrier()
+
+        ab = tl.zeros((dT,dT), tl.float32)
+        ak = tl.zeros((dT,dT), tl.float32)
+        qb = tl.zeros((dT,dT), tl.float32)
+        qk = tl.zeros((dT,dT), tl.float32)
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+
+            wa = tl.load(wa_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            wq = tl.load(wq_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            bwi = tl.load(bwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            kwi = tl.load(kwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+
+            sa = tl.load(a_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sb = tl.load(b_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+
+            ab += tl_dot(prec, wa, bwi.trans())
+            ak += tl_dot(prec, wa, kwi.trans())
+            qb += tl_dot(prec, wq, bwi.trans())
+            qk += tl_dot(prec, wq, kwi.trans())
+
+        mask1 = (t > t.trans())
+        mask2 = (t >= t.trans())
+        ab *= mask1
+        ak *= mask1
+        qb *= mask2
+        qk *= mask2
+
+        ab_inv = tri_minv(ab, dT, prec)
+
+        for i0 in range(0,C,dC):
+            i = i0+tl.arange(0,dC)[None,:]
+            sv = tl.load(v_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+
+            wa_state = tl.zeros((dT,dC), tl.float32)
+            wq_state = tl.zeros((dT,dC), tl.float32)
+            for j0 in range(0,C,dC):
+                j = j0+tl.arange(0,dC)[None,:]
+                state = tl.load(s_+IND5(bi,hi,t0,i.trans(),j, H,T//dT,C,C)).to(tl.float32)
+                wa = tl.load(wa_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                wq = tl.load(wq_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                wa_state += tl_dot(prec, wa, state.trans())
+                wq_state += tl_dot(prec, wq, state.trans())
+
+            ab_u = tl_dot(prec, ak, sv) + wa_state
+            u = tl_dot(prec, ab_inv, ab_u)
+            yy = tl_dot(prec, qk, sv) + tl_dot(prec, qb, u) + wq_state
+            tl.store(y_+IND4(bi,t,hi,i, T,H,C), yy.to(tl.bfloat16))
+
+            for j0 in range(0,C,dC):
+                j = j0+tl.arange(0,dC)[None,:]
+                state = tl.load(s_+IND5(bi,hi,t0,i.trans(),j, H,T//dT,C,C)).to(tl.float32)
+                kwi = tl.load(kwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                bwi = tl.load(bwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                fw = tl.load(fw_+IND3(bi,hi,j, H,C))
+
+                state = state * fw + tl_dot(prec, sv.trans(), kwi*fw) + tl_dot(prec, u.trans(), bwi*fw)
+
+                if t0+1 < T//dT:
+                    tl.store(s_+IND5(bi,hi,t0+1,i.trans(),j, H,T//dT,C,C), state.to(tl.float32))
+                else:
+                    tl.store(sT_+IND4(bi,hi,i.trans(),j, H,C,C), state.to(tl.bfloat16))
+
+
+@triton.autotune(configs=[triton.Config({'dC': dC}, num_stages=1) for dC in [16,32,64]], key=['T','H','C','dT','prec'])
+@triton.jit
+def bw_attn_triton_bighead(w_,q_,k_,v_,a_,b_, dy_,s_,dsT_,ds_, dw_,dq_,dk_,dv_,da_,db_,ds0_, wq_,wa_,kwi_,bwi_,fw_,u_,dab_u_, B:tl.constexpr,T:tl.constexpr,H:tl.constexpr,C:tl.constexpr,dT:tl.constexpr, prec:tl.constexpr, dC:tl.constexpr):
+    tl.static_assert(C%dC == 0)
+    bi = tl.program_id(1)
+    hi = tl.program_id(0)
+
+    for i0 in range(0,C,dC):
+        i = i0+tl.arange(0,dC)[None,:]
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+            dstate = tl.load(dsT_+IND4(bi,hi,i.trans(),j, H,C,C)).to(tl.float32)
+            tl.store(ds_+IND4(bi,hi,i.trans(),j, H,C,C), dstate.to(tl.float32))
+
+    for t0 in range(T//dT-1,-1,-1):
+        dt = tl.arange(0,dT)[:,None]
+        t = t0*dT+dt
+        tl.debug_barrier()
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+            sw = tl.load(w_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sq = tl.load(q_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sk = tl.load(k_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sa = tl.load(a_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sb = tl.load(b_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+
+            w = (-sw.exp()).exp()
+            fw = tl.reduce(w, 0, _prod, keep_dims=True)
+            incl_pref = tl.cumprod(w,axis=0)
+            non_incl_pref = incl_pref / w
+            inv_incl_pref = 1 / incl_pref
+
+            wq = sq * incl_pref
+            wa = sa * non_incl_pref
+            kwi = sk * inv_incl_pref
+            bwi = sb * inv_incl_pref
+
+            tl.store(wq_+IND4(bi,hi,dt,j, H,dT,C), wq.to(tl.float32))
+            tl.store(wa_+IND4(bi,hi,dt,j, H,dT,C), wa.to(tl.float32))
+            tl.store(kwi_+IND4(bi,hi,dt,j, H,dT,C), kwi.to(tl.float32))
+            tl.store(bwi_+IND4(bi,hi,dt,j, H,dT,C), bwi.to(tl.float32))
+            tl.store(fw_+IND3(bi,hi,j, H,C), fw.to(tl.float32))
+        tl.debug_barrier()
+
+        ab = tl.zeros((dT,dT), tl.float32)
+        ak = tl.zeros((dT,dT), tl.float32)
+        qb = tl.zeros((dT,dT), tl.float32)
+        qk = tl.zeros((dT,dT), tl.float32)
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+
+            wa = tl.load(wa_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            wq = tl.load(wq_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            bwi = tl.load(bwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            kwi = tl.load(kwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+
+            sa = tl.load(a_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            sb = tl.load(b_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+
+            ab += tl_dot(prec, wa, bwi.trans())
+            ak += tl_dot(prec, wa, kwi.trans())
+            qb += tl_dot(prec, wq, bwi.trans())
+            qk += tl_dot(prec, wq, kwi.trans())
+
+        mask1 = (t > t.trans())
+        mask2 = (t >= t.trans())
+        ab *= mask1
+        ak *= mask1
+        qb *= mask2
+        qk *= mask2
+
+        ab_inv = tri_minv(ab, dT, prec)
+
+        dab = tl.zeros((dT,dT), tl.float32)
+        dak = tl.zeros((dT,dT), tl.float32)
+        dqb = tl.zeros((dT,dT), tl.float32)
+        dqk = tl.zeros((dT,dT), tl.float32)
+
+        tl.debug_barrier()
+        for i0 in range(0,C,dC):
+            i = i0+tl.arange(0,dC)[None,:]
+            wa_state = tl.zeros((dT,dC), tl.float32)
+            bwi_dw_dstate = tl.zeros((dT,dC), tl.float32)
+            kwi_dw_dstate = tl.zeros((dT,dC), tl.float32)
+            for j0 in range(0,C,dC):
+                j = j0+tl.arange(0,dC)[None,:]
+                state = tl.load(s_+IND5(bi,hi,t0,i.trans(),j, H,T//dT,C,C)).to(tl.float32)
+                dstate = tl.load(ds_+IND4(bi,hi,i.trans(),j, H,C,C)).to(tl.float32)
+                wa = tl.load(wa_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                bwi = tl.load(bwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                kwi = tl.load(kwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+                fw = tl.load(fw_+IND3(bi,hi,j, H,C))
+
+                wa_state += tl_dot(prec, wa, state.trans())
+                bwi_dw_dstate += tl_dot(prec, bwi*fw, dstate.trans())
+                kwi_dw_dstate += tl_dot(prec, kwi*fw, dstate.trans())
+
+            sv = tl.load(v_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+            sdy = tl.load(dy_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+
+            ab_u = tl_dot(prec, ak, sv) + wa_state
+            u = tl_dot(prec, ab_inv, ab_u)
+            du = tl_dot(prec, qb.trans(), sdy) + bwi_dw_dstate
+            dab_u = tl_dot(prec, ab_inv.trans(), du)
+
+            tl.store(u_+IND4(bi,hi,dt,i, H,dT,C), u.to(tl.float32))
+            tl.store(dab_u_+IND4(bi,hi,dt,i, H,dT,C), dab_u.to(tl.float32))
+
+            dv = tl_dot(prec, qk.trans(), sdy) + kwi_dw_dstate + tl_dot(prec, ak.trans(), dab_u)
+            tl.store(dv_+IND4(bi,t,hi,i, T,H,C), dv.to(tl.bfloat16))
+
+            dab += tl_dot(prec, dab_u, u.trans()) * mask1
+            dak += tl_dot(prec, dab_u, sv.trans()) * mask1
+            dqb += tl_dot(prec, sdy, u.trans()) * mask2
+            dqk += tl_dot(prec, sdy, sv.trans()) * mask2
+        tl.debug_barrier()
+
+        for j0 in range(0,C,dC):
+            j = j0+tl.arange(0,dC)[None,:]
+
+            dy_state = tl.zeros((dT,dC), tl.float32)
+            dab_u_state = tl.zeros((dT,dC), tl.float32)
+            fw_u_dstate = tl.zeros((dT,dC), tl.float32)
+            fw_v_dstate = tl.zeros((dT,dC), tl.float32)
+            state_dstate = tl.zeros((1,dC), tl.float32)
+
+            fw = tl.load(fw_+IND3(bi,hi,j, H,C))
+            wa = tl.load(wa_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            wq = tl.load(wq_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            for i0 in range(0,C,dC):
+                i = i0+tl.arange(0,dC)[None,:]
+
+                u = tl.load(u_+IND4(bi,hi,dt,i, H,dT,C)).to(tl.float32)
+                dab_u = tl.load(dab_u_+IND4(bi,hi,dt,i, H,dT,C)).to(tl.float32)
+                sv = tl.load(v_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+                sdy = tl.load(dy_+IND4(bi,t,hi,i, T,H,C)).to(tl.float32)
+
+                state = tl.load(s_+IND5(bi,hi,t0,i.trans(),j, H,T//dT,C,C)).to(tl.float32)
+                tl.debug_barrier()
+                dstate = tl.load(ds_+IND4(bi,hi,i.trans(),j, H,C,C)).to(tl.float32)
+                tl.debug_barrier()
+
+                dab_u_state += tl_dot(prec, dab_u, state)
+                fw_u_dstate += fw * tl_dot(prec, u, dstate)
+                fw_v_dstate += fw * tl_dot(prec, sv, dstate)
+                dy_state += tl_dot(prec, sdy, state)
+
+                state_dstate += tl.sum(state*dstate, axis=0,keep_dims=True)
+
+                dstate = dstate * fw + tl_dot(prec, sdy.trans(), wq) + tl_dot(prec, dab_u.trans(), wa)
+                if t0 > 0:
+                    tl.store(ds_+IND4(bi,hi,i.trans(),j, H,C,C), dstate.to(tl.float32))
+                else:
+                    tl.store(ds0_+IND4(bi,hi,i.trans(),j, H,C,C), dstate.to(tl.bfloat16))
+
+            sw = tl.load(w_+IND4(bi,t,hi,j, T,H,C)).to(tl.float32)
+            w = (-sw.exp()).exp()
+            incl_pref = tl.cumprod(w,axis=0)
+            non_incl_pref = incl_pref / w
+            inv_incl_pref = 1 / incl_pref
+
+            bwi = tl.load(bwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+            kwi = tl.load(kwi_+IND4(bi,hi,dt,j, H,dT,C)).to(tl.float32)
+
+            da = non_incl_pref * (tl_dot(prec, dab, bwi) + tl_dot(prec, dak, kwi) + dab_u_state)
+            tl.store(da_+IND4(bi,t,hi,j, T,H,C), da.to(tl.bfloat16))
+
+            dq = incl_pref * (tl_dot(prec, dqb, bwi) + tl_dot(prec, dqk, kwi) + dy_state)
+            tl.store(dq_+IND4(bi,t,hi,j, T,H,C), dq.to(tl.bfloat16))
+
+            db = inv_incl_pref * (tl_dot(prec, dab.trans(), wa) + tl_dot(prec, dqb.trans(), wq) + fw_u_dstate)
+            tl.store(db_+IND4(bi,t,hi,j, T,H,C), db.to(tl.bfloat16))
+
+            dk = inv_incl_pref * (tl_dot(prec, dak.trans(), wa) + tl_dot(prec, dqk.trans(), wq) + fw_v_dstate)
+            tl.store(dk_+IND4(bi,t,hi,j, T,H,C), dk.to(tl.bfloat16))
+
+            dw0 = fw * state_dstate
+            for k in range(t0*dT,t0*dT+dT):
+                lmask = (t<k).trans()
+                A = (tl_dot(prec, dab*lmask, bwi) + tl_dot(prec, dak*lmask, kwi)) * wa * (t>k)
+                A += (tl_dot(prec, dqb*lmask, bwi) + tl_dot(prec, dqk*lmask, kwi)) * wq * (t>=k)
+                A += (fw_v_dstate*kwi + fw_u_dstate*bwi) * (t<k)
+                A += dab_u_state*wa * (t>k) + dy_state*wq * (t>=k)
+                dw = tl.sum(A, axis=0,keep_dims=True) + dw0
+
+                wk = tl.load(w_+IND4(bi,k,hi,j, T,H,C)).to(tl.float32)
+                dw *= -wk.exp()
+                tl.store(dw_+IND4(bi,k,hi,j, T,H,C), dw.to(tl.bfloat16))
+
+class TritonBigheadRWKV7(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, w,q,k,v,a,b,s0, dot_prec):
+        K = 16
+        B,T,H,C = w.shape
+        assert T%K == 0
+        assert C%16 == 0
+        s0 = th.zeros(B,H,C,C, dtype=w.dtype,device=w.device) if s0 is None else s0
+        y = th.empty_like(v)
+        sT = th.empty_like(s0)
+        s = th.zeros(B,H,T//K,C,C, dtype=th.float32,device=w.device)
+        wq,wa,kwi,bwi = [th.empty(B,H,K,C, dtype=th.float32,device=w.device) for i in range(4)]
+        fw = th.empty(B,H,C, dtype=th.float32,device=w.device)
+        fw_attn_triton_bighead[(H,B)](w,q,k,v,a,b, s0,y,s,sT, wq,wa,kwi,bwi,fw, B,T,H,C,K, dot_prec)
+        ctx.dot_prec = dot_prec
+        ctx.save_for_backward(w,q,k,v,a,b,s)
+        return y, sT
+    @staticmethod
+    def backward(ctx, dy, dsT):
+        K = 16
+        w,q,k,v,a,b,s = ctx.saved_tensors
+        B,T,H,C = w.shape
+        dw,dq,dk,dv,da,db,ds0 = [th.empty_like(x) for x in [w,q,k,v,a,b,dsT]]
+        fw = th.empty(B,H,C, dtype=th.float32,device=w.device)
+        ds = th.empty(B,H,C,C, dtype=th.float32,device=w.device)
+        wq,wa,kwi,bwi,u,dab_u = [th.empty(B,H,K,C, dtype=th.float32,device=w.device) for i in range(6)]
+        bw_attn_triton_bighead[(H,B)](w,q,k,v,a,b, dy,s,dsT,ds, dw,dq,dk,dv,da,db,ds0, wq,wa,kwi,bwi,fw,u,dab_u, B,T,H,C,K, ctx.dot_prec)
+        return dw,dq,dk,dv,da,db,ds0,None
+    
+####################################################################################################
+# Start of pytorch code
+####################################################################################################
+
+# from .rwkv7_attn_pytorch import rwkv7_attn_pytorch_chunk
+
+# -------------------------
+# Pytorch "smallhead" code
+# -------------------------
+
+def rwkv7_attn_triton(r,w,k,v, kk,iclr, HEAD_SIZE=64, dot_prec='fp32', s0=None):
+    B,T,HC = w.shape
+
+    # Check if the chunk is multiple of 16
+    chunk_remainder = T % 16
+
+    # Optimize the call, if chunk is multiple of 16
+    if chunk_remainder == 0:
+        return rwkv7_attn_triton_chunk(r,w,k,v, kk,iclr, HEAD_SIZE, dot_prec, s0)
+
+    # Initialize the state
+    C = HEAD_SIZE
+    H = HC//C
+    s0 = th.zeros(B,H,C,C, dtype=th.float,device=w.device) if s0 is None else s0
+
+    # Compute the number of chunks
+    chunks = T // 16
+    si = chunks * 16
+
+    # Get the chunked output
+    chunk_xx, chunk_sT = rwkv7_attn_triton_chunk(
+        r[:,:si],w[:,:si],k[:,:si],v[:,:si], kk[:,:si],iclr[:,:si],
+        HEAD_SIZE, dot_prec, s0
+    )
+
+    # Get the remainder
+    remain_xx, last_sT = rwkv7_attn_pytorch_chunk(
+        r[:,si:],torch.exp(-torch.exp(w[:,si:])),k[:,si:],v[:,si:], kk[:,si:],iclr[:,si:], 
+        B, H, C, torch.zeros(B, chunk_remainder, HC, device=w.device, dtype=w.dtype), 
+        chunk_sT, chunk_size=chunk_remainder
+    )
+
+    # Concatenate and return results
+    return torch.cat([chunk_xx.to(dtype=w.dtype), remain_xx.to(dtype=w.dtype)], dim=1), last_sT.to(dtype=s0.dtype)
+
+
+def rwkv7_attn_triton_chunk(r,w,k,v, kk,iclr, HEAD_SIZE=64, dot_prec='fp32', s0=None):
+    '''
+    Triton implementation running in blocks of 16 (hardcoded requirement for the kernel)
+    '''
+    B,T,HC = w.shape
+    assert T % 16 == 0, 'pure triton, only works in multiple of 16'
+    C = HEAD_SIZE
+    H = HC//C
+
+    # Moving the triton specific operations into the chunk steps
+    r,w,k,v,a,b = [i.view(B,T,H,C) for i in [r,w,k,v,-kk,(kk*iclr)]]
+    s0 = th.zeros(B,H,C,C, dtype=th.float,device=w.device) if s0 is None else s0
+    xx, sT = TritonRWKV7.apply(w,r,k,v,a,b,s0,dot_prec)
+    return xx.view(B,T,HC), sT
+
+# -------------------------
+# Pytorch "bighead" code
+# -------------------------
+
+def rwkv7_attn_triton_bighead(r,w,k,v, kk,iclr, HEAD_SIZE=64, dot_prec='fp32', s0=None):
+    B,T,HC = w.shape
+
+    # Check if the chunk is multiple of 16
+    chunk_remainder = T % 16
+
+    # Optimize the call, if chunk is multiple of 16
+    if chunk_remainder == 0:
+        return rwkv7_attn_triton_bighead_chunk(r,w,k,v, kk,iclr, HEAD_SIZE, dot_prec, s0)
+
+    # Initialize the state
+    C = HEAD_SIZE
+    H = HC//C
+    s0 = th.zeros(B,H,C,C, dtype=th.float,device=w.device) if s0 is None else s0
+
+    # Compute the number of chunks
+    chunks = T // 16
+    si = chunks * 16
+
+    # Get the chunked output
+    chunk_xx, chunk_sT = rwkv7_attn_triton_bighead_chunk(
+        r[:,:si],w[:,:si],k[:,:si],v[:,:si], kk[:,:si],iclr[:,:si],
+        HEAD_SIZE, dot_prec, s0
+    )
+
+    # Get the remainder
+    remain_xx, last_sT = rwkv7_attn_pytorch_chunk(
+        r[:,si:],torch.exp(-torch.exp(w[:,si:])),k[:,si:],v[:,si:], kk[:,si:],iclr[:,si:], 
+        B, H, C, torch.zeros(B, chunk_remainder, HC, device=w.device, dtype=w.dtype), 
+        chunk_sT, chunk_size=chunk_remainder
+    )
+
+    # Concatenate and return results
+    return torch.cat([chunk_xx.to(dtype=w.dtype), remain_xx.to(dtype=w.dtype)], dim=1), last_sT.to(dtype=s0.dtype)
+
+
+def rwkv7_attn_triton_bighead_chunk(r,w,k,v, kk,iclr, HEAD_SIZE=64, dot_prec='fp32', s0=None):
+    '''
+    Triton implementation running in blocks of 16 (hardcoded requirement for the kernel)
+    '''
+    B,T,HC = w.shape
+    assert T % 16 == 0, 'pure triton, only works in multiple of 16'
+    C = HEAD_SIZE
+    H = HC//C
+
+    # Moving the triton specific operations into the chunk steps
+    r,w,k,v,a,b = [i.view(B,T,H,C) for i in [r,w,k,v,-kk,(kk*iclr)]]
+    s0 = th.zeros(B,H,C,C, dtype=th.float,device=w.device) if s0 is None else s0
+    xx, sT = TritonBigheadRWKV7.apply(w,r,k,v,a,b,s0,dot_prec)
+    return xx.view(B,T,HC), sT
+
+
+# ----------------
+# block/rwkv7_block_config_map.py
+# ----------------
+from dataclasses import dataclass
+from typing import Optional
+from typing import Union
+import torch
+
+@dataclass
+class RWKV7BlockConfigMap:
+
+    """Configuration map for RWKV based models"""
+    # Key properties for the block / model
+    num_hidden_layers: int
+    hidden_size: int
+
+    head_size: int = 64
+
+    # Dropout rate, should only be used in training
+    dropout_rate: float = 0.0
+
+    # Implementation backend to use
+    tmix_backend: str = "auto"
+
+    # ---
+    # OPTIONAL PROPS
+    #
+    # Optional properties which can be derived
+    # or can be overwritten by the user
+    # ---
+
+    # Current layer_id of the block
+    layer_id: Optional[int] = None
+
+    # Device and Data type
+    device: Union[torch.device, str, None] = None
+    dtype: Union[torch.dtype, str, None] = None
+
+    # Channel mix / FFN block dimension size
+    hidden_size_ffn: Optional[int] = None
+    hidden_size_att: Optional[int] = None
+
+    # # number of heads
+    # n_head: Optional[int] = None
+
+    # ---
+    # Initializer, with excess arg ignore
+    # ---
+    def __init__(
+        self,
+        num_hidden_layers: int,
+        hidden_size: int,
+        head_size: int = 64,
+        dropout_rate: float = 0.0,
+        tmix_backend: str = "auto",
+        layer_id: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
+        dtype: Union[torch.dtype, str, None] = None,
+        hidden_size_ffn: Optional[int] = None,
+        hidden_size_att: Optional[int] = None,
+        **kwargs
+    ) -> None:
+        '''
+        Constructor for the config
+        '''
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.dropout_rate = dropout_rate
+        self.tmix_backend = tmix_backend
+        self.layer_id = layer_id
+        self.device = device
+        self.dtype = dtype
+        self.hidden_size_ffn = hidden_size_ffn
+        self.hidden_size_att = hidden_size_att
+
+    # ---
+    # OPTIONAL PROPS FETCHER
+    # ---
+
+    def get_layer_id(self, fallback:int) -> int:
+        '''
+        Returns the layer id
+        '''
+        if self.layer_id is not None:
+            return self.layer_id
+        return fallback
+    
+    def get_device(self, fallback:str) -> torch.device:
+        '''
+        Returns the device
+        '''
+        if self.device is not None:
+            return torch.device(self.device)
+        if fallback is not None:
+            return torch.device(fallback)
+        return torch.get_default_device()
+    
+    def get_dtype(self, fallback:str) -> torch.dtype:
+        '''
+        Returns the dtype
+        '''
+        if self.dtype is not None:
+            key = self.dtype
+        else:
+            key = fallback
+
+        # if dtype is already torch.dtype
+        if isinstance(key, torch.dtype):
+            return key
+        
+        # Get and Check if the dtype is instance of torch.dtype
+        ret = getattr(torch, key) 
+        assert isinstance(ret, torch.dtype), f"Invalid dtype: {self.dtype}"
+        return ret
+    
+    # ---
+    
+    def get_hidden_size_att(self) -> int:
+        '''
+        Returns the dimension of attention
+        '''
+        if self.hidden_size_att is not None:
+            hidden_size_att = self.hidden_size_att
+        else:
+            hidden_size = self.hidden_size
+            assert hidden_size  % 32 == 0, f"hidden_size must be divisible by 32"
+            hidden_size_att = hidden_size
+        assert hidden_size_att % 32 == 0, f"hidden_size_att must be divisible by 32 ({hidden_size_att})"
+        return hidden_size_att
+    
+    def get_hidden_size_ffn(self) -> int:
+        '''
+        Returns the dimension of feed forward network
+        '''
+        if self.hidden_size_ffn is not None:
+            hidden_size_ffn = self.hidden_size_ffn
+        else:
+            hidden_size = self.hidden_size
+            assert hidden_size  % 32 == 0, f"hidden_size must be divisible by 32"
+            hidden_size_ffn = hidden_size  * 4
+
+        assert hidden_size_ffn % 32 == 0, f"hidden_size_ffn must be divisible by 32"
+        return hidden_size_ffn
+    
+    # def get_n_head(self) -> int:
+    #     '''
+    #     Returns the number of heads
+    #     '''
+    #     if self.n_head is not None:
+    #         n_head = self.n_head
+    #     else:
+    #         hidden_size_att = self.get_hidden_size_att()
+    #         n_head = self.get_hidden_size_att() // self.head_size
+    #         assert hidden_size_att % n_head == 0 ,  f"hidden_size_att must be divisible by head_size ({self.head_size})"
+    #
+    #     return n_head
+
+    # ---
+    # Duplicator & Normalizer
+    # ---
+
+    def new_block_config_map(self, **kwargs) -> 'RWKV7BlockConfigMap':
+        '''
+        Returns a new config map with updated values
+        '''
+
+        new_dict = {}
+        for key in RWKV7BlockConfigMap.__dataclass_fields__:
+            if key in self.__dict__:
+                new_dict[key] = self.__dict__[key]
+        new_dict.update(kwargs)
+
+        return RWKV7BlockConfigMap(**new_dict)
+
+    @staticmethod
+    def normalize(config_map: any) -> 'RWKV7BlockConfigMap':
+        '''
+        Converts either maps, objs or RWKV7BlockConfigMap
+        '''
+        if isinstance(config_map, RWKV7BlockConfigMap):
+            return config_map
+        
+        dict_obj = None
+        if isinstance(config_map, dict):
+            dict_obj = config_map
+        elif hasattr(config_map, '__dict__'):
+            dict_obj = config_map.__dict__
+        
+        if dict_obj is not None:
+            # Filter for only valeus in RWKV7BlockConfigMap
+            new_dict = {}
+            for key, value in dict_obj.items():
+                if key in RWKV7BlockConfigMap.__dataclass_fields__:
+                    new_dict[key] = value
+            return RWKV7BlockConfigMap(**new_dict)
+
+        raise ValueError(f"Unsupported config_map type: {type(config_map)}")
+
+
+# ----------------
+# block/rwkv7_channel_mix.py
+# ----------------
+import torch
+from torch import nn
+from typing import Union
+# from .rwkv7_block_config_map import RWKV7BlockConfigMap
+
+class RWKV7ChannelMix(torch.nn.Module):
+    '''
+    ChannelMix block for RWKV
+    This is similar to transformer FFN block
+    '''
+
+    def __init__(self, configMap: Union[RWKV7BlockConfigMap, any]):
+        '''
+        Initialize the ChannelMix block.
+        
+        Note: this does not initialize the parameter weights itself
+        which would depend on the `init_parameters()` method
+        '''
+
+        super().__init__()
+
+        configMap:RWKV7BlockConfigMap = RWKV7BlockConfigMap.normalize(configMap)
+        self.configMap = configMap
+
+        # Get various props
+        hidden_size = configMap.hidden_size
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+
+        # By default, hidden_size_ffn = hidden_size * 4
+        hidden_size_ffn = configMap.get_hidden_size_ffn() 
+        
+        # Build the various params
+        # ---
+        self.x_k = nn.Parameter(torch.empty(1, 1, hidden_size, device=device, dtype=dtype))
+        self.key = nn.Linear(hidden_size, hidden_size_ffn, bias=False, device=device, dtype=dtype)
+        self.value = nn.Linear(hidden_size_ffn, hidden_size, bias=False, device=device, dtype=dtype)
+
+    def init_parameters(self):
+        '''
+        Reset the parameters of the block, to an initial state used for training a model from scratch
+        '''
+        
+        # Get required props
+        configMap = self.configMap
+        hidden_size = configMap.hidden_size
+        num_hidden_layers = configMap.num_hidden_layers
+
+        # Get optional props
+        layer_id = configMap.get_layer_id(0)
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+
+        # By default, hidden_size_ffn = hidden_size * 4
+        hidden_size_ffn = configMap.get_hidden_size_ffn() 
+        
+        # Reset the various params
+        # ---
+        with torch.no_grad():  # fancy init of time_mix
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+            ddd = torch.ones(1, 1, hidden_size)
+            for i in range(hidden_size):
+                ddd[0, 0, i] = i / hidden_size
+            self.x_k = nn.Parameter( (1.0 - torch.pow(ddd, ratio_1_to_almost0**4)).to(device, dtype=dtype) )
+
+        self.key = nn.Linear(hidden_size, hidden_size_ffn, bias=False, device=device, dtype=dtype)
+        self.value = nn.Linear(hidden_size_ffn, hidden_size, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x: torch.Tensor, last_state: torch.Tensor) -> tuple[torch.Tensor,torch.Tensor]:
+        '''
+        Forwarding channel mix given the input tokens and states.
+        
+        Given:
+        - Incoming token embedding size of shape [batch_size, seq_len, embedding_size]
+        - Incoming channel mix, shift states of the various batches [batch_size, state_size]
+        
+        Returns a pair 
+        - Output embedding of shape [batch_size, seq_len, embedding_size]
+        - Output channel mix, shift state of shape [batch_size, state_size]
+        '''
+        # last_state = last_state.to(self.key.weight.device)
+
+        ##########
+        ## x070
+        ##########
+
+        dxprev = torch.cat((last_state.unsqueeze(1), x[:, :-1]), dim=1) - x
+        xk = x + dxprev * self.x_k
+        k = torch.relu( self.key(xk) ) ** 2
+
+        return self.value(k), x[:,-1]
+
+    @torch.compile(mode="default", fullgraph=True)
+    def forward_with_default_compile(self, in_x: torch.Tensor, in_state: torch.Tensor, out_x: torch.Tensor, out_state: torch.Tensor) -> tuple[torch.Tensor,torch.Tensor]:
+        '''
+        Compiled varient of the forward function
+        With no new tensors being created for the output
+        Useful for static memory allocation optimizations inference
+        '''
+        out_x[:], out_state[:] = self.forward_with_reduce_compile(in_x, in_state)
+        return out_x, out_state
+
+    @torch.compile(mode="reduce-overhead", fullgraph=True)
+    def forward_with_reduce_compile(self, in_x: torch.Tensor, in_state: torch.Tensor) -> tuple[torch.Tensor,torch.Tensor]:
+        '''
+        Compiled varient of the forward function
+        '''
+        return self.forward(in_x, in_state)
+
+    def load_from_model_state_dict(self, model_state_dict: dict, layer_id:int, non_blocking:bool=True):
+        '''
+        Given the Full/partial RWKV model weights, loaded via `torch.load`
+        Setup the the current module weights, using the layer_id
+        '''
+        # Get the current state_dict
+        current_state_dict = self.state_dict()
+
+        # Iterate each parameter in the state_dict, and compare from the model
+        for n in current_state_dict:
+            model_key = f"blocks.{layer_id}.ffn.{n}"
+            if model_key not in model_state_dict:
+                continue
+
+            # Copy the values from the state_dict
+            try:
+                current_state_dict[n].copy_(model_state_dict[model_key], non_blocking=non_blocking)
+            except Exception as e:
+                print(f"[ERROR] loading: {model_key} | model shape: {current_state_dict[n].shape} | weight shape: {model_state_dict[model_key].shape}")
+                raise e
+
+# ----------------
+# block/rwkv7_time_mix.py
+# ----------------
+import torch, math
+from torch import nn
+from torch import Tensor
+from typing import Union
+from torch.nn import functional as F
+
+# from .rwkv7_block_config_map import RWKV7BlockConfigMap
+
+# Check for triton package, if GPU is available
+triton = None
+if torch.cuda.is_available():
+    try:
+        import triton
+    except ImportError:
+        triton = None
+else:
+    print("[WARNING] Triton not available, falling back to pytorch mode by default - this is significantly slower")
+
+class RWKV7TimeMix(torch.nn.Module):
+    '''
+    Time Mix block for RWKV V7
+    '''
+
+    def __init__(self, configMap: Union[RWKV7BlockConfigMap, any]):
+        '''
+        Initialize the TimeMix block.
+        
+        Note: this does not initialize the parameter weights itself
+        which would depend on the `init_parameters()` method
+        '''
+        super().__init__()
+
+        configMap:RWKV7BlockConfigMap = RWKV7BlockConfigMap.normalize(configMap)
+        self.configMap = configMap
+
+        # Get required props
+        hidden_size = configMap.hidden_size
+        # num_hidden_layers = configMap.num_hidden_layers
+
+        # Get the layer id
+        layer_id = configMap.get_layer_id(0)
+        self.layer_id = layer_id
+
+        # Get optional props
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+
+        # By default, hidden_size_ffn = hidden_size
+        hidden_size_att = configMap.get_hidden_size_att()
+
+        # Assert hidden_size == hidden_size_att, until we support different hidden_size and hidden_size_att
+        assert hidden_size == hidden_size_att, "hidden_size should be equal to hidden_size_att (@TODO: support different hidden_size and hidden_size_att)"
+
+        # Head size settings
+        head_size = configMap.head_size
+        self.head_size = head_size
+
+        # Number of heads
+        n_head = hidden_size_att // head_size
+        assert hidden_size_att % head_size == 0, "hidden_size_att should be divisible by head_size"
+        self.n_head = n_head
+
+        # Backend
+        self.tmix_backend = configMap.tmix_backend
+
+        # Build the various params
+        # ---
+
+        with torch.no_grad():
+            # Note: for some data, you can reduce D_GATE_LORA or even remove this gate
+            def calc_lora_rank(exponent, multiplier):
+                return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
+            D_DECAY_LORA = calc_lora_rank(0.5, 1.8)
+            D_AAA_LORA   = calc_lora_rank(0.5, 1.8)
+            D_MV_LORA    = calc_lora_rank(0.5, 1.3)
+            D_GATE_LORA  = calc_lora_rank(0.8, 0.6)
+
+            self.x_r = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.x_w = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.x_k = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.x_v = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.x_a = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.x_g = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+
+            self.w0 = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.w1 = nn.Parameter(torch.empty(hidden_size, D_DECAY_LORA, device=device, dtype=dtype))
+            self.w2 = nn.Parameter(torch.empty(D_DECAY_LORA, hidden_size, device=device, dtype=dtype))
+
+            self.a0 = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.a1 = nn.Parameter(torch.empty(hidden_size,D_AAA_LORA, device=device, dtype=dtype))
+            self.a2 = nn.Parameter(torch.empty(D_AAA_LORA,hidden_size, device=device, dtype=dtype))
+            
+            if layer_id > 0:
+                self.v0 = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+                self.v1 = nn.Parameter(torch.empty(hidden_size,D_MV_LORA, device=device, dtype=dtype))
+                self.v2 = nn.Parameter(torch.empty(D_MV_LORA,hidden_size, device=device, dtype=dtype))
+                
+            self.g1 = nn.Parameter(torch.empty(hidden_size, D_GATE_LORA, device=device, dtype=dtype))
+            self.g2 = nn.Parameter(torch.empty(D_GATE_LORA, hidden_size, device=device, dtype=dtype))
+
+            self.k_k = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.k_a = nn.Parameter(torch.empty(1,1,hidden_size, device=device, dtype=dtype))
+            self.r_k = nn.Parameter(torch.empty(n_head, head_size, device=device, dtype=dtype))
+
+        self.receptance = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.key = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.value = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.output = nn.Linear(hidden_size_att, hidden_size, bias=False, device=device, dtype=dtype)
+        self.ln_x = nn.GroupNorm(n_head, hidden_size_att, device=device, dtype=dtype, eps=(1e-5)*head_size)
+        
+    def init_parameters(self):
+        '''
+        Reset the parameters of the block, to an initial state used for training a model from scratch
+        '''
+        configMap = self.configMap
+
+        # Get required props
+        hidden_size = configMap.hidden_size
+        num_hidden_layers = configMap.num_hidden_layers
+
+        # Get the layer id
+        layer_id = self.layer_id
+
+        # Get optional props
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+
+        # By default, hidden_size_ffn = hidden_size
+        hidden_size_att = configMap.get_hidden_size_att()
+
+        # Assert hidden_size == hidden_size_att, until we support different hidden_size and hidden_size_att
+        assert hidden_size == hidden_size_att, "hidden_size should be equal to hidden_size_att (@TODO: support different hidden_size and hidden_size_att)"
+
+        # Head size settings
+        head_size = self.head_size
+
+        # Number of heads
+        n_head = hidden_size_att // head_size
+        assert hidden_size_att % head_size == 0, "hidden_size_att should be divisible by head_size"
+        
+        # Reset the various params
+        # ---
+        with torch.no_grad():
+            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
+            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
+            ddd = torch.ones(1, 1, hidden_size, device=device, dtype=dtype)
+            for i in range(hidden_size):
+                ddd[0, 0, i] = i / hidden_size
+
+            # Note: for some data, you can reduce D_GATE_LORA or even remove this gate
+            def calc_lora_rank(exponent, multiplier):
+                return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
+            D_DECAY_LORA = calc_lora_rank(0.5, 1.8)
+            D_AAA_LORA   = calc_lora_rank(0.5, 1.8)
+            D_MV_LORA    = calc_lora_rank(0.5, 1.3)
+            D_GATE_LORA  = calc_lora_rank(0.8, 0.6)
+
+            self.x_r.copy_(1.0 - torch.pow(ddd, 0.2 * ratio_1_to_almost0))
+            self.x_w.copy_(1.0 - torch.pow(ddd, 0.9 * ratio_1_to_almost0))
+            self.x_k.copy_(1.0 - (torch.pow(ddd, 0.9 * ratio_1_to_almost0) + 0.4 * ratio_0_to_1))
+            self.x_v.copy_(1.0 - (torch.pow(ddd, 0.4 * ratio_1_to_almost0) + 0.6 * ratio_0_to_1))
+            self.x_a.copy_(1.0 - torch.pow(ddd, 0.9 * ratio_1_to_almost0))
+            self.x_g.copy_(1.0 - torch.pow(ddd, 0.2 * ratio_1_to_almost0))
+
+            def ortho_init(x, scale):
+                x = x.to(device)
+                shape = x.shape
+                if len(shape) == 2:
+                    gain = math.sqrt(shape[0] / shape[1]) if shape[0] > shape[1] else 1
+                    nn.init.orthogonal_(x, gain=gain * scale)
+                elif len(shape) == 3:
+                    gain = math.sqrt(shape[1] / shape[2]) if shape[1] > shape[2] else 1
+                    for i in range(shape[0]):
+                        nn.init.orthogonal_(x[i], gain=gain * scale)
+                else:
+                    assert False
+                return x.to(device, dtype=dtype)
+
+            # D_DECAY_LORA = max(32, int(round(  (1.8*(hidden_size**0.5))  /32)*32))
+            decay_speed = torch.ones(hidden_size, device=device, dtype=dtype)
+            for n in range(hidden_size):
+                decay_speed[n] = -7 + 5 * (n / (hidden_size - 1)) ** (0.85 + 1.0 * ratio_0_to_1 ** 0.5)
+            
+            self.w0.copy_(decay_speed.reshape(1,1,hidden_size).to(device, dtype=dtype) + 0.5)  # !!! 0.5 comes from F.softplus !!!
+            self.w1.copy_(torch.zeros(hidden_size, D_DECAY_LORA, device=device, dtype=dtype))
+            self.w2.copy_(ortho_init(torch.zeros(D_DECAY_LORA, hidden_size), 0.1))
+
+            # D_AAA_LORA = max(32, int(round(  (1.8*(hidden_size**0.5))  /32)*32)) # suggestion
+            self.a0.copy_(torch.zeros(1,1,hidden_size, device=device, dtype=dtype))
+            self.a1.copy_(torch.zeros(hidden_size, D_AAA_LORA, device=device, dtype=dtype))
+            self.a2.copy_(ortho_init(torch.zeros(D_AAA_LORA, hidden_size), 0.1))
+
+            # D_MV_LORA = max(32, int(round(  (1.3*(hidden_size**0.5))  /32)*32)) # suggestion
+            if layer_id > 0:
+                self.v0.copy_(torch.zeros(1,1,hidden_size, device=device, dtype=dtype)+1.0)
+                self.v1.copy_(torch.zeros(hidden_size, D_MV_LORA, device=device, dtype=dtype))
+                self.v2.copy_(ortho_init(torch.zeros(D_MV_LORA, hidden_size), 0.1))
+
+            # D_GATE_LORA = max(32, int(round(  (0.6*(hidden_size**0.8))  /32)*32)) # suggestion
+            # Note: for some data, you can reduce D_GATE_LORA or even remove this gate
+            self.g1.copy_(torch.zeros(hidden_size, D_GATE_LORA, device=device, dtype=dtype))
+            self.g2.copy_(ortho_init(torch.zeros(D_GATE_LORA, hidden_size), 0.1))
+
+            self.k_k.copy_(torch.ones(1,1,hidden_size, device=device, dtype=dtype)*0.85)
+            self.k_a.copy_(torch.ones(1,1,hidden_size, device=device, dtype=dtype))
+            self.r_k.copy_(torch.zeros(n_head,head_size, device=device, dtype=dtype))
+            
+        self.receptance = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.key = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.value = nn.Linear(hidden_size, hidden_size_att, bias=False, device=device, dtype=dtype)
+        self.output = nn.Linear(hidden_size_att, hidden_size, bias=False, device=device, dtype=dtype)
+        self.ln_x = nn.GroupNorm(n_head, hidden_size_att, device=device, dtype=dtype, eps=(1e-5)*head_size)
+
+    def forward(self, x:Tensor, shift_state_in:Tensor, wkv_state_in:Tensor, v_first_val:Tensor) -> tuple[Tensor,Tensor,Tensor,Tensor]:
+        '''
+        forwarding time mix given the model weights and the input tokens and states.
+        
+        Given:
+        - Incoming token embedding size of shape [batch_size, seq_len, embedding_size]
+        - Incoming states containing of shapes:
+            [batch_size, state_size] ## Token Shift state,
+            [batch_size, n_head, head_size, head_size] ## WKV state
+        - Incoming v_first_val of shape [batch_size, seq_len, embedding_size]
+        
+        
+        Returns a pair 
+        - output embedding of shape [batch_size, seq_len, embedding_size]
+        - output state of shapes:
+            [batch_size, state_size] ## Token Shift state,
+            [batch_size, n_head, head_size, head_size] ## WKV state
+        - output v_first_val of shape [batch_size, seq_len, embedding_size]
+        
+        '''
+        # Get the sizing
+        BATCH_SIZE, SEQ_LEN, IN_EMB_SIZE = x.size()
+        N_HEAD = self.n_head
+        HEAD_SIZE = self.head_size
+
+        # Ensure wkv_state_in is initialized
+        if wkv_state_in is None:
+            wkv_state_in = torch.zeros(BATCH_SIZE,N_HEAD,HEAD_SIZE,HEAD_SIZE, dtype=torch.float,device=w.device)
+        else:
+            wkv_state_in = wkv_state_in.clone()
+
+        ##########
+        ## x070
+        ##########
+
+        shift_state_out = x[:, -1]
+        dxprev = torch.cat((shift_state_in.unsqueeze(1), x[:, :-1]), dim=1) - x
+
+        xr = x + dxprev * self.x_r
+        xw = x + dxprev * self.x_w
+        xk = x + dxprev * self.x_k
+        xv = x + dxprev * self.x_v
+        xa = x + dxprev * self.x_a
+        xg = x + dxprev * self.x_g
+        xx = dxprev
+
+        r = self.receptance(xr)
+        w = torch.tanh(xw @ self.w1) @ self.w2
+        k = self.key(xk)
+        v = self.value(xv)
+        g = torch.sigmoid(xg @ self.g1) @ self.g2
+        iclr = torch.sigmoid(self.a0 + (xa @ self.a1) @ self.a2) # a is "in-context learning rate"
+
+        kk = F.normalize((k * self.k_k).view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1), dim=-1, p=2.0).view(BATCH_SIZE, SEQ_LEN, IN_EMB_SIZE)
+        k = k * (1 + (iclr-1) * self.k_a)
+
+        if self.layer_id == 0:
+            v_first_val = v # store the v of the first layer
+        else:
+            v = v + (v_first_val - v) * torch.sigmoid(self.v0 + (xv @ self.v1) @ self.v2) # add value residual
+
+        tmix_backend = self.tmix_backend.lower()
+        if tmix_backend == "auto":
+            if triton is None or self.receptance.weight.device.type == "cpu":
+                tmix_backend = "pytorch"
+            else:
+                tmix_backend = "cuda"
+
+        if tmix_backend == "pytorch_ref" or tmix_backend == "pytorch_ref_ori":
+            # Pure pytorch mode for rwkv attention
+            # from .kernel.rwkv7_attn_pytorch import rwkv7_attn_pytorch_ref
+            # Reference minimal compilation version
+            w = torch.exp(-0.606531 * torch.sigmoid((self.w0 + w).float())) # 0.606531 = exp(-0.5)
+            xx, wkv_state_out = rwkv7_attn_pytorch_ref(r, w, k, v, kk, iclr, BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE, xx, wkv_state_in) 
+        elif tmix_backend == "pytorch_ref_fp32":
+            # Pure pytorch mode for rwkv attention
+            # from .kernel.rwkv7_attn_pytorch import rwkv7_attn_pytorch_ref_fp32
+            # Modified to follow the same logic as "cuda" version
+            # w = torch.exp(-0.606531 * torch.sigmoid((self.w0 + w).float())) # 0.606531 = exp(-0.5)
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_pytorch_ref_fp32(r, w, k, v, kk, iclr, BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE, xx, wkv_state_in) 
+        elif tmix_backend == "pytorch":
+            # Pure pytorch mode for rwkv attention
+            # from .kernel.rwkv7_attn_pytorch import rwkv7_attn_pytorch
+            # Tweaked pytorch compile varient
+            w = torch.exp(-0.606531 * torch.sigmoid((self.w0 + w).float())) # 0.606531 = exp(-0.5)
+            xx, wkv_state_out = rwkv7_attn_pytorch(r, w, k, v, kk, iclr, BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE, xx, wkv_state_in) 
+        elif tmix_backend == "triton":
+            if triton is None:
+                raise ValueError("Triton not available, unable to load triton kernel")
+            # from .kernel.rwkv7_attn_triton import rwkv7_attn_triton
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_triton(r, w, k, v, kk, iclr, s0=wkv_state_in)
+        elif tmix_backend == "triton_bighead":
+            if triton is None:
+                raise ValueError("Triton not available, unable to load triton kernel")
+            # from .kernel.rwkv7_attn_triton import rwkv7_attn_triton_bighead
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_triton_bighead(r, w, k, v, kk, iclr, s0=wkv_state_in)
+        elif tmix_backend == "cuda_ref":
+            # Cuda based method for rwkv attention
+            # from .kernel.rwkv7_attn_cuda import rwkv7_attn_cuda_ref
+            # Reference cuda version (no state output)
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_cuda_ref(r, w, k, v, kk, iclr, s0=wkv_state_in)
+        elif tmix_backend == "cuda":
+            # Cuda based method for rwkv attention
+            # from .kernel.rwkv7_attn_cuda import rwkv7_attn_cuda
+            # Modified cuda version (with state output)
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_cuda(r, w, k, v, kk, iclr, s0=wkv_state_in)
+        elif tmix_backend == "fla":
+            # FLA based method for rwkv attention
+            # from .kernel.rwkv7_attn_fla import rwkv7_attn_fla
+            # FLA runs with the softplus w
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_fla(r, w, k, v, kk, iclr, BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE, xx, wkv_state_in) 
+        elif tmix_backend == "fla_fused" or tmix_backend == "fused_fla":
+            # FLA based method for rwkv attention
+            # from .kernel.rwkv7_attn_fla import rwkv7_attn_fused_reccurent_fla
+            # FLA runs with the softplus w
+            w = -F.softplus(-(self.w0 + w)) - 0.5
+            xx, wkv_state_out = rwkv7_attn_fused_reccurent_fla(r, w, k, v, kk, iclr, BATCH_SIZE, SEQ_LEN, N_HEAD, HEAD_SIZE, xx, wkv_state_in) 
+        else:
+            raise ValueError(f"Unknown tmix_backend: {tmix_backend}")
+
+        # wkv_state_in normalization of type
+        if wkv_state_in is not None:
+            wkv_state_out = wkv_state_out.to(wkv_state_in.dtype)
+
+        ######## cuda-based method 
+        # wkv_state_out = wkv_state_in.clone()
+        # w = -F.softplus(-(self.w0 + w)) - 0.5 # soft-clamp to (-inf, -0.5)
+        # xx = RWKV7_OP(wkv_state_out, r, w, k, v, -kk, kk*a)
+        ######## cuda-based method 
+
+        xx = self.ln_x(xx.view(BATCH_SIZE * SEQ_LEN, IN_EMB_SIZE)).view(BATCH_SIZE, SEQ_LEN, IN_EMB_SIZE)
+        xx = xx + ((r.view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1)*k.view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1)*self.r_k).sum(dim=-1, keepdim=True) * v.view(BATCH_SIZE,SEQ_LEN,N_HEAD,-1)).view(BATCH_SIZE,SEQ_LEN,IN_EMB_SIZE)
+        xx = self.output(xx * g)
+
+        return xx, shift_state_out, wkv_state_out, v_first_val
+
+    @torch.compile(mode="default")
+    def forward_with_default_compile(self, in_x:Tensor, shift_state_in:Tensor, wkv_state_in:Tensor, v_first_val_in:Tensor, out_x:Tensor, shift_state_out:Tensor, wkv_state_out:Tensor, v_first_val_out:Tensor) -> tuple[Tensor,Tensor,Tensor,Tensor]:
+        '''
+        Compiled varient of the forward function
+        With no new tensors being created for the output
+        Useful for static memory allocation optimizations inference
+        '''
+        out_x[:], shift_state_out[:], wkv_state_out[:], v_first_val_out[:] = self.forward(in_x, shift_state_in, wkv_state_in, v_first_val_in)
+        return out_x, shift_state_out, wkv_state_out, v_first_val_out
+
+    @torch.compile(mode="reduce-overhead")
+    def forward_with_reduce_compile(self, in_x:Tensor, shift_state_in:Tensor, wkv_state_in:Tensor, v_first_val:Tensor) -> tuple[Tensor,Tensor,Tensor,Tensor]:
+        '''
+        Compiled varient of the forward function
+        With no input tensor being modified. 
+        Useful for reduce-overhead compile mode
+        '''
+        return self.forward(in_x, shift_state_in, wkv_state_in, v_first_val)
+    
+    # ---------------------------------
+    #
+    #  Model state handling
+    #
+    # ---------------------------------
+    
+    def load_from_model_state_dict(self, model_state_dict: dict, layer_id:int, non_blocking:bool=True):
+        '''
+        Given the Full/partial RWKV model weights, loaded via `torch.load`
+        Setup the the current module weights, using the layer_id
+        '''
+        # Get the current state_dict
+        current_state_dict = self.state_dict()
+
+        # Iterate each parameter in the state_dict, and compare from the model
+        for n in current_state_dict:
+            model_key = f"blocks.{layer_id}.att.{n}"
+            if model_key not in model_state_dict:
+                continue
+
+            # Copy the values from the state_dict
+            try:
+                current_state_dict[n].copy_(model_state_dict[model_key], non_blocking=non_blocking)
+            except Exception as e:
+                print(f"[ERROR] loading: {model_key} | model shape: {current_state_dict[n].shape} | weight shape: {model_state_dict[model_key].shape}")
+                raise e
+
+# ----------------
+# block/rwkv7_layer_block.py
+# ----------------
+import torch
+from torch import nn
+from torch import Tensor
+from typing import Union
+from torch.nn import functional as F
+
+# from .rwkv7_block_config_map import RWKV7BlockConfigMap
+# from .rwkv7_channel_mix import RWKV7ChannelMix
+# from .rwkv7_time_mix import RWKV7TimeMix
+
+class RWKV7LayerBlock(torch.nn.Module):
+    '''
+    layer block for RWKV V7
+    '''
+
+    def __init__(self, configMap: Union[RWKV7BlockConfigMap, any]):
+        super().__init__()
+
+        configMap:RWKV7BlockConfigMap = RWKV7BlockConfigMap.normalize(configMap)
+        self.configMap = configMap
+
+        # Get required props
+        hidden_size = configMap.hidden_size
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+        dropout_rate = configMap.dropout_rate
+
+        # Get valid layer_id
+        layer_id = configMap.get_layer_id(-1)
+        assert layer_id >= 0, f'layer_id must be >= 0, got {layer_id}'
+
+        # Setup the layernorms, and mixes
+        self.ln1 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        self.ln2 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+
+        if layer_id == 0:
+            self.ln0 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        else:
+            self.ln0 = nn.Identity(device=device)
+
+        # Setup the time and channel mix
+        self.att = RWKV7TimeMix(configMap)
+        self.ffn = RWKV7ChannelMix(configMap)
+
+        # Setup droupout at block level
+        if dropout_rate > 0.0:            
+            self.drop0 = nn.Dropout(p = dropout_rate,device=device)
+            self.drop1 = nn.Dropout(p = dropout_rate,device=device)
+        else:
+            self.drop0 = nn.Identity(device=device)
+            self.drop1 = nn.Identity(device=device)
+    
+    def init_parameters(self):
+        '''
+        Reset the parameters of the block, to an initial state used for training a model from scratch
+        '''
+        configMap = self.configMap
+
+        # Get required props
+        hidden_size = configMap.hidden_size
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+        dropout_rate = configMap.dropout_rate
+
+        # Get valid layer_id
+        layer_id = configMap.get_layer_id(-1)
+        assert layer_id >= 0, f'layer_id must be >= 0, got {layer_id}'
+
+        # Redo the Setup for the layernorms, and mixes
+        self.ln1 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        self.ln2 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+
+        if layer_id == 0:
+            self.ln0 = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        else:
+            self.ln0 = nn.Identity(device=device)
+
+        # Call the sub blocks init_parameters
+        self.att.init_parameters()
+        self.ffn.init_parameters()
+
+    def forward(
+        self, x:torch.Tensor,
+        last_state: tuple[torch.Tensor,torch.Tensor,torch.Tensor], 
+        v_first:torch.Tensor
+        ) -> tuple[torch.Tensor,tuple[torch.Tensor,torch.Tensor,torch.Tensor],torch.Tensor]:
+        '''
+        Forward the block given the input tokens and the last state
+        Last state is a tuple of the following
+        - time mix shift state
+        - time mix wkv state
+        - channel mix shift state
+
+        Returns a pair of the output embedding, v_first and the next state
+        '''
+
+        # # Ensure everything is in the right device
+        # x = x.to(self.ln1.weight.device)
+        # last_state = [ s.to(self.ln1.weight.device) for s in last_state ]
+
+        # Note, that this only applies for layer 0
+        ln0_out = self.ln0(x)
+
+        # assert self.ln1(x) is not None
+        # assert last_state.tmix_shift is not None
+        # assert last_state.tmix_wkv is not None
+
+        att_out, tmix_shift, tmix_wkv, v_first = self.att(
+            self.ln1(ln0_out),
+            last_state[0], # tmix_shift,
+            last_state[1], # tmix_wkv
+            v_first
+        )
+
+        # x = x + att_out
+        x = self.drop0(ln0_out + att_out)
+
+        ffn_out, ffn_state = self.ffn(
+            self.ln2(x),
+            last_state[2] # cmix_shift,
+        )
+
+        # x = x + ffn_out
+        x = self.drop1(x + ffn_out)
+
+        # # Debugging for NaN
+        # layer_id = self.configMap.get_layer_id(-1)
+        # assert torch.isnan(att_out).sum() == 0, f'NaN detected att_out @ layer {layer_id}'
+        # assert torch.isnan(ffn_out).sum() == 0, f'NaN detected ffn_out @ layer {layer_id}'
+        # assert torch.isnan(v_first).sum() == 0, f'NaN detected v_first @ layer {layer_id}'
+        # assert torch.isnan(tmix_shift).sum() == 0, f'NaN detected tmix_shift @ layer {layer_id}'
+        # assert torch.isnan(tmix_wkv).sum() == 0, f'NaN detected tmix_wkv @ layer {layer_id}'
+        # assert torch.isnan(ffn_state).sum() == 0, f'NaN detected ffn_state @ layer {layer_id}'
+        # assert torch.isnan(x).sum() == 0, f'NaN detected block out @ layer {layer_id}'
+
+        return x, (tmix_shift, tmix_wkv, ffn_state), v_first
+    
+    @torch.compile(mode="default")
+    def forward_with_default_compile(
+        self, 
+        in_x:torch.Tensor, 
+        in_state: tuple[torch.Tensor,torch.Tensor,torch.Tensor],
+        in_v_first:torch.Tensor,
+        out_x:torch.Tensor, 
+        out_state: tuple[torch.Tensor,torch.Tensor,torch.Tensor],
+        out_v_first:torch.Tensor
+        ) -> tuple[torch.Tensor,tuple[torch.Tensor,torch.Tensor,torch.Tensor],torch.Tensor]:
+        '''
+        Compiled varient of the forward function
+        With no new tensors being created for the output
+        Useful for static memory allocation optimizations inference
+        '''
+        out_x[:], tmp_state, out_v_first[:] = self.forward(in_x, in_state, in_v_first)
+        out_state[0][:], out_state[1][:], out_state[2][:] = tmp_state
+        return out_x, out_state, out_v_first
+
+    @torch.compile(mode="reduce-overhead")
+    def forward_with_reduce_compile(self, in_x: torch.Tensor, in_state: tuple[torch.Tensor,torch.Tensor,torch.Tensor], in_v_first:torch.Tensor) -> tuple[torch.Tensor,tuple[torch.Tensor,torch.Tensor,torch.Tensor],torch.Tensor]:
+        '''
+        Compiled varient of the forward function
+        '''
+        return self.forward(in_x, in_state, in_v_first)
+    
+    def load_from_model_state_dict(self, model_state_dict:dict, layer_id:int=-1, non_blocking:bool=True):
+        '''
+        Given the Full/partial RWKV model weights, load the block weights accordingly
+        '''
+        if layer_id <= -1:
+            layer_id = self.configMap.get_layer_id(-1)
+        assert layer_id >= 0, f'layer_id must be >= 0, got {layer_id}'
+            
+        # Get the current state_dict
+        current_state_dict = self.state_dict()
+
+        # Iterate each parameter in the state_dict, and compare from the model
+        for n in current_state_dict:
+            model_key = f"blocks.{layer_id}.{n}"
+            if model_key not in model_state_dict:
+                continue
+
+            # Copy the values from the state_dict
+            try:
+                current_state_dict[n].copy_(model_state_dict[model_key], non_blocking=non_blocking)
+            except Exception as e:
+                print(f"[ERROR] loading: {model_key} | model shape: {current_state_dict[n].shape} | weight shape: {model_state_dict[model_key].shape}")
+                raise e
+
+# ----------------
+# model/rwkv7_goose_config_map.py
+# ----------------
+from dataclasses import dataclass
+from typing import Optional
+from typing import Union
+import torch
+
+# from ..block.rwkv7_block_config_map import RWKV7BlockConfigMap
+
+@dataclass
+class RWKV7GooseConfigMap(RWKV7BlockConfigMap):
+    # This is the world tokenizer size
+    vocab_size: int = 65536 
+    init_state_wkv: bool = False
+
+    # ---
+    # Initializer, with excess arg ignore
+    # ---
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        init_state_wkv: bool = False,
+        **kwargs
+    ) -> None:
+        self.vocab_size = vocab_size
+        self.init_state_wkv = init_state_wkv
+        super().__init__(**kwargs)
+        
+    @staticmethod
+    def normalize(config_map: any) -> 'RWKV7GooseConfigMap':
+        '''
+        Converts either maps, objs or RWKV7BlockConfigMap
+        '''
+        if isinstance(config_map, RWKV7GooseConfigMap):
+            return config_map
+        
+        if isinstance(config_map, dict):
+            return RWKV7GooseConfigMap(**config_map)
+
+        if hasattr(config_map, '__dict__'):
+            return RWKV7GooseConfigMap(**config_map.__dict__)
+        
+        raise ValueError(f"Unsupported config_map type: {type(config_map)}")
+
+    @staticmethod
+    def from_model_state_dict(state_dict: dict, **kwargs) -> 'RWKV7GooseConfigMap':
+        '''
+        Converts the state dict to the config map
+        '''
+
+        # Iterate and count the layers
+        num_hidden_layers = 0
+        for key in state_dict.keys():
+            if key.startswith('blocks.'):
+                idx = key.split('.')[1]
+                num_hidden_layers = max(num_hidden_layers, int(idx)+1)
+
+        # Enable wkv_state
+        if 'init_state.0.wkv' in state_dict:
+            kwargs['init_state_wkv'] = True
+        
+        # Initialize the config map, with the configured values
+        return RWKV7GooseConfigMap(
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=state_dict['emb.weight'].shape[1],
+            vocab_size=state_dict['emb.weight'].shape[0],
+            # init_state_wkv=hasattr(state_dict, 'init_state.0.wkv'),
+
+            # n_head=state_dict['blocks.0.att.r_k'].shape[0],
+            head_size=state_dict['blocks.0.att.r_k'].shape[1],
+
+            hidden_size_att=state_dict['blocks.0.att.key.weight'].shape[1],
+            hidden_size_ffn=state_dict['blocks.0.ffn.key.weight'].shape[0],
+
+            **kwargs
+        )
+        
+
+# ----------------
+# model/rwkv7_goose_model.py
+# ----------------
+import torch
+from torch import nn
+from torch import Tensor
+from typing import Union
+
+# from .rwkv7_goose_config_map import RWKV7GooseConfigMap
+# from ..block.rwkv7_layer_block import RWKV7LayerBlock
+
+class RWKV7GooseModel(nn.Module):
+    '''
+    RWKV7 Goose Model architecture
+    Simplified implementation
+    '''
+
+    def __init__(self, config: Union[RWKV7GooseConfigMap, any, None] = None):
+        # Initialize the base class
+        super().__init__()
+
+        # Normalize the config
+        configMap:RWKV7GooseConfigMap = RWKV7GooseConfigMap.normalize(config)
+        self.configMap = configMap
+
+        # Get the required prop
+        num_hidden_layers = configMap.num_hidden_layers
+        vocab_size = configMap.vocab_size
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+        hidden_size = configMap.hidden_size
+
+        # Embedding layer
+        self.emb = nn.Embedding(vocab_size, hidden_size, device=device, dtype=dtype)
+
+        # main block layers
+        blockList = [None]*num_hidden_layers
+        for i in range(num_hidden_layers):
+            blockList[i] = RWKV7LayerBlock(configMap.new_block_config_map(layer_id=i))
+        self.blocks = nn.ModuleList(blockList)
+
+        # ln_out and head
+        self.ln_out = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        self.head = nn.Linear(hidden_size, vocab_size, bias=False, device=device, dtype=dtype)
+
+        # init state tuning support
+        if configMap.init_state_wkv:
+            stateTuneList = [None]*num_hidden_layers
+            for i in range(num_hidden_layers):
+                stateTuneList[i] = nn.ParameterDict({
+                    "wkv": nn.Parameter(torch.zeros(hidden_size // 64, 64, 64, device=device, dtype=dtype)),
+                })
+            self.init_state = nn.ParameterList(stateTuneList)
+
+    def init_parameters(self):
+        '''
+        Reset the parameters of the block, to an initial state used for training a model from scratch
+        '''
+
+        # Get the required prop
+        configMap = self.configMap
+        num_hidden_layers = configMap.num_hidden_layers
+        vocab_size = configMap.vocab_size
+        device = configMap.get_device(None)
+        dtype = configMap.get_dtype('bfloat16')
+        hidden_size = configMap.hidden_size
+        
+        # Iterate and reset the blocks
+        for i in range(num_hidden_layers):
+            self.blocks[i].init_parameters()
+
+        # Reinit the Embedding layer
+        self.emb = nn.Embedding(vocab_size, hidden_size, device=device, dtype=dtype)
+
+        # Reinit the  ln_out and head
+        self.ln_out = nn.LayerNorm(hidden_size, device=device, dtype=dtype)
+        if self.head is not None:
+            self.head = nn.Linear(hidden_size, vocab_size, bias=False, device=device, dtype=dtype)
+
+        # Reinit the init state tuning support
+        if configMap.init_state_wkv:
+            stateTuneList = [None]*num_hidden_layers
+            for i in range(num_hidden_layers):
+                stateTuneList[i] = nn.ParameterDict({
+                    "wkv": nn.Parameter(torch.zeros(hidden_size // 64, 64, 64, device=device, dtype=torch.float)),
+                })
+            self.init_state = nn.ParameterList(stateTuneList)
+
+    def load_from_model_state_dict(self, state_dict: dict, non_blocking:bool=True):
+        '''
+        Given the Full/partial RWKV model weights, loaded via `torch.load`
+        Setup the RWKV_TimeMix model weights, using the layer_id
+        '''
+        for i, block in enumerate(self.blocks):
+            block.load_from_model_state_dict(state_dict, i, non_blocking=non_blocking)
+        
+        self.ln_out.weight.data.copy_(state_dict['ln_out.weight'], non_blocking=True)
+        self.ln_out.bias.data.copy_(state_dict['ln_out.bias'], non_blocking=True)
+        self.head.weight.data.copy_(state_dict['head.weight'], non_blocking=True)
+        self.emb.weight.data.copy_(state_dict['emb.weight'], non_blocking=True)
+
+    ### ---
+    ###
+    ### Init state handling
+    ###
+    ### ---
+
+    def get_init_state(self, batch_size:int=1, skip_init_state:bool=False) -> list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]:
+        '''
+        Get an initialized copy of the model state, for the given batch size
+        '''
+        # Get required configs
+        hidden_size = self.configMap.hidden_size
+        init_state_wkv = self.configMap.init_state_wkv
+        num_hidden_layers = self.configMap.num_hidden_layers
+
+        # Prepare the initial state
+        init_state = [ None for i in range(num_hidden_layers) ]
+        for i in range(num_hidden_layers):
+            device = self.blocks[i].ln1.weight.data.device
+            dtype = self.blocks[i].ln1.weight.data.dtype
+
+            # Use the saved init_state if enabled
+            # TODO: Consider letting the wkv_state dtype be a parameter
+            wkv_state = torch.zeros(batch_size, hidden_size // 64, 64, 64, device=device, dtype=torch.float)
+            if init_state_wkv and skip_init_state == False:
+                init_wkv = self.init_state[i]["wkv"]
+                for b in range(batch_size):
+                    wkv_state[b][:] = init_wkv
+
+            # Prepare the state
+            init_state[i] = (
+                torch.zeros(batch_size, hidden_size, device=device, dtype=dtype),
+                wkv_state,
+                torch.zeros(batch_size, hidden_size, device=device, dtype=dtype)
+            )
+        return init_state
+
+    ### ---
+    ###
+    ### Model Forward
+    ###
+    ### ---
+
+    def forward(
+        self, idx:torch.Tensor, 
+        prv_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]] = None,  
+        ret_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor,list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]]:
+        '''
+        Forward the block set, given the input tokens and the last state
+        Last state is a list of tuple of the following
+        - time mix shift state
+        - time mix wkv state
+        - channel mix shift state
+
+        Returns a pair of the output embedding and the next state
+        '''
+        # Prepare the state, with the batch size
+        if prv_stateList is None:
+            prv_stateList = self.get_init_state(idx.shape[0])
+
+        # If no return state is set, let _forward_internal, set it up
+        if ret_stateList is None:
+            ret_stateList = [ None for i in range(self.configMap.num_hidden_layers) ]
+            return self._forward_internal(idx, prv_stateList, ret_stateList, overwrite_ret_tensor=False)
+
+        # Forward internally
+        return self._forward_internal(idx, prv_stateList, ret_stateList, overwrite_ret_tensor=True)
+    
+    def _forward_block_hook(self, 
+            block:RWKV7LayerBlock, 
+            x_hidden_state:torch.Tensor, 
+            prv_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]], 
+            v_first:torch.Tensor
+        ) -> tuple[torch.Tensor,tuple[torch.Tensor,torch.Tensor,torch.Tensor],torch.Tensor]:
+        '''
+        Forward block hook operation, that is easily overridable.
+        To implement gradient checkpointing for use in various trainers
+        '''
+        x_hidden_state = x_hidden_state.to(block.ln1.weight.device, non_blocking=True)
+        return block(x_hidden_state, prv_stateList, v_first)
+
+    def _forward_internal(
+        self, idx:torch.Tensor, 
+        prv_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]],  
+        ret_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]],
+        overwrite_ret_tensor:bool=False
+    ) -> tuple[torch.Tensor,list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]]:
+        '''
+        Internal forward operations, which assumes the state is already initialized
+        Due to the lack of safety checks, this should not be used directly
+        '''
+
+        # Lets get the embedding
+        idx = idx.to(self.emb.weight.device, non_blocking=True)
+        x_hidden_state = self.emb(idx)
+        v_first = None
+
+        # Iterate the block layers, compute the x_hidden_state
+        if overwrite_ret_tensor:
+            for i, block in enumerate(self.blocks):
+                # x_hidden_state, last_block_state, v_first = block(x_hidden_state, prv_stateList[i], v_first)
+                x_hidden_state, last_block_state, v_first = self._forward_block_hook(block, x_hidden_state, prv_stateList[i], v_first)
+                ret_stateList[i][0][:] = last_block_state[0]
+                ret_stateList[i][1][:] = last_block_state[1]
+                ret_stateList[i][2][:] = last_block_state[2]
+        else:
+            ret_stateList = [ None for i in range( len(self.blocks) ) ]
+            for i, block in enumerate(self.blocks):
+                # x_hidden_state, ret_sublist, v_first = block(x_hidden_state, prv_stateList[i], v_first)
+                x_hidden_state, ret_sublist, v_first = self._forward_block_hook(block, x_hidden_state, prv_stateList[i], v_first)
+                ret_stateList[i] = ret_sublist
+
+        # Final layer norm, and head
+        x_hidden_state = x_hidden_state.to(self.ln_out.weight.device, non_blocking=True)
+        x_hidden_state = self.ln_out(x_hidden_state)
+        x_out = self.head(x_hidden_state)
+
+        # Return the output and the state list
+        return x_out, ret_stateList
+    
+    @torch.compile(mode="default")
+    def forward_with_default_compile(
+        self, idx:torch.Tensor, 
+        prv_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]],
+        ret_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]],
+    ) -> tuple[torch.Tensor,list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]]:
+        '''
+        Compiled varient of the forward function
+        With no new tensors being created for the output
+        Useful for static memory allocation optimizations inference
+        '''
+        # Forward internally
+        return self._forward_internal(idx, prv_stateList, ret_stateList, overwrite_ret_tensor=True)
+  
+    @torch.compile(mode="reduce-overhead")
+    def forward_with_reduce_compile(
+        self, in_idx:torch.Tensor, 
+        prv_stateList:list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]
+    ) -> tuple[torch.Tensor,list[tuple[torch.Tensor,torch.Tensor,torch.Tensor]]]:
+        '''
+        Compiled varient of the forward function, requires previous state to be passed
+        '''
+        return self._forward_internal(in_idx, prv_stateList, None, overwrite_ret_tensor=False)
+