Build

Browse files

Files changed (3) hide show

build/torch-universal/triton_layer_norm/__init__.py +4 -2
build/torch-universal/triton_layer_norm/layer_norm.py +95 -48
build/torch-universal/triton_layer_norm/layers.py +4 -0

build/torch-universal/triton_layer_norm/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
-from .layer_norm import RMSNorm, layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
-__all__ = ["RMSNorm", "layer_norm_fn", "layer_norm_linear_fn", "rms_norm_fn"]

+from .layer_norm import layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
+from . import layers
+__all__ = ["layers", "layer_norm_fn", "layer_norm_linear_fn", "rms_norm_fn"]

build/torch-universal/triton_layer_norm/layer_norm.py CHANGED Viewed

@@ -10,7 +10,7 @@ import math
 import torch
 import torch.nn.functional as F
-from torch.cuda.amp import custom_fwd, custom_bwd
 import triton
 import triton.language as tl
@@ -59,9 +59,9 @@ def layer_norm_ref(
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
-    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
-        dtype
-    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
@@ -115,13 +115,15 @@ def rms_norm_ref(
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
-        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
-            dtype
-        )
         return (out, out1) if not prenorm else (out, out1, x)
@@ -201,7 +203,9 @@ def _layer_norm_fwd_1pass_kernel(
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
-        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
@@ -214,7 +218,8 @@ def _layer_norm_fwd_1pass_kernel(
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
@@ -268,7 +273,7 @@ def _layer_norm_fwd(
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
-    residual_out=None
 ):
     if residual is not None:
         residual_dtype = residual.dtype
@@ -315,14 +320,21 @@ def _layer_norm_fwd(
     ):
         if residual_out is None:
             residual_out = torch.empty(
-                M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
             )
         else:
             assert residual_out.shape == x.shape
         assert residual_out.stride(-1) == 1
     else:
         residual_out = None
-    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
@@ -331,7 +343,9 @@ def _layer_norm_fwd(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
     else:
         dropout_mask = None
     # Less than 64KB per feature: enqueue fused kernel
@@ -401,7 +415,14 @@ def _layer_norm_fwd(
         triton.Config({}, num_warps=16),
         triton.Config({}, num_warps=32),
     ],
-    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
@@ -529,14 +550,18 @@ def _layer_norm_bwd_kernel(
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
-            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
@@ -627,9 +652,15 @@ def _layer_norm_bwd(
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
     if recompute_output:
-        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
@@ -723,7 +754,7 @@ class LayerNormFn(torch.autograd.Function):
         is_rms_norm=False,
         return_dropout_mask=False,
         out=None,
-        residual_out=None
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
@@ -759,22 +790,24 @@ class LayerNormFn(torch.autograd.Function):
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
-            x,
-            weight,
-            bias,
-            eps,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            dropout_p=dropout_p,
-            rowscale=rowscale,
-            residual_dtype=residual_dtype,
-            is_rms_norm=is_rms_norm,
-            return_dropout_mask=return_dropout_mask,
-            out=out,
-            residual_out=residual_out
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
@@ -789,9 +822,15 @@ class LayerNormFn(torch.autograd.Function):
         ctx.x_dtype = x.dtype
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
-        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
@@ -890,7 +929,7 @@ def layer_norm_fn(
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -908,7 +947,7 @@ def layer_norm_fn(
         is_rms_norm,
         return_dropout_mask,
         out,
-        residual_out
     )
@@ -927,7 +966,7 @@ def rms_norm_fn(
     residual_in_fp32=False,
     return_dropout_mask=False,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -945,7 +984,7 @@ def rms_norm_fn(
         True,
         return_dropout_mask,
         out,
-        residual_out
     )
@@ -981,7 +1020,7 @@ class RMSNorm(torch.nn.Module):
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
-    @custom_fwd
     def forward(
         ctx,
         x,
@@ -1019,17 +1058,25 @@ class LayerNormLinearFn(torch.autograd.Function):
             norm_bias,
             eps,
             residual,
-            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
-        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
@@ -1040,7 +1087,7 @@ class LayerNormLinearFn(torch.autograd.Function):
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
-    @custom_bwd
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])

 import torch
 import torch.nn.functional as F
+from torch.amp import custom_fwd, custom_bwd
 import triton
 import triton.language as tl
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
+    out = F.layer_norm(
+        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
+    ).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
+        dtype
+    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
+        out1 = (
+            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
+        ).to(dtype)
         return (out, out1) if not prenorm else (out, out1, x)
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
+        keep_mask = (
+            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+        )
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
+                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     if residual is not None:
         residual_dtype = residual.dtype
     ):
         if residual_out is None:
             residual_out = torch.empty(
+                M,
+                N,
+                device=x.device,
+                dtype=residual_dtype if residual_dtype is not None else x.dtype,
             )
         else:
             assert residual_out.shape == x.shape
         assert residual_out.stride(-1) == 1
     else:
         residual_out = None
+    mean = (
+        torch.empty((M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
+        dropout_mask = torch.empty(
+            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
+        )
     else:
         dropout_mask = None
     # Less than 64KB per feature: enqueue fused kernel
         triton.Config({}, num_warps=16),
         triton.Config({}, num_warps=32),
     ],
+    key=[
+        "N",
+        "HAS_DRESIDUAL",
+        "STORE_DRESIDUAL",
+        "IS_RMS_NORM",
+        "HAS_BIAS",
+        "HAS_DROPOUT",
+    ],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                    > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
+            keep_mask = (
+                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
+            )
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+    y = (
+        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
+        if recompute_output
+        else None
+    )
     if recompute_output:
+        assert (
+            weight1 is None
+        ), "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
         is_rms_norm=False,
         return_dropout_mask=False,
         out=None,
+        residual_out=None,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
+        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
+            _layer_norm_fwd(
+                x,
+                weight,
+                bias,
+                eps,
+                residual,
+                x1,
+                weight1,
+                bias1,
+                dropout_p=dropout_p,
+                rowscale=rowscale,
+                residual_dtype=residual_dtype,
+                is_rms_norm=is_rms_norm,
+                return_dropout_mask=return_dropout_mask,
+                out=out,
+                residual_out=residual_out,
+            )
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
         ctx.x_dtype = x.dtype
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
+        residual_out = (
+            residual_out.reshape(x_shape_og) if residual_out is not None else None
+        )
+        dropout_mask = (
+            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+        )
+        dropout_mask1 = (
+            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
+        )
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         is_rms_norm,
         return_dropout_mask,
         out,
+        residual_out,
     )
     residual_in_fp32=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         True,
         return_dropout_mask,
         out,
+        residual_out,
     )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
+    @custom_fwd(device_type="cuda")
     def forward(
         ctx,
         x,
             norm_bias,
             eps,
             residual,
+            out_dtype=(
+                None
+                if not torch.is_autocast_enabled()
+                else torch.get_autocast_gpu_dtype()
+            ),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
+        dtype = (
+            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        )
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(
+            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
+        )
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
+    @custom_bwd(device_type="cuda")
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])

build/torch-universal/triton_layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from .layer_norm import RMSNorm
2	+
3	+
4	+ __all__ = ["RMSNorm"]