danieldk HF Staff commited on Mar 11

Commit

e21f2b2

1 Parent(s): e428458

Add layers

Browse files

Files changed (27) hide show

build/torch25-cxx11-cu118-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx11-cu118-x86_64-linux/activation/layers.py +65 -0
build/torch25-cxx11-cu121-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx11-cu121-x86_64-linux/activation/layers.py +65 -0
build/torch25-cxx11-cu124-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx11-cu124-x86_64-linux/activation/layers.py +65 -0
build/torch25-cxx98-cu118-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx98-cu118-x86_64-linux/activation/layers.py +65 -0
build/torch25-cxx98-cu121-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx98-cu121-x86_64-linux/activation/layers.py +65 -0
build/torch25-cxx98-cu124-x86_64-linux/activation/__init__.py +14 -9
build/torch25-cxx98-cu124-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py +65 -0
build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py +14 -9
build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py +65 -0
tests/kernels/test_activation.py +30 -4
torch-ext/activation/__init__.py +14 -9
torch-ext/activation/layers.py +65 -0

build/torch25-cxx11-cu118-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx11-cu118-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch25-cxx11-cu121-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx11-cu121-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch25-cxx11-cu124-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx11-cu124-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch25-cxx98-cu118-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx98-cu118-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch25-cxx98-cu121-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx98-cu121-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch25-cxx98-cu124-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch25-cxx98-cu124-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out

tests/kernels/test_activation.py CHANGED Viewed

@@ -71,28 +71,34 @@ def test_act_and_mul(
         torch_fn = silu_and_mul
         fn = activation.silu_and_mul
         op = activation.ops.silu_and_mul
     elif activation_name == "gelu":
         torch_fn = lambda x: gelu_and_mul(x, "none")
         fn = activation.gelu_and_mul
         op = activation.ops.gelu_and_mul
     elif activation_name == "gelu_tanh":
         torch_fn = lambda x: gelu_and_mul(x, "tanh")
         fn = activation.gelu_tanh_and_mul
         op = activation.ops.gelu_tanh_and_mul
     elif activation_name == "fatrelu":
         threshold = random.uniform(0, 1)
         torch_fn = lambda x: fatrelu_and_mul(x, threshold)
         fn = lambda out, x: activation.fatrelu_and_mul(out, x, threshold)
         op = activation.ops.fatrelu_and_mul
     out_shape = x.shape[:-1] + (x.shape[-1] // 2,)
     out = torch.empty(out_shape, dtype=x.dtype, device=x.device)
     out = fn(out, x)
     ref_out = torch_fn(x)
     # The SiLU, GELU and FatReLU implementations are equivalent to the native
     # PyTorch implementations, so we can do exact comparison.
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
     d = x.shape[-1] // 2
     output_shape = x.shape[:-1] + (d,)
@@ -106,9 +112,24 @@ def test_act_and_mul(
 @pytest.mark.parametrize(
     "activation_fns",
     [
-        (gelu_fast, activation.gelu_fast, activation.ops.gelu_fast),
-        (gelu_new, activation.gelu_new, activation.ops.gelu_new),
-        (gelu_quick, activation.gelu_quick, activation.ops.gelu_quick),
     ],
 )
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -128,12 +149,17 @@ def test_activation(
     torch.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
-    torch_fn, fn, op = activation_fns
     out = fn(torch.empty_like(x), x)
     ref_out = torch_fn(x)
     torch.testing.assert_close(
         out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
     )
     out = torch.empty_like(x)
     opcheck(op, (out, x))

         torch_fn = silu_and_mul
         fn = activation.silu_and_mul
         op = activation.ops.silu_and_mul
+        layer = activation.layers.SiluAndMul()
     elif activation_name == "gelu":
         torch_fn = lambda x: gelu_and_mul(x, "none")
         fn = activation.gelu_and_mul
         op = activation.ops.gelu_and_mul
+        layer = activation.layers.GeluAndMul()
     elif activation_name == "gelu_tanh":
         torch_fn = lambda x: gelu_and_mul(x, "tanh")
         fn = activation.gelu_tanh_and_mul
         op = activation.ops.gelu_tanh_and_mul
+        layer = activation.layers.GeluTanhAndMul()
     elif activation_name == "fatrelu":
         threshold = random.uniform(0, 1)
         torch_fn = lambda x: fatrelu_and_mul(x, threshold)
         fn = lambda out, x: activation.fatrelu_and_mul(out, x, threshold)
         op = activation.ops.fatrelu_and_mul
+        layer = activation.layers.FatreluAndMul(threshold)
     out_shape = x.shape[:-1] + (x.shape[-1] // 2,)
     out = torch.empty(out_shape, dtype=x.dtype, device=x.device)
     out = fn(out, x)
+    mod_out = layer(x)
     ref_out = torch_fn(x)
     # The SiLU, GELU and FatReLU implementations are equivalent to the native
     # PyTorch implementations, so we can do exact comparison.
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    torch.testing.assert_close(mod_out, ref_out, atol=0.0, rtol=0.0)
     d = x.shape[-1] // 2
     output_shape = x.shape[:-1] + (d,)
 @pytest.mark.parametrize(
     "activation_fns",
     [
+        (
+            gelu_fast,
+            activation.gelu_fast,
+            activation.ops.gelu_fast,
+            activation.layers.FastGELU,
+        ),
+        (
+            gelu_new,
+            activation.gelu_new,
+            activation.ops.gelu_new,
+            activation.layers.NewGELU,
+        ),
+        (
+            gelu_quick,
+            activation.gelu_quick,
+            activation.ops.gelu_quick,
+            activation.layers.QuickGELU,
+        ),
     ],
 )
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
     torch.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
+    torch_fn, fn, op, cls = activation_fns
+    layer = cls()
     out = fn(torch.empty_like(x), x)
+    layer_out = layer(x)
     ref_out = torch_fn(x)
     torch.testing.assert_close(
         out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
     )
+    torch.testing.assert_close(
+        out, layer_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
+    )
     out = torch.empty_like(x)
     opcheck(op, (out, x))

torch-ext/activation/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _activation
-        ops = torch.ops._activition
-    except ImportError:
-        raise e
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -45,3 +38,15 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out

 import torch
+from ._ops import ops
+from . import layers
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_quick(out, x)
     return out
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]

torch-ext/activation/layers.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class SiluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+class GeluAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+class GeluTanhAndMul(nn.Module):
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+class FatreluAndMul(nn.Module):
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+class FastGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+class NewGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out