update binned_copy kernels

Files changed (7) hide show

build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/kernels.py +7 -7
build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py +7 -7
build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py +7 -7
build/torch28-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py +7 -7
build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py +7 -7
build/torch28-cxx11-cu129-x86_64-linux/megablocks/backend/kernels.py +7 -7
torch-ext/megablocks/backend/kernels.py +7 -7

build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

build/torch28-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

build/torch28-cxx11-cu129-x86_64-linux/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,

torch-ext/megablocks/backend/kernels.py CHANGED Viewed

@@ -352,8 +352,8 @@ def _binned_copy(
     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
@@ -416,7 +416,7 @@ def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         x,
         out,
         num_experts,
@@ -445,7 +445,7 @@ def binned_scatter(x, indices, weights, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
-    _binned_copy[(num_experts, expert_capacity)](
         out,
         x,
         num_experts,
@@ -492,8 +492,8 @@ def _binned_copy_wgrad(
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
-    expert_idx = tl.program_id(0)
-    entry_idx = tl.program_id(1)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
@@ -543,7 +543,7 @@ def binned_scatter_wgrad(x, grad, indices, bins, top_k):
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
-    _binned_copy_wgrad[(num_experts, expert_capacity)](
         x,
         grad,
         out,

     SCALE: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_b = expert_idx * expert_capacity + entry_idx
     num_experts = bins.shape[0]
     out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         x,
         out,
         num_experts,
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(expert_capacity, num_experts)](
         out,
         x,
         num_experts,
     BLOCK_X: tl.constexpr,
 ):
     # Load our indices into the output.
+    expert_idx = tl.program_id(1)
+    entry_idx = tl.program_id(0)
     # Calculate our offset into the output.
     index_x = expert_idx * expert_capacity + entry_idx
     num_experts, expert_capacity, hidden_size = x.shape
     tokens = indices.shape[0] // top_k
     out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(expert_capacity, num_experts)](
         x,
         grad,
         out,