Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| from torch.cuda.amp import custom_bwd, custom_fwd | |
| from transformers.models.llama.modeling_llama import LlamaMLP | |
| try: | |
| import triton | |
| import triton.language as tl | |
| from . import custom_autotune | |
| # code based https://github.com/fpgaminer/GPTQ-triton | |
| def fusedmatmul_248_kernel(a_ptr, c_ptr, b1_ptr, scales1_ptr, zeros1_ptr, g1_ptr, b2_ptr, scales2_ptr, zeros2_ptr, g2_ptr, M, N, K, bits, maxq, stride_am, stride_ak, stride_bk, stride_bn, | |
| stride_cm, stride_cn, stride_scales, stride_zeros, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr): | |
| """ | |
| Computes: C = silu(A * B1) * (A * B2) | |
| A is of shape (M, K) float16 | |
| B is of shape (K//8, N) int32 | |
| C is of shape (M, N) float16 | |
| scales is of shape (1, N) float16 | |
| zeros is of shape (1, N//8) int32 | |
| """ | |
| infearure_per_bits = 32 // bits | |
| pid = tl.program_id(axis=0) | |
| num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) | |
| num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) | |
| num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) | |
| num_pid_in_group = GROUP_SIZE_M * num_pid_n | |
| group_id = pid // num_pid_in_group | |
| first_pid_m = group_id * GROUP_SIZE_M | |
| group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) | |
| pid_m = first_pid_m + (pid % group_size_m) | |
| pid_n = (pid % num_pid_in_group) // group_size_m | |
| offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) | |
| offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) | |
| offs_k = tl.arange(0, BLOCK_SIZE_K) | |
| a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K) | |
| a_mask = (offs_am[:, None] < M) | |
| # b_ptrs is set up such that it repeats elements along the K axis 8 times | |
| b1_ptrs = b1_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) | |
| b2_ptrs = b2_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) | |
| g1_ptrs = g1_ptr + offs_k | |
| g2_ptrs = g2_ptr + offs_k | |
| # shifter is used to extract the N bits of each element in the 32-bit word from B | |
| scales1_ptrs = scales1_ptr + offs_bn[None, :] | |
| scales2_ptrs = scales2_ptr + offs_bn[None, :] | |
| zeros1_ptrs = zeros1_ptr + (offs_bn[None, :] // infearure_per_bits) | |
| zeros2_ptrs = zeros2_ptr + (offs_bn[None, :] // infearure_per_bits) | |
| shifter = (offs_k % infearure_per_bits) * bits | |
| zeros_shifter = (offs_bn % infearure_per_bits) * bits | |
| accumulator1 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) | |
| accumulator2 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) | |
| for k in range(0, num_pid_k): | |
| g1_idx = tl.load(g1_ptrs) | |
| g2_idx = tl.load(g2_ptrs) | |
| # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop | |
| scales1 = tl.load(scales1_ptrs + g1_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) | |
| scales2 = tl.load(scales2_ptrs + g2_idx[:, None] * stride_scales) | |
| zeros1 = tl.load(zeros1_ptrs + g1_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) | |
| zeros1 = (zeros1 >> zeros_shifter[None, :]) & maxq | |
| zeros1 = (zeros1 + 1) | |
| zeros2 = tl.load(zeros2_ptrs + g2_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) | |
| zeros2 = (zeros2 >> zeros_shifter[None, :]) & maxq | |
| zeros2 = (zeros2 + 1) | |
| a = tl.load(a_ptrs, mask=a_mask, other=0.) # (BLOCK_SIZE_M, BLOCK_SIZE_K) | |
| b1 = tl.load(b1_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated | |
| b2 = tl.load(b2_ptrs) | |
| # Now we need to unpack b (which is N-bit values) into 32-bit values | |
| b1 = (b1 >> shifter[:, None]) & maxq # Extract the N-bit values | |
| b1 = (b1 - zeros1) * scales1 # Scale and shift | |
| accumulator1 += tl.dot(a, b1) | |
| b2 = (b2 >> shifter[:, None]) & maxq | |
| b2 = (b2 - zeros2) * scales2 | |
| accumulator2 += tl.dot(a, b2) | |
| a_ptrs += BLOCK_SIZE_K | |
| b1_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk | |
| b2_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk | |
| g1_ptrs += BLOCK_SIZE_K | |
| g2_ptrs += BLOCK_SIZE_K | |
| accumulator1 = silu(accumulator1) | |
| c = accumulator1 * accumulator2 | |
| c = c.to(tl.float16) | |
| c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] | |
| c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) | |
| tl.store(c_ptrs, c, mask=c_mask) | |
| def silu(x): | |
| return x * tl.sigmoid(x) | |
| except: | |
| print('triton not installed.') | |
| class QuantLlamaMLP(nn.Module): | |
| def __init__( | |
| self, | |
| gate_proj, | |
| down_proj, | |
| up_proj, | |
| ): | |
| super().__init__() | |
| self.register_buffer('gate_proj_qweight', gate_proj.qweight) | |
| self.register_buffer('gate_proj_scales', gate_proj.scales) | |
| self.register_buffer('gate_proj_qzeros', gate_proj.qzeros) | |
| self.register_buffer('gate_proj_g_idx', gate_proj.g_idx) | |
| self.register_buffer('up_proj_qweight', up_proj.qweight) | |
| self.register_buffer('up_proj_scales', up_proj.scales) | |
| self.register_buffer('up_proj_qzeros', up_proj.qzeros) | |
| self.register_buffer('up_proj_g_idx', up_proj.g_idx) | |
| self.infeatures = gate_proj.infeatures | |
| self.intermediate_size = gate_proj.outfeatures | |
| self.outfeatures = down_proj.outfeatures | |
| self.bits = gate_proj.bits | |
| self.maxq = gate_proj.maxq | |
| self.down_proj = down_proj | |
| def forward(self, x): | |
| return self.down_proj(self.triton_llama_mlp(x)) | |
| def triton_llama_mlp(self, x): | |
| with torch.cuda.device(x.device): | |
| out_shape = x.shape[:-1] + (self.intermediate_size, ) | |
| x = x.reshape(-1, x.shape[-1]) | |
| M, K = x.shape | |
| N = self.intermediate_size | |
| c = torch.empty((M, N), device='cuda', dtype=torch.float16) | |
| grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) | |
| fusedmatmul_248_kernel[grid](x, c, self.gate_proj_qweight, self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx, self.up_proj_qweight, self.up_proj_scales, | |
| self.up_proj_qzeros, self.up_proj_g_idx, M, N, K, self.bits, self.maxq, x.stride(0), x.stride(1), self.gate_proj_qweight.stride(0), | |
| self.gate_proj_qweight.stride(1), c.stride(0), c.stride(1), self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)) | |
| c = c.reshape(out_shape) | |
| return c | |
| def fused2cuda(self): | |
| self.gate_proj_qweight = self.gate_proj_qweight.cuda() | |
| self.gate_proj_scales = self.gate_proj_scales.cuda() | |
| self.gate_proj_qzeros = self.gate_proj_qzeros.cuda() | |
| self.gate_proj_g_idx = self.gate_proj_g_idx.cuda() | |
| self.up_proj_qweight = self.up_proj_qweight.cuda() | |
| self.up_proj_scales = self.up_proj_scales.cuda() | |
| self.up_proj_qzeros = self.up_proj_qzeros.cuda() | |
| self.up_proj_g_idx = self.up_proj_g_idx.cuda() | |
| def fused2cpu(self): | |
| self.gate_proj_qweight = self.gate_proj_qweight.cpu() | |
| self.gate_proj_scales = self.gate_proj_scales.cpu() | |
| self.gate_proj_qzeros = self.gate_proj_qzeros.cpu() | |
| self.gate_proj_g_idx = self.gate_proj_g_idx.cpu() | |
| self.up_proj_qweight = self.up_proj_qweight.cpu() | |
| self.up_proj_scales = self.up_proj_scales.cpu() | |
| self.up_proj_qzeros = self.up_proj_qzeros.cpu() | |
| self.up_proj_g_idx = self.up_proj_g_idx.cpu() | |
| def make_fused_mlp(m, parent_name=''): | |
| """ | |
| Replace all LlamaMLP modules with QuantLlamaMLP modules, which fuses many of the operations. | |
| """ | |
| if isinstance(m, LlamaMLP): | |
| return QuantLlamaMLP(m.gate_proj, m.down_proj, m.up_proj) | |
| for name, child in m.named_children(): | |
| child = make_fused_mlp(child, parent_name=f"{parent_name}.{name}") | |
| if isinstance(child, QuantLlamaMLP): | |
| setattr(m, name, child) | |
| return m | |
| def autotune_warmup_fused(model): | |
| """ | |
| Pre-tunes the quantized kernel | |
| """ | |
| from tqdm import tqdm | |
| kn_values = {} | |
| for _, m in model.named_modules(): | |
| if not isinstance(m, QuantLlamaMLP): | |
| continue | |
| k = m.infeatures | |
| n = m.intermediate_size | |
| m.fused2cuda() | |
| if (k, n) not in kn_values: | |
| kn_values[(k, n)] = m | |
| print(f'Found {len(kn_values)} unique fused mlp KN values.') | |
| print('Warming up autotune cache ...') | |
| with torch.no_grad(): | |
| for m in tqdm(range(0, 12)): | |
| m = 2**m # [1, 2048] | |
| for (k, n), (modules) in kn_values.items(): | |
| a = torch.randn(m, k, dtype=torch.float16, device='cuda') | |
| modules.triton_llama_mlp(a) | |
| for (k, n), (modules) in kn_values.items(): | |
| a = torch.randn(m, k, dtype=torch.float16, device='cuda') | |
| modules.fused2cpu() | |
| del kn_values | |