ReRoPE-test-report

#47

by tpoisonooo - opened Nov 28, 2023

base: refs/heads/main

←

from: refs/pr/47

Discussion Files changed

+285

-205

Files changed (3) hide show

config.json +7 -4
modeling_qwen.py +181 -201
test_passkey_retrieval.py +97 -0

config.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
   "attn_dropout_prob": 0.0,
-  "bf16": false,
   "emb_dropout_prob": 0.0,
   "fp16": false,
   "fp32": false,
@@ -30,8 +30,11 @@
   "tokenizer_class": "QWenTokenizer",
   "transformers_version": "4.32.0",
   "use_cache": true,
-  "use_dynamic_ntk": true,
-  "use_flash_attn": "auto",
   "use_logn_attn": true,
   "vocab_size": 151936
-}

     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
   "attn_dropout_prob": 0.0,
+  "bf16": true,
   "emb_dropout_prob": 0.0,
   "fp16": false,
   "fp32": false,
   "tokenizer_class": "QWenTokenizer",
   "transformers_version": "4.32.0",
   "use_cache": true,
+  "use_dynamic_ntk": false,
+  "use_flash_attn": false,
+  "use_rerope": true,
+  "rerope_window": 512,
+  "forward_max_length": 32768,
   "use_logn_attn": true,
   "vocab_size": 151936
+}

modeling_qwen.py CHANGED Viewed

@@ -7,6 +7,7 @@ import copy
 import importlib
 import math
 import pathlib
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
 import torch
@@ -28,6 +29,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 try:
     from einops import rearrange
@@ -241,6 +243,7 @@ class QWenAttention(nn.Module):
         self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
         self.seq_length = config.seq_length
         self.hidden_size = config.hidden_size
         self.split_size = config.hidden_size
@@ -276,17 +279,19 @@ class QWenAttention(nn.Module):
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
         logn_list = [
             math.log(i, self.seq_length) if i > self.seq_length else 1
-            for i in range(1, 32768)
         ]
         logn_tensor = torch.tensor(logn_list)[None, :, None, None]
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
         self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
-        self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
         if self.bf16:
@@ -296,102 +301,60 @@ class QWenAttention(nn.Module):
         self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
         self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
-        if config.use_cache_quantization and config.use_cache_kernel:
-            # pre check if the support files existing
-            module_root = pathlib.Path(__file__).parent
-            src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
-            if any(not (module_root/src).is_file() for src in src_files):
-                warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
-                self.cache_kernels = None
-            else:
-                try:
-                    from .cpp_kernels import cache_autogptq_cuda_256
-                    self.cache_kernels = cache_autogptq_cuda_256
-                except ImportError:
-                    warnings.warn("Failed to import KV cache kernels.")
-                    self.cache_kernels = None
-    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
-        device = query.device
-        if self.use_cache_quantization:
-            qk, qk_scale, qk_zero = key
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = query.shape[:-1] + (qk.shape[-2],)
-                attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_faster_old(
-                    query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
-                    qk.transpose(-1, -2).contiguous(),
-                    attn_weights,
-                    qk_scale.contiguous() if qk_scale.dtype == torch.float16 else qk_scale.to(torch.float16).contiguous(),
-                    qk_zero.contiguous()if qk_zero.dtype == torch.float16 else qk_zero.to(torch.float16).contiguous())
-                # attn_weights = attn_weights.to(query.dtype).contiguous()
-            else:
-                key = dequantize_cache_torch(qk, qk_scale, qk_zero)
-                attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        else:
-            attn_weights = torch.matmul(query, key.transpose(-1, -2))
         if self.scale_attn_weights:
-            if self.use_cache_quantization:
-                size_temp = value[0].size(-1)
-            else:
-                size_temp = value.size(-1)
-            attn_weights = attn_weights / torch.full(
-                [],
-                size_temp ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
             )
-        if self.use_cache_quantization:
-            query_length, key_length = query.size(-2), key[0].size(-2)
-        else:
-            query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = registered_causal_mask[
             :, :, key_length - query_length : key_length, :key_length
         ]
         mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
             attn_weights.device
         )
-        attn_weights = torch.where(
-            causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        )
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
-        if self.softmax_in_fp32:
-            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
-        else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
-        if self.use_cache_quantization:
-            qv, qv_scale, qv_zero = value
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = attn_weights.shape[:-1] + (query.shape[-1],)
-                attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
-                    attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
-                    qv.contiguous(),  # dtype: int32
-                    attn_output,
-                    qv_scale.contiguous() if qv_scale.dtype == torch.float16 else qv_scale.to(torch.float16).contiguous(),
-                    qv_zero.contiguous() if qv_zero.dtype == torch.float16 else qv_zero.to(torch.float16).contiguous())
-                if attn_output.dtype != query.dtype:
-                    attn_output = attn_output.to(query.dtype)
-                    attn_weights = attn_weights.to(query.dtype)
-            else:
-                value = dequantize_cache_torch(qv, qv_scale, qv_zero)
-                attn_output = torch.matmul(attn_weights, value)
-        else:
-            attn_output = torch.matmul(attn_weights, value)
-        attn_output = attn_output.transpose(1, 2)
         return attn_output, attn_weights
@@ -404,11 +367,31 @@ class QWenAttention(nn.Module):
         tensor = tensor.contiguous()
         new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         return tensor.view(new_shape)
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -425,116 +408,101 @@ class QWenAttention(nn.Module):
         key = self._split_heads(key, self.num_heads, self.head_dim)
         value = self._split_heads(value, self.num_heads, self.head_dim)
-        if rotary_pos_emb_list is not None:
-            cur_len = query.shape[1]
-            if len(rotary_pos_emb_list) == 1:
-                rotary_pos_emb = rotary_pos_emb_list[0]
-                rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                rotary_pos_emb = (rotary_pos_emb,) * 2
-                q_pos_emb, k_pos_emb = rotary_pos_emb
-                # Slice the pos emb for current inference
-                query = apply_rotary_pos_emb(query, q_pos_emb)
-                key = apply_rotary_pos_emb(key, k_pos_emb)
-            else:
-                query_list = []
-                key_list = []
-                for i, rotary_pos_emb in enumerate(rotary_pos_emb_list):
-                    rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                    rotary_pos_emb = (rotary_pos_emb,) * 2
-                    q_pos_emb, k_pos_emb = rotary_pos_emb
-                    # Slice the pos emb for current inference
-                    query_list += [apply_rotary_pos_emb(query[i:i+1, :, :], q_pos_emb)]
-                    key_list += [apply_rotary_pos_emb(key[i:i+1, :, :], k_pos_emb)]
-                query = torch.cat(query_list, dim=0)
-                key = torch.cat(key_list, dim=0)
-        if self.use_cache_quantization:
-            key = quantize_cache_v(key.permute(0, 2, 1, 3),
-                                       bits=8,
-                                       qmin=self.cache_qmin,
-                                       qmax=self.cache_qmax)
-            value = quantize_cache_v(value.permute(0, 2, 1, 3),
-                                         bits=8,
-                                         qmin=self.cache_qmin,
-                                         qmax=self.cache_qmax)
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            if self.use_cache_quantization:
-                # use_cache_quantization:
-                # present=((q_key,key_scale,key_zero_point),
-                #          (q_value,value_scale,value_zero_point))
-                key = (torch.cat((past_key[0], key[0]), dim=2),
-                       torch.cat((past_key[1], key[1]), dim=2),
-                       torch.cat((past_key[2], key[2]), dim=2))
-                value = (torch.cat((past_value[0], value[0]), dim=2),
-                         torch.cat((past_value[1], value[1]), dim=2),
-                         torch.cat((past_value[2], value[2]), dim=2))
-            else:
-                # not use_cache_quantization:
-                # present=(key,value)
                 key = torch.cat((past_key, key), dim=1)
                 value = torch.cat((past_value, value), dim=1)
-        if use_cache:
             present = (key, value)
-        else:
-            present = None
-        if self.use_logn_attn and not self.training:
-            if self.use_cache_quantization:
-                seq_start = key[0].size(2) - query.size(1)
-                seq_end = key[0].size(2)
-            else:
                 seq_start = key.size(1) - query.size(1)
                 seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
-            query = query * logn_tensor.expand_as(query)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
-        else:
-            registered_causal_mask = torch.tril(
-                torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device)
-            ).view(1, 1, key.size(1), key.size(1))
             query = query.permute(0, 2, 1, 3)
-            if not self.use_cache_quantization:
-                key = key.permute(0, 2, 1, 3)
-                value = value.permute(0, 2, 1, 3)
-            if (
-                registered_causal_mask is None
-                and self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-                and not query.is_cuda
-            ):
-                raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
-            if not self.use_cache_quantization and SUPPORT_TORCH2:
-                causal_mask = registered_causal_mask[
-                    :, :, key.size(-2) - query.size(-2): key.size(-2), :key.size(-2)
-                ]
-                if attention_mask is not None:
-                    attention_mask = attention_mask.expand(
-                        -1, -1, causal_mask.size(2), -1
-                    ).masked_fill(~causal_mask, torch.finfo(query.dtype).min)
-                else:
-                    attention_mask = causal_mask
-                attn_output = F.scaled_dot_product_attention(
-                    query, key, value, attn_mask=attention_mask
-                ).transpose(1, 2)
-                attn_weight = None
             else:
-                attn_output, attn_weight = self._attn(
-                    query, key, value, registered_causal_mask, attention_mask, head_mask
-                )
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
@@ -542,15 +510,6 @@ class QWenAttention(nn.Module):
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
-        if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            else:
-                outputs += (attn_weight,)
         return outputs
@@ -596,6 +555,7 @@ class QWenBlock(nn.Module):
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -609,6 +569,7 @@ class QWenBlock(nn.Module):
         attn_outputs = self.attn(
             layernorm_output,
             rotary_pos_emb_list,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -682,10 +643,13 @@ class QWenModel(QWenPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
         self.embed_dim = config.hidden_size
-        self.use_cache_quantization = self.config.use_cache_quantization if hasattr(self.config, 'use_cache_quantization') else False
         self.gradient_checkpointing = False
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.seq_length = config.seq_length
         self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
@@ -708,6 +672,21 @@ class QWenModel(QWenPreTrainedModel):
         self.use_flash_attn = config.use_flash_attn
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.h = nn.ModuleList(
             [
@@ -792,10 +771,7 @@ class QWenModel(QWenPreTrainedModel):
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
         else:
-            if self.use_cache_quantization:
-                past_length = past_key_values[0][0][0].size(2)
-            else:
-                past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(
                 past_length,
@@ -823,10 +799,7 @@ class QWenModel(QWenPreTrainedModel):
         kv_seq_len = hidden_states.size()[1]
         if past_key_values[0] is not None:
             # past key values[0][0] shape: bs * seq_len * head_num * dim
-            if self.use_cache_quantization:
-                kv_seq_len += past_key_values[0][0][0].shape[2]
-            else:
-                kv_seq_len += past_key_values[0][0].shape[1]
         if self.training or not self.use_dynamic_ntk:
             ntk_alpha_list = [1.0]
@@ -844,10 +817,15 @@ class QWenModel(QWenPreTrainedModel):
                 ntk_alpha = self.get_ntk_alpha(kv_seq_len)
                 ntk_alpha_list.append(ntk_alpha)
         self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
         rotary_pos_emb_list = [
-            self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
         ]
         hidden_states = self.drop(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
@@ -879,6 +857,7 @@ class QWenModel(QWenPreTrainedModel):
                     create_custom_forward(block),
                     hidden_states,
                     rotary_pos_emb_list,
                     None,
                     attention_mask,
                     head_mask[i],
@@ -890,6 +869,7 @@ class QWenModel(QWenPreTrainedModel):
                     hidden_states,
                     layer_past=layer_past,
                     rotary_pos_emb_list=rotary_pos_emb_list,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,

 import importlib
 import math
 import pathlib
+import pdb
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
 import torch
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+import numpy as np
 try:
     from einops import rearrange
         self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
         self.seq_length = config.seq_length
+        self.forward_max_length = config.forward_max_length
         self.hidden_size = config.hidden_size
         self.split_size = config.hidden_size
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
+        self.use_rerope = config.use_rerope
+        self.rerope_window = config.rerope_window
+        self.causal = True
         logn_list = [
             math.log(i, self.seq_length) if i > self.seq_length else 1
+            for i in range(1, self.forward_max_length)
         ]
         logn_tensor = torch.tensor(logn_list)[None, :, None, None]
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
         self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
         if self.bf16:
         self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
         self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
+    def _upcast_and_reordered_attn(
+        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
+    ):
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        attn_weights = torch.empty(
+            bsz * num_heads,
+            q_seq_len,
+            k_seq_len,
+            dtype=torch.float32,
+            device=query.device,
+        )
+        scale_factor = 1.0
         if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
+                -1, dk, k_seq_len
             )
+            attn_weights = torch.baddbmm(
+                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
+            )
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = registered_causal_mask[
             :, :, key_length - query_length : key_length, :key_length
         ]
         mask_value = torch.finfo(attn_weights.dtype).min
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
             attn_weights.device
         )
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError(
+                "Error with upcasting, attn_weights does not have dtype torch.float32"
+            )
+        attn_weights = attn_weights.type(value.dtype)
         attn_weights = self.attn_dropout(attn_weights)
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
         return attn_output, attn_weights
         tensor = tensor.contiguous()
         new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         return tensor.view(new_shape)
+    def rotate_half(self, x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., :x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2:]
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb_rerope(self, query, key, cos, sin, position_ids):
+        # take bsz into consideration
+        assert 1 == position_ids.shape[0]
+        cos = cos.squeeze(0).squeeze(1)
+        cos = cos[position_ids][:,:,None,:] # [bs, seq_len, 1, dim] to [1, pos_len, 1, dim]
+        sin = sin.squeeze(0).squeeze(1)
+        sin = sin[position_ids][:,:,None,:] # [bs, seq_len, 1, dim] to [1, pos_len, 1, dim]
+        q_embed = ((query * cos[:,-query.shape[1]:]) + (self.rotate_half(query) * sin[:,-query.shape[1]:])).to(query.dtype) if query is not None else None
+        k_embed = ((key * cos) + (self.rotate_half(key) * sin)).to(key.dtype) if key is not None else None
+        return q_embed, k_embed
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         key = self._split_heads(key, self.num_heads, self.head_dim)
         value = self._split_heads(value, self.num_heads, self.head_dim)
+        q_len = hidden_states.shape[1]
+        assert rotary_pos_emb_list is not None
+        assert output_attentions is False
+        # TODO
+        # 1. 移除动态量化
+        # 2. 用了 logn
+        # 3. 准备增加 context rotary_emb_apply
+        cos, sin = rotary_pos_emb_list[0]
+        assert len(rotary_pos_emb_list) == 1
+        if q_len == 1:
+            # position_ids = torch.tensor([[layer_past[0].shape[1]]], dtype=torch.int64, device=query.device)
+            # query *= ((position_ids.flatten() + 1)[None, :, None, None].log() / np.log(self.train_length)).clip(1).to(query.dtype)
+            if layer_past is not None:
+                past_key, past_value = layer_past[0], layer_past[1]
                 key = torch.cat((past_key, key), dim=1)
                 value = torch.cat((past_value, value), dim=1)
             present = (key, value)
+            # position embedding
+            position_ids = torch.arange(layer_past[0].shape[1] + 1, device=query.device).unsqueeze(0)
+            position_ids = (position_ids[:, -1] - position_ids).clip(max=self.rerope_window)
+            _, key = self.apply_rotary_pos_emb_rerope(None, key, cos, -sin, position_ids)
+            if self.use_logn_attn:
                 seq_start = key.size(1) - query.size(1)
                 seq_end = key.size(1)
+                logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
+                query = query * logn_tensor.expand_as(query)
+            # attn
             query = query.permute(0, 2, 1, 3)
+            key = key.permute(0, 2, 1, 3)
+            value = value.permute(0, 2, 1, 3)
+            causal_mask = registered_causal_mask[
+                :, :, key.size(-2) - query.size(-2): key.size(-2), :key.size(-2)
+            ]
+            if attention_mask is not None:
+                attention_mask = attention_mask.expand(
+                    -1, -1, causal_mask.size(2), -1
+                ).masked_fill(~causal_mask, torch.finfo(query.dtype).min)
             else:
+                attention_mask = causal_mask
+            attn_output = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask
+            ).transpose(1, 2)
+        else:
+            # prefill
+            position_ids = torch.arange(query.shape[1], device=query.device).unsqueeze(0)
+            # query *= ((position_ids.flatten() + 1)[None, :, None, None].log() / np.log(self.train_length)).clip(1).to(query.dtype)
+            present = (key, value)
+            if self.use_logn_attn:
+                seq_start = key.size(1) - query.size(1)
+                seq_end = key.size(1)
+                logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
+                query = query * logn_tensor.expand_as(query)
+            query_states1, key_states1 = self.apply_rotary_pos_emb_rerope(query, key, cos, sin, position_ids)
+            query_states2, _ = self.apply_rotary_pos_emb_rerope(query, None, cos, sin, position_ids * 0 + self.rerope_window)
+            query_states1 = query_states1.permute(0, 2, 1, 3)
+            query_states2 = query_states2.permute(0, 2, 1, 3)
+            key_states1 = key_states1.permute(0, 2, 1, 3)
+            key_states2 = key.to(key_states1.dtype).permute(0, 2, 1, 3)
+            value = value.permute(0, 2, 1, 3)
+            sm_scale = 1.0 / math.sqrt(self.head_dim)
+            attn_weights1 = torch.matmul(query_states1, key_states1.transpose(2, 3)) * sm_scale
+            attn_weights2 = torch.matmul(query_states2, key_states2.transpose(2, 3)) * sm_scale
+            rectified_mask = (position_ids[:, -q_len:, None] - position_ids[:, None]).abs() < self.rerope_window
+            attn_weights = torch.where(rectified_mask, attn_weights1, attn_weights2)
+            if self.causal:
+                tgt_len = attn_weights.shape[-1]
+                dtype = attn_weights.dtype
+                device = attn_weights.device
+                mask = torch.full((tgt_len, tgt_len),
+                                    torch.finfo(dtype).min,
+                                    device=device)
+                mask_cond = torch.arange(mask.size(-1), device=device)
+                mask.masked_fill_(
+                    mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+                mask = mask.to(dtype)
+                attn_weights = attn_weights + mask
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+            attn_output = torch.matmul(attn_weights, value).transpose(1, 2)
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         return outputs
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         attn_outputs = self.attn(
             layernorm_output,
             rotary_pos_emb_list,
+            registered_causal_mask=registered_causal_mask,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
         self.embed_dim = config.hidden_size
         self.gradient_checkpointing = False
         self.use_dynamic_ntk = config.use_dynamic_ntk
+        assert self.use_dynamic_ntk is False
+        self.use_rerope = config.use_rerope
+        self.rerope_window = config.rerope_window
+        assert self.use_rerope is True
         self.seq_length = config.seq_length
         self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
         self.use_flash_attn = config.use_flash_attn
         self.is_fp32 = not (config.bf16 or config.fp16)
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+        ):
+            self.registered_causal_mask = None
+        else:
+            max_positions = config.max_position_embeddings
+            self.register_buffer(
+                "registered_causal_mask",
+                torch.tril(
+                    torch.ones((max_positions, max_positions), dtype=torch.bool)
+                ).view(1, 1, max_positions, max_positions),
+                persistent=False,
+            )
         self.h = nn.ModuleList(
             [
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
         else:
+            past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(
                 past_length,
         kv_seq_len = hidden_states.size()[1]
         if past_key_values[0] is not None:
             # past key values[0][0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += past_key_values[0][0].shape[1]
         if self.training or not self.use_dynamic_ntk:
             ntk_alpha_list = [1.0]
                 ntk_alpha = self.get_ntk_alpha(kv_seq_len)
                 ntk_alpha_list.append(ntk_alpha)
         self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
+        if kv_seq_len > 1:
+            # prefill
+            rotary_emb_seq_len = max(kv_seq_len, self.rerope_window + 1)
+        else:
+            rotary_emb_seq_len = kv_seq_len
         rotary_pos_emb_list = [
+            self.rotary_emb(rotary_emb_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
         ]
         hidden_states = self.drop(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
                     create_custom_forward(block),
                     hidden_states,
                     rotary_pos_emb_list,
+                    self.registered_causal_mask,
                     None,
                     attention_mask,
                     head_mask[i],
                     hidden_states,
                     layer_past=layer_past,
                     rotary_pos_emb_list=rotary_pos_emb_list,
+                    registered_causal_mask=self.registered_causal_mask,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,

test_passkey_retrieval.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import argparse
+import random
+from numpy import random
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import pdb
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--max_tokens', type=int, default=20000, help='maximum token length for evaluation')
+    parser.add_argument('--interval', type=int, default=1000, help='interval for evaluation')
+    parser.add_argument('--num_tests', type=int, default=30, help='number of repeat testing for each length')
+    args = parser.parse_args()
+    return args
+# copy from https://github.com/dvlab-research/LongLoRA/blob/main/passkey_retrivial.py
+def generate_prompt_landmark(n_garbage=60000, seed=666):
+    """Generates a text file and inserts an passkey at a random position."""
+    rnd_state = random.get_state()
+    random.seed(seed)
+    n_garbage_prefix = random.randint(0, n_garbage)
+    n_garbage_suffix = n_garbage - n_garbage_prefix
+    task_description = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there."
+    garbage = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."
+    garbage_inf = " ".join([garbage] * 5000)
+    assert len(garbage_inf) >= n_garbage
+    garbage_prefix = garbage_inf[:n_garbage_prefix]
+    garbage_suffix = garbage_inf[:n_garbage_suffix]
+    pass_key = random.randint(1, 50000)
+    information_line = f"The pass key is {pass_key}. Remember it. {pass_key} is the pass key."
+    final_question = "What is the pass key? The pass key is"
+    print('idx : {}'.format(len(task_description) + len(garbage_prefix)))
+    lines = [
+        task_description,
+        garbage_prefix,
+        information_line,
+        garbage_suffix,
+        final_question,
+    ]
+    random.set_state(rnd_state)
+    return "\n".join(lines), str(pass_key)
+# NTK+log on Qwen-7B tokens {'5801': 0.95, '7986': 0.9, '8805': 0.85, '9897': 0.8, '11809': 0.95, '12900': 0.78, '13993':0.06, '14812': 0.0}
+# ReRoPE on Qwen-7B
+def main(args):
+    # Load model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained('/models/Qwen-7B-Chat-ReRoPE', trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained('/models/Qwen-7B-Chat-ReRoPE', trust_remote_code=True).eval().cuda('cuda:3')
+    # tokenizer = AutoTokenizer.from_pretrained('/models/Qwen-14B-Chat', trust_remote_code=True)
+    # model = AutoModelForCausalLM.from_pretrained('/models/Qwen-14B-Chat', trust_remote_code=True).eval().cuda('cuda:3')
+    all_accuries = {}
+    # This is a rough ratio to control the number of texts and tokens
+    # for val in [8000, 9000, 10000, 11000, 13000, 14000, 15000, 16000, 17000]:
+    for val in range(2000, 12000, args.interval):
+        n_garbage = int(3.75 * val // 1024 * 1024)
+        passed_tests = 0
+        total_tokens = 0
+        for j in range(args.num_tests):
+            prompt, pass_key = generate_prompt_landmark(n_garbage=n_garbage, seed=j)
+            response, _ = model.chat(tokenizer, prompt, history=[], top_k=1)
+            print((response, pass_key))
+            if pass_key in response:
+                passed_tests += 1
+            total_tokens += len(tokenizer(prompt).input_ids)
+        avg_tokens = total_tokens//args.num_tests
+        accuracy = passed_tests/args.num_tests
+        print("accuracy on the token length %d is %f"%(avg_tokens, accuracy))
+        all_accuries[str(avg_tokens)] = accuracy
+    all_accuries = {}
+    # This is a rough ratio to control the number of texts and tokens
+    # for val in [8000, 9000, 10000, 11000, 13000, 14000, 15000, 16000, 17000]:
+    for val in range(2000, 12000, args.interval):
+        n_garbage = int(3.75 * val // 1024 * 1024)
+        passed_tests = 0
+        total_tokens = 0
+        for j in range(args.num_tests):
+            prompt, pass_key = generate_prompt_landmark(n_garbage=n_garbage, seed=j+val)
+            response, _ = model.chat(tokenizer, prompt, history=[])
+            print((response, pass_key))
+            if pass_key in response:
+                passed_tests += 1
+            total_tokens += len(tokenizer(prompt).input_ids)
+        avg_tokens = total_tokens//args.num_tests
+        accuracy = passed_tests/args.num_tests
+        print("accuracy on the token length %d is %f"%(avg_tokens, accuracy))
+        all_accuries[str(avg_tokens)] = accuracy
+    print("accuries over tokens", all_accuries)
+if __name__ == "__main__":
+    args = parse_config()
+    main(args)