Spaces:

Dovakiins
/

qwerrwe

Build error

winglian commited on Nov 15, 2023

Commit

db8a8af

unverified ·

1 Parent(s): 1470650

adds llama and mistral dropout support (#858)

* adds llama and mistral dropout support

* gracefully handle attention dropout if not available yet

Files changed (2) hide show

src/axolotl/monkeypatch/llama_attn_hijack_flash.py CHANGED Viewed

@@ -321,6 +321,8 @@ def flashattn_forward(
         # only on first autoregressive step q,k,v have same seqlen
         is_causal = key_states.shape == query_states.shape
     if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
         # special handling using sample packing
         qkv = torch.stack(
@@ -330,7 +332,12 @@ def flashattn_forward(
         qkv = rearrange(qkv, "b s ... -> (b s) ...")
         output = flash_attn_varlen_qkvpacked_func(
-            qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=None, causal=True
         )
         output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
     elif query_states.shape == key_states.shape:
@@ -353,7 +360,7 @@ def flashattn_forward(
             qkv_unpad,
             cu_seqlens_q,
             max_seqlen_q,
-            0.0,
             softmax_scale=None,
             causal=is_causal,
         )
@@ -366,6 +373,7 @@ def flashattn_forward(
             output = flash_attn_kvpacked_func(
                 query_states,
                 torch.stack([key_states, value_states], 2),
                 causal=is_causal,
             )
         else:
@@ -398,7 +406,7 @@ def flashattn_forward(
                 cu_seqlens_k,
                 max_seqlen_q,
                 max_seqlen_k,
-                0.0,
                 softmax_scale=None,
                 causal=is_causal,
             )

         # only on first autoregressive step q,k,v have same seqlen
         is_causal = key_states.shape == query_states.shape
+    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
     if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
         # special handling using sample packing
         qkv = torch.stack(
         qkv = rearrange(qkv, "b s ... -> (b s) ...")
         output = flash_attn_varlen_qkvpacked_func(
+            qkv,
+            cu_seqlens,
+            max_seqlen,
+            dropout_p=dropout_rate,
+            softmax_scale=None,
+            causal=True,
         )
         output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
     elif query_states.shape == key_states.shape:
             qkv_unpad,
             cu_seqlens_q,
             max_seqlen_q,
+            dropout_p=dropout_rate,
             softmax_scale=None,
             causal=is_causal,
         )
             output = flash_attn_kvpacked_func(
                 query_states,
                 torch.stack([key_states, value_states], 2),
+                dropout_p=dropout_rate,
                 causal=is_causal,
             )
         else:
                 cu_seqlens_k,
                 max_seqlen_q,
                 max_seqlen_k,
+                dropout_p=dropout_rate,
                 softmax_scale=None,
                 causal=is_causal,
             )

src/axolotl/monkeypatch/mistral_attn_hijack_flash.py CHANGED Viewed

@@ -201,6 +201,8 @@ def flashattn_forward(
         # only on first autoregressive step q,k,v have same seqlen
         is_causal = key_states.shape == query_states.shape
     if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
         # special handling using sample packing
         qkv = torch.stack(
@@ -213,7 +215,7 @@ def flashattn_forward(
             qkv,
             cu_seqlens,
             max_seqlen,
-            0.0,
             softmax_scale=None,
             causal=True,
             window_size=window_size,
@@ -239,7 +241,7 @@ def flashattn_forward(
             qkv_unpad,
             cu_seqlens_q,
             max_seqlen_q,
-            0.0,
             softmax_scale=None,
             causal=is_causal,
             window_size=window_size,
@@ -253,6 +255,7 @@ def flashattn_forward(
             output = flash_attn_kvpacked_func(
                 query_states,
                 torch.stack([key_states, value_states], 2),
                 causal=is_causal,
                 window_size=window_size,
             )
@@ -286,7 +289,7 @@ def flashattn_forward(
                 cu_seqlens_k,
                 max_seqlen_q,
                 max_seqlen_k,
-                0.0,
                 softmax_scale=None,
                 causal=is_causal,
                 window_size=window_size,

         # only on first autoregressive step q,k,v have same seqlen
         is_causal = key_states.shape == query_states.shape
+    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
     if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
         # special handling using sample packing
         qkv = torch.stack(
             qkv,
             cu_seqlens,
             max_seqlen,
+            dropout_p=dropout_rate,
             softmax_scale=None,
             causal=True,
             window_size=window_size,
             qkv_unpad,
             cu_seqlens_q,
             max_seqlen_q,
+            dropout_p=dropout_rate,
             softmax_scale=None,
             causal=is_causal,
             window_size=window_size,
             output = flash_attn_kvpacked_func(
                 query_states,
                 torch.stack([key_states, value_states], 2),
+                dropout_p=dropout_rate,
                 causal=is_causal,
                 window_size=window_size,
             )
                 cu_seqlens_k,
                 max_seqlen_q,
                 max_seqlen_k,
+                dropout_p=dropout_rate,
                 softmax_scale=None,
                 causal=is_causal,
                 window_size=window_size,