Upload DogeForCausalLM
Browse files- modeling_doge.py +1 -8
modeling_doge.py
CHANGED
@@ -864,14 +864,7 @@ class DogeModel(DogePreTrainedModel):
|
|
864 |
past_key_values: Cache,
|
865 |
output_attentions: bool,
|
866 |
):
|
867 |
-
|
868 |
-
if attention_mask is not None and (attention_mask == 0.0).any():
|
869 |
-
return attention_mask
|
870 |
-
return None
|
871 |
-
|
872 |
-
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
873 |
-
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
874 |
-
# to infer the attention mask.
|
875 |
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
876 |
using_static_cache = isinstance(past_key_values, StaticCache)
|
877 |
|
|
|
864 |
past_key_values: Cache,
|
865 |
output_attentions: bool,
|
866 |
):
|
867 |
+
# We have to provide attention_mask for dynamic mask computation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
869 |
using_static_cache = isinstance(past_key_values, StaticCache)
|
870 |
|