Text Generation
Transformers
Safetensors
English
doge
conversational
custom_code
JingzeShi commited on
Commit
a6a6403
·
verified ·
1 Parent(s): 09899c7

Upload DogeForCausalLM

Browse files
Files changed (1) hide show
  1. modeling_doge.py +1 -8
modeling_doge.py CHANGED
@@ -864,14 +864,7 @@ class DogeModel(DogePreTrainedModel):
864
  past_key_values: Cache,
865
  output_attentions: bool,
866
  ):
867
- if self.config._attn_implementation == "flash_attention_2":
868
- if attention_mask is not None and (attention_mask == 0.0).any():
869
- return attention_mask
870
- return None
871
-
872
- # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
873
- # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
874
- # to infer the attention mask.
875
  past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
876
  using_static_cache = isinstance(past_key_values, StaticCache)
877
 
 
864
  past_key_values: Cache,
865
  output_attentions: bool,
866
  ):
867
+ # We have to provide attention_mask for dynamic mask computation
 
 
 
 
 
 
 
868
  past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
869
  using_static_cache = isinstance(past_key_values, StaticCache)
870