chandar-lab
/

AMPLIFY_350M_base

@@ -124,13 +124,13 @@ class EncoderBlock(nn.Module):
         self.ffn_dropout = nn.Dropout(config.dropout_prob)
-    def forward(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
-        attn, contact = self._att_block(self.attention_norm(x), pad_mask, freqs_cis, output_attentions)
         x = x + attn
         x = x + self._ff_block(self.ffn_norm(x))
         return x, contact
-    def _att_block(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
         batch_size, seq_len, _ = x.shape
         xq, xk, xv = self.q(x), self.k(x), self.v(x)
@@ -144,15 +144,15 @@ class EncoderBlock(nn.Module):
             query=xq,
             key=xk,
             value=xv,
-            attn_bias=pad_mask,
             p=self.config.dropout_prob if self.training else 0,
         )
         _attn = None
         if output_attentions:
             _attn = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
-            if pad_mask is not None:
-                _attn = _attn + pad_mask
             _attn = _attn.softmax(-1)
         return self.resid_dropout(self.wo(attn.view(batch_size, seq_len, self.config.num_attention_heads * self.d_head))), _attn
@@ -203,28 +203,28 @@ class AMPLIFY(AMPLIFYPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def forward(self, src, pad_mask=None, output_hidden_states=False, output_attentions=False):
         # Initialize
         hidden_states, attentions = [], []
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
-        if pad_mask is not None and not torch.all(pad_mask == 0):
-            pad_mask = pad_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, pad_mask.size(-1), 1)
         else:
-            pad_mask = None
         # RoPE
-        self.freqs_cis = self.freqs_cis.to(src.device, non_blocking=True)
-        freqs_cis = self.freqs_cis[: src.shape[1]]
         # Embedding
-        x = self.encoder(src)
         if self.config.layer_norm_after_embedding:
             x = self.layer_norm_1(x)
         # Transformer encoder
         for layer in self.transformer_encoder:
-            x, attn = layer(x, pad_mask, freqs_cis, output_attentions)
             if output_hidden_states:
                 hidden_states.append(x)
             if output_attentions:
@@ -234,5 +234,4 @@ class AMPLIFY(AMPLIFYPreTrainedModel):
         logits = self.decoder(self.layer_norm_2(x) if self.config.layer_norm_before_last_layer else x)
         # Return logits or the output of the last hidden layer
-        return MaskedLMOutput(logits=logits, hidden_states=hidden_states, attentions=attentions)

         self.ffn_dropout = nn.Dropout(config.dropout_prob)
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
+        attn, contact = self._att_block(self.attention_norm(x), attention_mask, freqs_cis, output_attentions)
         x = x + attn
         x = x + self._ff_block(self.ffn_norm(x))
         return x, contact
+    def _att_block(self, x: torch.Tensor, attention_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
         batch_size, seq_len, _ = x.shape
         xq, xk, xv = self.q(x), self.k(x), self.v(x)
             query=xq,
             key=xk,
             value=xv,
+            attn_bias=attention_mask,
             p=self.config.dropout_prob if self.training else 0,
         )
         _attn = None
         if output_attentions:
             _attn = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
+            if attention_mask is not None:
+                _attn = _attn + attention_mask
             _attn = _attn.softmax(-1)
         return self.resid_dropout(self.wo(attn.view(batch_size, seq_len, self.config.num_attention_heads * self.d_head))), _attn
         # Initialize weights and apply final processing
         self.post_init()
+    def forward(self, input_ids, attention_mask=None, output_hidden_states=False, output_attentions=False, **kwargs):
         # Initialize
         hidden_states, attentions = [], []
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
+        if attention_mask is not None and not torch.all(attention_mask == 0):
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
         else:
+            attention_mask = None
         # RoPE
+        self.freqs_cis = self.freqs_cis.to(input_ids.device, non_blocking=True)
+        freqs_cis = self.freqs_cis[: input_ids.shape[1]]
         # Embedding
+        x = self.encoder(input_ids)
         if self.config.layer_norm_after_embedding:
             x = self.layer_norm_1(x)
         # Transformer encoder
         for layer in self.transformer_encoder:
+            x, attn = layer(x, attention_mask, freqs_cis, output_attentions)
             if output_hidden_states:
                 hidden_states.append(x)
             if output_attentions:
         logits = self.decoder(self.layer_norm_2(x) if self.config.layer_norm_before_last_layer else x)
         # Return logits or the output of the last hidden layer
+        return MaskedLMOutput(logits=logits, hidden_states=hidden_states, attentions=attentions)