Spaces:

Tousifahamed
/

Transformers-from-Scratch

Running

App Files Files Community

Tousifahamed commited on Jan 18

Commit

0f122cf

verified ·

1 Parent(s): 85c27c2

Upload 2 files

Browse files

Files changed (1) hide show

model_utils.py +13 -42

model_utils.py CHANGED Viewed

@@ -17,53 +17,24 @@ class Block(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.ln_1 = nn.LayerNorm(config.n_embd)
-        self.attn = MultiHeadAttention(config)
         self.ln_2 = nn.LayerNorm(config.n_embd)
-        self.mlp = FeedForward(config)
     def forward(self, x):
-        x = x + self.attn(self.ln_1(x))
         x = x + self.mlp(self.ln_2(x))
         return x
-class MultiHeadAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.n_head = config.n_head
-        self.n_embd = config.n_embd
-        assert self.n_embd % self.n_head == 0
-        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
-        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
-        self.dropout = nn.Dropout(config.dropout)
-    def forward(self, x):
-        B, T, C = x.size()
-        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
-        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
-        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
-        att = (q @ k.transpose(-2, -1)) * (1.0 / torch.sqrt(torch.tensor(k.size(-1))))
-        att = F.softmax(att, dim=-1)
-        att = self.dropout(att)
-        y = att @ v
-        y = y.transpose(1, 2).contiguous().view(B, T, C)
-        return self.c_proj(y)
-class FeedForward(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
-        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
-        self.dropout = nn.Dropout(config.dropout)
-    def forward(self, x):
-        x = F.gelu(self.c_fc(x))
-        x = self.dropout(x)
-        x = self.c_proj(x)
-        x = self.dropout(x)
-        return x
 class GPT(nn.Module):
     def __init__(self, config):

     def __init__(self, config):
         super().__init__()
         self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = nn.MultiheadAttention(config.n_embd, config.n_head, dropout=config.dropout, batch_first=True)
         self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = nn.Sequential(
+            nn.Linear(config.n_embd, 4 * config.n_embd),
+            nn.GELU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(4 * config.n_embd, config.n_embd),
+            nn.Dropout(config.dropout),
+        )
     def forward(self, x):
+        x = x + self._attention_block(self.ln_1(x))
         x = x + self.mlp(self.ln_2(x))
         return x
+    def _attention_block(self, x):
+        attn_output, _ = self.attn(x, x, x)
+        return attn_output
 class GPT(nn.Module):
     def __init__(self, config):