andrewdalpino
/

LightGPT-Small-Base

@@ -92,6 +92,7 @@ class LightGPT(Module):
         """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
         self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
     def resize_token_embeddings(self, num_tokens: int) -> None:
         """Resize the token embeddings to accommodate a new vocabulary size."""
@@ -105,7 +106,13 @@ class LightGPT(Module):
             :num_tokens_to_copy, :
         ]
-        self.token_embeddings = new_embeddings
         self.output_layer.weight = self.token_embeddings.weight
@@ -370,14 +377,13 @@ class LightGPTInstruct(Module):
         if alpha <= 0.0:
             raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
-        if vocabulary_size != model.vocabulary_size:
-            model.resize_token_embeddings(vocabulary_size)
         for param in model.parameters():
             param.requires_grad = False
-        for i in range(vocabulary_size, model.vocabulary_size, -1):
-            model.output_layer.weight[i - 1].requires_grad = True
         for module in model.body:
             out_features, in_features = module.attention.in_proj_weight.shape
@@ -404,19 +410,16 @@ class LightGPTInstruct(Module):
                         LoRA.from_linear(layer, rank, alpha, dropout),
                     )
-        register_parametrization(
-            model.output_layer,
-            "weight",
-            LoRA.from_linear(model.output_layer, rank, alpha, dropout),
-        )
         self.model = model
     @property
     def num_trainable_params(self) -> int:
         return self.model.num_trainable_params
-    def state_dict(self):
         return {
             name: module
             for name, module in super().state_dict().items()
@@ -654,9 +657,7 @@ class LoRA(Module):
         if alpha <= 0.0:
             raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
-        std_dev = 1.0 / sqrt(rank)
-        self.lora_a = Parameter(torch.randn(rank, in_features) * std_dev)
         self.lora_b = Parameter(torch.zeros(out_features, rank))
         self.dropout = Dropout1d(p=dropout)

         """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
         self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
+    @torch.no_grad()
     def resize_token_embeddings(self, num_tokens: int) -> None:
         """Resize the token embeddings to accommodate a new vocabulary size."""
             :num_tokens_to_copy, :
         ]
+        for i in range(num_tokens_to_copy, num_tokens):
+            new_embeddings.weight[i] = torch.randn(new_embeddings.embedding_dim) / sqrt(
+                new_embeddings.embedding_dim
+            )
+        self.token_embeddings.weight = new_embeddings.weight
+        self.token_embeddings.num_embeddings = new_embeddings.num_embeddings
         self.output_layer.weight = self.token_embeddings.weight
         if alpha <= 0.0:
             raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
         for param in model.parameters():
             param.requires_grad = False
+        if vocabulary_size != model.vocabulary_size:
+            model.resize_token_embeddings(vocabulary_size)
+        model.token_embeddings.weight.requires_grad = True
         for module in model.body:
             out_features, in_features = module.attention.in_proj_weight.shape
                         LoRA.from_linear(layer, rank, alpha, dropout),
                     )
         self.model = model
     @property
     def num_trainable_params(self) -> int:
         return self.model.num_trainable_params
+    def token_embeddings_state_dict(self):
+        return self.model.token_embeddings.state_dict()
+    def lora_state_dict(self):
         return {
             name: module
             for name, module in super().state_dict().items()
         if alpha <= 0.0:
             raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
+        self.lora_a = Parameter(torch.randn(rank, in_features) / sqrt(rank))
         self.lora_b = Parameter(torch.zeros(out_features, rank))
         self.dropout = Dropout1d(p=dropout)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2716db68bb0143039012e287e16b005dc5b071d545f109fc40236d2ba2ab333
 size 1414060818

 version https://git-lfs.github.com/spec/v1
+oid sha256:32e214d9dddfc85c390ffb8ac314064074a147bacbefce772f21a04d6337eab0
 size 1414060818