andrewdalpino commited on
Commit
791345d
·
verified ·
1 Parent(s): 170566e

Upload model

Browse files
Files changed (2) hide show
  1. model.py +17 -16
  2. pytorch_model.bin +1 -1
model.py CHANGED
@@ -92,6 +92,7 @@ class LightGPT(Module):
92
  """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
93
  self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
94
 
 
95
  def resize_token_embeddings(self, num_tokens: int) -> None:
96
  """Resize the token embeddings to accommodate a new vocabulary size."""
97
 
@@ -105,7 +106,13 @@ class LightGPT(Module):
105
  :num_tokens_to_copy, :
106
  ]
107
 
108
- self.token_embeddings = new_embeddings
 
 
 
 
 
 
109
 
110
  self.output_layer.weight = self.token_embeddings.weight
111
 
@@ -370,14 +377,13 @@ class LightGPTInstruct(Module):
370
  if alpha <= 0.0:
371
  raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
372
 
373
- if vocabulary_size != model.vocabulary_size:
374
- model.resize_token_embeddings(vocabulary_size)
375
-
376
  for param in model.parameters():
377
  param.requires_grad = False
378
 
379
- for i in range(vocabulary_size, model.vocabulary_size, -1):
380
- model.output_layer.weight[i - 1].requires_grad = True
 
 
381
 
382
  for module in model.body:
383
  out_features, in_features = module.attention.in_proj_weight.shape
@@ -404,19 +410,16 @@ class LightGPTInstruct(Module):
404
  LoRA.from_linear(layer, rank, alpha, dropout),
405
  )
406
 
407
- register_parametrization(
408
- model.output_layer,
409
- "weight",
410
- LoRA.from_linear(model.output_layer, rank, alpha, dropout),
411
- )
412
-
413
  self.model = model
414
 
415
  @property
416
  def num_trainable_params(self) -> int:
417
  return self.model.num_trainable_params
418
 
419
- def state_dict(self):
 
 
 
420
  return {
421
  name: module
422
  for name, module in super().state_dict().items()
@@ -654,9 +657,7 @@ class LoRA(Module):
654
  if alpha <= 0.0:
655
  raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
656
 
657
- std_dev = 1.0 / sqrt(rank)
658
-
659
- self.lora_a = Parameter(torch.randn(rank, in_features) * std_dev)
660
  self.lora_b = Parameter(torch.zeros(out_features, rank))
661
 
662
  self.dropout = Dropout1d(p=dropout)
 
92
  """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
93
  self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
94
 
95
+ @torch.no_grad()
96
  def resize_token_embeddings(self, num_tokens: int) -> None:
97
  """Resize the token embeddings to accommodate a new vocabulary size."""
98
 
 
106
  :num_tokens_to_copy, :
107
  ]
108
 
109
+ for i in range(num_tokens_to_copy, num_tokens):
110
+ new_embeddings.weight[i] = torch.randn(new_embeddings.embedding_dim) / sqrt(
111
+ new_embeddings.embedding_dim
112
+ )
113
+
114
+ self.token_embeddings.weight = new_embeddings.weight
115
+ self.token_embeddings.num_embeddings = new_embeddings.num_embeddings
116
 
117
  self.output_layer.weight = self.token_embeddings.weight
118
 
 
377
  if alpha <= 0.0:
378
  raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
379
 
 
 
 
380
  for param in model.parameters():
381
  param.requires_grad = False
382
 
383
+ if vocabulary_size != model.vocabulary_size:
384
+ model.resize_token_embeddings(vocabulary_size)
385
+
386
+ model.token_embeddings.weight.requires_grad = True
387
 
388
  for module in model.body:
389
  out_features, in_features = module.attention.in_proj_weight.shape
 
410
  LoRA.from_linear(layer, rank, alpha, dropout),
411
  )
412
 
 
 
 
 
 
 
413
  self.model = model
414
 
415
  @property
416
  def num_trainable_params(self) -> int:
417
  return self.model.num_trainable_params
418
 
419
+ def token_embeddings_state_dict(self):
420
+ return self.model.token_embeddings.state_dict()
421
+
422
+ def lora_state_dict(self):
423
  return {
424
  name: module
425
  for name, module in super().state_dict().items()
 
657
  if alpha <= 0.0:
658
  raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
659
 
660
+ self.lora_a = Parameter(torch.randn(rank, in_features) / sqrt(rank))
 
 
661
  self.lora_b = Parameter(torch.zeros(out_features, rank))
662
 
663
  self.dropout = Dropout1d(p=dropout)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2716db68bb0143039012e287e16b005dc5b071d545f109fc40236d2ba2ab333
3
  size 1414060818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32e214d9dddfc85c390ffb8ac314064074a147bacbefce772f21a04d6337eab0
3
  size 1414060818