Upload model
Browse files- model.py +17 -16
- pytorch_model.bin +1 -1
model.py
CHANGED
@@ -92,6 +92,7 @@ class LightGPT(Module):
|
|
92 |
"""Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
|
93 |
self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
|
94 |
|
|
|
95 |
def resize_token_embeddings(self, num_tokens: int) -> None:
|
96 |
"""Resize the token embeddings to accommodate a new vocabulary size."""
|
97 |
|
@@ -105,7 +106,13 @@ class LightGPT(Module):
|
|
105 |
:num_tokens_to_copy, :
|
106 |
]
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
self.output_layer.weight = self.token_embeddings.weight
|
111 |
|
@@ -370,14 +377,13 @@ class LightGPTInstruct(Module):
|
|
370 |
if alpha <= 0.0:
|
371 |
raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
|
372 |
|
373 |
-
if vocabulary_size != model.vocabulary_size:
|
374 |
-
model.resize_token_embeddings(vocabulary_size)
|
375 |
-
|
376 |
for param in model.parameters():
|
377 |
param.requires_grad = False
|
378 |
|
379 |
-
|
380 |
-
model.
|
|
|
|
|
381 |
|
382 |
for module in model.body:
|
383 |
out_features, in_features = module.attention.in_proj_weight.shape
|
@@ -404,19 +410,16 @@ class LightGPTInstruct(Module):
|
|
404 |
LoRA.from_linear(layer, rank, alpha, dropout),
|
405 |
)
|
406 |
|
407 |
-
register_parametrization(
|
408 |
-
model.output_layer,
|
409 |
-
"weight",
|
410 |
-
LoRA.from_linear(model.output_layer, rank, alpha, dropout),
|
411 |
-
)
|
412 |
-
|
413 |
self.model = model
|
414 |
|
415 |
@property
|
416 |
def num_trainable_params(self) -> int:
|
417 |
return self.model.num_trainable_params
|
418 |
|
419 |
-
def
|
|
|
|
|
|
|
420 |
return {
|
421 |
name: module
|
422 |
for name, module in super().state_dict().items()
|
@@ -654,9 +657,7 @@ class LoRA(Module):
|
|
654 |
if alpha <= 0.0:
|
655 |
raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
|
656 |
|
657 |
-
|
658 |
-
|
659 |
-
self.lora_a = Parameter(torch.randn(rank, in_features) * std_dev)
|
660 |
self.lora_b = Parameter(torch.zeros(out_features, rank))
|
661 |
|
662 |
self.dropout = Dropout1d(p=dropout)
|
|
|
92 |
"""Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
|
93 |
self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
|
94 |
|
95 |
+
@torch.no_grad()
|
96 |
def resize_token_embeddings(self, num_tokens: int) -> None:
|
97 |
"""Resize the token embeddings to accommodate a new vocabulary size."""
|
98 |
|
|
|
106 |
:num_tokens_to_copy, :
|
107 |
]
|
108 |
|
109 |
+
for i in range(num_tokens_to_copy, num_tokens):
|
110 |
+
new_embeddings.weight[i] = torch.randn(new_embeddings.embedding_dim) / sqrt(
|
111 |
+
new_embeddings.embedding_dim
|
112 |
+
)
|
113 |
+
|
114 |
+
self.token_embeddings.weight = new_embeddings.weight
|
115 |
+
self.token_embeddings.num_embeddings = new_embeddings.num_embeddings
|
116 |
|
117 |
self.output_layer.weight = self.token_embeddings.weight
|
118 |
|
|
|
377 |
if alpha <= 0.0:
|
378 |
raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
|
379 |
|
|
|
|
|
|
|
380 |
for param in model.parameters():
|
381 |
param.requires_grad = False
|
382 |
|
383 |
+
if vocabulary_size != model.vocabulary_size:
|
384 |
+
model.resize_token_embeddings(vocabulary_size)
|
385 |
+
|
386 |
+
model.token_embeddings.weight.requires_grad = True
|
387 |
|
388 |
for module in model.body:
|
389 |
out_features, in_features = module.attention.in_proj_weight.shape
|
|
|
410 |
LoRA.from_linear(layer, rank, alpha, dropout),
|
411 |
)
|
412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
self.model = model
|
414 |
|
415 |
@property
|
416 |
def num_trainable_params(self) -> int:
|
417 |
return self.model.num_trainable_params
|
418 |
|
419 |
+
def token_embeddings_state_dict(self):
|
420 |
+
return self.model.token_embeddings.state_dict()
|
421 |
+
|
422 |
+
def lora_state_dict(self):
|
423 |
return {
|
424 |
name: module
|
425 |
for name, module in super().state_dict().items()
|
|
|
657 |
if alpha <= 0.0:
|
658 |
raise ValueError(f"Alpha must be greater than 0, {alpha} given.")
|
659 |
|
660 |
+
self.lora_a = Parameter(torch.randn(rank, in_features) / sqrt(rank))
|
|
|
|
|
661 |
self.lora_b = Parameter(torch.zeros(out_features, rank))
|
662 |
|
663 |
self.dropout = Dropout1d(p=dropout)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1414060818
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32e214d9dddfc85c390ffb8ac314064074a147bacbefce772f21a04d6337eab0
|
3 |
size 1414060818
|