distributed-optimized-gpt2-1b-bnb-smashed
/
models--distributed--optimized-gpt2-500m
/blobs
/03f1227b29a7a2051f79adb38cc603d8ab6a730b
| from transformers import PretrainedConfig, GPT2Config | |
| from typing import List | |
| class GPTOptimConfig(GPT2Config): | |
| model_type = "gpt_optimized" | |
| def __init__( | |
| self, | |
| block_size: int = 1024, # max sequence length | |
| vocab_size: int = 50257, # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token | |
| n_layer: int = 16, # number of layers | |
| n_head: int = 16, # number of heads | |
| n_embd: int = 1024, # embedding dimension | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.block_size = block_size | |
| self.vocab_size = vocab_size | |
| self.n_layer = n_layer | |
| self.n_head = n_head | |
| self.n_embd = n_embd |