Gnider commited on
Commit
0d003ec
·
verified ·
1 Parent(s): 6f6364b

Upload 8 files

Browse files
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ru
4
+ tags:
5
+ - PyTorch
6
+ - Transformers
7
+ thumbnail: "https://github.com/sberbank-ai/ru-gpts"
8
+ ---
9
+
10
+ # rugpt3medium\_based\_on\_gpt2
11
+ The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
12
+
13
+
14
+ The model was pretrained with sequence length 1024 using the Transformers library by the [SberDevices](https://sberdevices.ru/) team on 80B tokens for 3 epochs. After that, the model was finetuned with the context size of 2048 tokens.
15
+
16
+ Total training time was around 16 days on 64 GPUs.
17
+ The final perplexity on the test set is `17.4`.
18
+
19
+ # Authors
20
+ + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
21
+ + Dmitry Zmitrovich
22
+
23
+ # Cite us
24
+ ```
25
+ @misc{zmitrovich2023family,
26
+ title={A Family of Pretrained Transformer Language Models for Russian},
27
+ author={Dmitry Zmitrovich and Alexander Abramov and Andrey Kalmykov and Maria Tikhonova and Ekaterina Taktasheva and Danil Astafurov and Mark Baushenko and Artem Snegirev and Tatiana Shavrina and Sergey Markov and Vladislav Mikhailov and Alena Fenogenova},
28
+ year={2023},
29
+ eprint={2309.10931},
30
+ archivePrefix={arXiv},
31
+ primaryClass={cs.CL}
32
+ }
33
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 1,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "id2label": {
11
+ "0": "LABEL_0"
12
+ },
13
+ "initializer_range": 0.02,
14
+ "label2id": {
15
+ "LABEL_0": 0
16
+ },
17
+ "layer_norm_epsilon": 1e-05,
18
+ "model_type": "gpt2",
19
+ "n_ctx": 2048,
20
+ "n_embd": 1024,
21
+ "n_head": 16,
22
+ "n_inner": null,
23
+ "n_layer": 24,
24
+ "n_positions": 2048,
25
+ "n_special": 0,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "predict_special_tokens": true,
29
+ "resid_pdrop": 0.1,
30
+ "summary_activation": null,
31
+ "summary_first_dropout": 0.1,
32
+ "summary_proj_to_labels": true,
33
+ "summary_type": "cls_index",
34
+ "summary_use_proj": true,
35
+ "use_cache": true,
36
+ "vocab_size": 50257
37
+ }
gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90e512e3d01d703b231a0cb160d16c025e1f1175e06c3dead5b6b1c75383f28
3
+ size 1730074771
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": "<mask>",
17
+ "pad_token": {
18
+ "content": "<pad>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<sep>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "bos_token": "<s>",
55
+ "clean_up_tokenization_spaces": true,
56
+ "eos_token": "</s>",
57
+ "errors": "replace",
58
+ "mask_token": "<mask>",
59
+ "model_max_length": 2048,
60
+ "pad_token": "<pad>",
61
+ "padding_side": "left",
62
+ "tokenizer_class": "GPT2Tokenizer",
63
+ "truncation_side": "left",
64
+ "trust_remote_code": false,
65
+ "unk_token": "<unk>",
66
+ "sep_token": "<sep>"
67
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff