update config and vocab with new special tokens handling
Browse files- config.json +40 -37
- vocab.json +10 -5
config.json
CHANGED
@@ -1,36 +1,39 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"
|
4 |
"transforms": [
|
5 |
"onmt_tokenize",
|
6 |
"filtertoolong"
|
7 |
],
|
8 |
-
"
|
9 |
-
"
|
10 |
-
"skip_empty_level": "silent",
|
11 |
-
"share_vocab": true,
|
12 |
-
"n_sample": 0,
|
13 |
-
"vocab_size_multiple": 8,
|
14 |
"save_data": null,
|
|
|
15 |
"tgt_vocab": null,
|
|
|
|
|
|
|
|
|
16 |
"src_vocab_size": 128256,
|
|
|
|
|
17 |
"training": {
|
18 |
-
"accum_count": [
|
19 |
-
32
|
20 |
-
],
|
21 |
-
"batch_size": 896,
|
22 |
"accum_steps": [
|
23 |
0
|
24 |
],
|
25 |
-
"group_size": 0,
|
26 |
-
"valid_batch_size": 256,
|
27 |
"batch_size_multiple": 1,
|
28 |
-
"
|
|
|
29 |
"quant_type": "",
|
|
|
|
|
|
|
30 |
"w_bit": 0,
|
31 |
"compute_dtype": "torch.bfloat16",
|
32 |
-
"
|
33 |
-
"quant_layers": []
|
|
|
|
|
34 |
},
|
35 |
"transforms_configs": {
|
36 |
"filtertoolong": {
|
@@ -1070,42 +1073,42 @@
|
|
1070 |
}
|
1071 |
},
|
1072 |
"model": {
|
1073 |
-
"
|
1074 |
-
"
|
1075 |
-
"
|
1076 |
-
"layer_norm": "rms",
|
1077 |
"num_experts": 0,
|
1078 |
"heads": 32,
|
1079 |
-
"
|
1080 |
-
"add_qkvbias": false,
|
1081 |
-
"head_dim": null,
|
1082 |
-
"transformer_ff": 14336,
|
1083 |
-
"heads_kv": 8,
|
1084 |
"norm_eps": 1e-05,
|
1085 |
-
"
|
1086 |
"mlp_activation_fn": "gated-silu",
|
1087 |
-
"
|
1088 |
-
"shared_layer_norm": false,
|
1089 |
-
"sliding_window": 0,
|
1090 |
"layers": 32,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1091 |
"embeddings": {
|
1092 |
"n_positions": 0,
|
|
|
1093 |
"position_encoding_type": "Rotary",
|
1094 |
-
"src_word_vec_size": 4096
|
1095 |
-
"tgt_word_vec_size": 4096
|
1096 |
},
|
1097 |
"rope_config": {
|
1098 |
-
"
|
1099 |
-
"
|
1100 |
},
|
1101 |
"decoder": {
|
1102 |
"position_encoding_type": "Rotary",
|
1103 |
-
"decoder_type": "transformer_lm",
|
1104 |
"n_positions": 0,
|
|
|
1105 |
"tgt_word_vec_size": 4096,
|
1106 |
"rope_config": {
|
1107 |
-
"
|
1108 |
-
"
|
1109 |
}
|
1110 |
}
|
1111 |
},
|
|
|
1 |
{
|
2 |
+
"eos_token": "<|end_of_text|>",
|
3 |
+
"share_vocab": true,
|
4 |
"transforms": [
|
5 |
"onmt_tokenize",
|
6 |
"filtertoolong"
|
7 |
],
|
8 |
+
"data": null,
|
9 |
+
"tgt_vocab_size": 128256,
|
|
|
|
|
|
|
|
|
10 |
"save_data": null,
|
11 |
+
"vocab_size_multiple": 8,
|
12 |
"tgt_vocab": null,
|
13 |
+
"n_sample": 0,
|
14 |
+
"src_vocab": null,
|
15 |
+
"skip_empty_level": "silent",
|
16 |
+
"bos_token": "<|begin_of_text|>",
|
17 |
"src_vocab_size": 128256,
|
18 |
+
"decoder_start_token": "<|begin_of_text|>",
|
19 |
+
"pad_token": "<blank>",
|
20 |
"training": {
|
|
|
|
|
|
|
|
|
21 |
"accum_steps": [
|
22 |
0
|
23 |
],
|
|
|
|
|
24 |
"batch_size_multiple": 1,
|
25 |
+
"valid_batch_size": 256,
|
26 |
+
"batch_type": "tokens",
|
27 |
"quant_type": "",
|
28 |
+
"accum_count": [
|
29 |
+
32
|
30 |
+
],
|
31 |
"w_bit": 0,
|
32 |
"compute_dtype": "torch.bfloat16",
|
33 |
+
"normalization": "tokens",
|
34 |
+
"quant_layers": [],
|
35 |
+
"group_size": 0,
|
36 |
+
"batch_size": 896
|
37 |
},
|
38 |
"transforms_configs": {
|
39 |
"filtertoolong": {
|
|
|
1073 |
}
|
1074 |
},
|
1075 |
"model": {
|
1076 |
+
"head_dim": null,
|
1077 |
+
"sliding_window": 0,
|
1078 |
+
"architecture": "transformer_lm",
|
|
|
1079 |
"num_experts": 0,
|
1080 |
"heads": 32,
|
1081 |
+
"hidden_size": 4096,
|
|
|
|
|
|
|
|
|
1082 |
"norm_eps": 1e-05,
|
1083 |
+
"num_experts_per_tok": 0,
|
1084 |
"mlp_activation_fn": "gated-silu",
|
1085 |
+
"left_pad": true,
|
|
|
|
|
1086 |
"layers": 32,
|
1087 |
+
"layer_norm": "rms",
|
1088 |
+
"shared_layer_norm": false,
|
1089 |
+
"add_ffnbias": false,
|
1090 |
+
"heads_kv": 8,
|
1091 |
+
"add_qkvbias": false,
|
1092 |
+
"parallel_residual": false,
|
1093 |
+
"transformer_ff": 14336,
|
1094 |
"embeddings": {
|
1095 |
"n_positions": 0,
|
1096 |
+
"tgt_word_vec_size": 4096,
|
1097 |
"position_encoding_type": "Rotary",
|
1098 |
+
"src_word_vec_size": 4096
|
|
|
1099 |
},
|
1100 |
"rope_config": {
|
1101 |
+
"rotary_interleave": false,
|
1102 |
+
"rotary_theta": 500000
|
1103 |
},
|
1104 |
"decoder": {
|
1105 |
"position_encoding_type": "Rotary",
|
|
|
1106 |
"n_positions": 0,
|
1107 |
+
"decoder_type": "transformer_lm",
|
1108 |
"tgt_word_vec_size": 4096,
|
1109 |
"rope_config": {
|
1110 |
+
"rotary_interleave": false,
|
1111 |
+
"rotary_theta": 500000
|
1112 |
}
|
1113 |
}
|
1114 |
},
|
vocab.json
CHANGED
@@ -128000,7 +128000,7 @@
|
|
128000 |
"ĠвÑĭÑģокой",
|
128001 |
"ãĥ¼ãĥ¼",
|
128002 |
"éĶ¦",
|
128003 |
-
"
|
128004 |
"<|end_of_text|>",
|
128005 |
"<|reserved_special_token_0|>",
|
128006 |
"<|reserved_special_token_1|>",
|
@@ -128009,7 +128009,7 @@
|
|
128009 |
"<|start_header_id|>",
|
128010 |
"<|end_header_id|>",
|
128011 |
"<|reserved_special_token_4|>",
|
128012 |
-
"
|
128013 |
"<|reserved_special_token_5|>",
|
128014 |
"<|reserved_special_token_6|>",
|
128015 |
"<|reserved_special_token_7|>",
|
@@ -256258,7 +256258,7 @@
|
|
256258 |
"ĠвÑĭÑģокой",
|
256259 |
"ãĥ¼ãĥ¼",
|
256260 |
"éĶ¦",
|
256261 |
-
"
|
256262 |
"<|end_of_text|>",
|
256263 |
"<|reserved_special_token_0|>",
|
256264 |
"<|reserved_special_token_1|>",
|
@@ -256267,7 +256267,7 @@
|
|
256267 |
"<|start_header_id|>",
|
256268 |
"<|end_header_id|>",
|
256269 |
"<|reserved_special_token_4|>",
|
256270 |
-
"
|
256271 |
"<|reserved_special_token_5|>",
|
256272 |
"<|reserved_special_token_6|>",
|
256273 |
"<|reserved_special_token_7|>",
|
@@ -256515,5 +256515,10 @@
|
|
256515 |
"<|reserved_special_token_249|>",
|
256516 |
"<|reserved_special_token_250|>"
|
256517 |
],
|
256518 |
-
"decoder_start_token": "
|
|
|
|
|
|
|
|
|
|
|
256519 |
}
|
|
|
128000 |
"ĠвÑĭÑģокой",
|
128001 |
"ãĥ¼ãĥ¼",
|
128002 |
"éĶ¦",
|
128003 |
+
"<|begin_of_text|>",
|
128004 |
"<|end_of_text|>",
|
128005 |
"<|reserved_special_token_0|>",
|
128006 |
"<|reserved_special_token_1|>",
|
|
|
128009 |
"<|start_header_id|>",
|
128010 |
"<|end_header_id|>",
|
128011 |
"<|reserved_special_token_4|>",
|
128012 |
+
"<|eot_id|>",
|
128013 |
"<|reserved_special_token_5|>",
|
128014 |
"<|reserved_special_token_6|>",
|
128015 |
"<|reserved_special_token_7|>",
|
|
|
256258 |
"ĠвÑĭÑģокой",
|
256259 |
"ãĥ¼ãĥ¼",
|
256260 |
"éĶ¦",
|
256261 |
+
"<|begin_of_text|>",
|
256262 |
"<|end_of_text|>",
|
256263 |
"<|reserved_special_token_0|>",
|
256264 |
"<|reserved_special_token_1|>",
|
|
|
256267 |
"<|start_header_id|>",
|
256268 |
"<|end_header_id|>",
|
256269 |
"<|reserved_special_token_4|>",
|
256270 |
+
"<|eot_id|>",
|
256271 |
"<|reserved_special_token_5|>",
|
256272 |
"<|reserved_special_token_6|>",
|
256273 |
"<|reserved_special_token_7|>",
|
|
|
256515 |
"<|reserved_special_token_249|>",
|
256516 |
"<|reserved_special_token_250|>"
|
256517 |
],
|
256518 |
+
"decoder_start_token": "<|begin_of_text|>",
|
256519 |
+
"specials": {
|
256520 |
+
"bos_token": "<|begin_of_text|>",
|
256521 |
+
"eos_token": "<|end_of_text|>",
|
256522 |
+
"pad_token": "<blank>"
|
256523 |
+
}
|
256524 |
}
|