speed commited on
Commit
de76ae5
·
verified ·
1 Parent(s): 63f161c

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -4
  2. tokenizer.json +12 -12
  3. tokenizer_config.json +12 -10
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "cls_token": {
10
- "content": "<s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
@@ -21,21 +21,21 @@
21
  "single_word": false
22
  },
23
  "mask_token": {
24
- "content": "<MASK>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
- "content": "<PAD>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "</s>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "cls_token": {
10
+ "content": "<CLS|LLM-jp>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
 
21
  "single_word": false
22
  },
23
  "mask_token": {
24
+ "content": "<MASK|LLM-jp>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
+ "content": "<PAD|LLM-jp>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "<SEP|LLM-jp>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
tokenizer.json CHANGED
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "<MASK>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "<PAD>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 5,
53
- "content": "<CLS>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -59,7 +59,7 @@
59
  },
60
  {
61
  "id": 6,
62
- "content": "<SEP>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
@@ -68,7 +68,7 @@
68
  },
69
  {
70
  "id": 7,
71
- "content": "<EOD>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
@@ -145,8 +145,8 @@
145
  },
146
  {
147
  "SpecialToken": {
148
- "id": "<s>",
149
- "type_id": 1
150
  }
151
  }
152
  ],
@@ -213,23 +213,23 @@
213
  -127.5
214
  ],
215
  [
216
- "<MASK>",
217
  -127.5
218
  ],
219
  [
220
- "<PAD>",
221
  -127.5
222
  ],
223
  [
224
- "<CLS>",
225
  -127.5
226
  ],
227
  [
228
- "<SEP>",
229
  -127.5
230
  ],
231
  [
232
- "<EOD>",
233
  -127.5
234
  ],
235
  [
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "<MASK|LLM-jp>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "<PAD|LLM-jp>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
50
  },
51
  {
52
  "id": 5,
53
+ "content": "<CLS|LLM-jp>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
59
  },
60
  {
61
  "id": 6,
62
+ "content": "<SEP|LLM-jp>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
 
68
  },
69
  {
70
  "id": 7,
71
+ "content": "<EOD|LLM-jp>",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
 
145
  },
146
  {
147
  "SpecialToken": {
148
+ "id": "</s>",
149
+ "type_id": 0
150
  }
151
  }
152
  ],
 
213
  -127.5
214
  ],
215
  [
216
+ "<MASK|LLM-jp>",
217
  -127.5
218
  ],
219
  [
220
+ "<PAD|LLM-jp>",
221
  -127.5
222
  ],
223
  [
224
+ "<CLS|LLM-jp>",
225
  -127.5
226
  ],
227
  [
228
+ "<SEP|LLM-jp>",
229
  -127.5
230
  ],
231
  [
232
+ "<EOD|LLM-jp>",
233
  -127.5
234
  ],
235
  [
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<unk>",
@@ -25,7 +27,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "<MASK>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +35,7 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "<PAD>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,7 +43,7 @@
41
  "special": true
42
  },
43
  "5": {
44
- "content": "<CLS>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,7 +51,7 @@
49
  "special": true
50
  },
51
  "6": {
52
- "content": "<SEP>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
@@ -57,7 +59,7 @@
57
  "special": true
58
  },
59
  "7": {
60
- "content": "<EOD>",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
@@ -67,14 +69,14 @@
67
  },
68
  "bos_token": "<s>",
69
  "clean_up_tokenization_spaces": false,
70
- "cls_token": "<s>",
71
- "eod_token": "<EOD>",
72
  "eos_token": "</s>",
73
  "extra_ids": 0,
74
- "mask_token": "<MASK>",
75
  "model_max_length": 1000000000000000019884624838656,
76
- "pad_token": "<PAD>",
77
- "sep_token": "</s>",
78
  "sp_model_kwargs": {},
79
  "tokenizer_class": "PreTrainedTokenizerFast",
80
  "unk_token": "<unk>"
 
1
  {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
 
27
  "special": true
28
  },
29
  "3": {
30
+ "content": "<MASK|LLM-jp>",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
 
35
  "special": true
36
  },
37
  "4": {
38
+ "content": "<PAD|LLM-jp>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
43
  "special": true
44
  },
45
  "5": {
46
+ "content": "<CLS|LLM-jp>",
47
  "lstrip": false,
48
  "normalized": false,
49
  "rstrip": false,
 
51
  "special": true
52
  },
53
  "6": {
54
+ "content": "<SEP|LLM-jp>",
55
  "lstrip": false,
56
  "normalized": false,
57
  "rstrip": false,
 
59
  "special": true
60
  },
61
  "7": {
62
+ "content": "<EOD|LLM-jp>",
63
  "lstrip": false,
64
  "normalized": false,
65
  "rstrip": false,
 
69
  },
70
  "bos_token": "<s>",
71
  "clean_up_tokenization_spaces": false,
72
+ "cls_token": "<CLS|LLM-jp>",
73
+ "eod_token": "</s>",
74
  "eos_token": "</s>",
75
  "extra_ids": 0,
76
+ "mask_token": "<MASK|LLM-jp>",
77
  "model_max_length": 1000000000000000019884624838656,
78
+ "pad_token": "<PAD|LLM-jp>",
79
+ "sep_token": "<SEP|LLM-jp>",
80
  "sp_model_kwargs": {},
81
  "tokenizer_class": "PreTrainedTokenizerFast",
82
  "unk_token": "<unk>"