kh4dien commited on
Commit
9b65536
·
verified ·
1 Parent(s): e668aed

Upload tokenizer.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.json +213 -0
tokenizer.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 100,
17
+ "content": "<s>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 101,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 102,
35
+ "content": "<pad>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": null,
44
+ "pre_tokenizer": {
45
+ "type": "Split",
46
+ "pattern": {
47
+ "String": " "
48
+ },
49
+ "behavior": "Isolated",
50
+ "invert": false
51
+ },
52
+ "post_processor": null,
53
+ "decoder": null,
54
+ "model": {
55
+ "type": "BPE",
56
+ "dropout": null,
57
+ "unk_token": "<unk>",
58
+ "continuing_subword_prefix": null,
59
+ "end_of_word_suffix": null,
60
+ "fuse_unk": false,
61
+ "byte_fallback": false,
62
+ "ignore_merges": false,
63
+ "vocab": {
64
+ "<unk>": 0,
65
+ " ": 1,
66
+ ".": 2,
67
+ "A": 3,
68
+ "B": 4,
69
+ "C": 5,
70
+ "D": 6,
71
+ "E": 7,
72
+ "F": 8,
73
+ "G": 9,
74
+ "H": 10,
75
+ "I": 11,
76
+ "J": 12,
77
+ "K": 13,
78
+ "L": 14,
79
+ "M": 15,
80
+ "N": 16,
81
+ "O": 17,
82
+ "P": 18,
83
+ "Q": 19,
84
+ "R": 20,
85
+ "S": 21,
86
+ "T": 22,
87
+ "U": 23,
88
+ "V": 24,
89
+ "W": 25,
90
+ "X": 26,
91
+ "Y": 27,
92
+ "Z": 28,
93
+ "a": 29,
94
+ "b": 30,
95
+ "c": 31,
96
+ "d": 32,
97
+ "e": 33,
98
+ "f": 34,
99
+ "g": 35,
100
+ "h": 36,
101
+ "i": 37,
102
+ "j": 38,
103
+ "k": 39,
104
+ "l": 40,
105
+ "m": 41,
106
+ "n": 42,
107
+ "o": 43,
108
+ "p": 44,
109
+ "q": 45,
110
+ "r": 46,
111
+ "s": 47,
112
+ "t": 48,
113
+ "u": 49,
114
+ "v": 50,
115
+ "w": 51,
116
+ "x": 52,
117
+ "y": 53,
118
+ "z": 54,
119
+ "ch": 55,
120
+ "cho": 56,
121
+ "echo": 57,
122
+ "up": 58,
123
+ "pe": 59,
124
+ "uppe": 60,
125
+ "upper": 61,
126
+ "fo": 62,
127
+ "nc": 63,
128
+ "b.": 64,
129
+ "et": 65,
130
+ "ip": 66,
131
+ "mk": 67,
132
+ "qz": 68,
133
+ "wx": 69,
134
+ "kx": 70,
135
+ "qo": 71,
136
+ "y.": 72,
137
+ "ej": 73,
138
+ "k.": 74,
139
+ "tg": 75,
140
+ "bt": 76,
141
+ "iu": 77,
142
+ "kk": 78,
143
+ "sx": 79,
144
+ "B.": 80,
145
+ "M.": 81,
146
+ "fy": 82,
147
+ "mp": 83,
148
+ "ou": 84,
149
+ "pf": 85,
150
+ "v.": 86,
151
+ "yo": 87,
152
+ "P.": 88,
153
+ "jg": 89,
154
+ "kq": 90,
155
+ "m.": 91,
156
+ "mt": 92,
157
+ "nh": 93,
158
+ "qr": 94,
159
+ "wq": 95,
160
+ "O.": 96,
161
+ "aj": 97,
162
+ "ax": 98,
163
+ "dt": 99
164
+ },
165
+ "merges": [
166
+ "c h",
167
+ "ch o",
168
+ "e cho",
169
+ "u p",
170
+ "p e",
171
+ "up pe",
172
+ "uppe r",
173
+ "f o",
174
+ "n c",
175
+ "b .",
176
+ "e t",
177
+ "i p",
178
+ "m k",
179
+ "q z",
180
+ "w x",
181
+ "k x",
182
+ "q o",
183
+ "y .",
184
+ "e j",
185
+ "k .",
186
+ "t g",
187
+ "b t",
188
+ "i u",
189
+ "k k",
190
+ "s x",
191
+ "B .",
192
+ "M .",
193
+ "f y",
194
+ "m p",
195
+ "o u",
196
+ "p f",
197
+ "v .",
198
+ "y o",
199
+ "P .",
200
+ "j g",
201
+ "k q",
202
+ "m .",
203
+ "m t",
204
+ "n h",
205
+ "q r",
206
+ "w q",
207
+ "O .",
208
+ "a j",
209
+ "a x",
210
+ "d t"
211
+ ]
212
+ }
213
+ }