Xenova HF Staff commited on
Commit
6f2427a
·
verified ·
1 Parent(s): 41a6132

Upload 2 files

Browse files
Files changed (2) hide show
  1. tokenizer.json +300 -0
  2. tokenizer_config.json +36 -0
tokenizer.json ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[S1]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": false
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[S2]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": false
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "BytePreTokenizer"
37
+ },
38
+ "post_processor": null,
39
+ "decoder": null,
40
+ "model": {
41
+ "vocab": {
42
+ "\u0000": 0,
43
+ "\u0001": 1,
44
+ "\u0002": 2,
45
+ "\u0003": 3,
46
+ "\u0004": 4,
47
+ "\u0005": 5,
48
+ "\u0006": 6,
49
+ "\u0007": 7,
50
+ "\u0008": 8,
51
+ "\u0009": 9,
52
+ "\u000a": 10,
53
+ "\u000b": 11,
54
+ "\u000c": 12,
55
+ "\u000d": 13,
56
+ "\u000e": 14,
57
+ "\u000f": 15,
58
+ "\u0010": 16,
59
+ "\u0011": 17,
60
+ "\u0012": 18,
61
+ "\u0013": 19,
62
+ "\u0014": 20,
63
+ "\u0015": 21,
64
+ "\u0016": 22,
65
+ "\u0017": 23,
66
+ "\u0018": 24,
67
+ "\u0019": 25,
68
+ "\u001a": 26,
69
+ "\u001b": 27,
70
+ "\u001c": 28,
71
+ "\u001d": 29,
72
+ "\u001e": 30,
73
+ "\u001f": 31,
74
+ "\u0020": 32,
75
+ "\u0021": 33,
76
+ "\u0022": 34,
77
+ "\u0023": 35,
78
+ "\u0024": 36,
79
+ "\u0025": 37,
80
+ "\u0026": 38,
81
+ "\u0027": 39,
82
+ "\u0028": 40,
83
+ "\u0029": 41,
84
+ "\u002a": 42,
85
+ "\u002b": 43,
86
+ "\u002c": 44,
87
+ "\u002d": 45,
88
+ "\u002e": 46,
89
+ "\u002f": 47,
90
+ "\u0030": 48,
91
+ "\u0031": 49,
92
+ "\u0032": 50,
93
+ "\u0033": 51,
94
+ "\u0034": 52,
95
+ "\u0035": 53,
96
+ "\u0036": 54,
97
+ "\u0037": 55,
98
+ "\u0038": 56,
99
+ "\u0039": 57,
100
+ "\u003a": 58,
101
+ "\u003b": 59,
102
+ "\u003c": 60,
103
+ "\u003d": 61,
104
+ "\u003e": 62,
105
+ "\u003f": 63,
106
+ "\u0040": 64,
107
+ "\u0041": 65,
108
+ "\u0042": 66,
109
+ "\u0043": 67,
110
+ "\u0044": 68,
111
+ "\u0045": 69,
112
+ "\u0046": 70,
113
+ "\u0047": 71,
114
+ "\u0048": 72,
115
+ "\u0049": 73,
116
+ "\u004a": 74,
117
+ "\u004b": 75,
118
+ "\u004c": 76,
119
+ "\u004d": 77,
120
+ "\u004e": 78,
121
+ "\u004f": 79,
122
+ "\u0050": 80,
123
+ "\u0051": 81,
124
+ "\u0052": 82,
125
+ "\u0053": 83,
126
+ "\u0054": 84,
127
+ "\u0055": 85,
128
+ "\u0056": 86,
129
+ "\u0057": 87,
130
+ "\u0058": 88,
131
+ "\u0059": 89,
132
+ "\u005a": 90,
133
+ "\u005b": 91,
134
+ "\u005c": 92,
135
+ "\u005d": 93,
136
+ "\u005e": 94,
137
+ "\u005f": 95,
138
+ "\u0060": 96,
139
+ "\u0061": 97,
140
+ "\u0062": 98,
141
+ "\u0063": 99,
142
+ "\u0064": 100,
143
+ "\u0065": 101,
144
+ "\u0066": 102,
145
+ "\u0067": 103,
146
+ "\u0068": 104,
147
+ "\u0069": 105,
148
+ "\u006a": 106,
149
+ "\u006b": 107,
150
+ "\u006c": 108,
151
+ "\u006d": 109,
152
+ "\u006e": 110,
153
+ "\u006f": 111,
154
+ "\u0070": 112,
155
+ "\u0071": 113,
156
+ "\u0072": 114,
157
+ "\u0073": 115,
158
+ "\u0074": 116,
159
+ "\u0075": 117,
160
+ "\u0076": 118,
161
+ "\u0077": 119,
162
+ "\u0078": 120,
163
+ "\u0079": 121,
164
+ "\u007a": 122,
165
+ "\u007b": 123,
166
+ "\u007c": 124,
167
+ "\u007d": 125,
168
+ "\u007e": 126,
169
+ "\u007f": 127,
170
+ "\u0080": 128,
171
+ "\u0081": 129,
172
+ "\u0082": 130,
173
+ "\u0083": 131,
174
+ "\u0084": 132,
175
+ "\u0085": 133,
176
+ "\u0086": 134,
177
+ "\u0087": 135,
178
+ "\u0088": 136,
179
+ "\u0089": 137,
180
+ "\u008a": 138,
181
+ "\u008b": 139,
182
+ "\u008c": 140,
183
+ "\u008d": 141,
184
+ "\u008e": 142,
185
+ "\u008f": 143,
186
+ "\u0090": 144,
187
+ "\u0091": 145,
188
+ "\u0092": 146,
189
+ "\u0093": 147,
190
+ "\u0094": 148,
191
+ "\u0095": 149,
192
+ "\u0096": 150,
193
+ "\u0097": 151,
194
+ "\u0098": 152,
195
+ "\u0099": 153,
196
+ "\u009a": 154,
197
+ "\u009b": 155,
198
+ "\u009c": 156,
199
+ "\u009d": 157,
200
+ "\u009e": 158,
201
+ "\u009f": 159,
202
+ "\u00a0": 160,
203
+ "\u00a1": 161,
204
+ "\u00a2": 162,
205
+ "\u00a3": 163,
206
+ "\u00a4": 164,
207
+ "\u00a5": 165,
208
+ "\u00a6": 166,
209
+ "\u00a7": 167,
210
+ "\u00a8": 168,
211
+ "\u00a9": 169,
212
+ "\u00aa": 170,
213
+ "\u00ab": 171,
214
+ "\u00ac": 172,
215
+ "\u00ad": 173,
216
+ "\u00ae": 174,
217
+ "\u00af": 175,
218
+ "\u00b0": 176,
219
+ "\u00b1": 177,
220
+ "\u00b2": 178,
221
+ "\u00b3": 179,
222
+ "\u00b4": 180,
223
+ "\u00b5": 181,
224
+ "\u00b6": 182,
225
+ "\u00b7": 183,
226
+ "\u00b8": 184,
227
+ "\u00b9": 185,
228
+ "\u00ba": 186,
229
+ "\u00bb": 187,
230
+ "\u00bc": 188,
231
+ "\u00bd": 189,
232
+ "\u00be": 190,
233
+ "\u00bf": 191,
234
+ "\u00c0": 192,
235
+ "\u00c1": 193,
236
+ "\u00c2": 194,
237
+ "\u00c3": 195,
238
+ "\u00c4": 196,
239
+ "\u00c5": 197,
240
+ "\u00c6": 198,
241
+ "\u00c7": 199,
242
+ "\u00c8": 200,
243
+ "\u00c9": 201,
244
+ "\u00ca": 202,
245
+ "\u00cb": 203,
246
+ "\u00cc": 204,
247
+ "\u00cd": 205,
248
+ "\u00ce": 206,
249
+ "\u00cf": 207,
250
+ "\u00d0": 208,
251
+ "\u00d1": 209,
252
+ "\u00d2": 210,
253
+ "\u00d3": 211,
254
+ "\u00d4": 212,
255
+ "\u00d5": 213,
256
+ "\u00d6": 214,
257
+ "\u00d7": 215,
258
+ "\u00d8": 216,
259
+ "\u00d9": 217,
260
+ "\u00da": 218,
261
+ "\u00db": 219,
262
+ "\u00dc": 220,
263
+ "\u00dd": 221,
264
+ "\u00de": 222,
265
+ "\u00df": 223,
266
+ "\u00e0": 224,
267
+ "\u00e1": 225,
268
+ "\u00e2": 226,
269
+ "\u00e3": 227,
270
+ "\u00e4": 228,
271
+ "\u00e5": 229,
272
+ "\u00e6": 230,
273
+ "\u00e7": 231,
274
+ "\u00e8": 232,
275
+ "\u00e9": 233,
276
+ "\u00ea": 234,
277
+ "\u00eb": 235,
278
+ "\u00ec": 236,
279
+ "\u00ed": 237,
280
+ "\u00ee": 238,
281
+ "\u00ef": 239,
282
+ "\u00f0": 240,
283
+ "\u00f1": 241,
284
+ "\u00f2": 242,
285
+ "\u00f3": 243,
286
+ "\u00f4": 244,
287
+ "\u00f5": 245,
288
+ "\u00f6": 246,
289
+ "\u00f7": 247,
290
+ "\u00f8": 248,
291
+ "\u00f9": 249,
292
+ "\u00fa": 250,
293
+ "\u00fb": 251,
294
+ "\u00fc": 252,
295
+ "\u00fd": 253,
296
+ "\u00fe": 254,
297
+ "\u00ff": 255
298
+ }
299
+ }
300
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[S1]",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "[S2]",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": false
26
+ }
27
+ },
28
+ "clean_up_tokenization_spaces": false,
29
+ "extra_special_tokens": {},
30
+ "max_length": 1024,
31
+ "model_max_length": 1000000000000000019884624838656,
32
+ "pad_token": "<pad>",
33
+ "processor_class": "DiaProcessor",
34
+ "tokenizer_class": "DiaTokenizer",
35
+ "unk_token": "<pad>"
36
+ }