gpt2-dzongkha-text / tokenizer.json
caffsean's picture
Upload tokenizer
0d0d50a
raw
history blame
18 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"vocab": {
"<|endoftext|>": 0,
",": 1,
"-": 2,
":": 3,
"A": 4,
"D": 5,
"E": 6,
"G": 7,
"H": 8,
"I": 9,
"K": 10,
"M": 11,
"N": 12,
"O": 13,
"R": 14,
"Z": 15,
"a": 16,
"b": 17,
"c": 18,
"d": 19,
"e": 20,
"g": 21,
"h": 22,
"i": 23,
"j": 24,
"k": 25,
"l": 26,
"m": 27,
"n": 28,
"o": 29,
"p": 30,
"r": 31,
"s": 32,
"t": 33,
"u": 34,
"w": 35,
"y": 36,
"z": 37,
"¡": 38,
"¢": 39,
"£": 40,
"¤": 41,
"¦": 42,
"¨": 43,
"ª": 44,
"®": 45,
"±": 46,
"²": 47,
"³": 48,
"´": 49,
"¶": 50,
"º": 51,
"»": 52,
"¼": 53,
"½": 54,
"¾": 55,
"Ã": 56,
"à": 57,
"â": 58,
"Ġ": 59,
"Ģ": 60,
"ģ": 61,
"Ĥ": 62,
"Ħ": 63,
"ħ": 64,
"Ĩ": 65,
"ĩ": 66,
"ī": 67,
"ĭ": 68,
"ı": 69,
"IJ": 70,
"ij": 71,
"ĵ": 72,
"Ķ": 73,
"ķ": 74,
"ĸ": 75,
"ĺ": 76,
"Ļ": 77,
"ļ": 78,
"Ŀ": 79,
"ŀ": 80,
"Ł": 81,
"ł": 82,
"Ń": 83,
"à½": 84,
"¼ĭ": 85,
"à¼ĭ": 86,
"AN": 87,
"DZ": 88,
"ED": 89,
"GK": 90,
"HA": 91,
"IZ": 92,
"MAN": 93,
"NGK": 94,
"OMAN": 95,
"ONGK": 96,
"ROMAN": 97,
"Ġà½": 98,
"DZONGK": 99,
"IZED": 100,
"ROMANIZED": 101,
"DZONGKHA": 102,
"Ġà½Ĥ": 103,
"âĢ": 104,
"âĢĻ": 105,
"Ġà½Ĥà½": 106,
"à¾": 107,
"ོ": 108,
"ng": 109,
"ĠâĢĻ": 110,
"ུ": 111,
"à½Ħ": 112,
"ས": 113,
"à½Ĥ": 114,
"ི": 115,
"à½ĺ": 116,
"ེ": 117,
"à½ĸ": 118,
"Ġà½ij": 119,
"ོà¼ĭ": 120,
"Ġg": 121,
"Ġk": 122,
"à½ĵ": 123,
"Ġà½ģ": 124,
"ྱ": 125,
"ch": 126,
"â": 127,
"ར": 128,
"à½Ķ": 129,
"ang": 130,
"Ġt": 131,
"ལ": 132,
"à½ij": 133,
"ha": 134,
"ü": 135,
"Ġch": 136,
"ྲ": 137,
"ླ": 138,
"ུà¼ĭ": 139,
"Ġs": 140,
"Ġà½Ģ": 141,
"ིà¼ĭ": 142,
"ny": 143,
"ê": 144,
"Ġng": 145,
"Ġà½Ħ": 146,
"Ġà½Ĥས": 147,
"ö": 148,
"Ġd": 149,
"Ġz": 150,
"à½Ŀ": 151,
"à½ł": 152,
"Ġà½Ĩ": 153,
"Ġà½Ĥà½Ł": 154,
"ep": 155,
"to": 156,
"ä": 157,
"am": 158,
"Ġj": 159,
"à½Ģ": 160,
"à½ı": 161,
"Ġà½Ĥཡ": 162,
"Ġà½Ĥà½ĵ": 163,
"à½Ĥས": 164,
"Ġkha": 165,
"û": 166,
"Ġp": 167,
"Ġny": 168,
"à½ģ": 169,
"Ġà½Ĥà½ī": 170,
"Ġà½ijà½Ķ": 171,
"Ġà½ijà½Ģ": 172,
"âp": 173,
"རà½Ķ": 174,
"Ġth": 175,
"ap": 176,
"ci": 177,
"ho": 178,
"kha": 179,
"nam": 180,
"ro": 181,
"î": 182,
"ô": 183,
"à½Ĩ": 184,
"Ġà½ĩ": 185,
"Ġà½ī": 186,
"à½Ħà½ĺ": 187,
"ེà¼ĭ": 188,
"Ġà½ijà½Ħ": 189,
"Ġà½ijà½Ĥ": 190,
"âm": 191,
"Ġchu": 192,
"ྲོ": 193,
"Ġsê": 194,
"Ġnga": 195,
"ön": 196,
"Ġdr": 197,
"Ġà½Ĥà½ĵà½ĺ": 198,
"el": 199,
"em": 200,
"en": 201,
"im": 202,
"ing": 203,
"lo": 204,
"lu": 205,
"lâm": 206,
"ni": 207,
"ou": 208,
"ong": 209,
"sh": 210,
"ts": 211,
"zh": 212,
"zê": 213,
"±à¼ĭ": 214,
"Ġc": 215,
"ŀà½ĸ": 216,
"ཤ": 217,
"à½ļ": 218,
"ཱà¼ĭ": 219,
"à½ŀà½ĸ": 220,
"Ġà½IJ": 221,
"Ġà½Ĥà½ħ": 222,
"Ġà½Ĥà½ij": 223,
"Ġà½Ĥà½ŀ": 224,
"ngü": 225,
"à½ĺས": 226,
"Ġà½ijà½ĸ": 227,
"Ġga": 228,
"Ġkh": 229,
"Ġkû": 230,
"à½ĵà½ĺ": 231,
"Ġà½ģà½ĸ": 232,
"ྱོ": 233,
"ྱི": 234,
"che": 235,
"Ġtr": 236,
"à½ijà½Ĥ": 237,
"à½ijà½Ķ": 238,
"ün": 239,
"üzh": 240,
"Ġchang": 241,
"Ġchâp": 242,
"ློà¼ĭ": 243,
"ླུà¼ĭ": 244,
"Ġsep": 245,
"nyî": 246,
"Ġngâ": 247,
"Ġà½Ħà½Ĥ": 248,
"Ġdzê": 249,
"Ġzh": 250,
"Ġà½Ĩà½Ħ": 251,
"Ġà½Ĩརà½Ķ": 252,
"toto": 253,
"Ġkhap": 254,
"Ġthr": 255,
"Ġà½ijà½Ĥà½ł": 256,
"lâmche": 257,
"à½ŀà½ĸས": 258,
"Kün": 259,
"ai": 260,
"al": 261,
"ao": 262,
"au": 263,
"ach": 264,
"aci": 265,
"aro": 266,
"ats": 267,
"bj": 268,
"cä": 269,
"da": 270,
"eng": 271,
"gi": 272,
"go": 273,
"gün": 274,
"hep": 275,
"io": 276,
"iu": 277,
"khep": 278,
"lang": 279,
"lü": 280,
"lep": 281,
"leng": 282,
"lio": 283,
"me": 284,
"mi": 285,
"mkha": 286,
"ne": 287,
"nang": 288,
"nö": 289,
"nap": 290,
"op": 291,
"osh": 292,
"psh": 293,
"rd": 294,
"ri": 295,
"ram": 296,
"ring": 297,
"sang": 298,
"sel": 299,
"tal": 300,
"tram": 301,
"ung": 302,
"wo": 303,
"wang": 304,
"yâ": 305,
"yep": 306,
"yä": 307,
"ze": 308,
"Ġci": 309,
"ĠKün": 310,
"Ħས": 311,
"ıà½Ħ": 312,
"IJུ": 313,
"Ļà½Ħ": 314,
"Łà½¼à¼ĭ": 315,
"Ńà¼ĭ": 316,
"à½ħ": 317,
"à½ķ": 318,
"à½Ł": 319,
"Ġà½ı": 320,
"Ġà½Ĥཤ": 321,
"Ġà½Ĥà½Ħས": 322,
"Ġà½Ĥà½ıà½Ħ": 323,
"Ġà½Ĥà½Ļà½Ħ": 324,
"ྨ": 325,
"à¾IJུ": 326,
"à¾Łà½¼à¼ĭ": 327,
"à¾Ńà¼ĭ": 328,
"à½Ħས": 329,
"སà½Ķ": 330,
"à½Ĥà½Ķ": 331,
"à½ĺà½ł": 332,
"à½ĸà½ijà½Ĥ": 333,
"Ġgu": 334,
"Ġgou": 335,
"Ġki": 336,
"Ġkâp": 337,
"Ġkho": 338,
"Ġkou": 339,
"Ġkau": 340,
"à½ĵà½ij": 341,
"Ġà½ģà½ĺ": 342,
"Ġà½ģà½Ŀ": 343,
"ྱུ": 344,
"ྱུà¼ĭ": 345,
"ྱིà¼ĭ": 346,
"ྱཱà¼ĭ": 347,
"angkha": 348,
"angne": 349,
"angri": 350,
"Ġtang": 351,
"Ġtha": 352,
"Ġtro": 353,
"Ġting": 354,
"Ġtsang": 355,
"ལà½ĺ": 356,
"ལà½Ŀ": 357,
"à½ijà½ĺ": 358,
"ükha": 359,
"üring": 360,
"Ġchö": 361,
"Ġchep": 362,
"Ġchim": 363,
"ྲུ": 364,
"ྲེ": 365,
"ྲུà¼ĭ": 366,
"ྲà¾Ńà¼ĭ": 367,
"ླུ": 368,
"ླེ": 369,
"Ġsâp": 370,
"Ġsho": 371,
"Ġsop": 372,
"Ġstal": 373,
"Ġà½Ģà½Ŀ": 374,
"nyen": 375,
"nyong": 376,
"êm": 377,
"Ġngä": 378,
"Ġngem": 379,
"Ġngosh": 380,
"Ġà½Ħལ": 381,
"Ġà½Ħà½ĵà½ĺ": 382,
"Ġà½Ĥསརà½Ķ": 383,
"öm": 384,
"öng": 385,
"Ġden": 386,
"Ġdong": 387,
"Ġzi": 388,
"Ġzû": 389,
"Ġzî": 390,
"Ġzung": 391,
"Ġzêm": 392,
"à½łà½ļ": 393,
"Ġà½Ĩà½ijà½Ķ": 394,
"amnyong": 395,
"Ġà½Ĥཡà½Ĥ": 396,
"Ġà½Ĥཡསà½Ķ": 397,
"Ġà½Ĥà½ĵà½Ħ": 398,
"Ġà½Ĥà½ĵà½Ĥà½Ķ": 399,
"à½Ĥསà½Ķ": 400,
"à½Ĥསལ": 401,
"Ġkham": 402,
"Ġkhau": 403,
"Ġpci": 404,
"Ġpön": 405,
"Ġpao": 406,
"Ġpcä": 407,
"Ġnyel": 408,
"Ġnyim": 409,
"Ġnyaro": 410,
"Ġnyamnyong": 411,
"à½ģà½Ħ": 412,
"Ġà½ijà½Ķà½ł": 413,
"Ġà½ijà½Ģརà½Ķ": 414,
"Ġthro": 415,
"Ġthiu": 416,
"nambj": 417,
"namgün": 418,
"ôni": 419,
"ôda": 420,
"à½Ĩà½Ħ": 421,
"Ġà½īà½ĺས": 422,
"âmme": 423,
"emze": 424,
"tsho": 425,
"Ġcê": 426,
"Ġcüring": 427,
"à½ļà½Ħ": 428,
"Ġà½IJà½ĸ": 429,
"Ġà½Ĥà½ħà½ij": 430,
"Ġà½Ĥà½ijà½ĵ": 431,
"ngütram": 432,
"Ġà½ijà½ĸà½Ħ": 433,
"Ġgatoto": 434,
"Ġkhä": 435,
"Ġkhwo": 436,
"Ġtrô": 437,
"Ġtremze": 438,
"Ġchanggi": 439,
"Ġzhu": 440,
"Ġzhöm": 441,
"Ġkhapto": 442,
"Ġthrâ": 443,
"Ġthrükha": 444,
"aito": 445,
"achang": 446,
"atsha": 447,
"lenggo": 448,
"lion": 449,
"pshi": 450,
"yäp": 451,
"ĠKünsel": 452,
"Ġgourd": 453,
"Ġthapshi": 454,
"Ġtsangtoto": 455,
"Ġshong": 456,
"Ġstallion": 457,
"Ġngätsho": 458,
"Ġngoshê": 459,
"Ġzungni": 460,
"Ġpcimi": 461,
"Ġpcäu": 462,
"Ġnyela": 463,
"Ġnyimaito": 464,
"Ġnyarop": 465,
"Ġthromkha": 466,
"nambjâ": 467
},
"merges": [
"à ½",
"¼ ĭ",
"à ¼ĭ",
"A N",
"D Z",
"E D",
"G K",
"H A",
"I Z",
"M AN",
"N GK",
"O MAN",
"O NGK",
"R OMAN",
"Ġ à½",
"DZ ONGK",
"IZ ED",
"ROMAN IZED",
"DZONGK HA",
"ĠའĤ",
"â Ģ",
"âĢ Ļ",
"Ġà½Ĥ à½",
"à ¾",
"འ¼",
"n g",
"Ġ âĢĻ",
"འ´",
"འĦ",
"འ¦",
"འĤ",
"འ²",
"འĺ",
"འº",
"འĸ",
"Ġའij",
"ོ à¼ĭ",
"Ġ g",
"Ġ k",
"འĵ",
"Ġའģ",
"ྠ±",
"c h",
"Ã ¢",
"འ¢",
"འĶ",
"a ng",
"Ġ t",
"འ£",
"འij",
"h a",
"Ã ¼",
"Ġ ch",
"ྠ²",
"ྠ³",
"ུ à¼ĭ",
"Ġ s",
"ĠའĢ",
"ི à¼ĭ",
"n y",
"Ã ª",
"Ġ ng",
"ĠའĦ",
"Ġà½Ĥའ¦",
"Ã ¶",
"Ġ d",
"Ġ z",
"འĿ",
"འł",
"ĠའĨ",
"Ġà½ĤའŁ",
"e p",
"t o",
"Ã ¤",
"a m",
"Ġ j",
"འĢ",
"འı",
"Ġà½Ĥའ¡",
"Ġà½Ĥའĵ",
"à½Ĥ ས",
"Ġk ha",
"Ã »",
"Ġ p",
"Ġ ny",
"འģ",
"Ġà½Ĥའī",
"Ġà½ij à½Ķ",
"Ġà½ij à½Ģ",
"â p",
"ར à½Ķ",
"Ġt h",
"a p",
"c i",
"h o",
"k ha",
"n am",
"r o",
"Ã ®",
"Ã ´",
"འĨ",
"Ġའĩ",
"Ġའī",
"à½Ħ à½ĺ",
"ེ à¼ĭ",
"Ġà½ij à½Ħ",
"Ġà½ij à½Ĥ",
"â m",
"Ġch u",
"ྲ ོ",
"Ġs ê",
"Ġng a",
"ö n",
"Ġd r",
"Ġà½Ĥà½ĵ à½ĺ",
"e l",
"e m",
"e n",
"i m",
"i ng",
"l o",
"l u",
"l âm",
"n i",
"o u",
"o ng",
"s h",
"t s",
"z h",
"z ê",
"± à¼ĭ",
"Ġ c",
"ŀ à½ĸ",
"འ¤",
"འļ",
"འ±à¼ĭ",
"འŀà½ĸ",
"ĠའIJ",
"Ġà½Ĥའħ",
"Ġà½Ĥའij",
"Ġà½Ĥའŀ",
"ng ü",
"à½ĺ ས",
"Ġà½ij à½ĸ",
"Ġg a",
"Ġk h",
"Ġk û",
"à½ĵ à½ĺ",
"Ġà½ģ à½ĸ",
"ྱ ོ",
"ྱ ི",
"ch e",
"Ġt r",
"à½ij à½Ĥ",
"à½ij à½Ķ",
"ü n",
"ü zh",
"Ġch ang",
"Ġch âp",
"ླ ོà¼ĭ",
"ླ ུà¼ĭ",
"Ġs ep",
"ny î",
"Ġng â",
"Ġà½Ħ à½Ĥ",
"Ġd zê",
"Ġz h",
"Ġà½Ĩ à½Ħ",
"Ġà½Ĩ རà½Ķ",
"to to",
"Ġkha p",
"Ġth r",
"Ġà½ijà½Ĥ à½ł",
"lâm che",
"à½ŀà½ĸ ས",
"K ün",
"a i",
"a l",
"a o",
"a u",
"a ch",
"a ci",
"a ro",
"a ts",
"b j",
"c ä",
"d a",
"e ng",
"g i",
"g o",
"g ün",
"h ep",
"i o",
"i u",
"k hep",
"l ang",
"l ü",
"l ep",
"l eng",
"l io",
"m e",
"m i",
"m kha",
"n e",
"n ang",
"n ö",
"n ap",
"o p",
"o sh",
"p sh",
"r d",
"r i",
"r am",
"r ing",
"s ang",
"s el",
"t al",
"t ram",
"u ng",
"w o",
"w ang",
"y â",
"y ep",
"y ä",
"z e",
"Ġ ci",
"Ġ Kün",
"Ħ ས",
"ı à½Ħ",
"IJ ུ",
"Ļ à½Ħ",
"Ł ོà¼ĭ",
"Ń à¼ĭ",
"འħ",
"འķ",
"འŁ",
"Ġའı",
"Ġà½Ĥའ¤",
"Ġà½ĤའĦས",
"Ġà½Ĥའıà½Ħ",
"Ġà½ĤའĻà½Ħ",
"ྠ¨",
"ྠIJུ",
"ྠŁà½¼à¼ĭ",
"ྠŃà¼ĭ",
"à½Ħ ས",
"ས à½Ķ",
"à½Ĥ à½Ķ",
"à½ĺ à½ł",
"à½ĸ à½ijà½Ĥ",
"Ġg u",
"Ġg ou",
"Ġk i",
"Ġk âp",
"Ġk ho",
"Ġk ou",
"Ġk au",
"à½ĵ à½ij",
"Ġà½ģ à½ĺ",
"Ġà½ģ à½Ŀ",
"ྱ ུ",
"ྱ ུà¼ĭ",
"ྱ ིà¼ĭ",
"ྱ ཱà¼ĭ",
"ang kha",
"ang ne",
"ang ri",
"Ġt ang",
"Ġt ha",
"Ġt ro",
"Ġt ing",
"Ġt sang",
"ལ à½ĺ",
"ལ à½Ŀ",
"à½ij à½ĺ",
"ü kha",
"ü ring",
"Ġch ö",
"Ġch ep",
"Ġch im",
"ྲ ུ",
"ྲ ེ",
"ྲ ུà¼ĭ",
"ྲ à¾Ńà¼ĭ",
"ླ ུ",
"ླ ེ",
"Ġs âp",
"Ġs ho",
"Ġs op",
"Ġs tal",
"Ġà½Ģ à½Ŀ",
"ny en",
"ny ong",
"ê m",
"Ġng ä",
"Ġng em",
"Ġng osh",
"Ġà½Ħ ལ",
"Ġà½Ħ à½ĵà½ĺ",
"Ġà½Ĥས རà½Ķ",
"ö m",
"ö ng",
"Ġd en",
"Ġd ong",
"Ġz i",
"Ġz û",
"Ġz î",
"Ġz ung",
"Ġz êm",
"à½ł à½ļ",
"Ġà½Ĩ à½ijà½Ķ",
"am nyong",
"Ġà½Ĥཡ à½Ĥ",
"Ġà½Ĥཡ སà½Ķ",
"Ġà½Ĥà½ĵ à½Ħ",
"Ġà½Ĥà½ĵ à½Ĥà½Ķ",
"à½Ĥས à½Ķ",
"à½Ĥས ལ",
"Ġkha m",
"Ġkha u",
"Ġp ci",
"Ġp ön",
"Ġp ao",
"Ġp cä",
"Ġny el",
"Ġny im",
"Ġny aro",
"Ġny amnyong",
"à½ģ à½Ħ",
"Ġà½ijà½Ķ à½ł",
"Ġà½ijà½Ģ རà½Ķ",
"Ġth ro",
"Ġth iu",
"nam bj",
"nam gün",
"ô ni",
"ô da",
"à½Ĩ à½Ħ",
"Ġà½ī à½ĺས",
"âm me",
"em ze",
"ts ho",
"Ġc ê",
"Ġc üring",
"à½ļ à½Ħ",
"Ġà½IJ à½ĸ",
"Ġà½Ĥà½ħ à½ij",
"Ġà½Ĥà½ij à½ĵ",
"ngü tram",
"Ġà½ijà½ĸ à½Ħ",
"Ġga toto",
"Ġkh ä",
"Ġkh wo",
"Ġtr ô",
"Ġtr emze",
"Ġchang gi",
"Ġzh u",
"Ġzh öm",
"Ġkhap to",
"Ġthr â",
"Ġthr ükha",
"ai to",
"ach ang",
"ats ha",
"leng go",
"lio n",
"psh i",
"yä p",
"ĠKün sel",
"Ġgou rd",
"Ġtha pshi",
"Ġtsang toto",
"Ġsho ng",
"Ġstal lion",
"Ġngä tsho",
"Ġngosh ê",
"Ġzung ni",
"Ġpci mi",
"Ġpcä u",
"Ġnyel a",
"Ġnyim aito",
"Ġnyaro p",
"Ġthro mkha",
"nambj â"
]
}
}