|
{ |
|
"version": "1.0", |
|
"truncation": null, |
|
"padding": null, |
|
"added_tokens": [ |
|
{ |
|
"id": 0, |
|
"content": "<unk>", |
|
"single_word": false, |
|
"lstrip": false, |
|
"rstrip": false, |
|
"normalized": false, |
|
"special": true |
|
}, |
|
{ |
|
"id": 256, |
|
"content": "<pad>", |
|
"single_word": false, |
|
"lstrip": false, |
|
"rstrip": false, |
|
"normalized": false, |
|
"special": true |
|
}, |
|
{ |
|
"id": 257, |
|
"content": "<bos>", |
|
"single_word": false, |
|
"lstrip": false, |
|
"rstrip": false, |
|
"normalized": false, |
|
"special": true |
|
}, |
|
{ |
|
"id": 258, |
|
"content": "<eos>", |
|
"single_word": false, |
|
"lstrip": false, |
|
"rstrip": false, |
|
"normalized": false, |
|
"special": true |
|
} |
|
], |
|
"normalizer": { |
|
"type": "BertNormalizer", |
|
"clean_text": true, |
|
"handle_chinese_chars": true, |
|
"strip_accents": null, |
|
"lowercase": false |
|
}, |
|
"pre_tokenizer": { |
|
"type": "BertPreTokenizer" |
|
}, |
|
"post_processor": null, |
|
"decoder": { |
|
"type": "BPEDecoder", |
|
"suffix": "</w>" |
|
}, |
|
"model": { |
|
"type": "BPE", |
|
"dropout": null, |
|
"unk_token": "<unk>", |
|
"continuing_subword_prefix": null, |
|
"end_of_word_suffix": "</w>", |
|
"fuse_unk": false, |
|
"byte_fallback": false, |
|
"ignore_merges": false, |
|
"vocab": { |
|
"<unk>": 0, |
|
"A": 1, |
|
"B": 2, |
|
"C": 3, |
|
"D": 4, |
|
"E": 5, |
|
"F": 6, |
|
"G": 7, |
|
"H": 8, |
|
"I": 9, |
|
"J": 10, |
|
"K": 11, |
|
"L": 12, |
|
"M": 13, |
|
"N": 14, |
|
"O": 15, |
|
"P": 16, |
|
"Q": 17, |
|
"R": 18, |
|
"S": 19, |
|
"T": 20, |
|
"U": 21, |
|
"V": 22, |
|
"W": 23, |
|
"X": 24, |
|
"Y": 25, |
|
"Z": 26, |
|
"Ġ": 27, |
|
"Z</w>": 28, |
|
"R</w>": 29, |
|
"G</w>": 30, |
|
"N</w>": 31, |
|
"S</w>": 32, |
|
"H</w>": 33, |
|
"D</w>": 34, |
|
"M</w>": 35, |
|
"W</w>": 36, |
|
"L</w>": 37, |
|
"Y</w>": 38, |
|
"E</w>": 39, |
|
"T</w>": 40, |
|
"K</w>": 41, |
|
"V</w>": 42, |
|
"A</w>": 43, |
|
"F</w>": 44, |
|
"Q</w>": 45, |
|
"B</w>": 46, |
|
"O</w>": 47, |
|
"P</w>": 48, |
|
"I</w>": 49, |
|
"C</w>": 50, |
|
"U</w>": 51, |
|
"X</w>": 52, |
|
"J</w>": 53, |
|
"HĠ": 54, |
|
"ĠA": 55, |
|
"ĠI": 56, |
|
"ĠAHĠ": 57, |
|
"ĠE": 58, |
|
"ĠIHĠ": 59, |
|
"YĠ": 60, |
|
"RĠ": 61, |
|
"NĠ": 62, |
|
"AĠ": 63, |
|
"WĠ": 64, |
|
"EĠ": 65, |
|
"ĠAAĠ": 66, |
|
"SĠ": 67, |
|
"ĠEHĠ": 68, |
|
"ĠAEĠ": 69, |
|
"ĠR": 70, |
|
"ĠIYĠ": 71, |
|
"LĠ": 72, |
|
"ĠERĠ": 73, |
|
"HĠA": 74, |
|
"KĠ": 75, |
|
"OWĠ": 76, |
|
"ĠIY</w>": 77, |
|
"ĠEYĠ": 78, |
|
"TĠ": 79, |
|
"ĠAO": 80, |
|
"GĠ": 81, |
|
"UWĠ": 82, |
|
"ĠAHĠNĠ": 83, |
|
"ĠAOĠ": 84, |
|
"ĠIHĠN": 85, |
|
"IHĠ": 86, |
|
"MĠ": 87, |
|
"ĠAH</w>": 88, |
|
"ĠAYĠ": 89, |
|
"DĠ": 90, |
|
"SĠT": 91, |
|
"HĠE": 92, |
|
"HĠAHĠ": 93, |
|
"ĠAHĠN</w>": 94, |
|
"ĠIY": 95, |
|
"ĠER</w>": 96, |
|
"PĠ": 97, |
|
"BĠ": 98, |
|
"AHĠ": 99, |
|
"ĠIHĠNG</w>": 100, |
|
"LĠAHĠ": 101, |
|
"NĠAHĠ": 102, |
|
"ĠER": 103, |
|
"OW</w>": 104, |
|
"KĠAHĠ": 105, |
|
"ĠAAĠRĠ": 106, |
|
"HHĠA": 107, |
|
"LĠIY</w>": 108, |
|
"LĠIHĠ": 109, |
|
"TĠS</w>": 110, |
|
"HĠIHĠ": 111, |
|
"SĠIHĠ": 112, |
|
"DĠIHĠ": 113, |
|
"TĠIHĠ": 114, |
|
"ĠAOĠRĠ": 115, |
|
"ĠERĠZ</w>": 116, |
|
"SĠAHĠ": 117, |
|
"ĠIYĠZ</w>": 118, |
|
"FĠ": 119, |
|
"IN": 120, |
|
"SHĠAHĠ": 121, |
|
"TĠAHĠ": 122, |
|
"NĠZ</w>": 123, |
|
"ER": 124, |
|
"AEĠ": 125, |
|
"MĠAHĠ": 126, |
|
"ĠAEĠNĠ": 127, |
|
"HĠEHĠ": 128, |
|
"EHĠ": 129, |
|
"UHĠ": 130, |
|
"ĠRĠAHĠ": 131, |
|
"ĠAHĠNĠZ</w>": 132, |
|
"BĠAHĠ": 133, |
|
"ĠEHĠR": 134, |
|
"ĠEHĠNĠ": 135, |
|
"DĠAHĠ": 136, |
|
"ĠRĠIHĠ": 137, |
|
"HĠI": 138, |
|
"KĠAAĠ": 139, |
|
"LĠZ</w>": 140, |
|
"ĠIHĠNGĠ": 141, |
|
"NGĠ": 142, |
|
"NĠIHĠ": 143, |
|
"MĠIHĠ": 144, |
|
"AN": 145, |
|
"WĠIHĠ": 146, |
|
"ĠAWĠ": 147, |
|
"AR": 148, |
|
"ZĠ": 149, |
|
"AAĠ": 150, |
|
"SĠT</w>": 151, |
|
"YĠUWĠ": 152, |
|
"DĠZ</w>": 153, |
|
"RĠOWĠ": 154, |
|
"AHĠNĠ": 155, |
|
"SĠK": 156, |
|
"EN": 157, |
|
"OĠ": 158, |
|
"SĠP": 159, |
|
"BĠERĠ": 160, |
|
"LĠAEĠ": 161, |
|
"KĠS</w>": 162, |
|
"RĠIHĠ": 163, |
|
"IHĠNĠ": 164, |
|
"TĠR": 165, |
|
"ĠIYĠAHĠ": 166, |
|
"ĠAAĠNĠ": 167, |
|
"ON": 168, |
|
"YĠAHĠ": 169, |
|
"PĠAHĠ": 170, |
|
"VĠ": 171, |
|
"RĠAHĠ": 172, |
|
"VĠIHĠ": 173, |
|
"LĠEHĠ": 174, |
|
"KĠAEĠ": 175, |
|
"HHĠ": 176, |
|
"LĠIYĠ": 177, |
|
"OR": 178, |
|
"HĠERĠ": 179, |
|
"GĠAHĠ": 180, |
|
"MĠAEĠ": 181, |
|
"GĠR": 182, |
|
"ST": 183, |
|
"AT": 184, |
|
"ES</w>": 185, |
|
"BĠR": 186, |
|
"RĠIYĠ": 187, |
|
"BĠIHĠ": 188, |
|
"SHĠ": 189, |
|
"LĠEYĠ": 190, |
|
"PĠR": 191, |
|
"LĠAAĠ": 192, |
|
"AL": 193, |
|
"TĠIY</w>": 194, |
|
"HHĠAEĠ": 195, |
|
"SĠEHĠ": 196, |
|
"NĠAHĠS</w>": 197, |
|
"TH</w>": 198, |
|
"EL": 199, |
|
"HĠIYĠ": 200, |
|
"FĠAHĠ": 201, |
|
"LĠAYĠ": 202, |
|
"LĠD</w>": 203, |
|
"KĠW": 204, |
|
"MĠEHĠ": 205, |
|
"RE": 206, |
|
"PĠIHĠ": 207, |
|
"FĠIHĠ": 208, |
|
"SHĠAHĠN</w>": 209, |
|
"NĠIY</w>": 210, |
|
"MĠAAĠ": 211, |
|
"KĠR": 212, |
|
"VĠAHĠ": 213, |
|
"THĠ": 214, |
|
"UW</w>": 215, |
|
"OWĠZ</w>": 216, |
|
"HHĠAAĠ": 217, |
|
"CH": 218, |
|
"RĠUWĠ": 219, |
|
"OYĠ": 220, |
|
"ĠAOĠR": 221, |
|
"KĠIHĠ": 222, |
|
"HĠAEĠ": 223, |
|
"ED</w>": 224, |
|
"ZĠAHĠ": 225, |
|
"HHĠEHĠ": 226, |
|
"SĠIHĠZ</w>": 227, |
|
"DĠEHĠ": 228, |
|
"JHĠAHĠ": 229, |
|
"JHĠIHĠ": 230, |
|
"BĠAEĠ": 231, |
|
"TĠERĠ": 232, |
|
"JHĠ": 233, |
|
"OW": 234, |
|
"BĠEHĠ": 235, |
|
"SĠIYĠ": 236, |
|
"OWĠLĠ": 237, |
|
"VĠERĠ": 238, |
|
"ĠEY</w>": 239, |
|
"TĠIHĠD</w>": 240, |
|
"KĠAHĠNĠ": 241, |
|
"LE": 242, |
|
"MĠAHĠN</w>": 243, |
|
"ĠAHĠNĠT</w>": 244, |
|
"RĠEHĠ": 245, |
|
"NĠAH</w>": 246, |
|
"CHĠ": 247, |
|
"IS": 248, |
|
"UW": 249, |
|
"PĠERĠ": 250, |
|
"SĠTĠ": 251, |
|
"PĠAAĠ": 252, |
|
"TĠAHĠN</w>": 253, |
|
"LĠUWĠ": 254, |
|
"HĠAAĠ": 255 |
|
}, |
|
"merges": [ |
|
"H Ġ", |
|
"Ġ A", |
|
"Ġ I", |
|
"ĠA HĠ", |
|
"Ġ E", |
|
"ĠI HĠ", |
|
"Y Ġ", |
|
"R Ġ", |
|
"N Ġ", |
|
"A Ġ", |
|
"W Ġ", |
|
"E Ġ", |
|
"ĠA AĠ", |
|
"S Ġ", |
|
"ĠE HĠ", |
|
"ĠA EĠ", |
|
"Ġ R", |
|
"ĠI YĠ", |
|
"L Ġ", |
|
"ĠE RĠ", |
|
"HĠ A", |
|
"K Ġ", |
|
"O WĠ", |
|
"ĠI Y</w>", |
|
"ĠE YĠ", |
|
"T Ġ", |
|
"ĠA O", |
|
"G Ġ", |
|
"U WĠ", |
|
"ĠAHĠ NĠ", |
|
"ĠAO Ġ", |
|
"ĠIHĠ N", |
|
"I HĠ", |
|
"M Ġ", |
|
"ĠA H</w>", |
|
"ĠA YĠ", |
|
"D Ġ", |
|
"SĠ T", |
|
"HĠ E", |
|
"HĠA HĠ", |
|
"ĠAHĠ N</w>", |
|
"ĠI Y", |
|
"ĠE R</w>", |
|
"P Ġ", |
|
"B Ġ", |
|
"A HĠ", |
|
"ĠIHĠN G</w>", |
|
"L ĠAHĠ", |
|
"N ĠAHĠ", |
|
"ĠE R", |
|
"O W</w>", |
|
"K ĠAHĠ", |
|
"ĠAAĠ RĠ", |
|
"H HĠA", |
|
"L ĠIY</w>", |
|
"L ĠIHĠ", |
|
"TĠ S</w>", |
|
"HĠ IHĠ", |
|
"S ĠIHĠ", |
|
"D ĠIHĠ", |
|
"T ĠIHĠ", |
|
"ĠAOĠ RĠ", |
|
"ĠERĠ Z</w>", |
|
"S ĠAHĠ", |
|
"ĠIYĠ Z</w>", |
|
"F Ġ", |
|
"I N", |
|
"S HĠAHĠ", |
|
"T ĠAHĠ", |
|
"NĠ Z</w>", |
|
"E R", |
|
"A EĠ", |
|
"M ĠAHĠ", |
|
"ĠAEĠ NĠ", |
|
"HĠE HĠ", |
|
"E HĠ", |
|
"U HĠ", |
|
"ĠR ĠAHĠ", |
|
"ĠAHĠNĠ Z</w>", |
|
"B ĠAHĠ", |
|
"ĠEHĠ R", |
|
"ĠEHĠ NĠ", |
|
"D ĠAHĠ", |
|
"ĠR ĠIHĠ", |
|
"HĠ I", |
|
"K ĠAAĠ", |
|
"LĠ Z</w>", |
|
"ĠIHĠN GĠ", |
|
"N GĠ", |
|
"N ĠIHĠ", |
|
"M ĠIHĠ", |
|
"A N", |
|
"W ĠIHĠ", |
|
"ĠA WĠ", |
|
"A R", |
|
"Z Ġ", |
|
"A AĠ", |
|
"SĠ T</w>", |
|
"YĠ UWĠ", |
|
"DĠ Z</w>", |
|
"RĠ OWĠ", |
|
"AHĠ NĠ", |
|
"SĠ K", |
|
"E N", |
|
"O Ġ", |
|
"SĠ P", |
|
"B ĠERĠ", |
|
"L ĠAEĠ", |
|
"KĠ S</w>", |
|
"R ĠIHĠ", |
|
"IHĠ NĠ", |
|
"T ĠR", |
|
"ĠIY ĠAHĠ", |
|
"ĠAAĠ NĠ", |
|
"O N", |
|
"Y ĠAHĠ", |
|
"P ĠAHĠ", |
|
"V Ġ", |
|
"R ĠAHĠ", |
|
"V ĠIHĠ", |
|
"L ĠEHĠ", |
|
"K ĠAEĠ", |
|
"H HĠ", |
|
"L ĠIYĠ", |
|
"O R", |
|
"HĠE RĠ", |
|
"G ĠAHĠ", |
|
"M ĠAEĠ", |
|
"G ĠR", |
|
"S T", |
|
"A T", |
|
"E S</w>", |
|
"B ĠR", |
|
"R ĠIYĠ", |
|
"B ĠIHĠ", |
|
"S HĠ", |
|
"L ĠEYĠ", |
|
"P ĠR", |
|
"L ĠAAĠ", |
|
"A L", |
|
"T ĠIY</w>", |
|
"HHĠA EĠ", |
|
"S ĠEHĠ", |
|
"NĠAHĠ S</w>", |
|
"T H</w>", |
|
"E L", |
|
"HĠI YĠ", |
|
"F ĠAHĠ", |
|
"L ĠAYĠ", |
|
"LĠ D</w>", |
|
"KĠ W", |
|
"M ĠEHĠ", |
|
"R E", |
|
"P ĠIHĠ", |
|
"F ĠIHĠ", |
|
"SHĠAHĠ N</w>", |
|
"N ĠIY</w>", |
|
"M ĠAAĠ", |
|
"K ĠR", |
|
"V ĠAHĠ", |
|
"T HĠ", |
|
"U W</w>", |
|
"OWĠ Z</w>", |
|
"HHĠA AĠ", |
|
"C H", |
|
"RĠ UWĠ", |
|
"O YĠ", |
|
"ĠAO ĠR", |
|
"K ĠIHĠ", |
|
"HĠA EĠ", |
|
"E D</w>", |
|
"Z ĠAHĠ", |
|
"H HĠEHĠ", |
|
"SĠIHĠ Z</w>", |
|
"D ĠEHĠ", |
|
"J HĠAHĠ", |
|
"J HĠIHĠ", |
|
"B ĠAEĠ", |
|
"T ĠERĠ", |
|
"J HĠ", |
|
"O W", |
|
"B ĠEHĠ", |
|
"S ĠIYĠ", |
|
"OWĠ LĠ", |
|
"V ĠERĠ", |
|
"ĠE Y</w>", |
|
"TĠIHĠ D</w>", |
|
"K ĠAHĠNĠ", |
|
"L E", |
|
"M ĠAHĠN</w>", |
|
"ĠAHĠNĠ T</w>", |
|
"R ĠEHĠ", |
|
"N ĠAH</w>", |
|
"C HĠ", |
|
"I S", |
|
"U W", |
|
"P ĠERĠ", |
|
"SĠ TĠ", |
|
"P ĠAAĠ", |
|
"T ĠAHĠN</w>", |
|
"LĠ UWĠ", |
|
"HĠA AĠ" |
|
] |
|
} |
|
} |