smiles_tokenizer / tokenizer.json
karina-zadorozhny's picture
Upload 3 files
1d39049 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<cls>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<sep>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
"Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])"
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<cls>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<cls>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 1
}
}
],
"special_tokens": {
"<cls>": {
"id": "<cls>",
"ids": [
2
],
"tokens": [
"<cls>"
]
},
"<eos>": {
"id": "<eos>",
"ids": [
5
],
"tokens": [
"<eos>"
]
}
}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": true,
"vocab": {
"<cls>": 0,
"<pad>": 1,
"<eos>": 2,
"<unk>": 3,
"<mask>": 4,
"<sep>": 5,
"c": 6,
"C": 7,
"(": 8,
")": 9,
"O": 10,
"1": 11,
"2": 12,
"=": 13,
"N": 14,
".": 15,
"n": 16,
"3": 17,
"F": 18,
"Cl": 19,
">>": 20,
"~": 21,
"-": 22,
"4": 23,
"[C@H]": 24,
"S": 25,
"[C@@H]": 26,
"[O-]": 27,
"Br": 28,
"#": 29,
"/": 30,
"[nH]": 31,
"[N+]": 32,
"s": 33,
"5": 34,
"o": 35,
"P": 36,
"[Na+]": 37,
"[Si]": 38,
"I": 39,
"[Na]": 40,
"[Pd]": 41,
"[K+]": 42,
"[K]": 43,
"[P]": 44,
"B": 45,
"[C@]": 46,
"[C@@]": 47,
"[Cl-]": 48,
"6": 49,
"[OH-]": 50,
"\\": 51,
"[N-]": 52,
"[Li]": 53,
"[H]": 54,
"[2H]": 55,
"[NH4+]": 56,
"[c-]": 57,
"[P-]": 58,
"[Cs+]": 59,
"[Li+]": 60,
"[Cs]": 61,
"[NaH]": 62,
"[H-]": 63,
"[O+]": 64,
"[BH4-]": 65,
"[Cu]": 66,
"7": 67,
"[Mg]": 68,
"[Fe+2]": 69,
"[n+]": 70,
"[Sn]": 71,
"[BH-]": 72,
"[Pd+2]": 73,
"[CH]": 74,
"[I-]": 75,
"[Br-]": 76,
"[C-]": 77,
"[Zn]": 78,
"[B-]": 79,
"[F-]": 80,
"[Al]": 81,
"[P+]": 82,
"[BH3-]": 83,
"[Fe]": 84,
"[C]": 85,
"[AlH4]": 86,
"[Ni]": 87,
"[SiH]": 88,
"8": 89,
"[Cu+2]": 90,
"[Mn]": 91,
"[AlH]": 92,
"[nH+]": 93,
"[AlH4-]": 94,
"[O-2]": 95,
"[Cr]": 96,
"[Mg+2]": 97,
"[NH3+]": 98,
"[S@]": 99,
"[Pt]": 100,
"[Al+3]": 101,
"[S@@]": 102,
"[S-]": 103,
"[Ti]": 104,
"[Zn+2]": 105,
"[PH]": 106,
"[NH2+]": 107,
"[Ru]": 108,
"[Ag+]": 109,
"[S+]": 110,
"[I+3]": 111,
"[NH+]": 112,
"[Ca+2]": 113,
"[Ag]": 114,
"9": 115,
"[Os]": 116,
"[Se]": 117,
"[SiH2]": 118,
"[Ca]": 119,
"[Ti+4]": 120,
"[Ac]": 121,
"[Cu+]": 122,
"[S]": 123,
"[Rh]": 124,
"[Cl+3]": 125,
"[cH-]": 126,
"[Zn+]": 127,
"[O]": 128,
"[Cl+]": 129,
"[SH]": 130,
"[H+]": 131,
"[Pd+]": 132,
"[se]": 133,
"[PH+]": 134,
"[I]": 135,
"[Pt+2]": 136,
"[C+]": 137,
"[Mg+]": 138,
"[Hg]": 139,
"[W]": 140,
"[SnH]": 141,
"[SiH3]": 142,
"[Fe+3]": 143,
"[NH]": 144,
"[Mo]": 145,
"[CH2+]": 146,
"%10": 147,
"[CH2-]": 148,
"[CH2]": 149,
"[n-]": 150,
"[Ce+4]": 151,
"[NH-]": 152,
"[Co]": 153,
"[I+]": 154,
"[PH2]": 155,
"[Pt+4]": 156,
"[Ce]": 157,
"[B]": 158,
"[Sn+2]": 159,
"[Ba+2]": 160,
"%11": 161,
"[Fe-3]": 162,
"[18F]": 163,
"[SH-]": 164,
"[Pb+2]": 165,
"[Os-2]": 166,
"[Zr+4]": 167,
"[N]": 168,
"[Ir]": 169,
"[Bi]": 170,
"[Ni+2]": 171,
"[P@]": 172,
"[Co+2]": 173,
"[s+]": 174,
"[As]": 175,
"[P+3]": 176,
"[Hg+2]": 177,
"[Yb+3]": 178,
"[CH-]": 179,
"[Zr+2]": 180,
"[Mn+2]": 181,
"[CH+]": 182,
"[In]": 183,
"[KH]": 184,
"[Ce+3]": 185,
"[Zr]": 186,
"[AlH2-]": 187,
"[OH2+]": 188,
"[Ti+3]": 189,
"[Rh+2]": 190,
"[Sb]": 191,
"[S-2]": 192,
"%12": 193,
"[P@@]": 194,
"[Si@H]": 195,
"[Mn+4]": 196,
"p": 197,
"[Ba]": 198,
"[NH2-]": 199,
"[Ge]": 200,
"[Pb+4]": 201,
"[Cr+3]": 202,
"[Au]": 203,
"[LiH]": 204,
"[Sc+3]": 205,
"[o+]": 206,
"[Rh-3]": 207,
"%13": 208,
"[Br]": 209,
"[Sb-]": 210,
"[S@+]": 211,
"[I+2]": 212,
"[Ar]": 213,
"[V]": 214,
"[Cu-]": 215,
"[Al-]": 216,
"[Te]": 217,
"[13c]": 218,
"[13C]": 219,
"[Cl]": 220,
"[PH4+]": 221,
"[SiH4]": 222,
"[te]": 223,
"[CH3-]": 224,
"[S@@+]": 225,
"[Rh+3]": 226,
"[SH+]": 227,
"[Bi+3]": 228,
"[Br+2]": 229,
"[La]": 230,
"[La+3]": 231,
"[Pt-2]": 232,
"[N@@]": 233,
"[PH3+]": 234,
"[N@]": 235,
"[Si+4]": 236,
"[Sr+2]": 237,
"[Al+]": 238,
"[Pb]": 239,
"[SeH]": 240,
"[Si-]": 241,
"[V+5]": 242,
"[Y+3]": 243,
"[Re]": 244,
"[Ru+]": 245,
"[Sm]": 246,
"*": 247,
"[3H]": 248,
"[NH2]": 249,
"[Ag-]": 250,
"[13CH3]": 251,
"[OH+]": 252,
"[Ru+3]": 253,
"[OH]": 254,
"[Gd+3]": 255,
"[13CH2]": 256,
"[In+3]": 257,
"[Si@@]": 258,
"[Si@]": 259,
"[Ti+2]": 260,
"[Sn+]": 261,
"[Cl+2]": 262,
"[AlH-]": 263,
"[Pd-2]": 264,
"[SnH3]": 265,
"[B+3]": 266,
"[Cu-2]": 267,
"[Nd+3]": 268,
"[Pb+3]": 269,
"[13cH]": 270,
"[Fe-4]": 271,
"[Ga]": 272,
"[Sn+4]": 273,
"[Hg+]": 274,
"[11CH3]": 275,
"[Hf]": 276,
"[Pr]": 277,
"[Y]": 278,
"[S+2]": 279,
"[Cd]": 280,
"[Cr+6]": 281,
"[Zr+3]": 282,
"[Rh+]": 283,
"[CH3]": 284,
"[N-3]": 285,
"[Hf+2]": 286,
"[Th]": 287,
"[Sb+3]": 288,
"%14": 289,
"[Cr+2]": 290,
"[Ru+2]": 291,
"[Hf+4]": 292,
"[14C]": 293,
"[Ta]": 294,
"[Tl+]": 295,
"[B+]": 296,
"[Os+4]": 297,
"[PdH2]": 298,
"[Pd-]": 299,
"[Cd+2]": 300,
"[Co+3]": 301,
"[S+4]": 302,
"[Nb+5]": 303,
"[123I]": 304,
"[c+]": 305,
"[Rb+]": 306,
"[V+2]": 307,
"[CH3+]": 308,
"[Ag+2]": 309,
"[cH+]": 310,
"[Mn+3]": 311,
"[Se-]": 312,
"[As-]": 313,
"[Eu+3]": 314,
"[SH2]": 315,
"[Sm+3]": 316,
"[IH+]": 317,
"%15": 318,
"[OH3+]": 319,
"[PH3]": 320,
"[IH2+]": 321,
"[SH2+]": 322,
"[Ir+3]": 323,
"[AlH3]": 324,
"[Sc]": 325,
"[Yb]": 326,
"[15NH2]": 327,
"[Lu]": 328,
"[sH+]": 329,
"[Gd]": 330,
"[18F-]": 331,
"[SH3+]": 332,
"[SnH4]": 333,
"[TeH]": 334,
"[Si@@H]": 335,
"[Ga+3]": 336,
"[CaH2]": 337,
"[Tl]": 338,
"[Ta+5]": 339,
"[GeH]": 340,
"[Br+]": 341,
"[Sr]": 342,
"[Tl+3]": 343,
"[Sm+2]": 344,
"[PH5]": 345,
"%16": 346,
"[N@@+]": 347,
"[Au+3]": 348,
"[C-4]": 349,
"[Nd]": 350,
"[Ti+]": 351,
"[IH]": 352,
"[N@+]": 353,
"[125I]": 354,
"[Eu]": 355,
"[Sn+3]": 356,
"[Nb]": 357,
"[Er+3]": 358,
"[123I-]": 359,
"[14c]": 360,
"%17": 361,
"[SnH2]": 362,
"[YH]": 363,
"[Sb+5]": 364,
"[Pr+3]": 365,
"[Ir+]": 366,
"[N+3]": 367,
"[AlH2]": 368,
"[19F]": 369,
"%18": 370,
"[Tb]": 371,
"[14CH]": 372,
"[Mo+4]": 373,
"[Si+]": 374,
"[BH]": 375,
"[Be]": 376,
"[Rb]": 377,
"[pH]": 378,
"%19": 379,
"%20": 380,
"[Xe]": 381,
"[Ir-]": 382,
"[Be+2]": 383,
"[C+4]": 384,
"[RuH2]": 385,
"[15NH]": 386,
"[U+2]": 387,
"[Au-]": 388,
"%21": 389,
"%22": 390,
"[Au+]": 391,
"[15n]": 392,
"[Al+2]": 393,
"[Tb+3]": 394,
"[15N]": 395,
"[V+3]": 396,
"[W+6]": 397,
"[14CH3]": 398,
"[Cr+4]": 399,
"[ClH+]": 400,
"b": 401,
"[Ti+6]": 402,
"[Nd+]": 403,
"[Zr+]": 404,
"[PH2+]": 405,
"[Fm]": 406,
"[N@H+]": 407,
"[RuH]": 408,
"[Dy+3]": 409,
"%23": 410,
"[Hf+3]": 411,
"[W+4]": 412,
"[11C]": 413,
"[13CH]": 414,
"[Er]": 415,
"[124I]": 416,
"[LaH]": 417,
"[F]": 418,
"[siH]": 419,
"[Ga+]": 420,
"[Cm]": 421,
"[GeH3]": 422,
"[IH-]": 423,
"[U+6]": 424,
"[SeH+]": 425,
"[32P]": 426,
"[SeH-]": 427,
"[Pt-]": 428,
"[Ir+2]": 429,
"[se+]": 430,
"[U]": 431,
"[F+]": 432,
"[BH2]": 433,
"[As+]": 434,
"[Cf]": 435,
"[ClH2+]": 436,
"[Ni+]": 437,
"[TeH3]": 438,
"[SbH2]": 439,
"[Ag+3]": 440,
"%24": 441,
"[18O]": 442,
"[PH4]": 443,
"[Os+2]": 444,
"[Na-]": 445,
"[Sb+2]": 446,
"[V+4]": 447,
"[Ho+3]": 448,
"[68Ga]": 449,
"[PH-]": 450,
"[Bi+2]": 451,
"[Ce+2]": 452,
"[Pd+3]": 453,
"[99Tc]": 454,
"[13C@@H]": 455,
"[Fe+6]": 456,
"[c]": 457,
"[GeH2]": 458,
"[10B]": 459,
"[Cu+3]": 460,
"[Mo+2]": 461,
"[Cr+]": 462,
"[Pd+4]": 463,
"[Dy]": 464,
"[AsH]": 465,
"[Ba+]": 466,
"[SeH2]": 467,
"[In+]": 468,
"[TeH2]": 469,
"[BrH+]": 470,
"[14cH]": 471,
"[W+]": 472,
"[13C@H]": 473,
"[AsH2]": 474,
"[In+2]": 475,
"[N+2]": 476,
"[N@@H+]": 477,
"[SbH]": 478,
"[60Co]": 479,
"[AsH4+]": 480,
"[AsH3]": 481,
"[18OH]": 482,
"[Ru-2]": 483,
"[Na-2]": 484,
"[CuH2]": 485,
"[31P]": 486,
"[Ti+5]": 487,
"[35S]": 488,
"[P@@H]": 489,
"[ArH]": 490,
"[Co+]": 491,
"[Zr-2]": 492,
"[BH2-]": 493,
"[131I]": 494,
"[SH5]": 495,
"[VH]": 496,
"[B+2]": 497,
"[Yb+2]": 498,
"[14C@H]": 499,
"[211At]": 500,
"[NH3+2]": 501,
"[IrH]": 502,
"[IrH2]": 503,
"[Rh-]": 504,
"[Cr-]": 505,
"[Sb+]": 506,
"[Ni+3]": 507,
"[TaH3]": 508,
"[Tl+2]": 509,
"[64Cu]": 510,
"[Tc]": 511,
"[Cd+]": 512,
"[1H]": 513,
"[15nH]": 514,
"[AlH2+]": 515,
"[FH+2]": 516,
"[BiH3]": 517,
"[Ru-]": 518,
"[Mo+6]": 519,
"[AsH+]": 520,
"[BaH2]": 521,
"[BaH]": 522,
"[Fe+4]": 523,
"[229Th]": 524,
"[Th+4]": 525,
"[As+3]": 526,
"[NH+3]": 527,
"[P@H]": 528,
"[Li-]": 529,
"[7NaH]": 530,
"[Bi+]": 531,
"[PtH+2]": 532,
"[p-]": 533,
"[Re+5]": 534,
"[NiH]": 535,
"[Ni-]": 536,
"[Xe+]": 537,
"[Ca+]": 538,
"[11c]": 539,
"[Rh+4]": 540,
"[AcH]": 541,
"[HeH]": 542,
"[Sc+2]": 543,
"[Mn+]": 544,
"[UH]": 545,
"[14CH2]": 546,
"[SiH4+]": 547,
"[18OH2]": 548,
"[Ac-]": 549,
"[Re+4]": 550,
"[118Sn]": 551,
"[153Sm]": 552,
"[P+2]": 553,
"[9CH]": 554,
"[9CH3]": 555,
"[Y-]": 556,
"[NiH2]": 557,
"[Si+2]": 558,
"[Mn+6]": 559,
"[ZrH2]": 560,
"[C-2]": 561,
"[Bi+5]": 562,
"[24NaH]": 563,
"[Fr]": 564,
"[15CH]": 565,
"[Se+]": 566,
"[At]": 567,
"[P-3]": 568,
"[124I-]": 569,
"[CuH2-]": 570,
"[Nb+4]": 571,
"[Nb+3]": 572,
"[MgH]": 573,
"[Ir+4]": 574,
"[67Ga+3]": 575,
"[67Ga]": 576,
"[13N]": 577,
"[15OH2]": 578,
"[2NH]": 579,
"[Ho]": 580,
"[Cn]": 581
},
"merges": []
}
}