BarcodeBERT / tokenizer.json
nioushasadjadi
Adding max_length and padding to tokenizer and encoder.
156a2ea
{
"version": "1.0",
"added_tokens": [
{
"id": 0,
"content": "[MASK]",
"special": true
},
{
"id": 1,
"content": "[UNK]",
"special": true
}
],
"pre_tokenizer": {
"type": "KmerSplitter",
"k": 4,
"stride": 4,
"max_length": 660
},
"model": {
"type": "KmerTokenizer",
"unk_token": "[UNK]",
"vocab": {
"[MASK]": 0,
"[UNK]": 1,
"AAAA": 2,
"AAAC": 3,
"AAAG": 4,
"AAAT": 5,
"AACA": 6,
"AACC": 7,
"AACG": 8,
"AACT": 9,
"AAGA": 10,
"AAGC": 11,
"AAGG": 12,
"AAGT": 13,
"AATA": 14,
"AATC": 15,
"AATG": 16,
"AATT": 17,
"ACAA": 18,
"ACAC": 19,
"ACAG": 20,
"ACAT": 21,
"ACCA": 22,
"ACCC": 23,
"ACCG": 24,
"ACCT": 25,
"ACGA": 26,
"ACGC": 27,
"ACGG": 28,
"ACGT": 29,
"ACTA": 30,
"ACTC": 31,
"ACTG": 32,
"ACTT": 33,
"AGAA": 34,
"AGAC": 35,
"AGAG": 36,
"AGAT": 37,
"AGCA": 38,
"AGCC": 39,
"AGCG": 40,
"AGCT": 41,
"AGGA": 42,
"AGGC": 43,
"AGGG": 44,
"AGGT": 45,
"AGTA": 46,
"AGTC": 47,
"AGTG": 48,
"AGTT": 49,
"ATAA": 50,
"ATAC": 51,
"ATAG": 52,
"ATAT": 53,
"ATCA": 54,
"ATCC": 55,
"ATCG": 56,
"ATCT": 57,
"ATGA": 58,
"ATGC": 59,
"ATGG": 60,
"ATGT": 61,
"ATTA": 62,
"ATTC": 63,
"ATTG": 64,
"ATTT": 65,
"CAAA": 66,
"CAAC": 67,
"CAAG": 68,
"CAAT": 69,
"CACA": 70,
"CACC": 71,
"CACG": 72,
"CACT": 73,
"CAGA": 74,
"CAGC": 75,
"CAGG": 76,
"CAGT": 77,
"CATA": 78,
"CATC": 79,
"CATG": 80,
"CATT": 81,
"CCAA": 82,
"CCAC": 83,
"CCAG": 84,
"CCAT": 85,
"CCCA": 86,
"CCCC": 87,
"CCCG": 88,
"CCCT": 89,
"CCGA": 90,
"CCGC": 91,
"CCGG": 92,
"CCGT": 93,
"CCTA": 94,
"CCTC": 95,
"CCTG": 96,
"CCTT": 97,
"CGAA": 98,
"CGAC": 99,
"CGAG": 100,
"CGAT": 101,
"CGCA": 102,
"CGCC": 103,
"CGCG": 104,
"CGCT": 105,
"CGGA": 106,
"CGGC": 107,
"CGGG": 108,
"CGGT": 109,
"CGTA": 110,
"CGTC": 111,
"CGTG": 112,
"CGTT": 113,
"CTAA": 114,
"CTAC": 115,
"CTAG": 116,
"CTAT": 117,
"CTCA": 118,
"CTCC": 119,
"CTCG": 120,
"CTCT": 121,
"CTGA": 122,
"CTGC": 123,
"CTGG": 124,
"CTGT": 125,
"CTTA": 126,
"CTTC": 127,
"CTTG": 128,
"CTTT": 129,
"GAAA": 130,
"GAAC": 131,
"GAAG": 132,
"GAAT": 133,
"GACA": 134,
"GACC": 135,
"GACG": 136,
"GACT": 137,
"GAGA": 138,
"GAGC": 139,
"GAGG": 140,
"GAGT": 141,
"GATA": 142,
"GATC": 143,
"GATG": 144,
"GATT": 145,
"GCAA": 146,
"GCAC": 147,
"GCAG": 148,
"GCAT": 149,
"GCCA": 150,
"GCCC": 151,
"GCCG": 152,
"GCCT": 153,
"GCGA": 154,
"GCGC": 155,
"GCGG": 156,
"GCGT": 157,
"GCTA": 158,
"GCTC": 159,
"GCTG": 160,
"GCTT": 161,
"GGAA": 162,
"GGAC": 163,
"GGAG": 164,
"GGAT": 165,
"GGCA": 166,
"GGCC": 167,
"GGCG": 168,
"GGCT": 169,
"GGGA": 170,
"GGGC": 171,
"GGGG": 172,
"GGGT": 173,
"GGTA": 174,
"GGTC": 175,
"GGTG": 176,
"GGTT": 177,
"GTAA": 178,
"GTAC": 179,
"GTAG": 180,
"GTAT": 181,
"GTCA": 182,
"GTCC": 183,
"GTCG": 184,
"GTCT": 185,
"GTGA": 186,
"GTGC": 187,
"GTGG": 188,
"GTGT": 189,
"GTTA": 190,
"GTTC": 191,
"GTTG": 192,
"GTTT": 193,
"TAAA": 194,
"TAAC": 195,
"TAAG": 196,
"TAAT": 197,
"TACA": 198,
"TACC": 199,
"TACG": 200,
"TACT": 201,
"TAGA": 202,
"TAGC": 203,
"TAGG": 204,
"TAGT": 205,
"TATA": 206,
"TATC": 207,
"TATG": 208,
"TATT": 209,
"TCAA": 210,
"TCAC": 211,
"TCAG": 212,
"TCAT": 213,
"TCCA": 214,
"TCCC": 215,
"TCCG": 216,
"TCCT": 217,
"TCGA": 218,
"TCGC": 219,
"TCGG": 220,
"TCGT": 221,
"TCTA": 222,
"TCTC": 223,
"TCTG": 224,
"TCTT": 225,
"TGAA": 226,
"TGAC": 227,
"TGAG": 228,
"TGAT": 229,
"TGCA": 230,
"TGCC": 231,
"TGCG": 232,
"TGCT": 233,
"TGGA": 234,
"TGGC": 235,
"TGGG": 236,
"TGGT": 237,
"TGTA": 238,
"TGTC": 239,
"TGTG": 240,
"TGTT": 241,
"TTAA": 242,
"TTAC": 243,
"TTAG": 244,
"TTAT": 245,
"TTCA": 246,
"TTCC": 247,
"TTCG": 248,
"TTCT": 249,
"TTGA": 250,
"TTGC": 251,
"TTGG": 252,
"TTGT": 253,
"TTTA": 254,
"TTTC": 255,
"TTTG": 256,
"TTTT": 257
}
}
}