Upload tokenizer

Files changed (3) hide show

special_tokens_map.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "single_word": false
   },
   "cls_token": {
-    "content": "<s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
@@ -21,21 +21,21 @@
     "single_word": false
   },
   "mask_token": {
-    "content": "<MASK>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
-    "content": "<PAD>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

     "single_word": false
   },
   "cls_token": {
+    "content": "<CLS|LLM-jp>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "mask_token": {
+    "content": "<MASK|LLM-jp>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
+    "content": "<PAD|LLM-jp>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
+    "content": "<SEP|LLM-jp>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenizer.json CHANGED Viewed

@@ -32,7 +32,7 @@
     },
     {
       "id": 3,
-      "content": "<MASK>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -41,7 +41,7 @@
     },
     {
       "id": 4,
-      "content": "<PAD>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -50,7 +50,7 @@
     },
     {
       "id": 5,
-      "content": "<CLS>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -59,7 +59,7 @@
     },
     {
       "id": 6,
-      "content": "<SEP>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -68,7 +68,7 @@
     },
     {
       "id": 7,
-      "content": "<EOD>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -145,8 +145,8 @@
       },
       {
         "SpecialToken": {
-          "id": "<s>",
-          "type_id": 1
         }
       }
     ],
@@ -213,23 +213,23 @@
         -127.5
       ],
       [
-        "<MASK>",
         -127.5
       ],
       [
-        "<PAD>",
         -127.5
       ],
       [
-        "<CLS>",
         -127.5
       ],
       [
-        "<SEP>",
         -127.5
       ],
       [
-        "<EOD>",
         -127.5
       ],
       [

     },
     {
       "id": 3,
+      "content": "<MASK|LLM-jp>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 4,
+      "content": "<PAD|LLM-jp>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 5,
+      "content": "<CLS|LLM-jp>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 6,
+      "content": "<SEP|LLM-jp>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 7,
+      "content": "<EOD|LLM-jp>",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       },
       {
         "SpecialToken": {
+          "id": "</s>",
+          "type_id": 0
         }
       }
     ],
         -127.5
       ],
       [
+        "<MASK|LLM-jp>",
         -127.5
       ],
       [
+        "<PAD|LLM-jp>",
         -127.5
       ],
       [
+        "<CLS|LLM-jp>",
         -127.5
       ],
       [
+        "<SEP|LLM-jp>",
         -127.5
       ],
       [
+        "<EOD|LLM-jp>",
         -127.5
       ],
       [

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
@@ -25,7 +27,7 @@
       "special": true
     },
     "3": {
-      "content": "<MASK>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -33,7 +35,7 @@
       "special": true
     },
     "4": {
-      "content": "<PAD>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -41,7 +43,7 @@
       "special": true
     },
     "5": {
-      "content": "<CLS>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -49,7 +51,7 @@
       "special": true
     },
     "6": {
-      "content": "<SEP>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -57,7 +59,7 @@
       "special": true
     },
     "7": {
-      "content": "<EOD>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -67,14 +69,14 @@
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
-  "cls_token": "<s>",
-  "eod_token": "<EOD>",
   "eos_token": "</s>",
   "extra_ids": 0,
-  "mask_token": "<MASK>",
   "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<PAD>",
-  "sep_token": "</s>",
   "sp_model_kwargs": {},
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>"

 {
+  "add_bos_token": true,
+  "add_eos_token": false,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
       "special": true
     },
     "3": {
+      "content": "<MASK|LLM-jp>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "4": {
+      "content": "<PAD|LLM-jp>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "5": {
+      "content": "<CLS|LLM-jp>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "6": {
+      "content": "<SEP|LLM-jp>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "7": {
+      "content": "<EOD|LLM-jp>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "cls_token": "<CLS|LLM-jp>",
+  "eod_token": "</s>",
   "eos_token": "</s>",
   "extra_ids": 0,
+  "mask_token": "<MASK|LLM-jp>",
   "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<PAD|LLM-jp>",
+  "sep_token": "<SEP|LLM-jp>",
   "sp_model_kwargs": {},
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>"