goabonga commited on
Commit
c34bcb2
·
verified ·
1 Parent(s): 1d0f788

Upload tokenizer files (vocab, config, README)

Browse files
Files changed (3) hide show
  1. README.md +20 -0
  2. special_tokens_map.json +6 -0
  3. tokenizer_config.json +3 -3
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - tokenizer
5
+ - pytorch
6
+ - streaming
7
+ library_name: nano
8
+ ---
9
+
10
+ # Nano Tokenizer
11
+
12
+ This tokenizer was trained using a Python-only pipeline (no `transformers` or `tokenizers`), on a dataset streamed from the Hugging Face Hub.
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from transformers import PreTrainedTokenizerFast
18
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("your-username/tokenizer-name")
19
+ ```
20
+
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>",
3
+ "pad_token": "<pad>",
4
+ "bos_token": "<bos>",
5
+ "eos_token": "<eos>"
6
+ }
tokenizer_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
 
 
2
  "unk_token": "<unk>",
3
  "pad_token": "<pad>",
4
  "bos_token": "<bos>",
5
- "eos_token": "<eos>",
6
- "model_max_length": 512,
7
- "tokenizer_class": "PreTrainedTokenizerFast"
8
  }
 
1
  {
2
+ "model_max_length": 512,
3
+ "tokenizer_class": "PreTrainedTokenizerFast",
4
  "unk_token": "<unk>",
5
  "pad_token": "<pad>",
6
  "bos_token": "<bos>",
7
+ "eos_token": "<eos>"
 
 
8
  }