NeuraCraft commited on
Commit
8928510
·
1 Parent(s): 69a2579

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,5 +1,3 @@
1
  {
2
- "<AI>": 50259,
3
- "<USER>": 50258,
4
  "[PAD]": 50257
5
  }
 
1
  {
 
 
2
  "[PAD]": 50257
3
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,34 +1,6 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<USER>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<AI>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
- "bos_token": {
19
- "content": "<|endoftext|>",
20
- "lstrip": false,
21
- "normalized": true,
22
- "rstrip": false,
23
- "single_word": false
24
- },
25
- "eos_token": {
26
- "content": "<|endoftext|>",
27
- "lstrip": false,
28
- "normalized": true,
29
- "rstrip": false,
30
- "single_word": false
31
- },
32
  "pad_token": {
33
  "content": "[PAD]",
34
  "lstrip": false,
@@ -36,11 +8,5 @@
36
  "rstrip": false,
37
  "single_word": false
38
  },
39
- "unk_token": {
40
- "content": "<|endoftext|>",
41
- "lstrip": false,
42
- "normalized": true,
43
- "rstrip": false,
44
- "single_word": false
45
- }
46
  }
 
1
  {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  "pad_token": {
5
  "content": "[PAD]",
6
  "lstrip": false,
 
8
  "rstrip": false,
9
  "single_word": false
10
  },
11
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
12
  }
tokenizer_config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "add_bos_token": false,
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "50256": {
@@ -17,32 +16,11 @@
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
- },
21
- "50258": {
22
- "content": "<USER>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "50259": {
30
- "content": "<AI>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
  }
37
  },
38
- "additional_special_tokens": [
39
- "<USER>",
40
- "<AI>"
41
- ],
42
  "bos_token": "<|endoftext|>",
43
  "clean_up_tokenization_spaces": false,
44
  "eos_token": "<|endoftext|>",
45
- "errors": "replace",
46
  "extra_special_tokens": {},
47
  "model_max_length": 1024,
48
  "pad_token": "[PAD]",
 
1
  {
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "50256": {
 
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
  },
 
 
 
 
21
  "bos_token": "<|endoftext|>",
22
  "clean_up_tokenization_spaces": false,
23
  "eos_token": "<|endoftext|>",
 
24
  "extra_special_tokens": {},
25
  "model_max_length": 1024,
26
  "pad_token": "[PAD]",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff