nroggendorff commited on
Commit
85c4894
·
verified ·
1 Parent(s): 3c67b8d

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +2 -2
train.py CHANGED
@@ -36,7 +36,7 @@ def create_tokenizer(training_corpus):
36
  tokenizer = ByteLevelBPETokenizer()
37
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
38
  if INSTRUCT_FINETUNE_BOOL:
39
- special_tokens.append("<|user|>", "<|bot|>", "<|end|>")
40
  tokenizer.train_from_iterator(
41
  training_corpus,
42
  vocab_size=VOCAB_SIZE,
@@ -50,7 +50,7 @@ def create_tokenizer(training_corpus):
50
  def load_tokenizer(training_corpus):
51
  tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO)
52
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
53
- special_tokens.append("<|user|>", "<|bot|>", "<|end|>")
54
  tokenizer.train_from_iterator(
55
  training_corpus,
56
  vocab_size=VOCAB_SIZE,
 
36
  tokenizer = ByteLevelBPETokenizer()
37
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
38
  if INSTRUCT_FINETUNE_BOOL:
39
+ special_tokens.append(["<|user|>", "<|bot|>", "<|end|>"])
40
  tokenizer.train_from_iterator(
41
  training_corpus,
42
  vocab_size=VOCAB_SIZE,
 
50
  def load_tokenizer(training_corpus):
51
  tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO)
52
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
53
+ special_tokens.append(["<|user|>", "<|bot|>", "<|end|>"])
54
  tokenizer.train_from_iterator(
55
  training_corpus,
56
  vocab_size=VOCAB_SIZE,