jondurbin commited on
Commit
a998154
·
1 Parent(s): 8933a5b

Upload folder using huggingface_hub

Browse files
qwen_generation_utils.py CHANGED
@@ -128,7 +128,7 @@ def make_context(
128
  history = []
129
 
130
  if chat_format == "chatml":
131
- im_start, im_end = "<|im_start|>", "<|im_end|>"
132
  im_start_tokens = [tokenizer.im_start_id]
133
  im_end_tokens = [tokenizer.im_end_id]
134
  nl_tokens = tokenizer.encode("\n")
@@ -286,8 +286,8 @@ def decode_tokens(
286
  elif chat_format == "raw":
287
  return _decode_default(
288
  tokens,
289
- stop_words=["<|endoftext|>"],
290
- eod_words=["<|endoftext|>"],
291
  tokenizer=tokenizer,
292
  raw_text_len=raw_text_len,
293
  verbose=verbose,
 
128
  history = []
129
 
130
  if chat_format == "chatml":
131
+ im_start, im_end = "<s>", "<|im_end|>"
132
  im_start_tokens = [tokenizer.im_start_id]
133
  im_end_tokens = [tokenizer.im_end_id]
134
  nl_tokens = tokenizer.encode("\n")
 
286
  elif chat_format == "raw":
287
  return _decode_default(
288
  tokens,
289
+ stop_words=["</s>"],
290
+ eod_words=["</s>"],
291
  tokenizer=tokenizer,
292
  raw_text_len=raw_text_len,
293
  verbose=verbose,
special_tokens_map.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "bos_token": "<|im_start|>",
3
- "eos_token": "<|endoftext|>",
4
  "pad_token": "<|extra_0|>"
5
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
  "pad_token": "<|extra_0|>"
5
  }
tokenization_qwen.py CHANGED
@@ -20,8 +20,8 @@ logger = logging.getLogger(__name__)
20
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
 
22
  PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
- ENDOFTEXT = "<|endoftext|>"
24
- IMSTART = "<|im_start|>"
25
  IMEND = "<|im_end|>"
26
  # as the default behavior is changed to allow special tokens in
27
  # regular texts, the surface forms of special tokens need to be
 
20
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
 
22
  PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "</s>"
24
+ IMSTART = "<s>"
25
  IMEND = "<|im_end|>"
26
  # as the default behavior is changed to allow special tokens in
27
  # regular texts, the surface forms of special tokens need to be
tokenizer_config.json CHANGED
@@ -6,9 +6,9 @@
6
  null
7
  ]
8
  },
9
- "bos_token": "<|im_start|>",
10
  "clean_up_tokenization_spaces": true,
11
- "eos_token": "<|endoftext|>",
12
  "model_max_length": 8192,
13
  "pad_token": "<|extra_0|>",
14
  "padding_side": "left",
 
6
  null
7
  ]
8
  },
9
+ "bos_token": "<s>",
10
  "clean_up_tokenization_spaces": true,
11
+ "eos_token": "</s>",
12
  "model_max_length": 8192,
13
  "pad_token": "<|extra_0|>",
14
  "padding_side": "left",