Upload folder using huggingface_hub

Files changed (4) hide show

qwen_generation_utils.py CHANGED Viewed

@@ -128,7 +128,7 @@ def make_context(
         history = []
     if chat_format == "chatml":
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
         im_start_tokens = [tokenizer.im_start_id]
         im_end_tokens = [tokenizer.im_end_id]
         nl_tokens = tokenizer.encode("\n")
@@ -286,8 +286,8 @@ def decode_tokens(
     elif chat_format == "raw":
         return _decode_default(
             tokens,
-            stop_words=["<|endoftext|>"],
-            eod_words=["<|endoftext|>"],
             tokenizer=tokenizer,
             raw_text_len=raw_text_len,
             verbose=verbose,

         history = []
     if chat_format == "chatml":
+        im_start, im_end = "<s>", "<|im_end|>"
         im_start_tokens = [tokenizer.im_start_id]
         im_end_tokens = [tokenizer.im_end_id]
         nl_tokens = tokenizer.encode("\n")
     elif chat_format == "raw":
         return _decode_default(
             tokens,
+            stop_words=["</s>"],
+            eod_words=["</s>"],
             tokenizer=tokenizer,
             raw_text_len=raw_text_len,
             verbose=verbose,

special_tokens_map.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "bos_token": "<|im_start|>",
-  "eos_token": "<|endoftext|>",
   "pad_token": "<|extra_0|>"
 }

 {
+  "bos_token": "<s>",
+  "eos_token": "</s>",
   "pad_token": "<|extra_0|>"
 }

tokenization_qwen.py CHANGED Viewed

@@ -20,8 +20,8 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
-ENDOFTEXT = "<|endoftext|>"
-IMSTART = "<|im_start|>"
 IMEND = "<|im_end|>"
 # as the default behavior is changed to allow special tokens in
 # regular texts, the surface forms of special tokens need to be

 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "</s>"
+IMSTART = "<s>"
 IMEND = "<|im_end|>"
 # as the default behavior is changed to allow special tokens in
 # regular texts, the surface forms of special tokens need to be

tokenizer_config.json CHANGED Viewed

@@ -6,9 +6,9 @@
       null
     ]
   },
-  "bos_token": "<|im_start|>",
   "clean_up_tokenization_spaces": true,
-  "eos_token": "<|endoftext|>",
   "model_max_length": 8192,
   "pad_token": "<|extra_0|>",
   "padding_side": "left",

       null
     ]
   },
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
   "model_max_length": 8192,
   "pad_token": "<|extra_0|>",
   "padding_side": "left",