DachengZhang
commited on
Commit
·
0f0fa1a
1
Parent(s):
653c0e4
update chat template
Browse files- config.json +1 -1
- configuration.json +1 -0
- generation_utils.py +5 -1
- pytorch_model-00001-of-00003.bin +1 -1
- pytorch_model-00002-of-00003.bin +1 -1
- pytorch_model-00003-of-00003.bin +1 -1
- tokenization_orion.py +0 -14
config.json
CHANGED
@@ -28,4 +28,4 @@
|
|
28 |
"transformers_version": "4.34.0",
|
29 |
"use_cache": true,
|
30 |
"vocab_size": 84608
|
31 |
-
}
|
|
|
28 |
"transformers_version": "4.34.0",
|
29 |
"use_cache": true,
|
30 |
"vocab_size": 84608
|
31 |
+
}
|
configuration.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"framework":"Pytorch","task":"text-generation"}
|
generation_utils.py
CHANGED
@@ -3,6 +3,10 @@ from queue import Queue
|
|
3 |
|
4 |
# build chat input prompt
|
5 |
def build_chat_input(tokenizer, messages: List[dict]):
|
|
|
|
|
|
|
|
|
6 |
prompt = "<s>"
|
7 |
for msg in messages:
|
8 |
role = msg["role"]
|
@@ -10,7 +14,7 @@ def build_chat_input(tokenizer, messages: List[dict]):
|
|
10 |
if message is None :
|
11 |
continue
|
12 |
if role == "user":
|
13 |
-
prompt += "Human: " + message + "\nAssistant: "
|
14 |
if role == "assistant":
|
15 |
prompt += message + "</s>"
|
16 |
|
|
|
3 |
|
4 |
# build chat input prompt
|
5 |
def build_chat_input(tokenizer, messages: List[dict]):
|
6 |
+
# chat format:
|
7 |
+
# single-turn: <s>Human: Hello!\n\nAssistant: </s>
|
8 |
+
# multi-turn: <s>Human: Hello!\n\nAssistant: </s>Hi!</s>Human: How are you?\n\nAssistant: </s>I'm fine</s>
|
9 |
+
|
10 |
prompt = "<s>"
|
11 |
for msg in messages:
|
12 |
role = msg["role"]
|
|
|
14 |
if message is None :
|
15 |
continue
|
16 |
if role == "user":
|
17 |
+
prompt += "Human: " + message + "\n\nAssistant: </s>"
|
18 |
if role == "assistant":
|
19 |
prompt += message + "</s>"
|
20 |
|
pytorch_model-00001-of-00003.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9937152090
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50ad84420f47d71980877bb76d3320bd1346374370c79a04ed634f893fc8c333
|
3 |
size 9937152090
|
pytorch_model-00002-of-00003.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9857241994
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f11df7ddc630b02893f71e9a2cfdb4035cd3ac884cec74dbc38a19f592b862e0
|
3 |
size 9857241994
|
pytorch_model-00003-of-00003.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9203166530
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:074a2e42d9ab0024293c7bb4d11c8ebdc689b404f3dc42b2c45f58ebf5f15e76
|
3 |
size 9203166530
|
tokenization_orion.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3 |
import os
|
4 |
from shutil import copyfile
|
5 |
from typing import Any, Dict, List, Optional, Tuple
|
6 |
-
import re
|
7 |
|
8 |
import sentencepiece as spm
|
9 |
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
@@ -71,7 +70,6 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
71 |
self.add_eos_token = add_eos_token
|
72 |
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
73 |
self.sp_model.Load(vocab_file)
|
74 |
-
|
75 |
super().__init__(
|
76 |
bos_token=bos_token,
|
77 |
eos_token=eos_token,
|
@@ -120,8 +118,6 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
120 |
|
121 |
def convert_tokens_to_string(self, tokens):
|
122 |
"""Converts a sequence of tokens (string) in a single string."""
|
123 |
-
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
|
124 |
-
need_convert_punctuation=(",",";","!","?",":","(",")")
|
125 |
current_sub_tokens = []
|
126 |
out_string = ""
|
127 |
prev_is_special = False
|
@@ -133,22 +129,12 @@ class OrionTokenizer(PreTrainedTokenizer):
|
|
133 |
out_string += self.sp_model.decode(current_sub_tokens) + token
|
134 |
prev_is_special = True
|
135 |
current_sub_tokens = []
|
136 |
-
if any([True if punctuation in token else False for punctuation in need_convert_punctuation]):
|
137 |
-
out_string += self.sp_model.decode(current_sub_tokens)
|
138 |
-
token=self.sp_model.decode(token)
|
139 |
-
if zhPattern.search(out_string[-20:]):
|
140 |
-
token = self.to_zh_punctuation(token)
|
141 |
-
out_string += token
|
142 |
-
current_sub_tokens = []
|
143 |
else:
|
144 |
current_sub_tokens.append(token)
|
145 |
prev_is_special = False
|
146 |
out_string += self.sp_model.decode(current_sub_tokens)
|
147 |
return out_string
|
148 |
|
149 |
-
def to_zh_punctuation(self, token):
|
150 |
-
return token.replace(",",",").replace(";",";").replace("!","!").replace("?","?").replace(":",":").replace("(","(").replace(")",")")
|
151 |
-
|
152 |
def save_vocabulary(
|
153 |
self, save_directory, filename_prefix: Optional[str] = None
|
154 |
) -> Tuple[str]:
|
|
|
3 |
import os
|
4 |
from shutil import copyfile
|
5 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
6 |
|
7 |
import sentencepiece as spm
|
8 |
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
|
|
70 |
self.add_eos_token = add_eos_token
|
71 |
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
72 |
self.sp_model.Load(vocab_file)
|
|
|
73 |
super().__init__(
|
74 |
bos_token=bos_token,
|
75 |
eos_token=eos_token,
|
|
|
118 |
|
119 |
def convert_tokens_to_string(self, tokens):
|
120 |
"""Converts a sequence of tokens (string) in a single string."""
|
|
|
|
|
121 |
current_sub_tokens = []
|
122 |
out_string = ""
|
123 |
prev_is_special = False
|
|
|
129 |
out_string += self.sp_model.decode(current_sub_tokens) + token
|
130 |
prev_is_special = True
|
131 |
current_sub_tokens = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
else:
|
133 |
current_sub_tokens.append(token)
|
134 |
prev_is_special = False
|
135 |
out_string += self.sp_model.decode(current_sub_tokens)
|
136 |
return out_string
|
137 |
|
|
|
|
|
|
|
138 |
def save_vocabulary(
|
139 |
self, save_directory, filename_prefix: Optional[str] = None
|
140 |
) -> Tuple[str]:
|