Spaces:
Running
on
Zero
Running
on
Zero
update message format
Browse files
app.py
CHANGED
@@ -51,6 +51,14 @@ h1 {
|
|
51 |
# Load the tokenizer and model
|
52 |
tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/Apollo-7B")
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/Apollo-7B", device_map="auto") # to("cuda:0")
|
55 |
terminators = [
|
56 |
tokenizer.eos_token_id,
|
@@ -58,39 +66,30 @@ terminators = [
|
|
58 |
]
|
59 |
|
60 |
@spaces.GPU(duration=120)
|
61 |
-
def chat_llama3_8b(
|
|
|
62 |
temperature: float,
|
63 |
max_new_tokens: int
|
64 |
) -> str:
|
65 |
"""
|
66 |
Generate a streaming response using the llama3-8b model.
|
67 |
Args:
|
68 |
-
|
|
|
69 |
temperature (float): The temperature for generating the response.
|
70 |
max_new_tokens (int): The maximum number of new tokens to generate.
|
71 |
Returns:
|
72 |
str: The generated response.
|
73 |
"""
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
conversation = []
|
79 |
-
if history_str:
|
80 |
-
# 假设历史记录是以某种格式存储的字符串,需要根据实际格式进行解析
|
81 |
-
# 这里假设历史记录是以换行符分隔的用户和助手消息,偶数行是用户,奇数行是助手
|
82 |
-
lines = history_str.strip().split('\n')
|
83 |
-
for i in range(0, len(lines), 2):
|
84 |
-
if i+1 < len(lines):
|
85 |
-
user_msg = lines[i]
|
86 |
-
assistant_msg = lines[i+1]
|
87 |
-
conversation.extend([
|
88 |
-
{"role": "user", "content": user_msg},
|
89 |
-
{"role": "assistant", "content": assistant_msg}
|
90 |
-
])
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
|
95 |
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
|
96 |
|
|
|
51 |
# Load the tokenizer and model
|
52 |
tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/Apollo-7B")
|
53 |
|
54 |
+
chat = [
|
55 |
+
{"role": "user", "content": "Hello, how are you?"},
|
56 |
+
{"role": "assistant", "content": "I'm doing great. How can I help you today?"},
|
57 |
+
{"role": "user", "content": "I'd like to show off how chat templating works!"},
|
58 |
+
]
|
59 |
+
|
60 |
+
tokenizer.apply_chat_template(chat, tokenize=False)
|
61 |
+
|
62 |
model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/Apollo-7B", device_map="auto") # to("cuda:0")
|
63 |
terminators = [
|
64 |
tokenizer.eos_token_id,
|
|
|
66 |
]
|
67 |
|
68 |
@spaces.GPU(duration=120)
|
69 |
+
def chat_llama3_8b(message: str,
|
70 |
+
history: list,
|
71 |
temperature: float,
|
72 |
max_new_tokens: int
|
73 |
) -> str:
|
74 |
"""
|
75 |
Generate a streaming response using the llama3-8b model.
|
76 |
Args:
|
77 |
+
message (str): The input message.
|
78 |
+
history (list): The conversation history used by ChatInterface.
|
79 |
temperature (float): The temperature for generating the response.
|
80 |
max_new_tokens (int): The maximum number of new tokens to generate.
|
81 |
Returns:
|
82 |
str: The generated response.
|
83 |
"""
|
84 |
+
# Build conversation as pure array format
|
85 |
+
history_messages = []
|
86 |
+
for user, assistant in history:
|
87 |
+
history_messages.extend(assistant)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
conversation = [
|
90 |
+
"text": message, # 当前消息
|
91 |
+
"history": history_messages # 历史消息数组
|
92 |
+
]
|
93 |
|
94 |
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
|
95 |
|