apollo_demo

Running on Zero

App Files Files Community

AlanXian commited on 8 days ago

Commit

40103ce

1 Parent(s): 33032fd

update message format

Browse files

Files changed (1) hide show

app.py +20 -21

app.py CHANGED Viewed

@@ -51,6 +51,14 @@ h1 {
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/Apollo-7B")
 model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/Apollo-7B", device_map="auto")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
@@ -58,39 +66,30 @@ terminators = [
 ]
 @spaces.GPU(duration=120)
-def chat_llama3_8b(conversation_data: dict,
               temperature: float,
               max_new_tokens: int
              ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
-        conversation_data (dict): A dictionary containing 'text' and 'history'.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
-    message = conversation_data.get("text", "")
-    history_str = conversation_data.get("history", "")
-    # 处理历史记录
-    conversation = []
-    if history_str:
-        # 假设历史记录是以某种格式存储的字符串，需要根据实际格式进行解析
-        # 这里假设历史记录是以换行符分隔的用户和助手消息，偶数行是用户，奇数行是助手
-        lines = history_str.strip().split('\n')
-        for i in range(0, len(lines), 2):
-            if i+1 < len(lines):
-                user_msg = lines[i]
-                assistant_msg = lines[i+1]
-                conversation.extend([
-                    {"role": "user", "content": user_msg},
-                    {"role": "assistant", "content": assistant_msg}
-                ])
-    # 添加最新的用户消息
-    conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)

 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/Apollo-7B")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+tokenizer.apply_chat_template(chat, tokenize=False)
 model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/Apollo-7B", device_map="auto")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
 ]
 @spaces.GPU(duration=120)
+def chat_llama3_8b(message: str,
+              history: list,
               temperature: float,
               max_new_tokens: int
              ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
+        message (str): The input message.
+        history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
     Returns:
         str: The generated response.
     """
+    # Build conversation as pure array format
+    history_messages = []
+    for user, assistant in history:
+        history_messages.extend(assistant)
+    conversation = [
+        "text": message,          # 当前消息
+        "history": history_messages  # 历史消息数组
+    ]
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)