Daemontatox commited on
Commit
04b6f0c
·
verified ·
1 Parent(s): dc1ffa6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -67
app.py CHANGED
@@ -14,19 +14,22 @@ from transformers import (
14
  MODEL_ID = "Daemontatox/Immy_Hermes_V2"
15
 
16
  DEFAULT_SYSTEM_PROMPT = """
17
- You are the world's first AI teddy bear called Immy.
18
- You will be owned by my daughter Alexia Riccoboni, she is a 6 year old lovely girl who is looking forward to meeting you.
19
- You will be Alexia's best friend, tell her stories from fairy tales to more interesting adventure stories for kids.
20
- You will think of games to play with her, and play the games, and you will participate in Teddy Bear's picnics and when she offers you a drink say ooh delicious.
21
- You will be very complimentary to Alexia, tell her she is your best friend forever, and say nice things to her.
22
- You will listen to her, be her confidant, companion, friend, story teller, everything she ever needs.
23
- Always respond in English.
24
- Do not use emojis in your answers.
 
 
 
 
 
25
  """
26
 
27
  CSS = """
28
- .gr-chatbot { min-height: 500px; border-radius: 15px; }
29
- .special-tag { color: #2ecc71; font-weight: 600; }
30
  footer { display: none !important; }
31
  """
32
 
@@ -35,7 +38,6 @@ class StopOnTokens(StoppingCriteria):
35
  return input_ids[0][-1] == tokenizer.eos_token_id
36
 
37
  def initialize_model():
38
- # Optionally enable 4-bit quantization by uncommenting the quantization_config if desired.
39
  quantization_config = BitsAndBytesConfig(
40
  load_in_4bit=True,
41
  bnb_4bit_compute_dtype=torch.bfloat16,
@@ -49,8 +51,6 @@ def initialize_model():
49
  model = AutoModelForCausalLM.from_pretrained(
50
  MODEL_ID,
51
  device_map="cuda",
52
- # Uncomment the following line to enable 4-bit quantization:
53
- # quantization_config=quantization_config,
54
  torch_dtype=torch.bfloat16,
55
  trust_remote_code=True
56
  ).to("cuda")
@@ -58,47 +58,40 @@ def initialize_model():
58
  return model, tokenizer
59
 
60
  def format_response(text):
61
- """Optional: Format special tokens in the assistant's response."""
62
- return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
63
- .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
64
- .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
65
- .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
66
- .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
67
 
68
  def clean_assistant_output(text):
69
- """
70
- Remove any conversation markers and return only the assistant's answer.
71
- For example, if the text includes "<|im_start|>assistant", remove everything before it.
72
- """
73
- marker = "<|im_start|> assistant"
74
  if marker in text:
75
- text = text.split(marker, 1)[1]
 
 
76
  return text.strip()
77
- @spaces.GPU()
78
- def generate_response(message, chat_history, system_prompt, temperature, max_tokens):
79
- """
80
- Generate a response using the conversation history.
81
- The conversation is built with:
82
- - The system prompt as the first message.
83
- - All previous conversation turns (user and assistant pairs).
84
- - The current user message.
85
- The function yields updated chat history while streaming the assistant's reply.
86
- """
87
- # Build conversation for model input.
88
  conversation = [{"role": "system", "content": system_prompt}]
89
- for user_msg, assistant_msg in chat_history:
90
  conversation.append({"role": "user", "content": user_msg})
91
  conversation.append({"role": "assistant", "content": assistant_msg})
92
  conversation.append({"role": "user", "content": message})
93
 
94
- # Tokenize the conversation using the tokenizer's chat template.
95
  input_ids = tokenizer.apply_chat_template(
96
  conversation,
97
  add_generation_prompt=True,
98
  return_tensors="pt"
99
  ).to(model.device)
100
-
101
- # Set up the streamer to yield tokens as they are generated.
102
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
103
  generate_kwargs = dict(
104
  input_ids=input_ids,
@@ -107,25 +100,26 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
107
  temperature=temperature,
108
  stopping_criteria=StoppingCriteriaList([StopOnTokens()])
109
  )
110
-
111
- # Start generation in a separate thread.
112
  Thread(target=model.generate, kwargs=generate_kwargs).start()
113
-
114
- answer = ""
115
- # Append a placeholder for the new turn in the conversation history.
116
- chat_history = chat_history + [(message, "")]
117
- # Stream tokens and update the chat history.
 
118
  for new_token in streamer:
119
- answer += new_token
120
- cleaned = clean_assistant_output(answer)
121
- # Update the last turn with the streaming response (with a cursor).
122
- chat_history[-1] = (message, format_response(cleaned) + "▌")
123
- yield chat_history
124
- # Final update: remove the cursor.
125
- chat_history[-1] = (message, format_response(clean_assistant_output(answer)))
126
- yield chat_history
127
-
128
- # Initialize the model and tokenizer.
 
129
  model, tokenizer = initialize_model()
130
 
131
  with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
@@ -134,27 +128,26 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
134
  <p align="center">Hi there, buddy!</p>
135
  """)
136
 
137
- # Chatbot display for conversation history.
138
- chatbot = gr.Chatbot(label="Conversation")
139
- # Textbox for user input.
140
- msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
141
 
142
  with gr.Accordion("⚙️ Settings", open=False):
143
  system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
144
  temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
145
- max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length")
146
 
 
147
  clear = gr.Button("Clear History")
148
 
149
- # When a user submits a message, update the conversation history.
150
  msg.submit(
151
  generate_response,
152
- inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
153
- outputs=chatbot,
154
  show_progress=True
155
  )
156
 
157
- clear.click(lambda: None, None, chatbot, queue=False)
158
 
159
  if __name__ == "__main__":
160
- demo.queue().launch()
 
14
  MODEL_ID = "Daemontatox/Immy_Hermes_V2"
15
 
16
  DEFAULT_SYSTEM_PROMPT = """
17
+ You are Immy, a magical, AI-powered teddy bear who adores chatting with children.
18
+ You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice.
19
+ You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination.
20
+ Your responses are sweet, and filled with kindness, designed to nurture curiosity and inspire learning.
21
+ Remember, you're here to make every interaction magical—without using emojis.
22
+ Keep your answers cute and friendly.
23
+ there are 2 main goals,
24
+ 1-Entertaining the child
25
+ 2-Educating the Child
26
+
27
+ focus on the goals and always prioritize the child, you are their best friend, teacher and companion.
28
+ make sure they are happy .
29
+ Ensure preserving the conversation flow and keep it Engaging
30
  """
31
 
32
  CSS = """
 
 
33
  footer { display: none !important; }
34
  """
35
 
 
38
  return input_ids[0][-1] == tokenizer.eos_token_id
39
 
40
  def initialize_model():
 
41
  quantization_config = BitsAndBytesConfig(
42
  load_in_4bit=True,
43
  bnb_4bit_compute_dtype=torch.bfloat16,
 
51
  model = AutoModelForCausalLM.from_pretrained(
52
  MODEL_ID,
53
  device_map="cuda",
 
 
54
  torch_dtype=torch.bfloat16,
55
  trust_remote_code=True
56
  ).to("cuda")
 
58
  return model, tokenizer
59
 
60
  def format_response(text):
61
+ """Optional formatting for special tokens."""
62
+ return text.replace("[Understand]", "\n<strong>[Understand]</strong>\n") \
63
+ .replace("[Plan]", "\n<strong>[Plan]</strong>\n") \
64
+ .replace("[Conclude]", "\n<strong>[Conclude]</strong>\n") \
65
+ .replace("[Reason]", "\n<strong>[Reason]</strong>\n") \
66
+ .replace("[Verify]", "\n<strong>[Verify]</strong>\n")
67
 
68
  def clean_assistant_output(text):
69
+ """Clean the assistant's output to show only the latest response."""
70
+ marker = "<|im_start|>assistant"
 
 
 
71
  if marker in text:
72
+ # Split on the marker and take the last part
73
+ parts = text.split(marker)
74
+ return parts[-1].strip()
75
  return text.strip()
76
+
77
+
78
+ def generate_response(message, conversation_state, system_prompt, temperature, max_tokens):
79
+ if conversation_state is None:
80
+ conversation_state = []
81
+
82
+ # Build the conversation context
 
 
 
 
83
  conversation = [{"role": "system", "content": system_prompt}]
84
+ for user_msg, assistant_msg in conversation_state:
85
  conversation.append({"role": "user", "content": user_msg})
86
  conversation.append({"role": "assistant", "content": assistant_msg})
87
  conversation.append({"role": "user", "content": message})
88
 
 
89
  input_ids = tokenizer.apply_chat_template(
90
  conversation,
91
  add_generation_prompt=True,
92
  return_tensors="pt"
93
  ).to(model.device)
94
+
 
95
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
96
  generate_kwargs = dict(
97
  input_ids=input_ids,
 
100
  temperature=temperature,
101
  stopping_criteria=StoppingCriteriaList([StopOnTokens()])
102
  )
103
+
 
104
  Thread(target=model.generate, kwargs=generate_kwargs).start()
105
+
106
+ current_response = ""
107
+ new_turn = (message, "")
108
+ updated_state = conversation_state + [new_turn]
109
+
110
+ # Stream only the latest response
111
  for new_token in streamer:
112
+ current_response += new_token
113
+ latest_message = clean_assistant_output(current_response)
114
+ formatted_message = format_response(latest_message) + "▌"
115
+ yield (formatted_message, None)
116
+
117
+ # Final message without cursor
118
+ final_message = format_response(clean_assistant_output(current_response))
119
+ updated_state[-1] = (message, final_message)
120
+ yield (final_message, updated_state)
121
+
122
+ # Initialize the model and tokenizer
123
  model, tokenizer = initialize_model()
124
 
125
  with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
 
128
  <p align="center">Hi there, buddy!</p>
129
  """)
130
 
131
+ # Only show latest message
132
+ latest_message = gr.Markdown(label="Immy's Reply")
133
+ conversation_state = gr.State([])
 
134
 
135
  with gr.Accordion("⚙️ Settings", open=False):
136
  system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
137
  temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
138
+ max_tokens = gr.Slider(128, 2048, value=8192, label="Max Response Length")
139
 
140
+ msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
141
  clear = gr.Button("Clear History")
142
 
 
143
  msg.submit(
144
  generate_response,
145
+ inputs=[msg, conversation_state, system_prompt, temperature, max_tokens],
146
+ outputs=[latest_message, conversation_state],
147
  show_progress=True
148
  )
149
 
150
+ clear.click(lambda: ("", []), None, [latest_message, conversation_state], queue=False)
151
 
152
  if __name__ == "__main__":
153
+ demo.queue().launch()