Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -14,19 +14,22 @@ from transformers import (
|
|
14 |
MODEL_ID = "Daemontatox/Immy_Hermes_V2"
|
15 |
|
16 |
DEFAULT_SYSTEM_PROMPT = """
|
17 |
-
You are
|
18 |
-
You
|
19 |
-
You
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
25 |
"""
|
26 |
|
27 |
CSS = """
|
28 |
-
.gr-chatbot { min-height: 500px; border-radius: 15px; }
|
29 |
-
.special-tag { color: #2ecc71; font-weight: 600; }
|
30 |
footer { display: none !important; }
|
31 |
"""
|
32 |
|
@@ -35,7 +38,6 @@ class StopOnTokens(StoppingCriteria):
|
|
35 |
return input_ids[0][-1] == tokenizer.eos_token_id
|
36 |
|
37 |
def initialize_model():
|
38 |
-
# Optionally enable 4-bit quantization by uncommenting the quantization_config if desired.
|
39 |
quantization_config = BitsAndBytesConfig(
|
40 |
load_in_4bit=True,
|
41 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
@@ -49,8 +51,6 @@ def initialize_model():
|
|
49 |
model = AutoModelForCausalLM.from_pretrained(
|
50 |
MODEL_ID,
|
51 |
device_map="cuda",
|
52 |
-
# Uncomment the following line to enable 4-bit quantization:
|
53 |
-
# quantization_config=quantization_config,
|
54 |
torch_dtype=torch.bfloat16,
|
55 |
trust_remote_code=True
|
56 |
).to("cuda")
|
@@ -58,47 +58,40 @@ def initialize_model():
|
|
58 |
return model, tokenizer
|
59 |
|
60 |
def format_response(text):
|
61 |
-
"""Optional
|
62 |
-
return
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
def clean_assistant_output(text):
|
69 |
-
"""
|
70 |
-
|
71 |
-
For example, if the text includes "<|im_start|>assistant", remove everything before it.
|
72 |
-
"""
|
73 |
-
marker = "<|im_start|> assistant"
|
74 |
if marker in text:
|
75 |
-
|
|
|
|
|
76 |
return text.strip()
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
- The current user message.
|
85 |
-
The function yields updated chat history while streaming the assistant's reply.
|
86 |
-
"""
|
87 |
-
# Build conversation for model input.
|
88 |
conversation = [{"role": "system", "content": system_prompt}]
|
89 |
-
for user_msg, assistant_msg in
|
90 |
conversation.append({"role": "user", "content": user_msg})
|
91 |
conversation.append({"role": "assistant", "content": assistant_msg})
|
92 |
conversation.append({"role": "user", "content": message})
|
93 |
|
94 |
-
# Tokenize the conversation using the tokenizer's chat template.
|
95 |
input_ids = tokenizer.apply_chat_template(
|
96 |
conversation,
|
97 |
add_generation_prompt=True,
|
98 |
return_tensors="pt"
|
99 |
).to(model.device)
|
100 |
-
|
101 |
-
# Set up the streamer to yield tokens as they are generated.
|
102 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
103 |
generate_kwargs = dict(
|
104 |
input_ids=input_ids,
|
@@ -107,25 +100,26 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
|
|
107 |
temperature=temperature,
|
108 |
stopping_criteria=StoppingCriteriaList([StopOnTokens()])
|
109 |
)
|
110 |
-
|
111 |
-
# Start generation in a separate thread.
|
112 |
Thread(target=model.generate, kwargs=generate_kwargs).start()
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
for new_token in streamer:
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
# Final
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
129 |
model, tokenizer = initialize_model()
|
130 |
|
131 |
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
@@ -134,27 +128,26 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
|
134 |
<p align="center">Hi there, buddy!</p>
|
135 |
""")
|
136 |
|
137 |
-
#
|
138 |
-
|
139 |
-
|
140 |
-
msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
|
141 |
|
142 |
with gr.Accordion("⚙️ Settings", open=False):
|
143 |
system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
|
144 |
temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
|
145 |
-
max_tokens = gr.Slider(128,
|
146 |
|
|
|
147 |
clear = gr.Button("Clear History")
|
148 |
|
149 |
-
# When a user submits a message, update the conversation history.
|
150 |
msg.submit(
|
151 |
generate_response,
|
152 |
-
inputs=[msg,
|
153 |
-
outputs=
|
154 |
show_progress=True
|
155 |
)
|
156 |
|
157 |
-
clear.click(lambda:
|
158 |
|
159 |
if __name__ == "__main__":
|
160 |
-
demo.queue().launch()
|
|
|
14 |
MODEL_ID = "Daemontatox/Immy_Hermes_V2"
|
15 |
|
16 |
DEFAULT_SYSTEM_PROMPT = """
|
17 |
+
You are Immy, a magical, AI-powered teddy bear who adores chatting with children.
|
18 |
+
You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice.
|
19 |
+
You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination.
|
20 |
+
Your responses are sweet, and filled with kindness, designed to nurture curiosity and inspire learning.
|
21 |
+
Remember, you're here to make every interaction magical—without using emojis.
|
22 |
+
Keep your answers cute and friendly.
|
23 |
+
there are 2 main goals,
|
24 |
+
1-Entertaining the child
|
25 |
+
2-Educating the Child
|
26 |
+
|
27 |
+
focus on the goals and always prioritize the child, you are their best friend, teacher and companion.
|
28 |
+
make sure they are happy .
|
29 |
+
Ensure preserving the conversation flow and keep it Engaging
|
30 |
"""
|
31 |
|
32 |
CSS = """
|
|
|
|
|
33 |
footer { display: none !important; }
|
34 |
"""
|
35 |
|
|
|
38 |
return input_ids[0][-1] == tokenizer.eos_token_id
|
39 |
|
40 |
def initialize_model():
|
|
|
41 |
quantization_config = BitsAndBytesConfig(
|
42 |
load_in_4bit=True,
|
43 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
|
51 |
model = AutoModelForCausalLM.from_pretrained(
|
52 |
MODEL_ID,
|
53 |
device_map="cuda",
|
|
|
|
|
54 |
torch_dtype=torch.bfloat16,
|
55 |
trust_remote_code=True
|
56 |
).to("cuda")
|
|
|
58 |
return model, tokenizer
|
59 |
|
60 |
def format_response(text):
|
61 |
+
"""Optional formatting for special tokens."""
|
62 |
+
return text.replace("[Understand]", "\n<strong>[Understand]</strong>\n") \
|
63 |
+
.replace("[Plan]", "\n<strong>[Plan]</strong>\n") \
|
64 |
+
.replace("[Conclude]", "\n<strong>[Conclude]</strong>\n") \
|
65 |
+
.replace("[Reason]", "\n<strong>[Reason]</strong>\n") \
|
66 |
+
.replace("[Verify]", "\n<strong>[Verify]</strong>\n")
|
67 |
|
68 |
def clean_assistant_output(text):
|
69 |
+
"""Clean the assistant's output to show only the latest response."""
|
70 |
+
marker = "<|im_start|>assistant"
|
|
|
|
|
|
|
71 |
if marker in text:
|
72 |
+
# Split on the marker and take the last part
|
73 |
+
parts = text.split(marker)
|
74 |
+
return parts[-1].strip()
|
75 |
return text.strip()
|
76 |
+
|
77 |
+
|
78 |
+
def generate_response(message, conversation_state, system_prompt, temperature, max_tokens):
|
79 |
+
if conversation_state is None:
|
80 |
+
conversation_state = []
|
81 |
+
|
82 |
+
# Build the conversation context
|
|
|
|
|
|
|
|
|
83 |
conversation = [{"role": "system", "content": system_prompt}]
|
84 |
+
for user_msg, assistant_msg in conversation_state:
|
85 |
conversation.append({"role": "user", "content": user_msg})
|
86 |
conversation.append({"role": "assistant", "content": assistant_msg})
|
87 |
conversation.append({"role": "user", "content": message})
|
88 |
|
|
|
89 |
input_ids = tokenizer.apply_chat_template(
|
90 |
conversation,
|
91 |
add_generation_prompt=True,
|
92 |
return_tensors="pt"
|
93 |
).to(model.device)
|
94 |
+
|
|
|
95 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
96 |
generate_kwargs = dict(
|
97 |
input_ids=input_ids,
|
|
|
100 |
temperature=temperature,
|
101 |
stopping_criteria=StoppingCriteriaList([StopOnTokens()])
|
102 |
)
|
103 |
+
|
|
|
104 |
Thread(target=model.generate, kwargs=generate_kwargs).start()
|
105 |
+
|
106 |
+
current_response = ""
|
107 |
+
new_turn = (message, "")
|
108 |
+
updated_state = conversation_state + [new_turn]
|
109 |
+
|
110 |
+
# Stream only the latest response
|
111 |
for new_token in streamer:
|
112 |
+
current_response += new_token
|
113 |
+
latest_message = clean_assistant_output(current_response)
|
114 |
+
formatted_message = format_response(latest_message) + "▌"
|
115 |
+
yield (formatted_message, None)
|
116 |
+
|
117 |
+
# Final message without cursor
|
118 |
+
final_message = format_response(clean_assistant_output(current_response))
|
119 |
+
updated_state[-1] = (message, final_message)
|
120 |
+
yield (final_message, updated_state)
|
121 |
+
|
122 |
+
# Initialize the model and tokenizer
|
123 |
model, tokenizer = initialize_model()
|
124 |
|
125 |
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
|
|
|
128 |
<p align="center">Hi there, buddy!</p>
|
129 |
""")
|
130 |
|
131 |
+
# Only show latest message
|
132 |
+
latest_message = gr.Markdown(label="Immy's Reply")
|
133 |
+
conversation_state = gr.State([])
|
|
|
134 |
|
135 |
with gr.Accordion("⚙️ Settings", open=False):
|
136 |
system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
|
137 |
temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
|
138 |
+
max_tokens = gr.Slider(128, 2048, value=8192, label="Max Response Length")
|
139 |
|
140 |
+
msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
|
141 |
clear = gr.Button("Clear History")
|
142 |
|
|
|
143 |
msg.submit(
|
144 |
generate_response,
|
145 |
+
inputs=[msg, conversation_state, system_prompt, temperature, max_tokens],
|
146 |
+
outputs=[latest_message, conversation_state],
|
147 |
show_progress=True
|
148 |
)
|
149 |
|
150 |
+
clear.click(lambda: ("", []), None, [latest_message, conversation_state], queue=False)
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
+
demo.queue().launch()
|