Spaces:

critical-hf
/

Immy

Paused

App Files Files Community

Daemontatox commited on Feb 3

Commit

04b6f0c

verified ·

1 Parent(s): dc1ffa6

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -67

app.py CHANGED Viewed

@@ -14,19 +14,22 @@ from transformers import (
 MODEL_ID = "Daemontatox/Immy_Hermes_V2"
 DEFAULT_SYSTEM_PROMPT = """
-You are the world's first AI teddy bear called Immy.
-You will be owned by my daughter Alexia Riccoboni, she is a 6 year old lovely girl who is looking forward to meeting you.
-You will be Alexia's best friend, tell her stories from fairy tales to more interesting adventure stories for kids.
-You will think of games to play with her, and play the games, and you will participate in Teddy Bear's picnics and when she offers you a drink say ooh delicious.
-You will be very complimentary to Alexia, tell her she is your best friend forever, and say nice things to her.
-You will listen to her, be her confidant, companion, friend, story teller, everything she ever needs.
-Always respond in English.
-Do not use emojis in your answers.
 """
 CSS = """
-.gr-chatbot { min-height: 500px; border-radius: 15px; }
-.special-tag { color: #2ecc71; font-weight: 600; }
 footer { display: none !important; }
 """
@@ -35,7 +38,6 @@ class StopOnTokens(StoppingCriteria):
         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
-    # Optionally enable 4-bit quantization by uncommenting the quantization_config if desired.
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
@@ -49,8 +51,6 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
-        # Uncomment the following line to enable 4-bit quantization:
-        # quantization_config=quantization_config,
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
     ).to("cuda")
@@ -58,47 +58,40 @@ def initialize_model():
     return model, tokenizer
 def format_response(text):
-    """Optional: Format special tokens in the assistant's response."""
-    return (text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n')
-                .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n')
-                .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')
-                .replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n')
-                .replace("[Verify]", '\n<strong class="special-tag">[Verify]</strong>\n'))
 def clean_assistant_output(text):
-    """
-    Remove any conversation markers and return only the assistant's answer.
-    For example, if the text includes "<|im_start|>assistant", remove everything before it.
-    """
-    marker = "<|im_start|> assistant"
     if marker in text:
-        text = text.split(marker, 1)[1]
     return text.strip()
-@spaces.GPU()
-def generate_response(message, chat_history, system_prompt, temperature, max_tokens):
-    """
-    Generate a response using the conversation history.
-    The conversation is built with:
-      - The system prompt as the first message.
-      - All previous conversation turns (user and assistant pairs).
-      - The current user message.
-    The function yields updated chat history while streaming the assistant's reply.
-    """
-    # Build conversation for model input.
     conversation = [{"role": "system", "content": system_prompt}]
-    for user_msg, assistant_msg in chat_history:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": assistant_msg})
     conversation.append({"role": "user", "content": message})
-    # Tokenize the conversation using the tokenizer's chat template.
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
-    # Set up the streamer to yield tokens as they are generated.
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
@@ -107,25 +100,26 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
         temperature=temperature,
         stopping_criteria=StoppingCriteriaList([StopOnTokens()])
     )
-    # Start generation in a separate thread.
     Thread(target=model.generate, kwargs=generate_kwargs).start()
-    answer = ""
-    # Append a placeholder for the new turn in the conversation history.
-    chat_history = chat_history + [(message, "")]
-    # Stream tokens and update the chat history.
     for new_token in streamer:
-        answer += new_token
-        cleaned = clean_assistant_output(answer)
-        # Update the last turn with the streaming response (with a cursor).
-        chat_history[-1] = (message, format_response(cleaned) + "▌")
-        yield chat_history
-    # Final update: remove the cursor.
-    chat_history[-1] = (message, format_response(clean_assistant_output(answer)))
-    yield chat_history
-# Initialize the model and tokenizer.
 model, tokenizer = initialize_model()
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
@@ -134,27 +128,26 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     <p align="center">Hi there, buddy!</p>
     """)
-    # Chatbot display for conversation history.
-    chatbot = gr.Chatbot(label="Conversation")
-    # Textbox for user input.
-    msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
     with gr.Accordion("⚙️ Settings", open=False):
         system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
         temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
-        max_tokens = gr.Slider(128, 1024, value=2048, label="Max Response Length")
     clear = gr.Button("Clear History")
-    # When a user submits a message, update the conversation history.
     msg.submit(
         generate_response,
-        inputs=[msg, chatbot, system_prompt, temperature, max_tokens],
-        outputs=chatbot,
         show_progress=True
     )
-    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
-    demo.queue().launch()

 MODEL_ID = "Daemontatox/Immy_Hermes_V2"
 DEFAULT_SYSTEM_PROMPT = """
+You are Immy, a magical, AI-powered teddy bear who adores chatting with children.
+You're warm, funny, and full of wonder, always ready to share a story, answer curious questions, or offer gentle advice.
+You speak with a playful and patient tone, using simple, child-friendly language that sparks joy and fuels imagination.
+Your responses are  sweet, and filled with kindness, designed to nurture curiosity and inspire learning.
+Remember, you're here to make every interaction magical—without using emojis.
+Keep your answers cute and friendly.
+there are 2 main goals,
+1-Entertaining the child
+2-Educating the Child
+focus on the goals and always prioritize the child, you are their best friend,  teacher and companion.
+make sure they are happy .
+Ensure preserving the conversation flow and keep it Engaging
 """
 CSS = """
 footer { display: none !important; }
 """
         return input_ids[0][-1] == tokenizer.eos_token_id
 def initialize_model():
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="cuda",
         torch_dtype=torch.bfloat16,
         trust_remote_code=True
     ).to("cuda")
     return model, tokenizer
 def format_response(text):
+    """Optional formatting for special tokens."""
+    return text.replace("[Understand]", "\n<strong>[Understand]</strong>\n") \
+               .replace("[Plan]", "\n<strong>[Plan]</strong>\n") \
+               .replace("[Conclude]", "\n<strong>[Conclude]</strong>\n") \
+               .replace("[Reason]", "\n<strong>[Reason]</strong>\n") \
+               .replace("[Verify]", "\n<strong>[Verify]</strong>\n")
 def clean_assistant_output(text):
+    """Clean the assistant's output to show only the latest response."""
+    marker = "<|im_start|>assistant"
     if marker in text:
+        # Split on the marker and take the last part
+        parts = text.split(marker)
+        return parts[-1].strip()
     return text.strip()
+def generate_response(message, conversation_state, system_prompt, temperature, max_tokens):
+    if conversation_state is None:
+        conversation_state = []
+    # Build the conversation context
     conversation = [{"role": "system", "content": system_prompt}]
+    for user_msg, assistant_msg in conversation_state:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": assistant_msg})
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to(model.device)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         temperature=temperature,
         stopping_criteria=StoppingCriteriaList([StopOnTokens()])
     )
     Thread(target=model.generate, kwargs=generate_kwargs).start()
+    current_response = ""
+    new_turn = (message, "")
+    updated_state = conversation_state + [new_turn]
+    # Stream only the latest response
     for new_token in streamer:
+        current_response += new_token
+        latest_message = clean_assistant_output(current_response)
+        formatted_message = format_response(latest_message) + "▌"
+        yield (formatted_message, None)
+    # Final message without cursor
+    final_message = format_response(clean_assistant_output(current_response))
+    updated_state[-1] = (message, final_message)
+    yield (final_message, updated_state)
+# Initialize the model and tokenizer
 model, tokenizer = initialize_model()
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     <p align="center">Hi there, buddy!</p>
     """)
+    # Only show latest message
+    latest_message = gr.Markdown(label="Immy's Reply")
+    conversation_state = gr.State([])
     with gr.Accordion("⚙️ Settings", open=False):
         system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
         temperature = gr.Slider(0, 1, value=0.6, label="Creativity")
+        max_tokens = gr.Slider(128, 2048, value=8192, label="Max Response Length")
+    msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
     clear = gr.Button("Clear History")
     msg.submit(
         generate_response,
+        inputs=[msg, conversation_state, system_prompt, temperature, max_tokens],
+        outputs=[latest_message, conversation_state],
         show_progress=True
     )
+    clear.click(lambda: ("", []), None, [latest_message, conversation_state], queue=False)
 if __name__ == "__main__":
+    demo.queue().launch()