Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on 18 days ago

Commit

a9a5df1

verified ·

1 Parent(s): 6daeb9a

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -69

app.py CHANGED Viewed

@@ -1,31 +1,37 @@
 import os
 import shlex
 import subprocess
-import threading
 import tempfile
 import traceback
 from pathlib import Path
 os.system("pip install -r requirements.txt")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/campplus.onnx -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.pt -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.yaml -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/hift.pt -P token2wav")
-# HF token passthrough
 hf_token = os.getenv("HF_TOKEN", None)
-os.environ["HF_TOKEN"] = hf_token
 import spaces
 import gradio as gr
-def save_tmp_audio(audio_bytes, cache_dir):
     os.makedirs(cache_dir, exist_ok=True)
     with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as temp_audio:
         temp_audio.write(audio_bytes)
-    return temp_audio.name
 def add_message(chatbot, history, mic, text):
     if not mic and not text:
         return chatbot, history, "Input is empty"
@@ -36,49 +42,47 @@ def add_message(chatbot, history, mic, text):
         chatbot.append({"role": "user", "content": {"path": mic}})
         history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
     return chatbot, history, None
-def reset_state(system_prompt):
     return [], [{"role": "system", "content": system_prompt}]
-_AUDIO_MODEL = None
-_TOKEN2WAV = None
-_INIT_LOCK = threading.Lock()
-def _ensure_models(model_path: str, token2wav_dir: str):
     """
-    Create heavy, non-picklable objects *inside* the worker process exactly once.
     """
-    global _AUDIO_MODEL, _TOKEN2WAV
-    if _AUDIO_MODEL is None or _TOKEN2WAV is None:
-        with _INIT_LOCK:
-            if _AUDIO_MODEL is None or _TOKEN2WAV is None:
-                # Import here to avoid importing before process fork
-                from stepaudio2 import StepAudio2
-                from token2wav import Token2wav
-                # Create non-picklable instances
-                _AUDIO_MODEL = StepAudio2(model_path)
-                _TOKEN2WAV = Token2wav(token2wav_dir)
-    return _AUDIO_MODEL, _TOKEN2WAV
 @spaces.GPU
-def predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir):
     """
-    IMPORTANT: All parameters are simple strings/lists (picklable).
-    Heavy objects are created inside via _ensure_models(...).
     """
     try:
-        audio_model, token2wav = _ensure_models(model_path, token2wav_dir)
-        # Stream start marker
         history.append({
             "role": "assistant",
             "content": [{"type": "text", "text": "<tts_start>"}],
             "eot": False
         })
-        # Your original generation call
         tokens, text, audio_tokens = audio_model(
             history,
             max_new_tokens=4096,
@@ -86,21 +90,23 @@ def predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir):
             repetition_penalty=1.05,
             do_sample=True
         )
-        # Convert tokens -> wav bytes
         audio_bytes = token2wav(audio_tokens, prompt_wav)
-        # Save to temp file for gradio Chatbot
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
-        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
-        # Finish the assistant turn
         history[-1]["content"].append({"type": "token", "token": tokens})
         history[-1]["eot"] = True
     except Exception:
         print(traceback.format_exc())
-        gr.Warning("Some error happened, please try again.")
     return chatbot, history
 def _launch_demo(args):
@@ -112,71 +118,85 @@ def _launch_demo(args):
                 label="System Prompt",
                 value=(
                     "你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
-                    "你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
                     "今天是2025年8月29日，星期五\n"
                     "请用默认女声与用户交流。"
                 ),
-                lines=2
             )
-        chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
-        # Initialize history with the *string* value of the prompt
-        history = gr.State([{"role": "system", "content": system_prompt.value}])
-        # Inputs
-        mic = gr.Audio(type="filepath")
-        text = gr.Textbox(placeholder="Enter message ...")
-        # Serializable configuration inputs (STRINGS ONLY)
-        model_path = "Step-Audio-2-mini"
-        token2wav_dir = "token2wav"
-        prompt_wav = "assets/default_female.wav"
-        cache_dir = "/tmp/stepaudio2"
         with gr.Row():
             clean_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
-        # --- event functions (now only use serializable args) ---
-        def on_submit(chatbot, history, mic, text, prompt_wav, cache_dir, model_path, token2wav_dir):
-            chatbot, history, error = add_message(chatbot, history, mic, text)
             if error:
                 gr.Warning(error)
-                return chatbot, history, None, None
-            chatbot, history = predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir)
-            return chatbot, history, None, None
         submit_btn.click(
             fn=on_submit,
-            inputs=[chatbot, history, mic, text, prompt_wav, cache_dir, model_path, token2wav_dir],
             outputs=[chatbot, history, mic, text],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
         )
         clean_btn.click(
-            fn=reset_state,
             inputs=[system_prompt],
             outputs=[chatbot, history],
         )
-        def on_regen(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir):
-            # drop last assistant turn so we can re-run
-            while chatbot and chatbot[-1]["role"] == "assistant":
-                chatbot.pop()
-            while history and history[-1]["role"] == "assistant":
-                history.pop()
-            return predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir)
         regen_btn.click(
-            fn=on_regen,
-            inputs=[chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir],
             outputs=[chatbot, history],
             concurrency_id="gpu_queue",
         )
-    demo.queue().launch(server_port=args.server_port, server_name=args.server_name)
 if __name__ == "__main__":
     from argparse import ArgumentParser
@@ -190,8 +210,4 @@ if __name__ == "__main__":
     args = parser.parse_args()
     os.environ["GRADIO_TEMP_DIR"] = args.cache_dir
-    os.makedirs(args.cache_dir, exist_ok=True)
-    # NOTE: Do NOT instantiate heavy models here.
-    # They will be created lazily inside predict() via _ensure_models(...).
     _launch_demo(args)

 import os
 import shlex
 import subprocess
 import tempfile
 import traceback
 from pathlib import Path
+# --- Install / fetch runtime deps & assets ---
 os.system("pip install -r requirements.txt")
+# Download token2wav assets
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/campplus.onnx -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.pt -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.yaml -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/hift.pt -P token2wav")
+# Hugging Face token (optional)
 hf_token = os.getenv("HF_TOKEN", None)
+if hf_token is not None:
+    os.environ["HF_TOKEN"] = hf_token
 import spaces
 import gradio as gr
+def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
+    """Save raw wav bytes to a temporary file and return path."""
     os.makedirs(cache_dir, exist_ok=True)
     with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as temp_audio:
         temp_audio.write(audio_bytes)
+        return temp_audio.name
 def add_message(chatbot, history, mic, text):
+    """Append user text or audio to the chat + history."""
     if not mic and not text:
         return chatbot, history, "Input is empty"
         chatbot.append({"role": "user", "content": {"path": mic}})
         history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
+    print(f"{history=}")
     return chatbot, history, None
+def reset_state(system_prompt: str):
+    """Reset chat to a single system message."""
     return [], [{"role": "system", "content": system_prompt}]
+_MODEL = None
+_TOK2WAV = None
+def _get_models(model_path: str):
     """
+    Lazily load heavy, non-picklable models INSIDE the worker process
+    and cache them in module globals for reuse.
     """
+    global _MODEL, _TOK2WAV
+    if _MODEL is None or _TOK2WAV is None:
+        # Import here so the objects are constructed in the worker
+        from stepaudio2 import StepAudio2
+        from token2wav import Token2wav
+        _MODEL = StepAudio2(model_path)
+        _TOK2WAV = Token2wav("token2wav")
+    return _MODEL, _TOK2WAV
 @spaces.GPU
+def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mini"):
     """
+    Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
+    Heavy models are created via _get_models() inside this process.
     """
     try:
+        audio_model, token2wav = _get_models(model_path)
         history.append({
             "role": "assistant",
             "content": [{"type": "text", "text": "<tts_start>"}],
             "eot": False
         })
         tokens, text, audio_tokens = audio_model(
             history,
             max_new_tokens=4096,
             repetition_penalty=1.05,
             do_sample=True
         )
+        print(f"predict text={text!r}")
+        # Convert tokens -> waveform bytes using token2wav
         audio_bytes = token2wav(audio_tokens, prompt_wav)
+        # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
+        # Append assistant audio message
+        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
         history[-1]["content"].append({"type": "token", "token": tokens})
         history[-1]["eot"] = True
     except Exception:
         print(traceback.format_exc())
+        gr.Warning("Some error happend, please try again.")
     return chatbot, history
 def _launch_demo(args):
                 label="System Prompt",
                 value=(
                     "你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
+                    "你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，"
+                    "说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
                     "今天是2025年8月29日，星期五\n"
                     "请用默认女声与用户交流。"
                 ),
+                lines=2,
             )
+        chatbot = gr.Chatbot(
+            elem_id="chatbot",
+            min_height=800,
+            type="messages",
+        )
+        # Initialize history with current system prompt value
+        history = gr.State([{"role": "system", "content": system_prompt.value}])
+        mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
+        text = gr.Textbox(placeholder="Enter message ...", label="💬 Text")
         with gr.Row():
             clean_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
+        def on_submit(chatbot_val, history_val, mic_val, text_val):
+            chatbot2, history2, error = add_message(chatbot_val, history_val, mic_val, text_val)
             if error:
                 gr.Warning(error)
+                return chatbot2, history2, None, None
+            # Run GPU inference with only picklable args
+            chatbot2, history2 = predict(
+                chatbot2, history2,
+                args.prompt_wav, args.cache_dir,
+                model_path=args.model_path
+            )
+            return chatbot2, history2, None, None
         submit_btn.click(
             fn=on_submit,
+            inputs=[chatbot, history, mic, text],
             outputs=[chatbot, history, mic, text],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
         )
+        def on_clean(system_prompt_text):
+            return reset_state(system_prompt_text)
         clean_btn.click(
+            fn=on_clean,
             inputs=[system_prompt],
             outputs=[chatbot, history],
         )
+        def on_regenerate(chatbot_val, history_val):
+            # Drop last assistant turn(s) to regenerate
+            while chatbot_val and chatbot_val[-1]["role"] == "assistant":
+                chatbot_val.pop()
+            while history_val and history_val[-1]["role"] == "assistant":
+                print(f"discard {history_val[-1]}")
+                history_val.pop()
+            return predict(
+                chatbot_val, history_val,
+                args.prompt_wav, args.cache_dir,
+                model_path=args.model_path
+            )
         regen_btn.click(
+            fn=on_regenerate,
+            inputs=[chatbot, history],
             outputs=[chatbot, history],
             concurrency_id="gpu_queue",
         )
+        demo.queue().launch(
+            server_port=args.server_port,
+            server_name=args.server_name,
+        )
 if __name__ == "__main__":
     from argparse import ArgumentParser
     args = parser.parse_args()
     os.environ["GRADIO_TEMP_DIR"] = args.cache_dir
     _launch_demo(args)