Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Aug 29

Commit

0a7cf4d

verified ·

1 Parent(s): 876f2fc

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -23

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# app.py
 import os
 import shlex
 import subprocess
@@ -82,10 +80,12 @@ def _get_models(model_path: str):
 # Inference
 # -----------------------
 @spaces.GPU
-def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
     """
     try:
         audio_model, token2wav = _get_models(model_path)
@@ -101,12 +101,13 @@ def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mi
             max_new_tokens=4096,
             temperature=0.7,
             repetition_penalty=1.05,
-            do_sample=True
         )
         print(f"predict text={text!r}")
-        # Convert tokens -> waveform bytes using token2wav
-        audio_bytes = token2wav(audio_tokens, prompt_wav)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
@@ -118,7 +119,7 @@ def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mi
     except Exception:
         print(traceback.format_exc())
-        gr.Warning("Some error happend, please try again.")
     return chatbot, history
@@ -152,6 +153,9 @@ def _launch_demo(args):
         # Initialize history with current system prompt value
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
         text = gr.Textbox(placeholder="Enter message ...", label="💬 Text")
@@ -160,37 +164,47 @@ def _launch_demo(args):
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
-        def on_submit(chatbot_val, history_val, mic_val, text_val):
             chatbot2, history2, error = add_message(chatbot_val, history_val, mic_val, text_val)
             if error:
                 gr.Warning(error)
-                return chatbot2, history2, None, None
-            # Run GPU inference with only picklable args
             chatbot2, history2 = predict(
                 chatbot2, history2,
-                args.prompt_wav, args.cache_dir,
-                model_path=args.model_path
             )
-            return chatbot2, history2, None, None
         submit_btn.click(
             fn=on_submit,
-            inputs=[chatbot, history, mic, text],
-            outputs=[chatbot, history, mic, text],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
         )
-        def on_clean(system_prompt_text):
-            return reset_state(system_prompt_text)
         clean_btn.click(
             fn=on_clean,
-            inputs=[system_prompt],
-            outputs=[chatbot, history],
         )
-        def on_regenerate(chatbot_val, history_val):
             # Drop last assistant turn(s) to regenerate
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
@@ -199,13 +213,14 @@ def _launch_demo(args):
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
-                args.prompt_wav, args.cache_dir,
-                model_path=args.model_path
             )
         regen_btn.click(
             fn=on_regenerate,
-            inputs=[chatbot, history],
             outputs=[chatbot, history],
             concurrency_id="gpu_queue",
         )

 import os
 import shlex
 import subprocess
 # Inference
 # -----------------------
 @spaces.GPU
+def predict(chatbot, history, prompt_wav_path, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
+    `prompt_wav_path` is the CURRENT reference audio to condition on (can be user upload).
     """
     try:
         audio_model, token2wav = _get_models(model_path)
             max_new_tokens=4096,
             temperature=0.7,
             repetition_penalty=1.05,
+            do_sample=True,
         )
         print(f"predict text={text!r}")
+        # Convert tokens -> waveform bytes using token2wav with the *selected* prompt
+        prompt_path = prompt_wav_path if (prompt_wav_path and Path(prompt_wav_path).exists()) else None
+        audio_bytes = token2wav(audio_tokens, prompt_path)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
     except Exception:
         print(traceback.format_exc())
+        gr.Warning("Some error happened, please try again.")
     return chatbot, history
         # Initialize history with current system prompt value
         history = gr.State([{"role": "system", "content": system_prompt.value}])
+        # NEW: keep track of the *current* prompt wav path (defaults to bundled voice)
+        current_prompt_wav = gr.State(args.prompt_wav)
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
         text = gr.Textbox(placeholder="Enter message ...", label="💬 Text")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
+        def on_submit(chatbot_val, history_val, mic_val, text_val, current_prompt):
             chatbot2, history2, error = add_message(chatbot_val, history_val, mic_val, text_val)
             if error:
                 gr.Warning(error)
+                # keep state intact
+                return chatbot2, history2, None, None, current_prompt
+            # Choose prompt: prefer latest user mic if present, else stick to remembered prompt
+            prompt_path = mic_val if (mic_val and Path(mic_val).exists()) else current_prompt
             chatbot2, history2 = predict(
                 chatbot2, history2,
+                prompt_path,
+                args.cache_dir,
+                model_path=args.model_path,
             )
+            # Clear inputs; remember the prompt we actually used
+            new_prompt_state = prompt_path
+            return chatbot2, history2, None, None, new_prompt_state
         submit_btn.click(
             fn=on_submit,
+            inputs=[chatbot, history, mic, text, current_prompt_wav],
+            outputs=[chatbot, history, mic, text, current_prompt_wav],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
         )
+        def on_clean(system_prompt_text, _default_prompt):
+            # Reset chat and also reset the remembered prompt back to default
+            new_chatbot, new_history = reset_state(system_prompt_text)
+            return new_chatbot, new_history, _default_prompt
         clean_btn.click(
             fn=on_clean,
+            inputs=[system_prompt, current_prompt_wav],
+            outputs=[chatbot, history, current_prompt_wav],
         )
+        def on_regenerate(chatbot_val, history_val, current_prompt):
             # Drop last assistant turn(s) to regenerate
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
+                current_prompt,           # use the remembered prompt for regen
+                args.cache_dir,
+                model_path=args.model_path,
             )
         regen_btn.click(
             fn=on_regenerate,
+            inputs=[chatbot, history, current_prompt_wav],
             outputs=[chatbot, history],
             concurrency_id="gpu_queue",
         )