File size: 5,783 Bytes
e4cc20f
 
 
 
 
 
 
6df37bd
 
 
 
3cf0e6f
 
 
7f2c936
4d768c3
 
3cf0e6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d768c3
3cf0e6f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import shlex
import subprocess

# install requirements
os.system("pip install -r requirements.txt")

# get hf token
hf_token = os.getenv("HF_TOKEN", None)
os.environ["HF_TOKEN"] = hf_token

import tempfile
import traceback
from pathlib import Path
import spaces
import gradio as gr

def save_tmp_audio(audio, cache_dir):
    with tempfile.NamedTemporaryFile(
        dir=cache_dir, delete=False, suffix=".wav"
    ) as temp_audio:
        temp_audio.write(audio)
    return temp_audio.name

def add_message(chatbot, history, mic, text):
    if not mic and not text:
        return chatbot, history, "Input is empty"

    if text:
        chatbot.append({"role": "user", "content": text})
        history.append({"role": "human", "content": text})
    elif mic and Path(mic).exists():
        chatbot.append({"role": "user", "content": {"path": mic}})
        history.append({"role": "human", "content": [{"type":"audio", "audio": mic}]})

    print(f"{history=}")
    return chatbot, history, None

def reset_state(system_prompt):
    return [], [{"role": "system", "content": system_prompt}]

@spaces.GPU
def predict(chatbot, history, audio_model, token2wav, prompt_wav, cache_dir):
    try:
        history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})
        tokens, text, audio = audio_model(history, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, do_sample=True)
        print(f"predict {text=}")
        audio = token2wav(audio, prompt_wav)
        audio_path = save_tmp_audio(audio, cache_dir)
        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
        history[-1]["content"].append({"type": "token", "token": tokens})
        history[-1]["eot"] = True
    except Exception:
        print(traceback.format_exc())
        gr.Warning(f"Some error happend, please try again.")
    return chatbot, history

def _launch_demo(args, audio_model, token2wav):
    with gr.Blocks(delete_cache=(86400, 86400)) as demo:
        gr.Markdown("""<center><font size=8>Step Audio 2 Demo</center>""")
        with gr.Row():
            system_prompt = gr.Textbox(
                label="System Prompt",
                value="你的名字叫做小跃,是由阶跃星辰公司训练出来的语音大模型。\n你情感细腻,观察能力强,擅长分析用户的内容,并作出善解人意的回复,说话的过程中时刻注意用户的感受,富有同理心,提供多样的情绪价值。\n今天是2025年8月29日,星期五\n请用默认女声与用户交流。",
                lines=2
            )
        chatbot = gr.Chatbot(
            elem_id="chatbot",
            #avatar_images=["assets/user.png", "assets/assistant.png"],
            min_height=800,
            type="messages",
        )
        history = gr.State([{"role": "system", "content": system_prompt.value}])
        mic = gr.Audio(type="filepath")
        text = gr.Textbox(placeholder="Enter message ...")

        with gr.Row():
            clean_btn = gr.Button("🧹 Clear History (清除历史)")
            regen_btn = gr.Button("🤔️ Regenerate (重试)")
            submit_btn = gr.Button("🚀 Submit")

        def on_submit(chatbot, history, mic, text):
            chatbot, history, error = add_message(
                chatbot, history, mic, text
            )
            if error:
                gr.Warning(error)  # 显示警告消息
                return chatbot, history, None, None
            else:
                chatbot, history = predict(chatbot, history, audio_model, token2wav, args.prompt_wav, args.cache_dir)
                return chatbot, history, None, None

        submit_btn.click(
            fn=on_submit,
            inputs=[chatbot, history, mic, text],
            outputs=[chatbot, history, mic, text],
            concurrency_limit=4,
            concurrency_id="gpu_queue",
        )

        clean_btn.click(
            fn=reset_state,
            inputs=[system_prompt],
            outputs=[chatbot, history],
            #show_progress=True,
        )

        def regenerate(chatbot, history):
            while chatbot and chatbot[-1]["role"] == "assistant":
                chatbot.pop()
            while history and history[-1]["role"] == "assistant":
                print(f"discard {history[-1]}")
                history.pop()
            return predict(chatbot, history, audio_model, token2wav, args.prompt_wav, args.cache_dir)

        regen_btn.click(
            regenerate,
            [chatbot, history],
            [chatbot, history],
            #show_progress=True,
            concurrency_id="gpu_queue",
        )

    demo.queue().launch(
        server_port=args.server_port,
        server_name=args.server_name,
    )


if __name__ == "__main__":
    import os
    from argparse import ArgumentParser

    from stepaudio2 import StepAudio2
    from token2wav import Token2wav

    parser = ArgumentParser()
    parser.add_argument("--model-path", type=str, default='Step-Audio-2-mini', help="Model path.")
    parser.add_argument(
        "--server-port", type=int, default=7860, help="Demo server port."
    )
    parser.add_argument(
        "--server-name", type=str, default="0.0.0.0", help="Demo server name."
    )
    parser.add_argument(
        "--prompt-wav", type=str, default="assets/default_female.wav", help="Prompt wave for the assistant."
    )
    parser.add_argument(
        "--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory."
    )
    args = parser.parse_args()
    os.environ["GRADIO_TEMP_DIR"] = args.cache_dir

    audio_model = StepAudio2(args.model_path)
    token2wav = Token2wav(f"{args.model_path}/token2wav")
    _launch_demo(args, audio_model, token2wav)