Spaces:

stepfun-ai
/

Step-Audio

Running

App Files Files Community

martin commited on Feb 19

Commit

b007bca

1 Parent(s): 930f36f

update app

Browse files

Files changed (13) hide show

.gitattributes +5 -0
app.py +204 -83
assets/request_rap_ZH.wav +3 -0
assets/tell_me_a_short_story_EN.wav +3 -0
assets/yuewen.jpeg +3 -0
speakers/{闫雨婷RAP_prompt.wav → TingtingRAP_prompt.wav} +0 -0
speakers/{闫雨婷VOCAL_prompt.wav → TingtingVOCAL_prompt.wav} +0 -0
speakers/{闫雨婷_prompt.wav → Tingting_prompt.wav} +0 -0
speakers/speakers_info.json +3 -3
start_app.sh +1 -2
stepaudio.py +0 -95
tts.py +13 -0
yuewen_api.py +54 -0

.gitattributes CHANGED Viewed

@@ -5,3 +5,8 @@ assets/assistant.png filter=lfs diff=lfs merge=lfs -text
 speakers/闫雨婷_prompt.wav filter=lfs diff=lfs merge=lfs -text
 speakers/闫雨婷RAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
 speakers/闫雨婷VOCAL_prompt.wav filter=lfs diff=lfs merge=lfs -text

 speakers/闫雨婷_prompt.wav filter=lfs diff=lfs merge=lfs -text
 speakers/闫雨婷RAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
 speakers/闫雨婷VOCAL_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/Tingting_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/TingtingRAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/TingtingVOCAL_prompt.wav filter=lfs diff=lfs merge=lfs -text
+assets/yuewen.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/request_rap_zh.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,61 +1,97 @@
 import gradio as gr
 import time
 from pathlib import Path
-import torchaudio
-from stepaudio import StepAudio
-from funasr import AutoModel
-from funasr.utils.postprocess_utils import rich_transcription_postprocess
 CACHE_DIR = "/tmp/gradio/"
-system_promtp = {"role": "system", "content": "适配用户的语言，用简短口语化的文字回答"}
-class CustomAsr:
-    def __init__(self, model_name="iic/SenseVoiceSmall", device="cuda"):
-        self.model = AutoModel(
-            model=model_name,
-            vad_model="fsmn-vad",
-            vad_kwargs={"max_single_segment_time": 30000},
-            device=device,
-        )
-    def run(self, audio_path):
-        res = self.model.generate(
-            input=audio_path,
-            cache={},
-            language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
-            use_itn=True,
-            batch_size_s=60,
-            merge_vad=True,  #
-            merge_length_s=15,
-        )
-        text = rich_transcription_postprocess(res[0]["text"])
-        return text
 def add_message(chatbot, history, mic, text):
     if not mic and not text:
         return chatbot, history, "Input is empty"
     if text:
         chatbot.append({"role": "user", "content": text})
-        history.append({"role": "user", "content": text})
-    elif mic and Path(mic).exists():
         chatbot.append({"role": "user", "content": {"path": mic}})
-        history.append({"role": "user", "content": {"type":"audio", "audio": mic}})
-    print(f"{history=}")
     return chatbot, history, None
-def reset_state():
-    """Reset the chat history."""
-    return [], [system_promtp]
 def save_tmp_audio(audio, sr):
     import tempfile
     with tempfile.NamedTemporaryFile(
         dir=CACHE_DIR, delete=False, suffix=".wav"
@@ -66,90 +102,176 @@ def save_tmp_audio(audio, sr):
     return temp_audio.name
-def predict(chatbot, history, audio_model, asr_model):
     """Generate a response from the model."""
     try:
-        is_input_audio = False
-        user_audio_path = None
-        # 检测用户输入的是音频还是文本
-        if isinstance(history[-1]["content"], dict):
-            is_input_audio = True
-            user_audio_path = history[-1]["content"]["audio"]
-        text, audio, sr = audio_model(history, "闫雨婷")
         print(f"predict {text=}")
         audio_path = save_tmp_audio(audio, sr)
-        # 缓存用户语音的 asr 文本结果为了加速下一次推理
-        if is_input_audio:
-            asr_text = asr_model.run(user_audio_path)
-            chatbot.append({"role": "user", "content": asr_text})
-            history[-1]["content"] = asr_text
-            print(f"{asr_text=}")
-        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
         chatbot.append({"role": "assistant", "content": text})
         history.append({"role": "assistant", "content": text})
     except Exception as e:
-        print(e)
-        gr.Warning(f"Some error happend, retry submit")
     return chatbot, history
-def _launch_demo(args, audio_model, asr_model):
-    with gr.Blocks(delete_cache=(86400, 86400)) as demo:
         gr.Markdown("""<center><font size=8>Step Audio Chat</center>""")
         chatbot = gr.Chatbot(
             elem_id="chatbot",
             avatar_images=["assets/user.png", "assets/assistant.png"],
             min_height=800,
             type="messages",
         )
-        # 保存 chat 历史，不需要每次再重新拼格式
-        history = gr.State([system_promtp])
-        mic = gr.Audio(type="filepath")
-        text = gr.Textbox(placeholder="Enter message ...")
         with gr.Row():
-            clean_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
-            submit_btn = gr.Button("🚀 Submit")
-        def on_submit(chatbot, history, mic, text):
-            chatbot, history, error = add_message(
-                chatbot, history, mic, text
             )
             if error:
-                gr.Warning(error)  # 显示警告消息
                 return chatbot, history, None, None
             else:
-                chatbot, history = predict(chatbot, history, audio_model, asr_model)
                 return chatbot, history, None, None
-        submit_btn.click(
             fn=on_submit,
-            inputs=[chatbot, history, mic, text],
             outputs=[chatbot, history, mic, text],
-            concurrency_limit=4,
-            concurrency_id="gpu_queue",
         )
-        clean_btn.click(
-            reset_state,
             outputs=[chatbot, history],
             show_progress=True,
         )
-        def regenerate(chatbot, history):
             while chatbot and chatbot[-1]["role"] == "assistant":
                 chatbot.pop()
             while history and history[-1]["role"] == "assistant":
                 print(f"discard {history[-1]}")
                 history.pop()
-            return predict(chatbot, history, audio_model, asr_model)
         regen_btn.click(
             regenerate,
-            [chatbot, history],
             [chatbot, history],
             show_progress=True,
-            concurrency_id="gpu_queue",
         )
     demo.queue().launch(
@@ -164,7 +286,7 @@ if __name__ == "__main__":
     import os
     parser = ArgumentParser()
-    parser.add_argument("--model-path", type=str, required=True, help="Model path.")
     parser.add_argument(
         "--server-port", type=int, default=7860, help="Demo server port."
     )
@@ -172,11 +294,10 @@ if __name__ == "__main__":
         "--server-name", type=str, default="0.0.0.0", help="Demo server name."
     )
     args = parser.parse_args()
-    audio_model = StepAudio(
-        tokenizer_path=os.path.join(args.model_path, "Step-Audio-Tokenizer"),
-        tts_path=os.path.join(args.model_path, "Step-Audio-TTS-3B"),
-        llm_path=os.path.join(args.model_path, "Step-Audio-Chat"),
     )
-    asr_model = CustomAsr()
-    _launch_demo(args, audio_model, asr_model)

+import base64
+from copy import deepcopy
 import gradio as gr
 import time
 from pathlib import Path
+from tokenizer import StepAudioTokenizer
+from tts import StepAudioTTS
+from yuewen_api import call_audiochat, call_asr
 CACHE_DIR = "/tmp/gradio/"
+CACHE_CLEAN_AGE = 864000
+CHINESE_PROMPT_CONTENT = """你是一个为对话而设计的人工智能模型，目前无法连接到互联网。
+当你需要唱歌或说唱时，请以（RAP）开头。当你需要快速说话时，请以（快速）开头。当你需要慢速说话时，请以（慢速）开头。
+现在，你需要倾听用户的语音内容，并以礼貌、简洁、口语化的文本进行回复。你需要尽量用户的语种进行回复。"""
+ENGLISH_PROMPT_CONTENT = """You are an AI designed for conversation, currently unable to connect to the internet.
+when you need to sing or rap, start your response with (RAP). when you need to speak fast, you start your response with (fast). when you need to speak fast, you start your response with (slow)
+Now, you need to listen to the user's voice content and respond with politely, concise, conversational text. Respond in accordance with the user's language."""
+PROMPT_TEMPLATE = {
+    "English Prompt": ENGLISH_PROMPT_CONTENT,
+    "Chinese Prompt": CHINESE_PROMPT_CONTENT,
+}
+def make_system_prompt(prompt):
+    return {"role": "system", "content": prompt}
+BASE_DIR = Path(__file__).resolve().parent
+CHAT_EXAMPLES = [
+    ["Input audio only", None, f"{BASE_DIR}/assets/tell_me_a_short_story_EN.wav"],
+    [
+        "Input text only",
+        "What did the speaker mean when they said, it's raining cats and dogs?",
+        None,
+    ],
+    [
+        "Text and audio mixed input",
+        "Answer the following query in English",
+        f"{BASE_DIR}/assets/request_rap_ZH.wav",
+    ],
+]
+def file_to_base64(filename):
+    with open(filename, "rb") as f:
+        file_content = f.read()
+        return base64.b64encode(file_content).decode("utf-8")
+def get_audio_format(audio_filename):
+    extension = Path(audio_filename).suffix.lower()
+    if extension == ".mp3":
+        return "mp3"
+    elif extension == ".wav":
+        return "wav"
+    else:
+        return "other"
 def add_message(chatbot, history, mic, text):
     if not mic and not text:
         return chatbot, history, "Input is empty"
+    content = []
     if text:
         chatbot.append({"role": "user", "content": text})
+        content.append({"type": "text", "text": text})
+        print(f"add message {text=}")
+    if mic and Path(mic).exists():
         chatbot.append({"role": "user", "content": {"path": mic}})
+        content.append(
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": file_to_base64(mic),
+                    "format": get_audio_format(mic),
+                },
+            }
+        )
+        print(f"add message {mic=}")
+    history.append({"role": "user", "content": content})
     return chatbot, history, None
 def save_tmp_audio(audio, sr):
     import tempfile
+    import torchaudio
     with tempfile.NamedTemporaryFile(
         dir=CACHE_DIR, delete=False, suffix=".wav"
     return temp_audio.name
+def predict(chatbot, history, tts_model, user_prompt, enable_asr):
     """Generate a response from the model."""
+    start_time = time.time()
     try:
+        messages = [make_system_prompt(user_prompt)] + history
+        if enable_asr:
+            asr_text = None
+            if not isinstance(chatbot[-1]["content"], str):
+                user_audio_path = chatbot[-1]["content"]["path"]
+                print(f"do asr {user_audio_path=}")
+                asr_text = call_asr(user_audio_path)
+                print(f"{asr_text=}")
+            if asr_text:
+                last_input = deepcopy(history[-1])
+                last_input["content"].append({"type": "text", "text": asr_text})
+                messages = (
+                    [make_system_prompt(user_prompt)] + history[:-1] + [last_input]
+                )
+        text = call_audiochat(messages)
         print(f"predict {text=}")
+        audio, sr = tts_model(text, "Tingting")
         audio_path = save_tmp_audio(audio, sr)
+        print(f"save_tmp_audio {audio_path=}")
         chatbot.append({"role": "assistant", "content": text})
+        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
         history.append({"role": "assistant", "content": text})
     except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"get a excption: {e=}")
+        gr.Warning(f"get a excption: {str(e)}, try regenerate")
+    print(f"predict cost {(time.time() - start_time):.2f}s")
     return chatbot, history
+def _launch_demo(args, tts_model):
+    with gr.Blocks(delete_cache=(86400, CACHE_CLEAN_AGE)) as demo:
+        # 保存 chat 历史，不需要每次再重新拼格式
+        history = gr.State([])
         gr.Markdown("""<center><font size=8>Step Audio Chat</center>""")
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown(
+                    """<font size=4>This preview demonstrates core functionalities. To unlock the complete real-time voice conversation system with end-to-end encryption and advanced features, download the Yuewen APP.</font>"""
+                )
+            with gr.Column(scale=1):
+                gr.Image(
+                    height=180,
+                    width=180,
+                    value="assets/yuewen.jpeg",
+                    interactive=False,
+                    show_label=False,
+                    show_download_button=False,
+                    show_fullscreen_button=False,
+                )
+        with gr.Accordion(
+            label="The performance of English prompts is not as stable as that of Chinese prompts. You can click here to change sys prompt.", open=False
+        ):
+            prompt_choice = gr.Radio(
+                choices=list(PROMPT_TEMPLATE.keys()),
+                value=list(PROMPT_TEMPLATE.keys())[0],
+                label="Select prompt template",
+            )
+            user_prompt = gr.Textbox(
+                show_label=False,
+                value=list(PROMPT_TEMPLATE.values())[0],
+                lines=6,
+            )
+            prompt_choice.change(
+                fn=lambda choice: PROMPT_TEMPLATE[choice],
+                inputs=prompt_choice,
+                outputs=user_prompt,
+            )
         chatbot = gr.Chatbot(
             elem_id="chatbot",
             avatar_images=["assets/user.png", "assets/assistant.png"],
             min_height=800,
             type="messages",
+            show_share_button=True,
+        )
+        mic = gr.Audio(
+            label="Only MP3 and WAV formats are supported for audio uploads.",
+            sources=["microphone", "upload"],
+            type="filepath",
+        )
+        text = gr.Textbox(
+            placeholder="Enter message ...",
+            label="Input text message",
+            show_label=False,
         )
         with gr.Row():
+            enable_asr = gr.Checkbox(
+                value=True,
+                label="Enhance understanding capability by ASR",
+            )
+        with gr.Row():
+            clear_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
+            gen_btn = gr.Button("🚀 Generate Response")
+        example_comment = gr.Textbox(label="Input data type", visible=False)
+        example_text = gr.Textbox(label="Input text message", visible=False)
+        example_audio = gr.Audio(
+            label="Input Audio",
+            type="filepath",
+            visible=False,
+        )
+        def update_examples(_, text, mic, user_prompt, enable_asr):
+            chatbot = []
+            history = []
+            chatbot, history, error = add_message(chatbot, history, mic, text)
+            if error:
+                gr.Warning(error)
+                print(f"update_examples error")
+                return chatbot, history
+            else:
+                chatbot, history = predict(chatbot, history, tts_model, user_prompt, enable_asr)
+                print(f"update_examples done")
+                return chatbot, history
+        with gr.Row():
+            gr.Examples(
+                fn=update_examples,
+                examples=CHAT_EXAMPLES,
+                inputs=[example_comment, example_text, example_audio, user_prompt, enable_asr],
+                outputs=[chatbot, history],
+                run_on_click=True,
             )
+        def on_submit(chatbot, history, mic, text, user_prompt, enable_asr):
+            chatbot, history, error = add_message(chatbot, history, mic, text)
             if error:
+                gr.Warning(error)
                 return chatbot, history, None, None
             else:
+                chatbot, history = predict(chatbot, history, tts_model, user_prompt, enable_asr)
                 return chatbot, history, None, None
+        gen_btn.click(
             fn=on_submit,
+            inputs=[chatbot, history, mic, text, user_prompt, enable_asr],
             outputs=[chatbot, history, mic, text],
+            show_progress=True,
         )
+        def clear_chat_history():
+            return [], []
+        clear_btn.click(
+            clear_chat_history,
             outputs=[chatbot, history],
             show_progress=True,
         )
+        def regenerate(chatbot, history, user_prompt, enable_asr):
             while chatbot and chatbot[-1]["role"] == "assistant":
                 chatbot.pop()
             while history and history[-1]["role"] == "assistant":
                 print(f"discard {history[-1]}")
                 history.pop()
+            return predict(chatbot, history, tts_model, user_prompt, enable_asr)
         regen_btn.click(
             regenerate,
+            [chatbot, history, user_prompt, enable_asr],
             [chatbot, history],
             show_progress=True,
         )
     demo.queue().launch(
     import os
     parser = ArgumentParser()
+    parser.add_argument("--model-path", type=str, help="Tokenizer and TTS model path.")
     parser.add_argument(
         "--server-port", type=int, default=7860, help="Demo server port."
     )
         "--server-name", type=str, default="0.0.0.0", help="Demo server name."
     )
     args = parser.parse_args()
+    tokenizer = StepAudioTokenizer(
+        os.path.join(args.model_path, "Step-Audio-Tokenizer")
+    )
+    tts_model = StepAudioTTS(
+        os.path.join(args.model_path, "Step-Audio-TTS-3B"), tokenizer
     )
+    _launch_demo(args, tts_model)

assets/request_rap_ZH.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f4507403248696260e6afb51459a39b6f8f413f2dd9f3f12568547928494c04
+size 229476

assets/tell_me_a_short_story_EN.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66328ecbf0b612977e4ea7661c5b7707e7183d7500e1482fc7b6e012604903a5
+size 80144

assets/yuewen.jpeg ADDED Viewed

Git LFS Details

SHA256: a10d8611679dcf7c1341a46b4bb041927ef4832c20155385d653bc644e6b2f4c
Pointer size: 130 Bytes
Size of remote file: 57.7 kB

speakers/{闫雨婷RAP_prompt.wav → TingtingRAP_prompt.wav} RENAMED Viewed

File without changes

speakers/{闫雨婷VOCAL_prompt.wav → TingtingVOCAL_prompt.wav} RENAMED Viewed

File without changes

speakers/{闫雨婷_prompt.wav → Tingting_prompt.wav} RENAMED Viewed

File without changes

speakers/speakers_info.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "闫雨婷RAP": "（RAP）远远甩开的笑他是陆行龟 他曾跌倒也曾吃过灰 他说有福的人才会多吃亏 他的爸爸让他小心交友可他偏偏钻进个垃圾堆 他说他明白How to play",
-    "闫雨婷VOCAL": "（哼唱）你从一座叫 我 的小镇经过 刚好屋顶的雪化成雨飘落",
-    "闫雨婷": "那等我们到海洋馆之后，给妈妈买个礼物，好不好呀？"
 }

 {
+    "TingtingRAP": "（RAP）远远甩开的笑他是陆行龟 他曾跌倒也曾吃过灰 他说有福的人才会多吃亏 他的爸爸让他小心交友可他偏偏钻进个垃圾堆 他说他明白How to play",
+    "TingtingVOCAL": "（哼唱）你从一座叫 我 的小镇经过 刚好屋顶的雪化成雨飘落",
+    "Tingting": "那等我们到海洋馆之后，给妈妈买个礼物，好不好呀？"
 }

start_app.sh CHANGED Viewed

@@ -18,7 +18,6 @@ BASE_REPO_URL="https://${HF_USER_NAME}:${HF_USER_TOKEN}@huggingface.co/stepfun-a
 REPOSITORIES=(
     "Step-Audio-Tokenizer"
     "Step-Audio-TTS-3B"
-    "Step-Audio-Chat"
 )
 # 定义本地存放仓库的目录，默认为当前目录
@@ -55,5 +54,5 @@ for repo in "${REPOSITORIES[@]}"; do
 done
 echo "所有仓库已成功下载！"
 python app.py --model $LOCAL_DIR

 REPOSITORIES=(
     "Step-Audio-Tokenizer"
     "Step-Audio-TTS-3B"
 )
 # 定义本地存放仓库的目录，默认为当前目录
 done
 echo "所有仓库已成功下载！"
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cuda_nvrtc/lib:$LD_LIBRARY_PATH
 python app.py --model $LOCAL_DIR

stepaudio.py DELETED Viewed

@@ -1,95 +0,0 @@
-import torch
-import torchaudio
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from tokenizer import StepAudioTokenizer
-from tts import StepAudioTTS
-from utils import load_audio, speech_adjust, volumn_adjust
-class StepAudio:
-    def __init__(self, tokenizer_path: str, tts_path: str, llm_path: str):
-        self.llm_tokenizer = AutoTokenizer.from_pretrained(
-            llm_path, trust_remote_code=True
-        )
-        self.encoder = StepAudioTokenizer(tokenizer_path)
-        self.decoder = StepAudioTTS(tts_path, self.encoder)
-        self.llm = AutoModelForCausalLM.from_pretrained(
-            llm_path,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-    def __call__(
-        self,
-        messages: list,
-        speaker_id: str,
-        speed_ratio: float = 1.0,
-        volumn_ratio: float = 1.0,
-    ):
-        text_with_audio = self.apply_chat_template(messages)
-        token_ids = self.llm_tokenizer.encode(text_with_audio, return_tensors="pt")
-        outputs = self.llm.generate(
-            token_ids, max_new_tokens=2048, temperature=0.7, top_p=0.9, do_sample=True
-        )
-        output_token_ids = outputs[:, token_ids.shape[-1] : -1].tolist()[0]
-        output_text = self.llm_tokenizer.decode(output_token_ids)
-        output_audio, sr = self.decoder(output_text, speaker_id)
-        if speed_ratio != 1.0:
-            output_audio = speech_adjust(output_audio, sr, speed_ratio)
-        if volumn_ratio != 1.0:
-            output_audio = volumn_adjust(output_audio, volumn_ratio)
-        return output_text, output_audio, sr
-    def encode_audio(self, audio_path):
-        audio_wav, sr = load_audio(audio_path)
-        audio_tokens = self.encoder(audio_wav, sr)
-        return audio_tokens
-    def apply_chat_template(self, messages: list):
-        text_with_audio = ""
-        for msg in messages:
-            role = msg["role"]
-            content = msg["content"]
-            if role == "user":
-                role = "human"
-            if isinstance(content, str):
-                text_with_audio += f"<|BOT|>{role}\n{content}<|EOT|>"
-            elif isinstance(content, dict):
-                if content["type"] == "text":
-                    text_with_audio += f"<|BOT|>{role}\n{content['text']}<|EOT|>"
-                elif content["type"] == "audio":
-                    audio_tokens = self.encode_audio(content["audio"])
-                    text_with_audio += f"<|BOT|>{role}\n{audio_tokens}<|EOT|>"
-            elif content is None:
-                text_with_audio += f"<|BOT|>{role}\n"
-            else:
-                raise ValueError(f"Unsupported content type: {type(content)}")
-        if not text_with_audio.endswith("<|BOT|>assistant\n"):
-            text_with_audio += "<|BOT|>assistant\n"
-        return text_with_audio
-if __name__ == "__main__":
-    model = StepAudio(
-        encoder_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-encoder",
-        decoder_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-decoder",
-        llm_path="/mnt/ys-shai-jfs/open-step1o-audio/step1o-audio-v18",
-    )
-    text, audio, sr = model(
-        [{"role": "user", "content": "你好，我是你的朋友，我叫小明，你叫什么名字？"}],
-        "闫雨婷",
-    )
-    torchaudio.save("output/output_e2e_tqta.wav", audio, sr)
-    text, audio, sr = model(
-        [
-            {
-                "role": "user",
-                "content": {"type": "audio", "audio": "output/output_e2e_tqta.wav"},
-            }
-        ],
-        "闫雨婷",
-    )
-    torchaudio.save("output/output_e2e_aqta.wav", audio, sr)

tts.py CHANGED Viewed

@@ -37,6 +37,19 @@ class StepAudioTTS:
         model_path,
         encoder,
     ):
         self.llm = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,

         model_path,
         encoder,
     ):
+        # load optimus_ths for flash attention, make sure LD_LIBRARY_PATH has `nvidia/cuda_nvrtc/lib`
+        # if not, please manually set LD_LIBRARY_PATH=xxx/python3.10/site-packages/nvidia/cuda_nvrtc/lib
+        try:
+            if torch.__version__ >= "2.5":
+                torch.ops.load_library(os.path.join(model_path, 'lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so'))
+            elif torch.__version__ >= "2.3":
+                torch.ops.load_library(os.path.join(model_path, 'lib/liboptimus_ths-torch2.3-cu121.cpython-310-x86_64-linux-gnu.so'))
+            elif torch.__version__ >= "2.2":
+                torch.ops.load_library(os.path.join(model_path, 'lib/liboptimus_ths-torch2.2-cu121.cpython-310-x86_64-linux-gnu.so'))
+            print("Load optimus_ths successfully and flash attn would be enabled")
+        except Exception as err:
+            print(f"Fail to load optimus_ths and flash attn is disabled: {err}")
         self.llm = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.bfloat16,

yuewen_api.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from openai import OpenAI, APIStatusError
+import os
+AUDIO_CHAT_MODEL = os.getenv("STEP_AUDIO_CHAT", "step-1o-audio")
+TTS_MODEL = os.getenv("STEP_AUDIO_TTS", "step-tts-mini")
+ASR_MODEL = os.getenv("STEP_AUDIO_ASR", "step-asr")
+STEP_BASE_URL = os.getenv("STEP_BASE_URL", "https://api.stepfun.com/v1")
+client = OpenAI(
+    base_url=STEP_BASE_URL,
+)
+def call_audiochat(messages):
+    try:
+        completion = client.chat.completions.create(
+            model=AUDIO_CHAT_MODEL,
+            messages=messages,
+            presence_penalty=1,
+        )
+        return completion.choices[0].message.content
+    except APIStatusError as e:
+        print(e)
+        raise RuntimeError(e)
+    except Exception as e:
+        raise e
+def call_tts(text, audio_path, voice="qinqienvsheng"):
+    response = client.audio.speech.create(model=TTS_MODEL, voice=voice, input=text)
+    response.stream_to_file(audio_path)
+    return True
+def call_asr(audio_path):
+    with open(audio_path, "rb") as audio_file:
+        response = client.audio.transcriptions.create(
+            model=ASR_MODEL,
+            file=audio_file,
+            response_format="json",
+        )
+    return response.text
+if __name__ == "__main__":
+    messages = [{"role":"user", "content": "介绍下你自己"}]
+    res = call_audiochat(messages)
+    print("call audiochat: ", res)
+    audio_path = "test.mp3"
+    text = "hello, 阶跃"
+    call_tts(text, audio_path)
+    res = call_asr(audio_path)
+    print("call asr:", res)