File size: 7,090 Bytes
cebfc07 f0a4357 cebfc07 37ce538 cebfc07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# Gradio app
# A chatbot that supports Audio inputs(user can upload an audio file.)
# from transformers import AutoModel, AutoTokenizer
import gradio as gr
from transformers import AutoModel
import os
if gr.NO_RELOAD:
model = AutoModel.from_pretrained("DeSTA-ntu/DeSTA2-8B-beta", trust_remote_code=True, token=os.getenv('HF_TOKEN'))
model.to("cuda")
model.eval()
def reset_chat(history, chatbot):
history = [{"role": "system", "content": "Focus on the input audio. You are a helpful voice assistant."}]
# history.clear()
return (history, None, gr.update(interactive=False), gr.update(interactive=True))
def upload_audio(history, speech, text_box, chatbot, chat_button, upload_button):
# {"role": "audio", "content": "assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"},
print(speech)
if speech is None:
gr.Warning("⚠️ Please upload an audio file first!", duration=5)
return (history, speech, text_box, chatbot, chat_button, upload_button)
history.append({"role": "audio", "content": speech})
chatbot.append([f"Speech: \n\n{speech}", None])
return (
history,
gr.update(interactive=True), # speech box
gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
chatbot,
gr.update(interactive=True), # chat_button,
gr.update(interactive=False) # upload_button
)
def user_send_message(history, speech, text_box, chatbot):
history.append({"role": "user", "content": text_box})
chatbot.append([f"{text_box}", None])
return (
history,
speech,
gr.update(interactive=True, placeholder="Start chatting!", value=""), # text_box,
chatbot,
)
def model_response(history, speech, text_box, chatbot):
print(history)
messages = history
generated_ids = model.chat(messages, max_new_tokens=128, do_sample=False, temperature=1.0, top_p=1.0)
response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
history.append({"role": "assistant", "content": response})
chatbot[-1][1] = response
return (
history,
speech,
gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
chatbot,
)
with gr.Blocks() as demo:
gr.Markdown("# DeSTA2 demo page")
message_box = gr.Markdown(value="have fun!", label="Message")
history = gr.State([{ "role": "system", "content": "Focus on the input audio. You are a helpful voice assistant." }])
# history = gr.State([])
with gr.Row():
chatbot = gr.Chatbot(label="DeSTA2", height="100%", min_height="400px")
with gr.Row():
with gr.Column():
speech = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
upload_button = gr.Button("Upload")
with gr.Column():
text_box = gr.Textbox(label="User", interactive=False, placeholder="Upload an audio first!")
chat_button = gr.Button("Send", interactive=False)
with gr.Row():
# top_p = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Top P")
# temperature = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Temperature")
gr.Button("Reset chat").click(reset_chat,
inputs=[history, chatbot],
outputs=[history, chatbot, chat_button, upload_button])
upload_button.click(upload_audio,
inputs=[history, speech, text_box, chatbot, chat_button, upload_button],
outputs=[history, speech, text_box, chatbot, chat_button, upload_button]
)
chat_button.click(user_send_message,
inputs=[history, speech, text_box, chatbot],
outputs=[history, speech, text_box, chatbot]).then(
model_response,
inputs=[history, speech, text_box, chatbot],
outputs=[history, speech, text_box, chatbot]
)
with gr.Row():
examples_prompt = gr.Examples(
examples = [
"Transcribe the speech accurately.",
"What is the primary emotion conveyed by the speaker?",
"Describe the content and tone of the audio in detail.",
"Provide a summary of the audio content.",
"Identify the language spoken in the recording.",
"What does the background noise in the audio indicate?",
"Identify if the speaker has a specific accent and describe it.",
"What is the gender and approximate age of the speaker?",
"Summarize the conversation happening in this audio.",
"Classify the type of audio: speech, music, noise, or mixed.",
"Assess the clarity and intelligibility of the speech.",
"What is the emotional state of the speaker, and why do you think so?",
"Provide a timestamped breakdown of key events in the audio."
"將這段語音轉成文字,請確保準確的時間點。",
"你能辨認出這段語音的情感是什麼嗎?",
"這段聲音中的說話者有什麼情緒?",
"從這段聲音中提取關鍵詞。",
"請翻譯這段語音的內容。",
"從這段聲音中找出說話者的性別和口音。",
],
inputs=[text_box],
label="Example prompts"
)
with gr.Row():
examples = gr.Examples(
examples = [
["assets/audios/0_000307.wav"],
["assets/audios/4_0_d47.wav"],
["assets/audios/7_1_d7.wav"],
["assets/audios/AccentClassification_AccentdbExtended_0193_british_s01_176.wav"],
["assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"],
["assets/audios/EmotionRecognition_MultimodalEmotionlinesDataset_0026_dia382_utt0.wav"],
["assets/audios/LanguageIdentification_VoxForge_0000_de143-43.flac"],
["assets/audios/MUL0608_120.98_148.92.wav"],
["assets/audios/NoiseDetection_LJSpeech_MUSAN-Music_0199_music_LJSpeech-1.1_16k_LJ050-0033.wav"],
["assets/audios/Ses01F_script03_1_F029.wav"],
["assets/audios/Ses01M_script01_1_F014.wav"],
["assets/audios/Ses04F_impro02_M004.wav"],
["assets/audios/SpeakerVerification_LibriSpeech-TestClean_0046_3575-170457-0038.flac"],
["assets/audios/SpeechTextMatching_LJSpeech_0001_LJ001-0107.wav"],
["assets/audios/common_voice_en_34980360.mp3"],
["assets/audios/p284_159.wav"],
["assets/audios/p287_162.wav"]
],
inputs=[speech],
label="Example audios"
)
if __name__ == "__main__":
demo.launch(share=True) |