julin90 commited on
Commit
7682901
·
1 Parent(s): 8c2ae28

更新語音助理功能

Browse files
Files changed (3) hide show
  1. .gitignore.txt +17 -0
  2. app.py +66 -0
  3. requirements.txt +5 -0
.gitignore.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 忽略虛擬環境和 cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.tmp
5
+ *.log
6
+ *.mp3
7
+ *.wav
8
+
9
+ # 忽略 huggingface token
10
+ *.env
11
+
12
+ # VSCode 和 Jupyter
13
+ .vscode/
14
+ .ipynb_checkpoints/
15
+
16
+ # 不可上傳的大模型(如vosk-model)
17
+ vosk-model-small-cn-0.22/
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import os
4
+ from gtts import gTTS
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ import torch
7
+ import json
8
+ import tempfile
9
+ import base64
10
+
11
+ # 使用小模型,因Hugging Face Space限制
12
+ MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
14
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).eval()
15
+
16
+ # 語音辨識 - 用 placeholder,不在huggingface運行vosk(因大小限制)
17
+ def fake_transcribe(audio):
18
+ return "你好,請問有什麼可以幫忙的?"
19
+
20
+ # 回答問題
21
+ def answer_question(text):
22
+ messages = [
23
+ {"role": "user", "content": text}
24
+ ]
25
+ input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
26
+ with torch.no_grad():
27
+ outputs = model.generate(input_ids, max_new_tokens=200)
28
+ response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
29
+ return response
30
+
31
+ # TTS 文字轉語音
32
+ def text_to_speech(text):
33
+ tts = gTTS(text, lang='zh')
34
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
35
+ tts.save(fp.name)
36
+ with open(fp.name, "rb") as f:
37
+ audio_b64 = base64.b64encode(f.read()).decode("utf-8")
38
+ return f"data:audio/mp3;base64,{audio_b64}"
39
+
40
+ # 整合流程
41
+ def chat_pipeline(audio_input=None, text_input=None):
42
+ if audio_input:
43
+ text = fake_transcribe(audio_input)
44
+ elif text_input:
45
+ text = text_input
46
+ else:
47
+ return "請輸入問題或語音", None
48
+ response = answer_question(text)
49
+ speech_url = text_to_speech(response)
50
+ return response, speech_url
51
+
52
+ # Gradio介面
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown("## 🎙️ 語音助理(Hugging Face Space 測試版)")
55
+ with gr.Row():
56
+ mic = gr.Audio(source="microphone", type="filepath", label="輸入語音")
57
+ text_input = gr.Textbox(label="或輸入文字")
58
+ with gr.Row():
59
+ submit = gr.Button("送出")
60
+ output_text = gr.Textbox(label="回答")
61
+ output_audio = gr.Audio(label="語音回答", type="filepath")
62
+
63
+ submit.click(fn=chat_pipeline, inputs=[mic, text_input], outputs=[output_text, output_audio])
64
+
65
+ if __name__ == "__main__":
66
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.28.3
2
+ gtts==2.5.1
3
+ transformers==4.41.1
4
+ torch==2.3.0
5
+ requests==2.31.0