File size: 2,282 Bytes
7682901
 
fda9f95
 
7682901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fda9f95
7682901
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# app.py
import gradio as gr

print("Gradio version:", gr.__version__)
import os
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import tempfile
import base64

# 使用小模型,因Hugging Face Space限制
MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).eval()

# 語音辨識 - 用 placeholder,不在huggingface運行vosk(因大小限制)
def fake_transcribe(audio):
    return "你好,請問有什麼可以幫忙的?"

# 回答問題
def answer_question(text):
    messages = [
        {"role": "user", "content": text}
    ]
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=200)
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response

# TTS 文字轉語音
def text_to_speech(text):
    tts = gTTS(text, lang='zh')
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        with open(fp.name, "rb") as f:
            audio_b64 = base64.b64encode(f.read()).decode("utf-8")
    return f"data:audio/mp3;base64,{audio_b64}"

# 整合流程
def chat_pipeline(audio_input=None, text_input=None):
    if audio_input:
        text = fake_transcribe(audio_input)
    elif text_input:
        text = text_input
    else:
        return "請輸入問題或語音", None
    response = answer_question(text)
    speech_url = text_to_speech(response)
    return response, speech_url

# Gradio介面
with gr.Blocks() as demo:
    gr.Markdown("## 🎙️ 語音助理(Hugging Face Space 測試版)")
    with gr.Row():
        mic = gr.Audio(type="filepath", label="輸入語音")
        text_input = gr.Textbox(label="或輸入文字")
    with gr.Row():
        submit = gr.Button("送出")
    output_text = gr.Textbox(label="回答")
    output_audio = gr.Audio(label="語音回答", type="filepath")

    submit.click(fn=chat_pipeline, inputs=[mic, text_input], outputs=[output_text, output_audio])

if __name__ == "__main__":
    demo.launch()