Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
print("Gradio version:", gr.__version__) | |
import os | |
from gtts import gTTS | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import json | |
import tempfile | |
import base64 | |
# 使用小模型,因Hugging Face Space限制 | |
MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).eval() | |
# 語音辨識 - 用 placeholder,不在huggingface運行vosk(因大小限制) | |
def fake_transcribe(audio): | |
return "你好,請問有什麼可以幫忙的?" | |
# 回答問題 | |
def answer_question(text): | |
messages = [ | |
{"role": "user", "content": text} | |
] | |
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model.generate(input_ids, max_new_tokens=200) | |
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) | |
return response | |
# TTS 文字轉語音 | |
def text_to_speech(text): | |
tts = gTTS(text, lang='zh') | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
tts.save(fp.name) | |
with open(fp.name, "rb") as f: | |
audio_b64 = base64.b64encode(f.read()).decode("utf-8") | |
return f"data:audio/mp3;base64,{audio_b64}" | |
# 整合流程 | |
def chat_pipeline(audio_input=None, text_input=None): | |
if audio_input: | |
text = fake_transcribe(audio_input) | |
elif text_input: | |
text = text_input | |
else: | |
return "請輸入問題或語音", None | |
response = answer_question(text) | |
speech_url = text_to_speech(response) | |
return response, speech_url | |
# Gradio介面 | |
with gr.Blocks() as demo: | |
gr.Markdown("## 🎙️ 語音助理(Hugging Face Space 測試版)") | |
with gr.Row(): | |
mic = gr.Audio(type="filepath", label="輸入語音") | |
text_input = gr.Textbox(label="或輸入文字") | |
with gr.Row(): | |
submit = gr.Button("送出") | |
output_text = gr.Textbox(label="回答") | |
output_audio = gr.Audio(label="語音回答", type="filepath") | |
submit.click(fn=chat_pipeline, inputs=[mic, text_input], outputs=[output_text, output_audio]) | |
if __name__ == "__main__": | |
demo.launch() | |