20250724001 / app.py
julin90's picture
初始提交
fda9f95
# app.py
import gradio as gr
print("Gradio version:", gr.__version__)
import os
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import tempfile
import base64
# 使用小模型,因Hugging Face Space限制
MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).eval()
# 語音辨識 - 用 placeholder,不在huggingface運行vosk(因大小限制)
def fake_transcribe(audio):
return "你好,請問有什麼可以幫忙的?"
# 回答問題
def answer_question(text):
messages = [
{"role": "user", "content": text}
]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(input_ids, max_new_tokens=200)
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
return response
# TTS 文字轉語音
def text_to_speech(text):
tts = gTTS(text, lang='zh')
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts.save(fp.name)
with open(fp.name, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode("utf-8")
return f"data:audio/mp3;base64,{audio_b64}"
# 整合流程
def chat_pipeline(audio_input=None, text_input=None):
if audio_input:
text = fake_transcribe(audio_input)
elif text_input:
text = text_input
else:
return "請輸入問題或語音", None
response = answer_question(text)
speech_url = text_to_speech(response)
return response, speech_url
# Gradio介面
with gr.Blocks() as demo:
gr.Markdown("## 🎙️ 語音助理(Hugging Face Space 測試版)")
with gr.Row():
mic = gr.Audio(type="filepath", label="輸入語音")
text_input = gr.Textbox(label="或輸入文字")
with gr.Row():
submit = gr.Button("送出")
output_text = gr.Textbox(label="回答")
output_audio = gr.Audio(label="語音回答", type="filepath")
submit.click(fn=chat_pipeline, inputs=[mic, text_input], outputs=[output_text, output_audio])
if __name__ == "__main__":
demo.launch()