CHUYEN_MP3 / app.py
mrsu0994
upload f5-tts source
ddfa4ca
import os
import sys
import subprocess
from huggingface_hub import hf_hub_download
from pydub import AudioSegment
import gradio as gr
import time
# Thêm thư mục src vào sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
current_dir = os.path.dirname(os.path.abspath(__file__))
infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
tests_dir = os.path.join(current_dir, "tests")
if not os.path.exists(infer_cli_path):
return None, "File infer_cli.py không tồn tại!"
try:
vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
except Exception as e:
return None, f"Lỗi khi tải model/vocab: {str(e)}"
os.environ['PYTHONIOENCODING'] = 'utf-8'
env = os.environ.copy()
env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))
command = [
sys.executable,
infer_cli_path,
"--model", model,
"--ref_audio", ref_audio_path,
"--ref_text", ref_text,
"--gen_text", gen_text,
"--speed", str(speed),
"--vocoder_name", vocoder_name,
"--vocab_file", vocab_file,
"--ckpt_file", ckpt_file
]
try:
subprocess.run(
command,
check=True,
capture_output=True,
text=True,
env=env
)
# Kiểm tra thư mục tests có file wav xuất ra không
if os.path.exists(tests_dir):
wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
if wav_files:
latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
output_wav = os.path.join(tests_dir, latest_wav)
audio = AudioSegment.from_wav(output_wav)
output_mp3 = os.path.join(tests_dir, "output.mp3")
audio.export(output_mp3, format="mp3")
return output_mp3, "✅ Suy luận thành công!"
return None, "❌ Không tìm thấy file âm thanh trong thư mục tests"
except subprocess.CalledProcessError as e:
return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
except Exception as e:
return None, str(e)
def generate_speech(ref_audio, ref_text, gen_text, speed, model):
if ref_audio is None:
return None
audio_segment = AudioSegment.from_file(ref_audio)
audio_segment = audio_segment.set_channels(1) # Chuyển sang mono
ref_audio_path = f"temp_ref_{int(time.time())}.wav"
audio_segment.export(ref_audio_path, format="wav")
output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))
if os.path.exists(ref_audio_path):
os.remove(ref_audio_path)
if output_mp3 is None:
raise Exception(message) # Hoặc xử lý lỗi theo cách khác, ví dụ print(message)
return output_mp3
# ====================== Gradio UI ======================
with gr.Blocks() as interface:
gr.Markdown("## 🎙️ F5-TTS Suy luận")
gr.Markdown("Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS.")
with gr.Row():
with gr.Column():
ref_audio = gr.Audio(type="filepath", label="📂 Tải lên file audio tham chiếu (.wav hoặc .mp3)")
ref_text = gr.Textbox(label="📝 Text tham chiếu")
gen_text = gr.Textbox(label="📝 Text cần sinh")
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="⚡ Tốc độ")
model = gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="🤖 Mô hình")
btn = gr.Button("🚀 Sinh giọng nói")
with gr.Column():
output_audio = gr.Audio(type="filepath", label="🔊 Kết quả audio (.mp3)")
btn.click(generate_speech, [ref_audio, ref_text, gen_text, speed, model], output_audio)
if __name__ == "__main__":
interface.launch(server_name="0.0.0.0", server_port=7860)
# import os
# import sys
# import subprocess
# from huggingface_hub import hf_hub_download
# from pydub import AudioSegment
# import gradio as gr
# import time
# # Thêm thư mục src vào sys.path
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
# current_dir = os.path.dirname(os.path.abspath(__file__))
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
# tests_dir = os.path.join(current_dir, "tests")
# print(f"Infer CLI path: {infer_cli_path}")
# print(f"Does infer_cli.py exist? {os.path.exists(infer_cli_path)}")
# if not os.path.exists(infer_cli_path):
# return None, "File infer_cli.py không tồn tại!"
# try:
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
# except Exception as e:
# return None, f"Lỗi khi tải model/vocab: {str(e)}"
# os.environ['PYTHONIOENCODING'] = 'utf-8'
# env = os.environ.copy()
# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))
# command = [
# sys.executable,
# infer_cli_path,
# "--model", model,
# "--ref_audio", ref_audio_path,
# "--ref_text", ref_text,
# "--gen_text", gen_text,
# "--speed", str(speed),
# "--vocoder_name", vocoder_name,
# "--vocab_file", vocab_file,
# "--ckpt_file", ckpt_file
# ]
# print(f"Running command: {' '.join(command)}")
# try:
# result = subprocess.run(
# command,
# check=True,
# capture_output=True,
# text=True,
# env=env
# )
# print("Subprocess stdout:", result.stdout)
# if os.path.exists(tests_dir):
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
# if wav_files:
# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
# output_wav = os.path.join(tests_dir, latest_wav)
# audio = AudioSegment.from_wav(output_wav)
# output_mp3 = os.path.join(tests_dir, "output.mp3")
# audio.export(output_mp3, format="mp3")
# return output_mp3, "Suy luận thành công!"
# return None, "Không tìm thấy file âm thanh trong thư mục tests"
# except subprocess.CalledProcessError as e:
# return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
# except Exception as e:
# return None, str(e)
# def generate_speech(ref_audio, ref_text, gen_text, speed, model):
# if ref_audio is None:
# return None, "Vui lòng tải lên file audio tham chiếu!"
# # ref_audio là đường dẫn file, tải bằng AudioSegment
# audio_segment = AudioSegment.from_file(ref_audio)
# audio_segment = audio_segment.set_channels(1) # Chuyển sang mono
# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
# audio_segment.export(ref_audio_path, format="wav")
# output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))
# os.remove(ref_audio_path)
# if output_mp3 and os.path.exists(output_mp3):
# return output_mp3, message
# return None, message
# interface = gr.Interface(
# fn=generate_speech,
# inputs=[
# gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"),
# gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"),
# gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"),
# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"),
# gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình")
# ],
# outputs=[
# gr.Audio(type="filepath", label="Kết quả audio (.mp3)"),
# gr.Textbox(label="Trạng thái")
# ],
# title="F5-TTS Suy luận",
# description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS."
# )
# if __name__ == "__main__":
# interface.launch(server_name="0.0.0.0", server_port=7860)
# import os
# import sys
# import subprocess
# from huggingface_hub import hf_hub_download
# from pydub import AudioSegment
# import gradio as gr
# import time
# # Thêm thư mục src vào sys.path
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
# current_dir = os.path.dirname(os.path.abspath(__file__))
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
# tests_dir = os.path.join(current_dir, "tests")
# # Debug: In đường dẫn để kiểm tra
# print(f"Infer CLI path: {infer_cli_path}")
# print(f"Tests dir: {tests_dir}")
# # Tải file từ Hugging Face Hub
# try:
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
# except Exception as e:
# return None, f"Lỗi khi tải model/vocab từ Hugging Face: {str(e)}"
# os.environ['PYTHONIOENCODING'] = 'utf-8'
# env = os.environ.copy()
# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))
# command = [
# sys.executable,
# infer_cli_path,
# "--model", model,
# "--ref_audio", ref_audio_path,
# "--ref_text", ref_text,
# "--gen_text", gen_text,
# "--speed", str(speed),
# "--vocoder_name", vocoder_name,
# "--vocab_file", vocab_file,
# "--ckpt_file", ckpt_file
# ]
# print(f"Running command: {' '.join(command)}")
# try:
# result = subprocess.run(
# command,
# check=True,
# capture_output=True,
# text=True,
# env=env
# )
# print("Subprocess stdout:", result.stdout)
# if os.path.exists(tests_dir):
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
# if wav_files:
# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
# output_wav = os.path.join(tests_dir, latest_wav)
# audio = AudioSegment.from_wav(output_wav)
# output_mp3 = os.path.join(tests_dir, "output.mp3")
# audio.export(output_mp3, format="mp3")
# return output_mp3, "Suy luận thành công!"
# return None, "Không tìm thấy file âm thanh trong thư mục tests"
# except subprocess.CalledProcessError as e:
# return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
# except Exception as e:
# return None, str(e)
# def generate_speech(ref_audio, ref_text, gen_text, speed, model):
# if ref_audio is None:
# return None, "Vui lòng tải lên file audio tham chiếu!"
# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
# ref_audio.convert_audio_channels(1) # Chuyển sang mono
# ref_audio.export(ref_audio_path, format="wav")
# output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))
# os.remove(ref_audio_path)
# if output_mp3 and os.path.exists(output_mp3):
# return output_mp3, message
# return None, message
# interface = gr.Interface(
# fn=generate_speech,
# inputs=[
# gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"),
# gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"),
# gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"),
# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"),
# gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình")
# ],
# outputs=[
# gr.Audio(type="filepath", label="Kết quả audio (.mp3)"),
# gr.Textbox(label="Trạng thái")
# ],
# title="F5-TTS Suy luận",
# description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS."
# )
# if __name__ == "__main__":
# interface.launch(server_name="0.0.0.0", server_port=7860)
# from flask import Flask, request, send_file
# import subprocess
# import os
# import sys
# from huggingface_hub import hf_hub_download
# from pydub import AudioSegment
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
# app = Flask(__name__)
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
# current_dir = os.path.dirname(os.path.abspath(__file__))
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
# tests_dir = os.path.join(current_dir, "tests")
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
# os.environ['PYTHONIOENCODING'] = 'utf-8'
# env = os.environ.copy()
# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))
# command = [
# sys.executable,
# infer_cli_path,
# "--model", model,
# "--ref_audio", ref_audio_path,
# "--ref_text", ref_text,
# "--gen_text", gen_text,
# "--speed", str(speed),
# "--vocoder_name", vocoder_name,
# "--vocab_file", vocab_file,
# "--ckpt_file", ckpt_file
# ]
# try:
# result = subprocess.run(
# command,
# check=True,
# capture_output=True,
# text=True,
# encoding='utf-8',
# env=env
# )
# if os.path.exists(tests_dir):
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
# if wav_files:
# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
# output_wav = os.path.join(tests_dir, latest_wav)
# audio = AudioSegment.from_wav(output_wav)
# output_mp3 = os.path.join(tests_dir, "output.mp3")
# audio.export(output_mp3, format="mp3")
# return True, output_mp3
# return False, "Không tìm thấy file âm thanh trong thư mục tests"
# except subprocess.CalledProcessError as e:
# return False, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
# except Exception as e:
# return False, str(e)
# @app.route('/')
# def home():
# return "F5-TTS API is running. Use POST /api/generate to generate audio."
# @app.route('/api/generate', methods=['POST'])
# def generate_speech():
# if 'ref_audio' not in request.files:
# return {"error": "Missing ref_audio"}, 400
# ref_audio = request.files['ref_audio']
# ref_text = request.form.get('ref_text', '')
# gen_text = request.form.get('gen_text', '')
# model = request.form.get('model', 'F5TTS_Base')
# speed = float(request.form.get('speed', 1.2))
# import time
# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
# ref_audio.save(ref_audio_path)
# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed)
# os.remove(ref_audio_path)
# if success:
# return send_file(result, mimetype='audio/mpeg')
# else:
# return {"error": result}, 500
# if __name__ == "__main__":
# port = int(os.environ.get("PORT", 7860))
# app.run(host="0.0.0.0", port=port, debug=False)
# from flask import Flask, request, send_file
# import subprocess
# import os
# import sys
# from huggingface_hub import hf_hub_download
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
# app = Flask(__name__)
# # =========================
# # Hàm chạy F5-TTS
# # =========================
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
# current_dir = os.path.dirname(os.path.abspath(__file__))
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
# tests_dir = os.path.join(current_dir, "tests")
# # Dùng huggingface_hub để tải file model và vocab từ repo 'nguyensu27/TTS'
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
# os.environ['PYTHONIOENCODING'] = 'utf-8'
# command = [
# sys.executable,
# infer_cli_path,
# "--model", model,
# "--ref_audio", ref_audio_path,
# "--ref_text", ref_text,
# "--gen_text", gen_text,
# "--speed", str(speed),
# "--vocoder_name", vocoder_name,
# "--vocab_file", vocab_file,
# "--ckpt_file", ckpt_file
# ]
# try:
# result = subprocess.run(
# command,
# check=True,
# capture_output=True,
# text=True,
# encoding='utf-8'
# )
# if os.path.exists(tests_dir):
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
# if wav_files:
# latest_wav = max(
# wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))
# )
# output_file = os.path.join(tests_dir, latest_wav)
# return True, output_file
# return False, "Không tìm thấy file âm thanh trong thư mục tests"
# except subprocess.CalledProcessError as e:
# return False, e.stderr
# except Exception as e:
# return False, str(e)
# # =========================
# # Routes
# # =========================
# @app.route('/')
# def home():
# return "F5-TTS API is running. Use POST /api/generate to generate audio."
# @app.route('/api/generate', methods=['POST'])
# def generate_speech():
# if 'ref_audio' not in request.files:
# return {"error": "Missing ref_audio"}, 400
# ref_audio = request.files['ref_audio']
# ref_text = request.form.get('ref_text', '')
# gen_text = request.form.get('gen_text', '')
# model = request.form.get('model', 'F5TTS_Base')
# speed = float(request.form.get('speed', 1.2))
# ref_audio_path = 'temp_ref.wav'
# ref_audio.save(ref_audio_path)
# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed)
# os.remove(ref_audio_path)
# if success:
# return send_file(result, mimetype='audio/wav')
# else:
# return {"error": result}, 500
# # =========================
# # Main
# # =========================
# if __name__ == "__main__":
# port = int(os.environ.get("PORT", 7860))
# app.run(host="0.0.0.0", port=port, debug=False)