Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import subprocess | |
from huggingface_hub import hf_hub_download | |
from pydub import AudioSegment | |
import gradio as gr | |
import time | |
# Thêm thư mục src vào sys.path | |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) | |
def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") | |
tests_dir = os.path.join(current_dir, "tests") | |
print(f"Infer CLI path: {infer_cli_path}") | |
print(f"Does infer_cli.py exist? {os.path.exists(infer_cli_path)}") | |
if not os.path.exists(infer_cli_path): | |
return None, "File infer_cli.py không tồn tại!" | |
try: | |
vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") | |
ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") | |
except Exception as e: | |
return None, f"Lỗi khi tải model/vocab: {str(e)}" | |
os.environ['PYTHONIOENCODING'] = 'utf-8' | |
env = os.environ.copy() | |
env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) | |
command = [ | |
sys.executable, | |
infer_cli_path, | |
"--model", model, | |
"--ref_audio", ref_audio_path, | |
"--ref_text", ref_text, | |
"--gen_text", gen_text, | |
"--speed", str(speed), | |
"--vocoder_name", vocoder_name, | |
"--vocab_file", vocab_file, | |
"--ckpt_file", ckpt_file | |
] | |
print(f"Running command: {' '.join(command)}") | |
try: | |
result = subprocess.run( | |
command, | |
check=True, | |
capture_output=True, | |
text=True, | |
env=env | |
) | |
print("Subprocess stdout:", result.stdout) | |
if os.path.exists(tests_dir): | |
wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] | |
if wav_files: | |
latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) | |
output_wav = os.path.join(tests_dir, latest_wav) | |
audio = AudioSegment.from_wav(output_wav) | |
output_mp3 = os.path.join(tests_dir, "output.mp3") | |
audio.export(output_mp3, format="mp3") | |
return output_mp3, "Suy luận thành công!" | |
return None, "Không tìm thấy file âm thanh trong thư mục tests" | |
except subprocess.CalledProcessError as e: | |
return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}" | |
except Exception as e: | |
return None, str(e) | |
def generate_speech(ref_audio, ref_text, gen_text, speed, model): | |
if ref_audio is None: | |
return None, "Vui lòng tải lên file audio tham chiếu!" | |
# ref_audio là đường dẫn file, tải bằng AudioSegment | |
audio_segment = AudioSegment.from_file(ref_audio) | |
audio_segment = audio_segment.set_channels(1) # Chuyển sang mono | |
ref_audio_path = f"temp_ref_{int(time.time())}.wav" | |
audio_segment.export(ref_audio_path, format="wav") | |
output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed)) | |
os.remove(ref_audio_path) | |
if output_mp3 and os.path.exists(output_mp3): | |
return output_mp3, message | |
return None, message | |
interface = gr.Interface( | |
fn=generate_speech, | |
inputs=[ | |
gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"), | |
gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"), | |
gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"), | |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"), | |
gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình") | |
], | |
outputs=[ | |
gr.Audio(type="filepath", label="Kết quả audio (.mp3)"), | |
gr.Textbox(label="Trạng thái") | |
], | |
title="F5-TTS Suy luận", | |
description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS." | |
) | |
if __name__ == "__main__": | |
interface.launch(server_name="0.0.0.0", server_port=7860) | |
# import os | |
# import sys | |
# import subprocess | |
# from huggingface_hub import hf_hub_download | |
# from pydub import AudioSegment | |
# import gradio as gr | |
# import time | |
# # Thêm thư mục src vào sys.path | |
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) | |
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): | |
# current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") | |
# tests_dir = os.path.join(current_dir, "tests") | |
# # Debug: In đường dẫn để kiểm tra | |
# print(f"Infer CLI path: {infer_cli_path}") | |
# print(f"Tests dir: {tests_dir}") | |
# # Tải file từ Hugging Face Hub | |
# try: | |
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") | |
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") | |
# except Exception as e: | |
# return None, f"Lỗi khi tải model/vocab từ Hugging Face: {str(e)}" | |
# os.environ['PYTHONIOENCODING'] = 'utf-8' | |
# env = os.environ.copy() | |
# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) | |
# command = [ | |
# sys.executable, | |
# infer_cli_path, | |
# "--model", model, | |
# "--ref_audio", ref_audio_path, | |
# "--ref_text", ref_text, | |
# "--gen_text", gen_text, | |
# "--speed", str(speed), | |
# "--vocoder_name", vocoder_name, | |
# "--vocab_file", vocab_file, | |
# "--ckpt_file", ckpt_file | |
# ] | |
# print(f"Running command: {' '.join(command)}") | |
# try: | |
# result = subprocess.run( | |
# command, | |
# check=True, | |
# capture_output=True, | |
# text=True, | |
# env=env | |
# ) | |
# print("Subprocess stdout:", result.stdout) | |
# if os.path.exists(tests_dir): | |
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] | |
# if wav_files: | |
# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) | |
# output_wav = os.path.join(tests_dir, latest_wav) | |
# audio = AudioSegment.from_wav(output_wav) | |
# output_mp3 = os.path.join(tests_dir, "output.mp3") | |
# audio.export(output_mp3, format="mp3") | |
# return output_mp3, "Suy luận thành công!" | |
# return None, "Không tìm thấy file âm thanh trong thư mục tests" | |
# except subprocess.CalledProcessError as e: | |
# return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}" | |
# except Exception as e: | |
# return None, str(e) | |
# def generate_speech(ref_audio, ref_text, gen_text, speed, model): | |
# if ref_audio is None: | |
# return None, "Vui lòng tải lên file audio tham chiếu!" | |
# ref_audio_path = f"temp_ref_{int(time.time())}.wav" | |
# ref_audio.convert_audio_channels(1) # Chuyển sang mono | |
# ref_audio.export(ref_audio_path, format="wav") | |
# output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed)) | |
# os.remove(ref_audio_path) | |
# if output_mp3 and os.path.exists(output_mp3): | |
# return output_mp3, message | |
# return None, message | |
# interface = gr.Interface( | |
# fn=generate_speech, | |
# inputs=[ | |
# gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"), | |
# gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"), | |
# gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"), | |
# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"), | |
# gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình") | |
# ], | |
# outputs=[ | |
# gr.Audio(type="filepath", label="Kết quả audio (.mp3)"), | |
# gr.Textbox(label="Trạng thái") | |
# ], | |
# title="F5-TTS Suy luận", | |
# description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS." | |
# ) | |
# if __name__ == "__main__": | |
# interface.launch(server_name="0.0.0.0", server_port=7860) | |
# from flask import Flask, request, send_file | |
# import subprocess | |
# import os | |
# import sys | |
# from huggingface_hub import hf_hub_download | |
# from pydub import AudioSegment | |
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) | |
# app = Flask(__name__) | |
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): | |
# current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") | |
# tests_dir = os.path.join(current_dir, "tests") | |
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") | |
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") | |
# os.environ['PYTHONIOENCODING'] = 'utf-8' | |
# env = os.environ.copy() | |
# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) | |
# command = [ | |
# sys.executable, | |
# infer_cli_path, | |
# "--model", model, | |
# "--ref_audio", ref_audio_path, | |
# "--ref_text", ref_text, | |
# "--gen_text", gen_text, | |
# "--speed", str(speed), | |
# "--vocoder_name", vocoder_name, | |
# "--vocab_file", vocab_file, | |
# "--ckpt_file", ckpt_file | |
# ] | |
# try: | |
# result = subprocess.run( | |
# command, | |
# check=True, | |
# capture_output=True, | |
# text=True, | |
# encoding='utf-8', | |
# env=env | |
# ) | |
# if os.path.exists(tests_dir): | |
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] | |
# if wav_files: | |
# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) | |
# output_wav = os.path.join(tests_dir, latest_wav) | |
# audio = AudioSegment.from_wav(output_wav) | |
# output_mp3 = os.path.join(tests_dir, "output.mp3") | |
# audio.export(output_mp3, format="mp3") | |
# return True, output_mp3 | |
# return False, "Không tìm thấy file âm thanh trong thư mục tests" | |
# except subprocess.CalledProcessError as e: | |
# return False, f"Lỗi khi chạy infer_cli.py: {e.stderr}" | |
# except Exception as e: | |
# return False, str(e) | |
# @app.route('/') | |
# def home(): | |
# return "F5-TTS API is running. Use POST /api/generate to generate audio." | |
# @app.route('/api/generate', methods=['POST']) | |
# def generate_speech(): | |
# if 'ref_audio' not in request.files: | |
# return {"error": "Missing ref_audio"}, 400 | |
# ref_audio = request.files['ref_audio'] | |
# ref_text = request.form.get('ref_text', '') | |
# gen_text = request.form.get('gen_text', '') | |
# model = request.form.get('model', 'F5TTS_Base') | |
# speed = float(request.form.get('speed', 1.2)) | |
# import time | |
# ref_audio_path = f"temp_ref_{int(time.time())}.wav" | |
# ref_audio.save(ref_audio_path) | |
# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed) | |
# os.remove(ref_audio_path) | |
# if success: | |
# return send_file(result, mimetype='audio/mpeg') | |
# else: | |
# return {"error": result}, 500 | |
# if __name__ == "__main__": | |
# port = int(os.environ.get("PORT", 7860)) | |
# app.run(host="0.0.0.0", port=port, debug=False) | |
# from flask import Flask, request, send_file | |
# import subprocess | |
# import os | |
# import sys | |
# from huggingface_hub import hf_hub_download | |
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) | |
# app = Flask(__name__) | |
# # ========================= | |
# # Hàm chạy F5-TTS | |
# # ========================= | |
# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): | |
# current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") | |
# tests_dir = os.path.join(current_dir, "tests") | |
# # Dùng huggingface_hub để tải file model và vocab từ repo 'nguyensu27/TTS' | |
# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") | |
# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") | |
# os.environ['PYTHONIOENCODING'] = 'utf-8' | |
# command = [ | |
# sys.executable, | |
# infer_cli_path, | |
# "--model", model, | |
# "--ref_audio", ref_audio_path, | |
# "--ref_text", ref_text, | |
# "--gen_text", gen_text, | |
# "--speed", str(speed), | |
# "--vocoder_name", vocoder_name, | |
# "--vocab_file", vocab_file, | |
# "--ckpt_file", ckpt_file | |
# ] | |
# try: | |
# result = subprocess.run( | |
# command, | |
# check=True, | |
# capture_output=True, | |
# text=True, | |
# encoding='utf-8' | |
# ) | |
# if os.path.exists(tests_dir): | |
# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] | |
# if wav_files: | |
# latest_wav = max( | |
# wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)) | |
# ) | |
# output_file = os.path.join(tests_dir, latest_wav) | |
# return True, output_file | |
# return False, "Không tìm thấy file âm thanh trong thư mục tests" | |
# except subprocess.CalledProcessError as e: | |
# return False, e.stderr | |
# except Exception as e: | |
# return False, str(e) | |
# # ========================= | |
# # Routes | |
# # ========================= | |
# @app.route('/') | |
# def home(): | |
# return "F5-TTS API is running. Use POST /api/generate to generate audio." | |
# @app.route('/api/generate', methods=['POST']) | |
# def generate_speech(): | |
# if 'ref_audio' not in request.files: | |
# return {"error": "Missing ref_audio"}, 400 | |
# ref_audio = request.files['ref_audio'] | |
# ref_text = request.form.get('ref_text', '') | |
# gen_text = request.form.get('gen_text', '') | |
# model = request.form.get('model', 'F5TTS_Base') | |
# speed = float(request.form.get('speed', 1.2)) | |
# ref_audio_path = 'temp_ref.wav' | |
# ref_audio.save(ref_audio_path) | |
# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed) | |
# os.remove(ref_audio_path) | |
# if success: | |
# return send_file(result, mimetype='audio/wav') | |
# else: | |
# return {"error": result}, 500 | |
# # ========================= | |
# # Main | |
# # ========================= | |
# if __name__ == "__main__": | |
# port = int(os.environ.get("PORT", 7860)) | |
# app.run(host="0.0.0.0", port=port, debug=False) | |