import os import sys import subprocess from huggingface_hub import hf_hub_download from pydub import AudioSegment import gradio as gr import time # Thêm thư mục src vào sys.path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): current_dir = os.path.dirname(os.path.abspath(__file__)) infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") tests_dir = os.path.join(current_dir, "tests") if not os.path.exists(infer_cli_path): return None, "File infer_cli.py không tồn tại!" try: vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") except Exception as e: return None, f"Lỗi khi tải model/vocab: {str(e)}" os.environ['PYTHONIOENCODING'] = 'utf-8' env = os.environ.copy() env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) command = [ sys.executable, infer_cli_path, "--model", model, "--ref_audio", ref_audio_path, "--ref_text", ref_text, "--gen_text", gen_text, "--speed", str(speed), "--vocoder_name", vocoder_name, "--vocab_file", vocab_file, "--ckpt_file", ckpt_file ] try: subprocess.run( command, check=True, capture_output=True, text=True, env=env ) # Kiểm tra thư mục tests có file wav xuất ra không if os.path.exists(tests_dir): wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] if wav_files: latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) output_wav = os.path.join(tests_dir, latest_wav) audio = AudioSegment.from_wav(output_wav) output_mp3 = os.path.join(tests_dir, "output.mp3") audio.export(output_mp3, format="mp3") return output_mp3, "✅ Suy luận thành công!" return None, "❌ Không tìm thấy file âm thanh trong thư mục tests" except subprocess.CalledProcessError as e: return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}" except Exception as e: return None, str(e) def generate_speech(ref_audio, ref_text, gen_text, speed, model): if ref_audio is None: return None audio_segment = AudioSegment.from_file(ref_audio) audio_segment = audio_segment.set_channels(1) # Chuyển sang mono ref_audio_path = f"temp_ref_{int(time.time())}.wav" audio_segment.export(ref_audio_path, format="wav") output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed)) if os.path.exists(ref_audio_path): os.remove(ref_audio_path) if output_mp3 is None: raise Exception(message) # Hoặc xử lý lỗi theo cách khác, ví dụ print(message) return output_mp3 # ====================== Gradio UI ====================== with gr.Blocks() as interface: gr.Markdown("## 🎙️ F5-TTS Suy luận") gr.Markdown("Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS.") with gr.Row(): with gr.Column(): ref_audio = gr.Audio(type="filepath", label="📂 Tải lên file audio tham chiếu (.wav hoặc .mp3)") ref_text = gr.Textbox(label="📝 Text tham chiếu") gen_text = gr.Textbox(label="📝 Text cần sinh") speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="⚡ Tốc độ") model = gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="🤖 Mô hình") btn = gr.Button("🚀 Sinh giọng nói") with gr.Column(): output_audio = gr.Audio(type="filepath", label="🔊 Kết quả audio (.mp3)") btn.click(generate_speech, [ref_audio, ref_text, gen_text, speed, model], output_audio) if __name__ == "__main__": interface.launch(server_name="0.0.0.0", server_port=7860) # import os # import sys # import subprocess # from huggingface_hub import hf_hub_download # from pydub import AudioSegment # import gradio as gr # import time # # Thêm thư mục src vào sys.path # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) # def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): # current_dir = os.path.dirname(os.path.abspath(__file__)) # infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") # tests_dir = os.path.join(current_dir, "tests") # print(f"Infer CLI path: {infer_cli_path}") # print(f"Does infer_cli.py exist? {os.path.exists(infer_cli_path)}") # if not os.path.exists(infer_cli_path): # return None, "File infer_cli.py không tồn tại!" # try: # vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") # ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") # except Exception as e: # return None, f"Lỗi khi tải model/vocab: {str(e)}" # os.environ['PYTHONIOENCODING'] = 'utf-8' # env = os.environ.copy() # env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) # command = [ # sys.executable, # infer_cli_path, # "--model", model, # "--ref_audio", ref_audio_path, # "--ref_text", ref_text, # "--gen_text", gen_text, # "--speed", str(speed), # "--vocoder_name", vocoder_name, # "--vocab_file", vocab_file, # "--ckpt_file", ckpt_file # ] # print(f"Running command: {' '.join(command)}") # try: # result = subprocess.run( # command, # check=True, # capture_output=True, # text=True, # env=env # ) # print("Subprocess stdout:", result.stdout) # if os.path.exists(tests_dir): # wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] # if wav_files: # latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) # output_wav = os.path.join(tests_dir, latest_wav) # audio = AudioSegment.from_wav(output_wav) # output_mp3 = os.path.join(tests_dir, "output.mp3") # audio.export(output_mp3, format="mp3") # return output_mp3, "Suy luận thành công!" # return None, "Không tìm thấy file âm thanh trong thư mục tests" # except subprocess.CalledProcessError as e: # return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}" # except Exception as e: # return None, str(e) # def generate_speech(ref_audio, ref_text, gen_text, speed, model): # if ref_audio is None: # return None, "Vui lòng tải lên file audio tham chiếu!" # # ref_audio là đường dẫn file, tải bằng AudioSegment # audio_segment = AudioSegment.from_file(ref_audio) # audio_segment = audio_segment.set_channels(1) # Chuyển sang mono # ref_audio_path = f"temp_ref_{int(time.time())}.wav" # audio_segment.export(ref_audio_path, format="wav") # output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed)) # os.remove(ref_audio_path) # if output_mp3 and os.path.exists(output_mp3): # return output_mp3, message # return None, message # interface = gr.Interface( # fn=generate_speech, # inputs=[ # gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"), # gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"), # gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"), # gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"), # gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình") # ], # outputs=[ # gr.Audio(type="filepath", label="Kết quả audio (.mp3)"), # gr.Textbox(label="Trạng thái") # ], # title="F5-TTS Suy luận", # description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS." # ) # if __name__ == "__main__": # interface.launch(server_name="0.0.0.0", server_port=7860) # import os # import sys # import subprocess # from huggingface_hub import hf_hub_download # from pydub import AudioSegment # import gradio as gr # import time # # Thêm thư mục src vào sys.path # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) # def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): # current_dir = os.path.dirname(os.path.abspath(__file__)) # infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") # tests_dir = os.path.join(current_dir, "tests") # # Debug: In đường dẫn để kiểm tra # print(f"Infer CLI path: {infer_cli_path}") # print(f"Tests dir: {tests_dir}") # # Tải file từ Hugging Face Hub # try: # vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") # ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") # except Exception as e: # return None, f"Lỗi khi tải model/vocab từ Hugging Face: {str(e)}" # os.environ['PYTHONIOENCODING'] = 'utf-8' # env = os.environ.copy() # env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) # command = [ # sys.executable, # infer_cli_path, # "--model", model, # "--ref_audio", ref_audio_path, # "--ref_text", ref_text, # "--gen_text", gen_text, # "--speed", str(speed), # "--vocoder_name", vocoder_name, # "--vocab_file", vocab_file, # "--ckpt_file", ckpt_file # ] # print(f"Running command: {' '.join(command)}") # try: # result = subprocess.run( # command, # check=True, # capture_output=True, # text=True, # env=env # ) # print("Subprocess stdout:", result.stdout) # if os.path.exists(tests_dir): # wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] # if wav_files: # latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) # output_wav = os.path.join(tests_dir, latest_wav) # audio = AudioSegment.from_wav(output_wav) # output_mp3 = os.path.join(tests_dir, "output.mp3") # audio.export(output_mp3, format="mp3") # return output_mp3, "Suy luận thành công!" # return None, "Không tìm thấy file âm thanh trong thư mục tests" # except subprocess.CalledProcessError as e: # return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}" # except Exception as e: # return None, str(e) # def generate_speech(ref_audio, ref_text, gen_text, speed, model): # if ref_audio is None: # return None, "Vui lòng tải lên file audio tham chiếu!" # ref_audio_path = f"temp_ref_{int(time.time())}.wav" # ref_audio.convert_audio_channels(1) # Chuyển sang mono # ref_audio.export(ref_audio_path, format="wav") # output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed)) # os.remove(ref_audio_path) # if output_mp3 and os.path.exists(output_mp3): # return output_mp3, message # return None, message # interface = gr.Interface( # fn=generate_speech, # inputs=[ # gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"), # gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"), # gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"), # gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"), # gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình") # ], # outputs=[ # gr.Audio(type="filepath", label="Kết quả audio (.mp3)"), # gr.Textbox(label="Trạng thái") # ], # title="F5-TTS Suy luận", # description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS." # ) # if __name__ == "__main__": # interface.launch(server_name="0.0.0.0", server_port=7860) # from flask import Flask, request, send_file # import subprocess # import os # import sys # from huggingface_hub import hf_hub_download # from pydub import AudioSegment # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) # app = Flask(__name__) # def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): # current_dir = os.path.dirname(os.path.abspath(__file__)) # infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") # tests_dir = os.path.join(current_dir, "tests") # vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") # ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") # os.environ['PYTHONIOENCODING'] = 'utf-8' # env = os.environ.copy() # env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src')) # command = [ # sys.executable, # infer_cli_path, # "--model", model, # "--ref_audio", ref_audio_path, # "--ref_text", ref_text, # "--gen_text", gen_text, # "--speed", str(speed), # "--vocoder_name", vocoder_name, # "--vocab_file", vocab_file, # "--ckpt_file", ckpt_file # ] # try: # result = subprocess.run( # command, # check=True, # capture_output=True, # text=True, # encoding='utf-8', # env=env # ) # if os.path.exists(tests_dir): # wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] # if wav_files: # latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))) # output_wav = os.path.join(tests_dir, latest_wav) # audio = AudioSegment.from_wav(output_wav) # output_mp3 = os.path.join(tests_dir, "output.mp3") # audio.export(output_mp3, format="mp3") # return True, output_mp3 # return False, "Không tìm thấy file âm thanh trong thư mục tests" # except subprocess.CalledProcessError as e: # return False, f"Lỗi khi chạy infer_cli.py: {e.stderr}" # except Exception as e: # return False, str(e) # @app.route('/') # def home(): # return "F5-TTS API is running. Use POST /api/generate to generate audio." # @app.route('/api/generate', methods=['POST']) # def generate_speech(): # if 'ref_audio' not in request.files: # return {"error": "Missing ref_audio"}, 400 # ref_audio = request.files['ref_audio'] # ref_text = request.form.get('ref_text', '') # gen_text = request.form.get('gen_text', '') # model = request.form.get('model', 'F5TTS_Base') # speed = float(request.form.get('speed', 1.2)) # import time # ref_audio_path = f"temp_ref_{int(time.time())}.wav" # ref_audio.save(ref_audio_path) # success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed) # os.remove(ref_audio_path) # if success: # return send_file(result, mimetype='audio/mpeg') # else: # return {"error": result}, 500 # if __name__ == "__main__": # port = int(os.environ.get("PORT", 7860)) # app.run(host="0.0.0.0", port=port, debug=False) # from flask import Flask, request, send_file # import subprocess # import os # import sys # from huggingface_hub import hf_hub_download # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src'))) # app = Flask(__name__) # # ========================= # # Hàm chạy F5-TTS # # ========================= # def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"): # current_dir = os.path.dirname(os.path.abspath(__file__)) # infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py") # tests_dir = os.path.join(current_dir, "tests") # # Dùng huggingface_hub để tải file model và vocab từ repo 'nguyensu27/TTS' # vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt") # ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt") # os.environ['PYTHONIOENCODING'] = 'utf-8' # command = [ # sys.executable, # infer_cli_path, # "--model", model, # "--ref_audio", ref_audio_path, # "--ref_text", ref_text, # "--gen_text", gen_text, # "--speed", str(speed), # "--vocoder_name", vocoder_name, # "--vocab_file", vocab_file, # "--ckpt_file", ckpt_file # ] # try: # result = subprocess.run( # command, # check=True, # capture_output=True, # text=True, # encoding='utf-8' # ) # if os.path.exists(tests_dir): # wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')] # if wav_files: # latest_wav = max( # wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)) # ) # output_file = os.path.join(tests_dir, latest_wav) # return True, output_file # return False, "Không tìm thấy file âm thanh trong thư mục tests" # except subprocess.CalledProcessError as e: # return False, e.stderr # except Exception as e: # return False, str(e) # # ========================= # # Routes # # ========================= # @app.route('/') # def home(): # return "F5-TTS API is running. Use POST /api/generate to generate audio." # @app.route('/api/generate', methods=['POST']) # def generate_speech(): # if 'ref_audio' not in request.files: # return {"error": "Missing ref_audio"}, 400 # ref_audio = request.files['ref_audio'] # ref_text = request.form.get('ref_text', '') # gen_text = request.form.get('gen_text', '') # model = request.form.get('model', 'F5TTS_Base') # speed = float(request.form.get('speed', 1.2)) # ref_audio_path = 'temp_ref.wav' # ref_audio.save(ref_audio_path) # success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed) # os.remove(ref_audio_path) # if success: # return send_file(result, mimetype='audio/wav') # else: # return {"error": result}, 500 # # ========================= # # Main # # ========================= # if __name__ == "__main__": # port = int(os.environ.get("PORT", 7860)) # app.run(host="0.0.0.0", port=port, debug=False)