Spaces:

nguyensu27
/

CHUYEN_MP3

Sleeping

CHUYEN_MP3 / app.py

mrsu0994

upload f5-tts source

ddfa4ca 6 days ago

19.5 kB

	import os
	import sys
	import subprocess
	from huggingface_hub import hf_hub_download
	from pydub import AudioSegment
	import gradio as gr
	import time

	# Thêm thư mục src vào sys.path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

	def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
	current_dir = os.path.dirname(os.path.abspath(__file__))
	infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
	tests_dir = os.path.join(current_dir, "tests")

	if not os.path.exists(infer_cli_path):
	return None, "File infer_cli.py không tồn tại!"

	try:
	vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
	ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
	except Exception as e:
	return None, f"Lỗi khi tải model/vocab: {str(e)}"

	os.environ['PYTHONIOENCODING'] = 'utf-8'
	env = os.environ.copy()
	env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))

	command = [
	sys.executable,
	infer_cli_path,
	"--model", model,
	"--ref_audio", ref_audio_path,
	"--ref_text", ref_text,
	"--gen_text", gen_text,
	"--speed", str(speed),
	"--vocoder_name", vocoder_name,
	"--vocab_file", vocab_file,
	"--ckpt_file", ckpt_file
	]

	try:
	subprocess.run(
	command,
	check=True,
	capture_output=True,
	text=True,
	env=env
	)
	# Kiểm tra thư mục tests có file wav xuất ra không
	if os.path.exists(tests_dir):
	wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
	if wav_files:
	latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
	output_wav = os.path.join(tests_dir, latest_wav)
	audio = AudioSegment.from_wav(output_wav)
	output_mp3 = os.path.join(tests_dir, "output.mp3")
	audio.export(output_mp3, format="mp3")
	return output_mp3, "✅ Suy luận thành công!"
	return None, "❌ Không tìm thấy file âm thanh trong thư mục tests"
	except subprocess.CalledProcessError as e:
	return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
	except Exception as e:
	return None, str(e)

	def generate_speech(ref_audio, ref_text, gen_text, speed, model):
	if ref_audio is None:
	return None

	audio_segment = AudioSegment.from_file(ref_audio)
	audio_segment = audio_segment.set_channels(1) # Chuyển sang mono
	ref_audio_path = f"temp_ref_{int(time.time())}.wav"
	audio_segment.export(ref_audio_path, format="wav")

	output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))

	if os.path.exists(ref_audio_path):
	os.remove(ref_audio_path)

	if output_mp3 is None:
	raise Exception(message) # Hoặc xử lý lỗi theo cách khác, ví dụ print(message)

	return output_mp3

	# ====================== Gradio UI ======================
	with gr.Blocks() as interface:
	gr.Markdown("## 🎙️ F5-TTS Suy luận")
	gr.Markdown("Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS.")

	with gr.Row():
	with gr.Column():
	ref_audio = gr.Audio(type="filepath", label="📂 Tải lên file audio tham chiếu (.wav hoặc .mp3)")
	ref_text = gr.Textbox(label="📝 Text tham chiếu")
	gen_text = gr.Textbox(label="📝 Text cần sinh")
	speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="⚡ Tốc độ")
	model = gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="🤖 Mô hình")
	btn = gr.Button("🚀 Sinh giọng nói")

	with gr.Column():
	output_audio = gr.Audio(type="filepath", label="🔊 Kết quả audio (.mp3)")

	btn.click(generate_speech, [ref_audio, ref_text, gen_text, speed, model], output_audio)

	if __name__ == "__main__":
	interface.launch(server_name="0.0.0.0", server_port=7860)






	# import os
	# import sys
	# import subprocess
	# from huggingface_hub import hf_hub_download
	# from pydub import AudioSegment
	# import gradio as gr
	# import time

	# # Thêm thư mục src vào sys.path
	# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

	# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
	# current_dir = os.path.dirname(os.path.abspath(__file__))
	# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
	# tests_dir = os.path.join(current_dir, "tests")

	# print(f"Infer CLI path: {infer_cli_path}")
	# print(f"Does infer_cli.py exist? {os.path.exists(infer_cli_path)}")
	# if not os.path.exists(infer_cli_path):
	# return None, "File infer_cli.py không tồn tại!"

	# try:
	# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
	# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
	# except Exception as e:
	# return None, f"Lỗi khi tải model/vocab: {str(e)}"

	# os.environ['PYTHONIOENCODING'] = 'utf-8'
	# env = os.environ.copy()
	# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))

	# command = [
	# sys.executable,
	# infer_cli_path,
	# "--model", model,
	# "--ref_audio", ref_audio_path,
	# "--ref_text", ref_text,
	# "--gen_text", gen_text,
	# "--speed", str(speed),
	# "--vocoder_name", vocoder_name,
	# "--vocab_file", vocab_file,
	# "--ckpt_file", ckpt_file
	# ]

	# print(f"Running command: {' '.join(command)}")
	# try:
	# result = subprocess.run(
	# command,
	# check=True,
	# capture_output=True,
	# text=True,
	# env=env
	# )
	# print("Subprocess stdout:", result.stdout)
	# if os.path.exists(tests_dir):
	# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
	# if wav_files:
	# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
	# output_wav = os.path.join(tests_dir, latest_wav)
	# audio = AudioSegment.from_wav(output_wav)
	# output_mp3 = os.path.join(tests_dir, "output.mp3")
	# audio.export(output_mp3, format="mp3")
	# return output_mp3, "Suy luận thành công!"
	# return None, "Không tìm thấy file âm thanh trong thư mục tests"
	# except subprocess.CalledProcessError as e:
	# return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
	# except Exception as e:
	# return None, str(e)

	# def generate_speech(ref_audio, ref_text, gen_text, speed, model):
	# if ref_audio is None:
	# return None, "Vui lòng tải lên file audio tham chiếu!"
	# # ref_audio là đường dẫn file, tải bằng AudioSegment
	# audio_segment = AudioSegment.from_file(ref_audio)
	# audio_segment = audio_segment.set_channels(1) # Chuyển sang mono
	# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
	# audio_segment.export(ref_audio_path, format="wav")

	# output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))
	# os.remove(ref_audio_path)

	# if output_mp3 and os.path.exists(output_mp3):
	# return output_mp3, message
	# return None, message

	# interface = gr.Interface(
	# fn=generate_speech,
	# inputs=[
	# gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"),
	# gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"),
	# gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"),
	# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"),
	# gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình")
	# ],
	# outputs=[
	# gr.Audio(type="filepath", label="Kết quả audio (.mp3)"),
	# gr.Textbox(label="Trạng thái")
	# ],
	# title="F5-TTS Suy luận",
	# description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS."
	# )

	# if __name__ == "__main__":
	# interface.launch(server_name="0.0.0.0", server_port=7860)



	# import os
	# import sys
	# import subprocess
	# from huggingface_hub import hf_hub_download
	# from pydub import AudioSegment
	# import gradio as gr
	# import time

	# # Thêm thư mục src vào sys.path
	# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

	# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
	# current_dir = os.path.dirname(os.path.abspath(__file__))
	# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
	# tests_dir = os.path.join(current_dir, "tests")

	# # Debug: In đường dẫn để kiểm tra
	# print(f"Infer CLI path: {infer_cli_path}")
	# print(f"Tests dir: {tests_dir}")

	# # Tải file từ Hugging Face Hub
	# try:
	# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
	# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")
	# except Exception as e:
	# return None, f"Lỗi khi tải model/vocab từ Hugging Face: {str(e)}"

	# os.environ['PYTHONIOENCODING'] = 'utf-8'
	# env = os.environ.copy()
	# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))

	# command = [
	# sys.executable,
	# infer_cli_path,
	# "--model", model,
	# "--ref_audio", ref_audio_path,
	# "--ref_text", ref_text,
	# "--gen_text", gen_text,
	# "--speed", str(speed),
	# "--vocoder_name", vocoder_name,
	# "--vocab_file", vocab_file,
	# "--ckpt_file", ckpt_file
	# ]

	# print(f"Running command: {' '.join(command)}")
	# try:
	# result = subprocess.run(
	# command,
	# check=True,
	# capture_output=True,
	# text=True,
	# env=env
	# )
	# print("Subprocess stdout:", result.stdout)
	# if os.path.exists(tests_dir):
	# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
	# if wav_files:
	# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
	# output_wav = os.path.join(tests_dir, latest_wav)
	# audio = AudioSegment.from_wav(output_wav)
	# output_mp3 = os.path.join(tests_dir, "output.mp3")
	# audio.export(output_mp3, format="mp3")
	# return output_mp3, "Suy luận thành công!"

	# return None, "Không tìm thấy file âm thanh trong thư mục tests"
	# except subprocess.CalledProcessError as e:
	# return None, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
	# except Exception as e:
	# return None, str(e)

	# def generate_speech(ref_audio, ref_text, gen_text, speed, model):
	# if ref_audio is None:
	# return None, "Vui lòng tải lên file audio tham chiếu!"
	# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
	# ref_audio.convert_audio_channels(1) # Chuyển sang mono
	# ref_audio.export(ref_audio_path, format="wav")

	# output_mp3, message = run_f5_tts(ref_audio_path, ref_text, gen_text, model, float(speed))
	# os.remove(ref_audio_path)

	# if output_mp3 and os.path.exists(output_mp3):
	# return output_mp3, message
	# return None, message

	# interface = gr.Interface(
	# fn=generate_speech,
	# inputs=[
	# gr.Audio(type="filepath", label="Tải lên file audio tham chiếu (.wav hoặc .mp3)"),
	# gr.Textbox(label="Text tham chiếu", placeholder="Nhập text của audio tham chiếu"),
	# gr.Textbox(label="Text cần sinh", placeholder="Nhập text bạn muốn sinh"),
	# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Tốc độ"),
	# gr.Dropdown(choices=["F5TTS_Base"], value="F5TTS_Base", label="Mô hình")
	# ],
	# outputs=[
	# gr.Audio(type="filepath", label="Kết quả audio (.mp3)"),
	# gr.Textbox(label="Trạng thái")
	# ],
	# title="F5-TTS Suy luận",
	# description="Tải lên audio tham chiếu, nhập text, và sinh audio mới với F5-TTS."
	# )

	# if __name__ == "__main__":
	# interface.launch(server_name="0.0.0.0", server_port=7860)




	# from flask import Flask, request, send_file
	# import subprocess
	# import os
	# import sys
	# from huggingface_hub import hf_hub_download
	# from pydub import AudioSegment

	# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

	# app = Flask(__name__)

	# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
	# current_dir = os.path.dirname(os.path.abspath(__file__))
	# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
	# tests_dir = os.path.join(current_dir, "tests")

	# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
	# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")

	# os.environ['PYTHONIOENCODING'] = 'utf-8'
	# env = os.environ.copy()
	# env['PYTHONPATH'] = os.path.abspath(os.path.join(current_dir, 'src'))

	# command = [
	# sys.executable,
	# infer_cli_path,
	# "--model", model,
	# "--ref_audio", ref_audio_path,
	# "--ref_text", ref_text,
	# "--gen_text", gen_text,
	# "--speed", str(speed),
	# "--vocoder_name", vocoder_name,
	# "--vocab_file", vocab_file,
	# "--ckpt_file", ckpt_file
	# ]

	# try:
	# result = subprocess.run(
	# command,
	# check=True,
	# capture_output=True,
	# text=True,
	# encoding='utf-8',
	# env=env
	# )

	# if os.path.exists(tests_dir):
	# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
	# if wav_files:
	# latest_wav = max(wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x)))
	# output_wav = os.path.join(tests_dir, latest_wav)
	# audio = AudioSegment.from_wav(output_wav)
	# output_mp3 = os.path.join(tests_dir, "output.mp3")
	# audio.export(output_mp3, format="mp3")
	# return True, output_mp3

	# return False, "Không tìm thấy file âm thanh trong thư mục tests"
	# except subprocess.CalledProcessError as e:
	# return False, f"Lỗi khi chạy infer_cli.py: {e.stderr}"
	# except Exception as e:
	# return False, str(e)

	# @app.route('/')
	# def home():
	# return "F5-TTS API is running. Use POST /api/generate to generate audio."

	# @app.route('/api/generate', methods=['POST'])
	# def generate_speech():
	# if 'ref_audio' not in request.files:
	# return {"error": "Missing ref_audio"}, 400
	# ref_audio = request.files['ref_audio']
	# ref_text = request.form.get('ref_text', '')
	# gen_text = request.form.get('gen_text', '')
	# model = request.form.get('model', 'F5TTS_Base')
	# speed = float(request.form.get('speed', 1.2))

	# import time
	# ref_audio_path = f"temp_ref_{int(time.time())}.wav"
	# ref_audio.save(ref_audio_path)

	# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed)
	# os.remove(ref_audio_path)

	# if success:
	# return send_file(result, mimetype='audio/mpeg')
	# else:
	# return {"error": result}, 500

	# if __name__ == "__main__":
	# port = int(os.environ.get("PORT", 7860))
	# app.run(host="0.0.0.0", port=port, debug=False)



	# from flask import Flask, request, send_file
	# import subprocess
	# import os
	# import sys
	# from huggingface_hub import hf_hub_download
	# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

	# app = Flask(__name__)

	# # =========================
	# # Hàm chạy F5-TTS
	# # =========================
	# def run_f5_tts(ref_audio_path, ref_text, gen_text, model="F5TTS_Base", speed=1.2, vocoder_name="vocos"):
	# current_dir = os.path.dirname(os.path.abspath(__file__))
	# infer_cli_path = os.path.join(current_dir, "src", "f5_tts", "infer", "infer_cli.py")
	# tests_dir = os.path.join(current_dir, "tests")

	# # Dùng huggingface_hub để tải file model và vocab từ repo 'nguyensu27/TTS'
	# vocab_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="vocab.txt")
	# ckpt_file = hf_hub_download(repo_id="nguyensu27/TTS", filename="model_last.pt")

	# os.environ['PYTHONIOENCODING'] = 'utf-8'

	# command = [
	# sys.executable,
	# infer_cli_path,
	# "--model", model,
	# "--ref_audio", ref_audio_path,
	# "--ref_text", ref_text,
	# "--gen_text", gen_text,
	# "--speed", str(speed),
	# "--vocoder_name", vocoder_name,
	# "--vocab_file", vocab_file,
	# "--ckpt_file", ckpt_file
	# ]

	# try:
	# result = subprocess.run(
	# command,
	# check=True,
	# capture_output=True,
	# text=True,
	# encoding='utf-8'
	# )

	# if os.path.exists(tests_dir):
	# wav_files = [f for f in os.listdir(tests_dir) if f.endswith('.wav')]
	# if wav_files:
	# latest_wav = max(
	# wav_files, key=lambda x: os.path.getmtime(os.path.join(tests_dir, x))
	# )
	# output_file = os.path.join(tests_dir, latest_wav)
	# return True, output_file

	# return False, "Không tìm thấy file âm thanh trong thư mục tests"
	# except subprocess.CalledProcessError as e:
	# return False, e.stderr
	# except Exception as e:
	# return False, str(e)


	# # =========================
	# # Routes
	# # =========================
	# @app.route('/')
	# def home():
	# return "F5-TTS API is running. Use POST /api/generate to generate audio."


	# @app.route('/api/generate', methods=['POST'])
	# def generate_speech():
	# if 'ref_audio' not in request.files:
	# return {"error": "Missing ref_audio"}, 400
	# ref_audio = request.files['ref_audio']
	# ref_text = request.form.get('ref_text', '')
	# gen_text = request.form.get('gen_text', '')
	# model = request.form.get('model', 'F5TTS_Base')
	# speed = float(request.form.get('speed', 1.2))

	# ref_audio_path = 'temp_ref.wav'
	# ref_audio.save(ref_audio_path)

	# success, result = run_f5_tts(ref_audio_path, ref_text, gen_text, model, speed)
	# os.remove(ref_audio_path)

	# if success:
	# return send_file(result, mimetype='audio/wav')
	# else:
	# return {"error": result}, 500


	# # =========================
	# # Main
	# # =========================
	# if __name__ == "__main__":
	# port = int(os.environ.get("PORT", 7860))
	# app.run(host="0.0.0.0", port=port, debug=False)