|
from TTS.api import TTS
|
|
import numpy as np
|
|
import torch
|
|
import os
|
|
import gradio as gr
|
|
from scipy.io.wavfile import write as write_wav
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
else:
|
|
device = "cpu"
|
|
|
|
|
|
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
|
|
tts.to(device)
|
|
|
|
|
|
def list_wav_files():
|
|
clone_folder = "clone"
|
|
if not os.path.exists(clone_folder):
|
|
print(f"Error: Folder '{clone_folder}' not found.")
|
|
return []
|
|
|
|
wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")]
|
|
if not wav_files:
|
|
print(f"No .wav files found in '{clone_folder}'.")
|
|
return []
|
|
|
|
return wav_files
|
|
|
|
|
|
def generate_tts_audio(text, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None):
|
|
|
|
if voice_choice == "existing_speaker":
|
|
if not speaker_name:
|
|
return "Error: Speaker name is required for existing speaker.", None
|
|
reference_audio = None
|
|
elif voice_choice == "voice_cloning":
|
|
if recorded_audio:
|
|
|
|
reference_audio = recorded_audio
|
|
elif uploaded_file:
|
|
|
|
reference_audio = uploaded_file
|
|
elif wav_file_choice:
|
|
|
|
wav_files = list_wav_files()
|
|
if not wav_files:
|
|
return "Error: No .wav files found for voice cloning.", None
|
|
|
|
try:
|
|
wav_file_index = int(wav_file_choice.split(":")[0].strip())
|
|
if wav_file_index < 0 or wav_file_index >= len(wav_files):
|
|
return "Error: Invalid .wav file index.", None
|
|
reference_audio = os.path.join("clone", wav_files[wav_file_index])
|
|
except (ValueError, IndexError, AttributeError):
|
|
return "Error: Invalid .wav file choice.", None
|
|
else:
|
|
return "Error: No reference audio provided for voice cloning.", None
|
|
else:
|
|
return "Error: Invalid voice choice.", None
|
|
|
|
|
|
if reference_audio:
|
|
|
|
audio = tts.tts(
|
|
text=text,
|
|
speaker_wav=reference_audio,
|
|
language="en"
|
|
)
|
|
else:
|
|
|
|
audio = tts.tts(
|
|
text=text,
|
|
speaker=speaker_name,
|
|
language="en"
|
|
)
|
|
|
|
|
|
audio_np = np.array(audio, dtype=np.float32)
|
|
|
|
|
|
output_file = "output.wav"
|
|
write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np)
|
|
|
|
return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np)
|
|
|
|
|
|
def create_gradio_interface():
|
|
wav_files = list_wav_files()
|
|
wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)]
|
|
|
|
with gr.Blocks() as demo:
|
|
gr.Markdown("# TTS Streaming System")
|
|
with gr.Row():
|
|
text_input = gr.Textbox(label="Enter text to generate speech", lines=3)
|
|
with gr.Row():
|
|
voice_choice = gr.Radio(choices=["existing_speaker", "voice_cloning"], label="Select voice type")
|
|
with gr.Row():
|
|
speaker_name = gr.Textbox(label="Enter the speaker name (e.g., 'Ana Florence')", visible=False)
|
|
wav_file_choice = gr.Dropdown(choices=wav_file_choices, label="Select a .wav file for cloning", visible=False)
|
|
uploaded_file = gr.Audio(label="Upload your own .wav file for cloning", type="filepath", visible=False)
|
|
recorded_audio = gr.Microphone(label="Record your voice for cloning", type="filepath", visible=False)
|
|
with gr.Row():
|
|
submit_button = gr.Button("Generate Speech")
|
|
with gr.Row():
|
|
output_text = gr.Textbox(label="Output", interactive=False)
|
|
output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True)
|
|
|
|
def update_components(choice):
|
|
if choice == "existing_speaker":
|
|
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
|
elif choice == "voice_cloning":
|
|
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
|
else:
|
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
|
|
|
voice_choice.change(update_components, inputs=voice_choice, outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])
|
|
|
|
submit_button.click(
|
|
generate_tts_audio,
|
|
inputs=[text_input, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio],
|
|
outputs=[output_text, output_audio]
|
|
)
|
|
|
|
return demo
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo = create_gradio_interface()
|
|
demo.launch(share=True) |