|
from TTS.api import TTS |
|
import numpy as np |
|
import torch |
|
import os |
|
import gradio as gr |
|
from scipy.io.wavfile import write as write_wav |
|
|
|
|
|
if torch.cuda.is_available(): |
|
device = "cuda" |
|
else: |
|
device = "cpu" |
|
|
|
|
|
global_tts = None |
|
current_model_name = None |
|
|
|
|
|
def list_available_models(): |
|
tts = TTS() |
|
model_manager = tts.list_models() |
|
return model_manager.list_models() |
|
|
|
|
|
def is_multilingual(model_name): |
|
return "multilingual" in model_name.lower() or "xtts" in model_name.lower() |
|
|
|
|
|
def get_available_speakers(tts): |
|
try: |
|
|
|
if hasattr(tts.synthesizer, 'speaker_manager') and tts.synthesizer.speaker_manager: |
|
return tts.synthesizer.speaker_manager.speaker_names |
|
else: |
|
print("Warning: No speaker manager found in the model. Using voice cloning only.") |
|
return None |
|
except Exception as e: |
|
print(f"Error fetching speakers: {e}") |
|
return None |
|
|
|
|
|
def list_wav_files(): |
|
clone_folder = "clone" |
|
if not os.path.exists(clone_folder): |
|
print(f"Error: Folder '{clone_folder}' not found.") |
|
return [] |
|
|
|
wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")] |
|
if not wav_files: |
|
print(f"No .wav files found in '{clone_folder}'.") |
|
return [] |
|
|
|
return wav_files |
|
|
|
|
|
def initialize_or_update_tts(model_name): |
|
global global_tts, current_model_name |
|
if global_tts is None or model_name != current_model_name: |
|
print(f"Loading model: {model_name}") |
|
try: |
|
|
|
global_tts = TTS(model_name=model_name, progress_bar=True) |
|
|
|
|
|
if hasattr(global_tts.synthesizer, 'phonemizer'): |
|
global_tts.synthesizer.phonemizer = "gruut" |
|
print("Using gruut phonemizer.") |
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
return None |
|
|
|
global_tts.to(device) |
|
current_model_name = model_name |
|
return global_tts |
|
|
|
|
|
def generate_tts_audio(text, model_name, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None): |
|
global global_tts |
|
try: |
|
|
|
tts = initialize_or_update_tts(model_name) |
|
if tts is None: |
|
return "Error: Failed to load the TTS model.", None |
|
|
|
|
|
if voice_choice == "existing_speaker": |
|
if not speaker_name: |
|
return "Error: Speaker name is required for existing speaker.", None |
|
reference_audio = None |
|
elif voice_choice == "voice_cloning": |
|
if recorded_audio: |
|
|
|
reference_audio = recorded_audio |
|
elif uploaded_file: |
|
|
|
reference_audio = uploaded_file |
|
elif wav_file_choice: |
|
|
|
wav_files = list_wav_files() |
|
if not wav_files: |
|
return "Error: No .wav files found for voice cloning.", None |
|
|
|
try: |
|
wav_file_index = int(wav_file_choice.split(":")[0].strip()) |
|
if wav_file_index < 0 or wav_file_index >= len(wav_files): |
|
return "Error: Invalid .wav file index.", None |
|
reference_audio = os.path.join("clone", wav_files[wav_file_index]) |
|
except (ValueError, IndexError, AttributeError): |
|
return "Error: Invalid .wav file choice.", None |
|
else: |
|
return "Error: No reference audio provided for voice cloning.", None |
|
else: |
|
return "Error: Invalid voice choice.", None |
|
|
|
|
|
if reference_audio: |
|
|
|
if is_multilingual(model_name): |
|
audio = tts.tts( |
|
text=text, |
|
speaker_wav=reference_audio, |
|
language="en" |
|
) |
|
else: |
|
audio = tts.tts( |
|
text=text, |
|
speaker_wav=reference_audio |
|
) |
|
else: |
|
|
|
if is_multilingual(model_name): |
|
audio = tts.tts( |
|
text=text, |
|
speaker=speaker_name, |
|
language="en" |
|
) |
|
else: |
|
audio = tts.tts( |
|
text=text, |
|
speaker=speaker_name |
|
) |
|
|
|
|
|
audio_np = np.array(audio, dtype=np.float32) |
|
|
|
|
|
output_file = "output.wav" |
|
write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np) |
|
|
|
return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np) |
|
except Exception as e: |
|
return f"Error generating audio: {e}", None |
|
|
|
|
|
def create_gradio_interface(): |
|
available_models = list_available_models() |
|
wav_files = list_wav_files() |
|
wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)] |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# TTS Streaming System") |
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Enter text to generate speech", lines=3) |
|
with gr.Row(): |
|
model_name = gr.Dropdown(choices=available_models, label="Select TTS Model", value=available_models[0] if available_models else None) |
|
with gr.Row(): |
|
voice_choice = gr.Radio( |
|
choices=["existing_speaker", "voice_cloning"], |
|
label="Select voice type", |
|
value="existing_speaker" |
|
) |
|
with gr.Row(): |
|
speaker_name = gr.Dropdown( |
|
label="Select a speaker", |
|
visible=True |
|
) |
|
wav_file_choice = gr.Dropdown( |
|
choices=wav_file_choices, |
|
label="Select a .wav file for cloning", |
|
visible=False |
|
) |
|
uploaded_file = gr.Audio( |
|
label="Upload your own .wav file for cloning", |
|
type="filepath", |
|
visible=False |
|
) |
|
recorded_audio = gr.Microphone( |
|
label="Record your voice for cloning", |
|
type="filepath", |
|
visible=False |
|
) |
|
with gr.Row(): |
|
submit_button = gr.Button("Generate Speech") |
|
with gr.Row(): |
|
output_text = gr.Textbox(label="Output", interactive=False) |
|
output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True) |
|
|
|
def update_components(choice, model_name): |
|
tts = initialize_or_update_tts(model_name) |
|
available_speakers = get_available_speakers(tts) |
|
|
|
if choice == "existing_speaker": |
|
return ( |
|
gr.update(visible=True, choices=available_speakers if available_speakers else []), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
elif choice == "voice_cloning": |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=bool(wav_files)), |
|
gr.update(visible=True), |
|
gr.update(visible=True) |
|
) |
|
else: |
|
return ( |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
|
|
voice_choice.change(update_components, inputs=[voice_choice, model_name], outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio]) |
|
model_name.change(update_components, inputs=[voice_choice, model_name], outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio]) |
|
|
|
|
|
submit_button.click( |
|
generate_tts_audio, |
|
inputs=[text_input, model_name, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio], |
|
outputs=[output_text, output_audio], |
|
concurrency_limit=10 |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_gradio_interface() |
|
demo.launch(share=True) |