coqui2 / app.py
Adoetz's picture
Rename 2. gradio.py to app.py
9d5c5f9 verified
raw
history blame
5.47 kB
from TTS.api import TTS
import numpy as np
import torch
import os
import gradio as gr
from scipy.io.wavfile import write as write_wav
# Check if GPU is available
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
# Initialize the TTS object
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
tts.to(device) # Use GPU if available
# Function to list .wav files in the /clone/ folder
def list_wav_files():
clone_folder = "clone"
if not os.path.exists(clone_folder):
print(f"Error: Folder '{clone_folder}' not found.")
return []
wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")]
if not wav_files:
print(f"No .wav files found in '{clone_folder}'.")
return []
return wav_files
# Function to generate TTS audio and save it as a .wav file
def generate_tts_audio(text, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None):
# Determine the reference audio file
if voice_choice == "existing_speaker":
if not speaker_name:
return "Error: Speaker name is required for existing speaker.", None
reference_audio = None
elif voice_choice == "voice_cloning":
if recorded_audio:
# Use the recorded audio for voice cloning
reference_audio = recorded_audio
elif uploaded_file:
# Use the uploaded file for voice cloning
reference_audio = uploaded_file
elif wav_file_choice:
# Use a file from the clone folder
wav_files = list_wav_files()
if not wav_files:
return "Error: No .wav files found for voice cloning.", None
try:
wav_file_index = int(wav_file_choice.split(":")[0].strip())
if wav_file_index < 0 or wav_file_index >= len(wav_files):
return "Error: Invalid .wav file index.", None
reference_audio = os.path.join("clone", wav_files[wav_file_index])
except (ValueError, IndexError, AttributeError):
return "Error: Invalid .wav file choice.", None
else:
return "Error: No reference audio provided for voice cloning.", None
else:
return "Error: Invalid voice choice.", None
# Generate TTS audio
if reference_audio:
# Use reference voice (voice cloning)
audio = tts.tts(
text=text,
speaker_wav=reference_audio,
language="en"
)
else:
# Use existing speaker
audio = tts.tts(
text=text,
speaker=speaker_name,
language="en"
)
# Convert audio to a NumPy array
audio_np = np.array(audio, dtype=np.float32)
# Save the audio as a .wav file
output_file = "output.wav"
write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np)
return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np)
# Gradio interface
def create_gradio_interface():
wav_files = list_wav_files()
wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)]
with gr.Blocks() as demo:
gr.Markdown("# TTS Streaming System")
with gr.Row():
text_input = gr.Textbox(label="Enter text to generate speech", lines=3)
with gr.Row():
voice_choice = gr.Radio(choices=["existing_speaker", "voice_cloning"], label="Select voice type")
with gr.Row():
speaker_name = gr.Textbox(label="Enter the speaker name (e.g., 'Ana Florence')", visible=False)
wav_file_choice = gr.Dropdown(choices=wav_file_choices, label="Select a .wav file for cloning", visible=False)
uploaded_file = gr.Audio(label="Upload your own .wav file for cloning", type="filepath", visible=False)
recorded_audio = gr.Microphone(label="Record your voice for cloning", type="filepath", visible=False)
with gr.Row():
submit_button = gr.Button("Generate Speech")
with gr.Row():
output_text = gr.Textbox(label="Output", interactive=False)
output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True)
def update_components(choice):
if choice == "existing_speaker":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif choice == "voice_cloning":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
voice_choice.change(update_components, inputs=voice_choice, outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])
submit_button.click(
generate_tts_audio,
inputs=[text_input, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio],
outputs=[output_text, output_audio]
)
return demo
# Launch Gradio interface
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(share=True) # Set share=True to create a public link