from TTS.api import TTS import numpy as np import torch import os import gradio as gr from scipy.io.wavfile import write as write_wav # Check if GPU is available if torch.cuda.is_available(): device = "cuda" else: device = "cpu" # Initialize the TTS object tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True) tts.to(device) # Use GPU if available # Function to list .wav files in the /clone/ folder def list_wav_files(): clone_folder = "clone" if not os.path.exists(clone_folder): print(f"Error: Folder '{clone_folder}' not found.") return [] wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")] if not wav_files: print(f"No .wav files found in '{clone_folder}'.") return [] return wav_files # Function to generate TTS audio and save it as a .wav file def generate_tts_audio(text, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None): # Determine the reference audio file if voice_choice == "existing_speaker": if not speaker_name: return "Error: Speaker name is required for existing speaker.", None reference_audio = None elif voice_choice == "voice_cloning": if recorded_audio: # Use the recorded audio for voice cloning reference_audio = recorded_audio elif uploaded_file: # Use the uploaded file for voice cloning reference_audio = uploaded_file elif wav_file_choice: # Use a file from the clone folder wav_files = list_wav_files() if not wav_files: return "Error: No .wav files found for voice cloning.", None try: wav_file_index = int(wav_file_choice.split(":")[0].strip()) if wav_file_index < 0 or wav_file_index >= len(wav_files): return "Error: Invalid .wav file index.", None reference_audio = os.path.join("clone", wav_files[wav_file_index]) except (ValueError, IndexError, AttributeError): return "Error: Invalid .wav file choice.", None else: return "Error: No reference audio provided for voice cloning.", None else: return "Error: Invalid voice choice.", None # Generate TTS audio if reference_audio: # Use reference voice (voice cloning) audio = tts.tts( text=text, speaker_wav=reference_audio, language="en" ) else: # Use existing speaker audio = tts.tts( text=text, speaker=speaker_name, language="en" ) # Convert audio to a NumPy array audio_np = np.array(audio, dtype=np.float32) # Save the audio as a .wav file output_file = "output.wav" write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np) return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np) # Gradio interface def create_gradio_interface(): wav_files = list_wav_files() wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)] with gr.Blocks() as demo: gr.Markdown("# TTS Streaming System") with gr.Row(): text_input = gr.Textbox(label="Enter text to generate speech", lines=3) with gr.Row(): voice_choice = gr.Radio(choices=["existing_speaker", "voice_cloning"], label="Select voice type") with gr.Row(): speaker_name = gr.Textbox(label="Enter the speaker name (e.g., 'Ana Florence')", visible=False) wav_file_choice = gr.Dropdown(choices=wav_file_choices, label="Select a .wav file for cloning", visible=False) uploaded_file = gr.Audio(label="Upload your own .wav file for cloning", type="filepath", visible=False) recorded_audio = gr.Microphone(label="Record your voice for cloning", type="filepath", visible=False) with gr.Row(): submit_button = gr.Button("Generate Speech") with gr.Row(): output_text = gr.Textbox(label="Output", interactive=False) output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True) def update_components(choice): if choice == "existing_speaker": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) elif choice == "voice_cloning": return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) voice_choice.change(update_components, inputs=voice_choice, outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio]) submit_button.click( generate_tts_audio, inputs=[text_input, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio], outputs=[output_text, output_audio] ) return demo # Launch Gradio interface if __name__ == "__main__": demo = create_gradio_interface() demo.launch(share=True) # Set share=True to create a public link