File size: 5,471 Bytes
17ed7d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from TTS.api import TTS
import numpy as np
import torch
import os
import gradio as gr
from scipy.io.wavfile import write as write_wav

# Check if GPU is available
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

# Initialize the TTS object
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
tts.to(device)  # Use GPU if available

# Function to list .wav files in the /clone/ folder
def list_wav_files():
    clone_folder = "clone"
    if not os.path.exists(clone_folder):
        print(f"Error: Folder '{clone_folder}' not found.")
        return []

    wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")]
    if not wav_files:
        print(f"No .wav files found in '{clone_folder}'.")
        return []

    return wav_files

# Function to generate TTS audio and save it as a .wav file
def generate_tts_audio(text, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None):
    # Determine the reference audio file
    if voice_choice == "existing_speaker":
        if not speaker_name:
            return "Error: Speaker name is required for existing speaker.", None
        reference_audio = None
    elif voice_choice == "voice_cloning":
        if recorded_audio:
            # Use the recorded audio for voice cloning
            reference_audio = recorded_audio
        elif uploaded_file:
            # Use the uploaded file for voice cloning
            reference_audio = uploaded_file
        elif wav_file_choice:
            # Use a file from the clone folder
            wav_files = list_wav_files()
            if not wav_files:
                return "Error: No .wav files found for voice cloning.", None

            try:
                wav_file_index = int(wav_file_choice.split(":")[0].strip())
                if wav_file_index < 0 or wav_file_index >= len(wav_files):
                    return "Error: Invalid .wav file index.", None
                reference_audio = os.path.join("clone", wav_files[wav_file_index])
            except (ValueError, IndexError, AttributeError):
                return "Error: Invalid .wav file choice.", None
        else:
            return "Error: No reference audio provided for voice cloning.", None
    else:
        return "Error: Invalid voice choice.", None

    # Generate TTS audio
    if reference_audio:
        # Use reference voice (voice cloning)
        audio = tts.tts(
            text=text,
            speaker_wav=reference_audio,
            language="en"
        )
    else:
        # Use existing speaker
        audio = tts.tts(
            text=text,
            speaker=speaker_name,
            language="en"
        )

    # Convert audio to a NumPy array
    audio_np = np.array(audio, dtype=np.float32)

    # Save the audio as a .wav file
    output_file = "output.wav"
    write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np)

    return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np)

# Gradio interface
def create_gradio_interface():
    wav_files = list_wav_files()
    wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)]

    with gr.Blocks() as demo:
        gr.Markdown("# TTS Streaming System")
        with gr.Row():
            text_input = gr.Textbox(label="Enter text to generate speech", lines=3)
        with gr.Row():
            voice_choice = gr.Radio(choices=["existing_speaker", "voice_cloning"], label="Select voice type")
        with gr.Row():
            speaker_name = gr.Textbox(label="Enter the speaker name (e.g., 'Ana Florence')", visible=False)
            wav_file_choice = gr.Dropdown(choices=wav_file_choices, label="Select a .wav file for cloning", visible=False)
            uploaded_file = gr.Audio(label="Upload your own .wav file for cloning", type="filepath", visible=False)
            recorded_audio = gr.Microphone(label="Record your voice for cloning", type="filepath", visible=False)
        with gr.Row():
            submit_button = gr.Button("Generate Speech")
        with gr.Row():
            output_text = gr.Textbox(label="Output", interactive=False)
            output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True)

        def update_components(choice):
            if choice == "existing_speaker":
                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
            elif choice == "voice_cloning":
                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

        voice_choice.change(update_components, inputs=voice_choice, outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])

        submit_button.click(
            generate_tts_audio,
            inputs=[text_input, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio],
            outputs=[output_text, output_audio]
        )

    return demo

# Launch Gradio interface
if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(share=True)  # Set share=True to create a public link