Adoetz commited on
Commit
03b0bdb
·
verified ·
1 Parent(s): e66369b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -131
app.py CHANGED
@@ -1,132 +1,227 @@
1
- from TTS.api import TTS
2
- import numpy as np
3
- import torch
4
- import os
5
- import gradio as gr
6
- from scipy.io.wavfile import write as write_wav
7
-
8
- # Check if GPU is available
9
- if torch.cuda.is_available():
10
- device = "cuda"
11
- else:
12
- device = "cpu"
13
-
14
- # Initialize the TTS object
15
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)
16
- tts.to(device) # Use GPU if available
17
-
18
- # Function to list .wav files in the /clone/ folder
19
- def list_wav_files():
20
- clone_folder = "clone"
21
- if not os.path.exists(clone_folder):
22
- print(f"Error: Folder '{clone_folder}' not found.")
23
- return []
24
-
25
- wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")]
26
- if not wav_files:
27
- print(f"No .wav files found in '{clone_folder}'.")
28
- return []
29
-
30
- return wav_files
31
-
32
- # Function to generate TTS audio and save it as a .wav file
33
- def generate_tts_audio(text, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None):
34
- # Determine the reference audio file
35
- if voice_choice == "existing_speaker":
36
- if not speaker_name:
37
- return "Error: Speaker name is required for existing speaker.", None
38
- reference_audio = None
39
- elif voice_choice == "voice_cloning":
40
- if recorded_audio:
41
- # Use the recorded audio for voice cloning
42
- reference_audio = recorded_audio
43
- elif uploaded_file:
44
- # Use the uploaded file for voice cloning
45
- reference_audio = uploaded_file
46
- elif wav_file_choice:
47
- # Use a file from the clone folder
48
- wav_files = list_wav_files()
49
- if not wav_files:
50
- return "Error: No .wav files found for voice cloning.", None
51
-
52
- try:
53
- wav_file_index = int(wav_file_choice.split(":")[0].strip())
54
- if wav_file_index < 0 or wav_file_index >= len(wav_files):
55
- return "Error: Invalid .wav file index.", None
56
- reference_audio = os.path.join("clone", wav_files[wav_file_index])
57
- except (ValueError, IndexError, AttributeError):
58
- return "Error: Invalid .wav file choice.", None
59
- else:
60
- return "Error: No reference audio provided for voice cloning.", None
61
- else:
62
- return "Error: Invalid voice choice.", None
63
-
64
- # Generate TTS audio
65
- if reference_audio:
66
- # Use reference voice (voice cloning)
67
- audio = tts.tts(
68
- text=text,
69
- speaker_wav=reference_audio,
70
- language="en"
71
- )
72
- else:
73
- # Use existing speaker
74
- audio = tts.tts(
75
- text=text,
76
- speaker=speaker_name,
77
- language="en"
78
- )
79
-
80
- # Convert audio to a NumPy array
81
- audio_np = np.array(audio, dtype=np.float32)
82
-
83
- # Save the audio as a .wav file
84
- output_file = "output.wav"
85
- write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np)
86
-
87
- return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np)
88
-
89
- # Gradio interface
90
- def create_gradio_interface():
91
- wav_files = list_wav_files()
92
- wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)]
93
-
94
- with gr.Blocks() as demo:
95
- gr.Markdown("# TTS Streaming System")
96
- with gr.Row():
97
- text_input = gr.Textbox(label="Enter text to generate speech", lines=3)
98
- with gr.Row():
99
- voice_choice = gr.Radio(choices=["existing_speaker", "voice_cloning"], label="Select voice type")
100
- with gr.Row():
101
- speaker_name = gr.Textbox(label="Enter the speaker name (e.g., 'Ana Florence')", visible=False)
102
- wav_file_choice = gr.Dropdown(choices=wav_file_choices, label="Select a .wav file for cloning", visible=False)
103
- uploaded_file = gr.Audio(label="Upload your own .wav file for cloning", type="filepath", visible=False)
104
- recorded_audio = gr.Microphone(label="Record your voice for cloning", type="filepath", visible=False)
105
- with gr.Row():
106
- submit_button = gr.Button("Generate Speech")
107
- with gr.Row():
108
- output_text = gr.Textbox(label="Output", interactive=False)
109
- output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True)
110
-
111
- def update_components(choice):
112
- if choice == "existing_speaker":
113
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
114
- elif choice == "voice_cloning":
115
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
116
- else:
117
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
118
-
119
- voice_choice.change(update_components, inputs=voice_choice, outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])
120
-
121
- submit_button.click(
122
- generate_tts_audio,
123
- inputs=[text_input, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio],
124
- outputs=[output_text, output_audio]
125
- )
126
-
127
- return demo
128
-
129
- # Launch Gradio interface
130
- if __name__ == "__main__":
131
- demo = create_gradio_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  demo.launch(share=True) # Set share=True to create a public link
 
1
+ from TTS.api import TTS
2
+ import numpy as np
3
+ import torch
4
+ import os
5
+ import gradio as gr
6
+ from scipy.io.wavfile import write as write_wav
7
+
8
+ # Check if GPU is available
9
+ if torch.cuda.is_available():
10
+ device = "cuda"
11
+ else:
12
+ device = "cpu"
13
+
14
+ # Global variable to store the TTS model
15
+ global_tts = None
16
+ current_model_name = None
17
+
18
+ # Function to list available TTS models
19
+ def list_available_models():
20
+ tts = TTS()
21
+ model_manager = tts.list_models()
22
+ return model_manager.list_models()
23
+
24
+ # Function to check if a model is multilingual
25
+ def is_multilingual(model_name):
26
+ return "multilingual" in model_name.lower() or "xtts" in model_name.lower()
27
+
28
+ # Function to fetch available speakers from the model
29
+ def get_available_speakers(tts):
30
+ try:
31
+ # Check if the model has a speaker manager
32
+ if hasattr(tts.synthesizer, 'speaker_manager') and tts.synthesizer.speaker_manager:
33
+ return tts.synthesizer.speaker_manager.speaker_names
34
+ else:
35
+ print("Warning: No speaker manager found in the model. Using voice cloning only.")
36
+ return None # No pre-defined speakers
37
+ except Exception as e:
38
+ print(f"Error fetching speakers: {e}")
39
+ return None # Fallback to voice cloning
40
+
41
+ # Function to list .wav files in the /clone/ folder
42
+ def list_wav_files():
43
+ clone_folder = "clone"
44
+ if not os.path.exists(clone_folder):
45
+ print(f"Error: Folder '{clone_folder}' not found.")
46
+ return []
47
+
48
+ wav_files = [f for f in os.listdir(clone_folder) if f.endswith(".wav")]
49
+ if not wav_files:
50
+ print(f"No .wav files found in '{clone_folder}'.")
51
+ return []
52
+
53
+ return wav_files
54
+
55
+ # Function to initialize or update the TTS model
56
+ def initialize_or_update_tts(model_name):
57
+ global global_tts, current_model_name
58
+ if global_tts is None or model_name != current_model_name:
59
+ print(f"Loading model: {model_name}")
60
+ global_tts = TTS(model_name=model_name, progress_bar=True)
61
+ global_tts.to(device)
62
+ current_model_name = model_name
63
+ return global_tts
64
+
65
+ # Function to generate TTS audio
66
+ def generate_tts_audio(text, model_name, voice_choice, speaker_name=None, wav_file_choice=None, uploaded_file=None, recorded_audio=None):
67
+ global global_tts
68
+ try:
69
+ # Initialize or update the TTS model
70
+ tts = initialize_or_update_tts(model_name)
71
+
72
+ # Determine the reference audio file
73
+ if voice_choice == "existing_speaker":
74
+ if not speaker_name:
75
+ return "Error: Speaker name is required for existing speaker.", None
76
+ reference_audio = None
77
+ elif voice_choice == "voice_cloning":
78
+ if recorded_audio:
79
+ # Use the recorded audio for voice cloning
80
+ reference_audio = recorded_audio
81
+ elif uploaded_file:
82
+ # Use the uploaded file for voice cloning
83
+ reference_audio = uploaded_file
84
+ elif wav_file_choice:
85
+ # Use a file from the clone folder
86
+ wav_files = list_wav_files()
87
+ if not wav_files:
88
+ return "Error: No .wav files found for voice cloning.", None
89
+
90
+ try:
91
+ wav_file_index = int(wav_file_choice.split(":")[0].strip())
92
+ if wav_file_index < 0 or wav_file_index >= len(wav_files):
93
+ return "Error: Invalid .wav file index.", None
94
+ reference_audio = os.path.join("clone", wav_files[wav_file_index])
95
+ except (ValueError, IndexError, AttributeError):
96
+ return "Error: Invalid .wav file choice.", None
97
+ else:
98
+ return "Error: No reference audio provided for voice cloning.", None
99
+ else:
100
+ return "Error: Invalid voice choice.", None
101
+
102
+ # Generate TTS audio
103
+ if reference_audio:
104
+ # Use reference voice (voice cloning)
105
+ if is_multilingual(model_name):
106
+ audio = tts.tts(
107
+ text=text,
108
+ speaker_wav=reference_audio,
109
+ language="en"
110
+ )
111
+ else:
112
+ audio = tts.tts(
113
+ text=text,
114
+ speaker_wav=reference_audio
115
+ )
116
+ else:
117
+ # Use existing speaker
118
+ if is_multilingual(model_name):
119
+ audio = tts.tts(
120
+ text=text,
121
+ speaker=speaker_name,
122
+ language="en"
123
+ )
124
+ else:
125
+ audio = tts.tts(
126
+ text=text,
127
+ speaker=speaker_name
128
+ )
129
+
130
+ # Convert audio to a NumPy array
131
+ audio_np = np.array(audio, dtype=np.float32)
132
+
133
+ # Save the audio as a .wav file
134
+ output_file = "output.wav"
135
+ write_wav(output_file, tts.synthesizer.output_sample_rate, audio_np)
136
+
137
+ return "Audio generated successfully!", (tts.synthesizer.output_sample_rate, audio_np)
138
+ except Exception as e:
139
+ return f"Error generating audio: {e}", None
140
+
141
+ # Gradio interface
142
+ def create_gradio_interface():
143
+ available_models = list_available_models()
144
+ wav_files = list_wav_files()
145
+ wav_file_choices = [f"{i}: {file}" for i, file in enumerate(wav_files)]
146
+
147
+ with gr.Blocks() as demo:
148
+ gr.Markdown("# TTS Streaming System")
149
+ with gr.Row():
150
+ text_input = gr.Textbox(label="Enter text to generate speech", lines=3)
151
+ with gr.Row():
152
+ model_name = gr.Dropdown(choices=available_models, label="Select TTS Model", value=available_models[0] if available_models else None)
153
+ with gr.Row():
154
+ voice_choice = gr.Radio(
155
+ choices=["existing_speaker", "voice_cloning"],
156
+ label="Select voice type",
157
+ value="existing_speaker"
158
+ )
159
+ with gr.Row():
160
+ speaker_name = gr.Dropdown(
161
+ label="Select a speaker",
162
+ visible=True
163
+ )
164
+ wav_file_choice = gr.Dropdown(
165
+ choices=wav_file_choices,
166
+ label="Select a .wav file for cloning",
167
+ visible=False
168
+ )
169
+ uploaded_file = gr.Audio(
170
+ label="Upload your own .wav file for cloning",
171
+ type="filepath",
172
+ visible=False
173
+ )
174
+ recorded_audio = gr.Microphone(
175
+ label="Record your voice for cloning",
176
+ type="filepath",
177
+ visible=False
178
+ )
179
+ with gr.Row():
180
+ submit_button = gr.Button("Generate Speech")
181
+ with gr.Row():
182
+ output_text = gr.Textbox(label="Output", interactive=False)
183
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", visible=True)
184
+
185
+ def update_components(choice, model_name):
186
+ tts = initialize_or_update_tts(model_name)
187
+ available_speakers = get_available_speakers(tts)
188
+
189
+ if choice == "existing_speaker":
190
+ return (
191
+ gr.update(visible=True, choices=available_speakers if available_speakers else []), # speaker_name
192
+ gr.update(visible=False), # wav_file_choice
193
+ gr.update(visible=False), # uploaded_file
194
+ gr.update(visible=False) # recorded_audio
195
+ )
196
+ elif choice == "voice_cloning":
197
+ return (
198
+ gr.update(visible=False), # speaker_name
199
+ gr.update(visible=bool(wav_files)), # wav_file_choice
200
+ gr.update(visible=True), # uploaded_file
201
+ gr.update(visible=True) # recorded_audio
202
+ )
203
+ else:
204
+ return (
205
+ gr.update(visible=False), # speaker_name
206
+ gr.update(visible=False), # wav_file_choice
207
+ gr.update(visible=False), # uploaded_file
208
+ gr.update(visible=False) # recorded_audio
209
+ )
210
+
211
+ voice_choice.change(update_components, inputs=[voice_choice, model_name], outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])
212
+ model_name.change(update_components, inputs=[voice_choice, model_name], outputs=[speaker_name, wav_file_choice, uploaded_file, recorded_audio])
213
+
214
+ # Enable concurrency for the submit button
215
+ submit_button.click(
216
+ generate_tts_audio,
217
+ inputs=[text_input, model_name, voice_choice, speaker_name, wav_file_choice, uploaded_file, recorded_audio],
218
+ outputs=[output_text, output_audio],
219
+ concurrency_limit=10 # Adjust this value based on your system's capabilities
220
+ )
221
+
222
+ return demo
223
+
224
+ # Launch Gradio interface
225
+ if __name__ == "__main__":
226
+ demo = create_gradio_interface()
227
  demo.launch(share=True) # Set share=True to create a public link