Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -45,108 +45,83 @@ class VoiceSynthesizer:
|
|
45 |
except Exception as e:
|
46 |
print(f"Bark model loading error: {e}")
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
# Convert to numpy array if needed
|
56 |
-
if isinstance(reference_audio, tuple):
|
57 |
-
reference_audio = reference_audio[0]
|
58 |
-
|
59 |
-
# Ensure the audio is mono and normalized
|
60 |
-
if reference_audio.ndim > 1:
|
61 |
-
reference_audio = reference_audio.mean(axis=1)
|
62 |
-
|
63 |
-
# Resample or trim if necessary
|
64 |
-
if len(reference_audio) > SAMPLE_RATE * 10: # Limit to 10 seconds
|
65 |
-
reference_audio = reference_audio[:SAMPLE_RATE * 10]
|
66 |
-
|
67 |
-
# Save reference audio
|
68 |
-
ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
|
69 |
-
sf.write(ref_filename, reference_audio, SAMPLE_RATE)
|
70 |
-
|
71 |
-
# Store reference voice
|
72 |
-
self.reference_voice = reference_audio
|
73 |
-
|
74 |
-
return "Reference voice processed successfully"
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
90 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
91 |
-
|
92 |
-
# Load speaker embeddings
|
93 |
-
embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
94 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
|
95 |
-
|
96 |
-
return {
|
97 |
-
"model": model,
|
98 |
-
"processor": processor,
|
99 |
-
"vocoder": vocoder,
|
100 |
-
"speaker_embeddings": speaker_embeddings
|
101 |
-
}
|
102 |
-
except Exception as e:
|
103 |
-
print(f"SpeechT5 model loading error: {e}")
|
104 |
-
return None
|
105 |
-
|
106 |
-
def generate_speech(self, text, model_name=None, voice_preset=None):
|
107 |
-
"""Generate speech using selected model"""
|
108 |
-
if not text or not text.strip():
|
109 |
-
return None, "Please enter some text to speak"
|
110 |
|
111 |
-
#
|
112 |
-
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
elif current_model == "speecht5":
|
118 |
-
return self._generate_speecht5_speech(text, voice_preset)
|
119 |
-
else:
|
120 |
-
raise ValueError(f"Unsupported model: {current_model}")
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
def _generate_bark_speech(self, text, voice_preset=None):
|
129 |
-
"""Generate speech using Bark"""
|
130 |
-
# Default Bark voice presets
|
131 |
-
voice_presets = [
|
132 |
-
"v2/en_speaker_6", # Female
|
133 |
-
"v2/en_speaker_3", # Male
|
134 |
-
"v2/en_speaker_9", # Neutral
|
135 |
-
]
|
136 |
|
137 |
-
#
|
138 |
-
|
|
|
|
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
-
|
143 |
-
ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
|
144 |
-
history_prompt = ref_filename
|
145 |
-
elif voice_preset:
|
146 |
-
# Use predefined voice preset
|
147 |
-
history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
|
148 |
|
149 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
if history_prompt:
|
151 |
audio_array = generate_audio(
|
152 |
text,
|
@@ -163,37 +138,11 @@ class VoiceSynthesizer:
|
|
163 |
|
164 |
return filepath, None
|
165 |
|
166 |
-
|
167 |
-
"
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
return None, "SpeechT5 model not loaded"
|
172 |
-
|
173 |
-
model = speecht5_models["model"]
|
174 |
-
processor = speecht5_models["processor"]
|
175 |
-
vocoder = speecht5_models["vocoder"]
|
176 |
-
speaker_embeddings = speecht5_models["speaker_embeddings"]
|
177 |
-
|
178 |
-
# Prepare inputs
|
179 |
-
inputs = processor(text=text, return_tensors="pt")
|
180 |
-
|
181 |
-
# Generate speech
|
182 |
-
speech = model.generate_speech(
|
183 |
-
inputs["input_ids"],
|
184 |
-
speaker_embeddings
|
185 |
-
)
|
186 |
-
|
187 |
-
# Convert to numpy array
|
188 |
-
audio_array = speech.numpy()
|
189 |
-
|
190 |
-
# Save generated audio
|
191 |
-
filename = f"speecht5_speech_{int(time.time())}.wav"
|
192 |
-
filepath = os.path.join(self.working_dir, filename)
|
193 |
-
wavfile.write(filepath, 16000, audio_array)
|
194 |
-
|
195 |
-
return filepath, None
|
196 |
-
|
197 |
def create_interface():
|
198 |
synthesizer = VoiceSynthesizer()
|
199 |
|
|
|
45 |
except Exception as e:
|
46 |
print(f"Bark model loading error: {e}")
|
47 |
|
48 |
+
def process_reference_audio(self, reference_audio):
|
49 |
+
"""Process and store reference audio for voice cloning"""
|
50 |
+
try:
|
51 |
+
# Gradio can pass audio in different formats
|
52 |
+
if reference_audio is None:
|
53 |
+
return "No audio provided"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
# Handle different input types
|
56 |
+
if isinstance(reference_audio, tuple):
|
57 |
+
# Gradio typically returns (sample_rate, audio_array)
|
58 |
+
if len(reference_audio) == 2:
|
59 |
+
sample_rate, audio_data = reference_audio
|
60 |
+
else:
|
61 |
+
audio_data = reference_audio[0]
|
62 |
+
sample_rate = SAMPLE_RATE # Default to Bark sample rate
|
63 |
+
elif isinstance(reference_audio, np.ndarray):
|
64 |
+
audio_data = reference_audio
|
65 |
+
sample_rate = SAMPLE_RATE
|
66 |
+
else:
|
67 |
+
return "Invalid audio format"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
+
# Ensure audio is numpy array
|
70 |
+
audio_data = np.asarray(audio_data)
|
71 |
|
72 |
+
# Handle multi-channel audio
|
73 |
+
if audio_data.ndim > 1:
|
74 |
+
audio_data = audio_data.mean(axis=1)
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Trim or pad to standard length
|
77 |
+
max_duration = 10 # 10 seconds
|
78 |
+
max_samples = max_duration * sample_rate
|
79 |
+
|
80 |
+
if len(audio_data) > max_samples:
|
81 |
+
audio_data = audio_data[:max_samples]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
# Resample if necessary
|
84 |
+
if sample_rate != SAMPLE_RATE:
|
85 |
+
from scipy.signal import resample
|
86 |
+
audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate))
|
87 |
|
88 |
+
# Save reference audio
|
89 |
+
ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
|
90 |
+
sf.write(ref_filename, audio_data, SAMPLE_RATE)
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
# Store reference voice
|
93 |
+
self.reference_voice = ref_filename
|
94 |
+
|
95 |
+
return "Reference voice processed successfully"
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Reference audio processing error: {e}")
|
99 |
+
import traceback
|
100 |
+
traceback.print_exc()
|
101 |
+
return f"Error processing reference audio: {str(e)}"
|
102 |
+
|
103 |
+
def _generate_bark_speech(self, text, voice_preset=None):
|
104 |
+
"""Generate speech using Bark"""
|
105 |
+
# Default Bark voice presets
|
106 |
+
voice_presets = [
|
107 |
+
"v2/en_speaker_6", # Female
|
108 |
+
"v2/en_speaker_3", # Male
|
109 |
+
"v2/en_speaker_9", # Neutral
|
110 |
+
]
|
111 |
+
|
112 |
+
# Prepare history prompt
|
113 |
+
history_prompt = None
|
114 |
+
|
115 |
+
# Check if a reference voice is available
|
116 |
+
if self.reference_voice is not None:
|
117 |
+
# Use saved reference voice file
|
118 |
+
history_prompt = self.reference_voice
|
119 |
+
elif voice_preset:
|
120 |
+
# Use predefined voice preset
|
121 |
+
history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
|
122 |
+
|
123 |
+
# Generate audio with or without history prompt
|
124 |
+
try:
|
125 |
if history_prompt:
|
126 |
audio_array = generate_audio(
|
127 |
text,
|
|
|
138 |
|
139 |
return filepath, None
|
140 |
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Bark speech generation error: {e}")
|
143 |
+
import traceback
|
144 |
+
traceback.print_exc()
|
145 |
+
return None, f"Error in Bark speech generation: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def create_interface():
|
147 |
synthesizer = VoiceSynthesizer()
|
148 |
|