Rogerjs commited on
Commit
909dbdf
·
verified ·
1 Parent(s): d864fc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -126
app.py CHANGED
@@ -45,108 +45,83 @@ class VoiceSynthesizer:
45
  except Exception as e:
46
  print(f"Bark model loading error: {e}")
47
 
48
- def process_reference_audio(self, reference_audio):
49
- """Process and store reference audio for voice cloning"""
50
- try:
51
- # Ensure audio is in the right format
52
- if reference_audio is None:
53
- return "No audio provided"
54
-
55
- # Convert to numpy array if needed
56
- if isinstance(reference_audio, tuple):
57
- reference_audio = reference_audio[0]
58
-
59
- # Ensure the audio is mono and normalized
60
- if reference_audio.ndim > 1:
61
- reference_audio = reference_audio.mean(axis=1)
62
-
63
- # Resample or trim if necessary
64
- if len(reference_audio) > SAMPLE_RATE * 10: # Limit to 10 seconds
65
- reference_audio = reference_audio[:SAMPLE_RATE * 10]
66
-
67
- # Save reference audio
68
- ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
69
- sf.write(ref_filename, reference_audio, SAMPLE_RATE)
70
-
71
- # Store reference voice
72
- self.reference_voice = reference_audio
73
-
74
- return "Reference voice processed successfully"
75
 
76
- except Exception as e:
77
- print(f"Reference audio processing error: {e}")
78
- return f"Error processing reference audio: {str(e)}"
79
-
80
- def _initialize_bark(self):
81
- """Bark model initialization (already done in __init__)"""
82
- return None
83
-
84
- def _initialize_speecht5(self):
85
- """Initialize SpeechT5 model from Hugging Face"""
86
- try:
87
- # Load SpeechT5 model and processor
88
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
89
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
90
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
91
-
92
- # Load speaker embeddings
93
- embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
94
- speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
95
-
96
- return {
97
- "model": model,
98
- "processor": processor,
99
- "vocoder": vocoder,
100
- "speaker_embeddings": speaker_embeddings
101
- }
102
- except Exception as e:
103
- print(f"SpeechT5 model loading error: {e}")
104
- return None
105
-
106
- def generate_speech(self, text, model_name=None, voice_preset=None):
107
- """Generate speech using selected model"""
108
- if not text or not text.strip():
109
- return None, "Please enter some text to speak"
110
 
111
- # Use specified model or current model
112
- current_model = model_name or self.current_model
113
 
114
- try:
115
- if current_model == "bark":
116
- return self._generate_bark_speech(text, voice_preset)
117
- elif current_model == "speecht5":
118
- return self._generate_speecht5_speech(text, voice_preset)
119
- else:
120
- raise ValueError(f"Unsupported model: {current_model}")
121
 
122
- except Exception as e:
123
- print(f"Speech generation error: {e}")
124
- import traceback
125
- traceback.print_exc()
126
- return None, f"Error generating speech: {str(e)}"
127
-
128
- def _generate_bark_speech(self, text, voice_preset=None):
129
- """Generate speech using Bark"""
130
- # Default Bark voice presets
131
- voice_presets = [
132
- "v2/en_speaker_6", # Female
133
- "v2/en_speaker_3", # Male
134
- "v2/en_speaker_9", # Neutral
135
- ]
136
 
137
- # Prepare history prompt
138
- history_prompt = None
 
 
139
 
140
- # Check if a reference voice is available
141
- if self.reference_voice is not None:
142
- # Save reference voice for Bark
143
- ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
144
- history_prompt = ref_filename
145
- elif voice_preset:
146
- # Use predefined voice preset
147
- history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
148
 
149
- # Generate audio with or without history prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  if history_prompt:
151
  audio_array = generate_audio(
152
  text,
@@ -163,37 +138,11 @@ class VoiceSynthesizer:
163
 
164
  return filepath, None
165
 
166
- def _generate_speecht5_speech(self, text, speaker_id=None):
167
- """Generate speech using SpeechT5"""
168
- # Ensure model is initialized
169
- speecht5_models = self.models["speecht5"]()
170
- if not speecht5_models:
171
- return None, "SpeechT5 model not loaded"
172
-
173
- model = speecht5_models["model"]
174
- processor = speecht5_models["processor"]
175
- vocoder = speecht5_models["vocoder"]
176
- speaker_embeddings = speecht5_models["speaker_embeddings"]
177
-
178
- # Prepare inputs
179
- inputs = processor(text=text, return_tensors="pt")
180
-
181
- # Generate speech
182
- speech = model.generate_speech(
183
- inputs["input_ids"],
184
- speaker_embeddings
185
- )
186
-
187
- # Convert to numpy array
188
- audio_array = speech.numpy()
189
-
190
- # Save generated audio
191
- filename = f"speecht5_speech_{int(time.time())}.wav"
192
- filepath = os.path.join(self.working_dir, filename)
193
- wavfile.write(filepath, 16000, audio_array)
194
-
195
- return filepath, None
196
-
197
  def create_interface():
198
  synthesizer = VoiceSynthesizer()
199
 
 
45
  except Exception as e:
46
  print(f"Bark model loading error: {e}")
47
 
48
+ def process_reference_audio(self, reference_audio):
49
+ """Process and store reference audio for voice cloning"""
50
+ try:
51
+ # Gradio can pass audio in different formats
52
+ if reference_audio is None:
53
+ return "No audio provided"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # Handle different input types
56
+ if isinstance(reference_audio, tuple):
57
+ # Gradio typically returns (sample_rate, audio_array)
58
+ if len(reference_audio) == 2:
59
+ sample_rate, audio_data = reference_audio
60
+ else:
61
+ audio_data = reference_audio[0]
62
+ sample_rate = SAMPLE_RATE # Default to Bark sample rate
63
+ elif isinstance(reference_audio, np.ndarray):
64
+ audio_data = reference_audio
65
+ sample_rate = SAMPLE_RATE
66
+ else:
67
+ return "Invalid audio format"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # Ensure audio is numpy array
70
+ audio_data = np.asarray(audio_data)
71
 
72
+ # Handle multi-channel audio
73
+ if audio_data.ndim > 1:
74
+ audio_data = audio_data.mean(axis=1)
 
 
 
 
75
 
76
+ # Trim or pad to standard length
77
+ max_duration = 10 # 10 seconds
78
+ max_samples = max_duration * sample_rate
79
+
80
+ if len(audio_data) > max_samples:
81
+ audio_data = audio_data[:max_samples]
 
 
 
 
 
 
 
 
82
 
83
+ # Resample if necessary
84
+ if sample_rate != SAMPLE_RATE:
85
+ from scipy.signal import resample
86
+ audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate))
87
 
88
+ # Save reference audio
89
+ ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
90
+ sf.write(ref_filename, audio_data, SAMPLE_RATE)
 
 
 
 
 
91
 
92
+ # Store reference voice
93
+ self.reference_voice = ref_filename
94
+
95
+ return "Reference voice processed successfully"
96
+
97
+ except Exception as e:
98
+ print(f"Reference audio processing error: {e}")
99
+ import traceback
100
+ traceback.print_exc()
101
+ return f"Error processing reference audio: {str(e)}"
102
+
103
+ def _generate_bark_speech(self, text, voice_preset=None):
104
+ """Generate speech using Bark"""
105
+ # Default Bark voice presets
106
+ voice_presets = [
107
+ "v2/en_speaker_6", # Female
108
+ "v2/en_speaker_3", # Male
109
+ "v2/en_speaker_9", # Neutral
110
+ ]
111
+
112
+ # Prepare history prompt
113
+ history_prompt = None
114
+
115
+ # Check if a reference voice is available
116
+ if self.reference_voice is not None:
117
+ # Use saved reference voice file
118
+ history_prompt = self.reference_voice
119
+ elif voice_preset:
120
+ # Use predefined voice preset
121
+ history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset
122
+
123
+ # Generate audio with or without history prompt
124
+ try:
125
  if history_prompt:
126
  audio_array = generate_audio(
127
  text,
 
138
 
139
  return filepath, None
140
 
141
+ except Exception as e:
142
+ print(f"Bark speech generation error: {e}")
143
+ import traceback
144
+ traceback.print_exc()
145
+ return None, f"Error in Bark speech generation: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def create_interface():
147
  synthesizer = VoiceSynthesizer()
148