Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import os | |
import time | |
import torch | |
from scipy.io import wavfile | |
import soundfile as sf | |
import datasets | |
# Bark imports | |
from bark import generate_audio, SAMPLE_RATE | |
from bark.generation import preload_models, generate_text_semantic | |
# Hugging Face Transformers | |
from transformers import ( | |
SpeechT5HifiGan, | |
SpeechT5ForTextToSpeech, | |
SpeechT5Processor | |
) | |
class VoiceSynthesizer: | |
def __init__(self): | |
# Create working directory | |
self.base_dir = os.path.dirname(os.path.abspath(__file__)) | |
self.working_dir = os.path.join(self.base_dir, "working_files") | |
os.makedirs(self.working_dir, exist_ok=True) | |
# Store reference voice | |
self.reference_voice = None | |
# Initialize models dictionary | |
self.models = { | |
"bark": self._initialize_bark, | |
"speecht5": self._initialize_speecht5 | |
} | |
# Default model | |
self.current_model = "bark" | |
# Initialize Bark models | |
try: | |
print("Attempting to load Bark models...") | |
preload_models() | |
print("Bark models loaded successfully.") | |
except Exception as e: | |
print(f"Bark model loading error: {e}") | |
def _initialize_bark(self): | |
"""Bark model initialization (already done in __init__)""" | |
return None | |
def _initialize_speecht5(self): | |
"""Initialize SpeechT5 model from Hugging Face""" | |
try: | |
# Load SpeechT5 model and processor | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
# Load speaker embeddings | |
embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) | |
return { | |
"model": model, | |
"processor": processor, | |
"vocoder": vocoder, | |
"speaker_embeddings": speaker_embeddings | |
} | |
except Exception as e: | |
print(f"SpeechT5 model loading error: {e}") | |
return None | |
def process_reference_audio(self, reference_audio): | |
"""Process and store reference audio for voice cloning""" | |
try: | |
# Gradio can pass audio in different formats | |
if reference_audio is None: | |
return "No audio provided" | |
# Handle different input types | |
if isinstance(reference_audio, tuple): | |
# Gradio typically returns (sample_rate, audio_array) | |
if len(reference_audio) == 2: | |
sample_rate, audio_data = reference_audio | |
else: | |
audio_data = reference_audio[0] | |
sample_rate = SAMPLE_RATE # Default to Bark sample rate | |
elif isinstance(reference_audio, np.ndarray): | |
audio_data = reference_audio | |
sample_rate = SAMPLE_RATE | |
else: | |
return "Invalid audio format" | |
# Ensure audio is numpy array | |
audio_data = np.asarray(audio_data) | |
# Handle multi-channel audio | |
if audio_data.ndim > 1: | |
audio_data = audio_data.mean(axis=1) | |
# Trim or pad to standard length | |
max_duration = 10 # 10 seconds | |
max_samples = max_duration * sample_rate | |
if len(audio_data) > max_samples: | |
audio_data = audio_data[:max_samples] | |
# Resample if necessary | |
if sample_rate != SAMPLE_RATE: | |
from scipy.signal import resample | |
audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate)) | |
# Save reference audio | |
ref_filename = os.path.join(self.working_dir, "reference_voice.wav") | |
sf.write(ref_filename, audio_data, SAMPLE_RATE) | |
# Store reference voice | |
self.reference_voice = ref_filename | |
return "Reference voice processed successfully" | |
except Exception as e: | |
print(f"Reference audio processing error: {e}") | |
import traceback | |
traceback.print_exc() | |
return f"Error processing reference audio: {str(e)}" | |
def _generate_bark_speech(self, text, voice_preset=None): | |
"""Generate speech using Bark""" | |
# Default Bark voice presets | |
voice_presets = [ | |
"v2/en_speaker_6", # Female | |
"v2/en_speaker_3", # Male | |
"v2/en_speaker_9", # Neutral | |
] | |
# Prepare history prompt | |
history_prompt = None | |
# Check if a reference voice is available | |
if self.reference_voice is not None: | |
# Use saved reference voice file | |
history_prompt = self.reference_voice | |
elif voice_preset: | |
# Use predefined voice preset | |
history_prompt = voice_presets[0] if "v2/en_speaker" not in voice_preset else voice_preset | |
# Generate audio with or without history prompt | |
try: | |
if history_prompt: | |
audio_array = generate_audio( | |
text, | |
history_prompt=history_prompt | |
) | |
else: | |
# Fallback to default generation | |
audio_array = generate_audio(text) | |
# Save generated audio | |
filename = f"bark_speech_{int(time.time())}.wav" | |
filepath = os.path.join(self.working_dir, filename) | |
wavfile.write(filepath, SAMPLE_RATE, audio_array) | |
return filepath, None | |
except Exception as e: | |
print(f"Bark speech generation error: {e}") | |
import traceback | |
traceback.print_exc() | |
return None, f"Error in Bark speech generation: {str(e)}" | |
def generate_speech(self, text, model_name=None, voice_preset=None): | |
"""Generate speech using selected model""" | |
if not text or not text.strip(): | |
return None, "Please enter some text to speak" | |
# Use specified model or current model | |
current_model = model_name or self.current_model | |
try: | |
if current_model == "bark": | |
return self._generate_bark_speech(text, voice_preset) | |
elif current_model == "speecht5": | |
return self._generate_speecht5_speech(text, voice_preset) | |
else: | |
raise ValueError(f"Unsupported model: {current_model}") | |
except Exception as e: | |
print(f"Speech generation error: {e}") | |
import traceback | |
traceback.print_exc() | |
return None, f"Error generating speech: {str(e)}" | |
def _generate_speecht5_speech(self, text, speaker_id=None): | |
"""Generate speech using SpeechT5""" | |
# Ensure model is initialized | |
speecht5_models = self.models["speecht5"]() | |
if not speecht5_models: | |
return None, "SpeechT5 model not loaded" | |
model = speecht5_models["model"] | |
processor = speecht5_models["processor"] | |
vocoder = speecht5_models["vocoder"] | |
speaker_embeddings = speecht5_models["speaker_embeddings"] | |
# Prepare inputs | |
inputs = processor(text=text, return_tensors="pt") | |
# Generate speech | |
speech = model.generate_speech( | |
inputs["input_ids"], | |
speaker_embeddings | |
) | |
# Convert to numpy array | |
audio_array = speech.numpy() | |
# Save generated audio | |
filename = f"speecht5_speech_{int(time.time())}.wav" | |
filepath = os.path.join(self.working_dir, filename) | |
wavfile.write(filepath, 16000, audio_array) | |
return filepath, None | |
# Rest of the code remains the same... |