Voicecloner / app.py
Rogerjs's picture
Update app.py
ee87e83 verified
import gradio as gr
import numpy as np
import os
import time
import torch
from scipy.io import wavfile
import soundfile as sf
import datasets
# Bark imports
from bark import generate_audio, SAMPLE_RATE
from bark.generation import preload_models, generate_text_semantic
# Hugging Face Transformers
from transformers import (
SpeechT5HifiGan,
SpeechT5ForTextToSpeech,
SpeechT5Processor
)
class VoiceSynthesizer:
def __init__(self):
# Create working directory
self.base_dir = os.path.dirname(os.path.abspath(__file__))
self.working_dir = os.path.join(self.base_dir, "working_files")
os.makedirs(self.working_dir, exist_ok=True)
# Store reference voice
self.reference_voice = None
# Initialize models dictionary
self.models = {
"bark": self._initialize_bark,
"speecht5": self._initialize_speecht5
}
# Default model
self.current_model = "bark"
# Initialize Bark models
try:
print("Attempting to load Bark models...")
preload_models()
print("Bark models loaded successfully.")
except Exception as e:
print(f"Bark model loading error: {e}")
def _initialize_bark(self):
"""Bark model initialization (already done in __init__)"""
return None
def _initialize_speecht5(self):
"""Initialize SpeechT5 model from Hugging Face"""
try:
# Load SpeechT5 model and processor
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings
embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
return {
"model": model,
"processor": processor,
"vocoder": vocoder,
"speaker_embeddings": speaker_embeddings
}
except Exception as e:
print(f"SpeechT5 model loading error: {e}")
return None
def process_reference_audio(self, reference_audio):
"""Process and store reference audio for voice cloning"""
try:
# Gradio can pass audio in different formats
if reference_audio is None:
return "No audio provided"
# Handle different input types
if isinstance(reference_audio, tuple):
# Gradio typically returns (sample_rate, audio_array)
if len(reference_audio) == 2:
sample_rate, audio_data = reference_audio
else:
audio_data = reference_audio[0]
sample_rate = SAMPLE_RATE # Default to Bark sample rate
elif isinstance(reference_audio, np.ndarray):
audio_data = reference_audio
sample_rate = SAMPLE_RATE
else:
return "Invalid audio format"
# Ensure audio is numpy array
audio_data = np.asarray(audio_data)
# Handle multi-channel audio
if audio_data.ndim > 1:
audio_data = audio_data.mean(axis=1)
# Trim or pad to standard length
max_duration = 10 # 10 seconds
max_samples = max_duration * sample_rate
if len(audio_data) > max_samples:
audio_data = audio_data[:max_samples]
# Resample if necessary
if sample_rate != SAMPLE_RATE:
from scipy.signal import resample
audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate))
# Save reference audio
ref_filename = os.path.join(self.working_dir, "reference_voice.wav")
sf.write(ref_filename, audio_data, SAMPLE_RATE)
# Store reference voice
self.reference_voice = ref_filename
return "Reference voice processed successfully"
except Exception as e:
print(f"Reference audio processing error: {e}")
import traceback
traceback.print_exc()
return f"Error processing reference audio: {str(e)}"
def _generate_bark_speech(self, text, voice_preset=None):
"""Generate speech using Bark"""
# Default Bark voice presets
voice_presets = [
"v2/en_speaker_6", # Female
"v2/en_speaker_3", # Male
"v2/en_speaker_9", # Neutral
]
# Prepare history prompt
history_prompt = None
# Check if a reference voice is available
if self.reference_voice is not None:
# Use saved reference voice file
history_prompt = self.reference_voice
# If no reference voice, use preset
if history_prompt is None and voice_preset:
# Extract the actual preset value
if isinstance(voice_preset, str):
# Remove any additional text in parentheses
preset_value = voice_preset.split(' ')[0]
history_prompt = preset_value if preset_value in voice_presets else voice_presets[0]
else:
history_prompt = voice_presets[0]
# Generate audio with or without history prompt
try:
# Attempt generation with different approaches
if history_prompt:
try:
audio_array = generate_audio(
text,
history_prompt=history_prompt
)
except Exception as preset_error:
print(f"Error with specific history prompt: {preset_error}")
# Fallback to default generation
audio_array = generate_audio(text)
else:
# Fallback to default generation
audio_array = generate_audio(text)
# Save generated audio
filename = f"bark_speech_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, SAMPLE_RATE, audio_array)
return filepath, None
except Exception as e:
print(f"Bark speech generation error: {e}")
import traceback
traceback.print_exc()
return None, f"Error in Bark speech generation: {str(e)}"
def generate_speech(self, text, model_name=None, voice_preset=None):
"""Generate speech using selected model"""
if not text or not text.strip():
return None, "Please enter some text to speak"
# Use specified model or current model
current_model = model_name or self.current_model
try:
if current_model == "bark":
return self._generate_bark_speech(text, voice_preset)
elif current_model == "speecht5":
return self._generate_speecht5_speech(text, voice_preset)
else:
raise ValueError(f"Unsupported model: {current_model}")
except Exception as e:
print(f"Speech generation error: {e}")
import traceback
traceback.print_exc()
return None, f"Error generating speech: {str(e)}"
def _generate_speecht5_speech(self, text, speaker_id=None):
"""Generate speech using SpeechT5"""
# Ensure model is initialized
speecht5_models = self.models["speecht5"]()
if not speecht5_models:
return None, "SpeechT5 model not loaded"
model = speecht5_models["model"]
processor = speecht5_models["processor"]
vocoder = speecht5_models["vocoder"]
speaker_embeddings = speecht5_models["speaker_embeddings"]
# Prepare inputs
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings
)
# Convert to numpy array
audio_array = speech.numpy()
# Save generated audio
filename = f"speecht5_speech_{int(time.time())}.wav"
filepath = os.path.join(self.working_dir, filename)
wavfile.write(filepath, 16000, audio_array)
return filepath, None
def create_interface():
synthesizer = VoiceSynthesizer()
with gr.Blocks() as interface:
gr.Markdown("# ๐ŸŽ™๏ธ Advanced Voice Synthesis")
with gr.Row():
with gr.Column():
gr.Markdown("## 1. Capture Reference Voice")
reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
process_ref_btn = gr.Button("Process Reference Voice")
process_ref_output = gr.Textbox(label="Reference Voice Processing")
with gr.Column():
gr.Markdown("## 2. Generate Speech")
text_input = gr.Textbox(label="Enter Text to Speak")
# Model Selection
model_dropdown = gr.Dropdown(
choices=[
"bark (Suno AI)",
"speecht5 (Microsoft)"
],
label="Select TTS Model",
value="bark (Suno AI)"
)
# Voice Preset Dropdowns
with gr.Row():
bark_preset = gr.Dropdown(
choices=[
"v2/en_speaker_6 (Female Voice)",
"v2/en_speaker_3 (Male Voice)",
"v2/en_speaker_9 (Neutral Voice)"
],
label="Bark Voice Preset",
value="v2/en_speaker_6 (Female Voice)",
visible=True
)
speecht5_preset = gr.Dropdown(
choices=[
"Default Speaker"
],
label="SpeechT5 Speaker",
visible=False
)
generate_btn = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Speech")
error_output = gr.Textbox(label="Errors", visible=True)
# Process reference audio
process_ref_btn.click(
fn=synthesizer.process_reference_audio,
inputs=reference_audio,
outputs=process_ref_output
)
# Dynamic model and preset visibility
def update_model_visibility(model):
if "bark" in model.lower():
return {
bark_preset: gr.update(visible=True),
speecht5_preset: gr.update(visible=False)
}
else:
return {
bark_preset: gr.update(visible=False),
speecht5_preset: gr.update(visible=True)
}
model_dropdown.change(
fn=update_model_visibility,
inputs=model_dropdown,
outputs=[bark_preset, speecht5_preset]
)
# Speech generation logic
def generate_speech_wrapper(text, model, bark_preset, speecht5_preset):
# Map model name
model_map = {
"bark (Suno AI)": "bark",
"speecht5 (Microsoft)": "speecht5"
}
# Select appropriate preset
preset = bark_preset if "bark" in model else speecht5_preset
# Extract preset value if it's a string with additional info
if isinstance(preset, str):
preset = preset.split(' ')[0]
return synthesizer.generate_speech(
text,
model_name=model_map[model],
voice_preset=preset
)
generate_btn.click(
fn=generate_speech_wrapper,
inputs=[text_input, model_dropdown, bark_preset, speecht5_preset],
outputs=[audio_output, error_output]
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch(
share=False,
debug=True,
show_error=True,
server_name='0.0.0.0',
server_port=7860
)