In [1]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, XttsAudioConfig

# File links for required components
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
MEL_NORM_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"

def create_model_config():
    """Create the model configuration matching your training setup."""
    # Initialize configurations
    audio_config = XttsAudioConfig(
        sample_rate=22050,
        dvae_sample_rate=22050,
        output_sample_rate=24000
    )
    
    model_args = GPTArgs(
        max_conditioning_length=132300,  # 6 secs
        min_conditioning_length=66150,  # 3 secs
        debug_loading_failures=False,
        max_wav_length=255995,  # ~11.6 seconds
        max_text_length=200,
        mel_norm_file="model_files/mel_stats.pth",  # Update this path
        dvae_checkpoint="model_files/dvae.pth",  # Update this path
        tokenizer_file="model_files/vocab.json",  # Update this path
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True,
    )
    
    config = XttsConfig(
        model_args=model_args,
        audio=audio_config,
        # Add any other necessary configuration parameters
    )
    
    return config

def load_model(checkpoint_path):
    """Load the XTTS model from checkpoint."""
    config = create_model_config()
    model = Xtts.init_from_config(config)
    model.load_checkpoint(config, checkpoint_path)
    model.eval()
    
    if torch.cuda.is_available():
        model.cuda()
    
    return model, config 

def generate_speech(model, text, language, speaker_wav, config, output_path, n):
    """Generate speech using the loaded model."""
    outputs = model.synthesize(
        text=text,
        language = language,
        speaker_wav=speaker_wav,
        config=config,
        temperature=1.,
        length_penalty=0.8,
        repetition_penalty=2.0,
    )
    #print(type(outputs))
    #print(outputs)
    # Save the generated audio

        # Convert the list to a PyTorch tensor
    audio_tensor = torch.tensor(outputs['wav'])
    
    # Add a batch dimension (channels). For mono audio, it should be 1xN
    audio_tensor = audio_tensor.unsqueeze(0)  # Shape: (1, N)
    
    # Define the sample rate (e.g., 16,000 Hz for speech)
    sample_rate = 24000
    
    # Save the tensor as a .wav file
    torchaudio.save(output_path, audio_tensor, sample_rate)
    print(f"Generated audio saved to: {output_path}")

def main():
    # Set your paths
    checkpoint_path = "./"  # Your trained model checkpoint
    speaker_wav = "./speaker.wav"  # Reference audio for speaker characteristics
    output_dir = "generated_audio"
    
    # Make sure these files exist and paths are correct
    assert os.path.exists("model_files/mel_stats.pth"), "mel_stats.pth not found"
    assert os.path.exists("model_files/dvae.pth"), "dvae.pth not found"
    assert os.path.exists("model_files/vocab.json"), "vocab.json not found"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the model
    print("Loading model...")
    model, config = load_model(checkpoint_path)
    
    # Example texts to generate
    texts = [
        "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
        "Somebody once told me the world is gonna roll me, I ain't the sharpest tool in the shed. She was looking kind of dumb with her finger and her thumb, In the shape of an L on her forehead",
        "Theyâ€™re taking the hobbits to Isengard!"
    ]
    
    # Generate speech for each text
    for i, text in enumerate(texts):
        output_path = os.path.join(output_dir, f"generated_speech_{i}.wav")
        print(f"\nGenerating speech for text {i+1}...")
        print(f"Text: {text}")
        
        generate_speech(
            model=model,
            text=text,
            language="en",  # Change according to your needs
            speaker_wav=speaker_wav,
            config=config,
            output_path=output_path,
            n = i
        )

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-19 20:59:19.199396: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-19 20:59:19.216174: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-19 20:59:19.229745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737316759.245180   14721 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737316759.249655   14721 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-19 20:59:19.267759: I tensorflow/core/platform/cpu_feature_guard.cc:210] This Tenso

Loading model...


  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.



Generating speech for text 1...
Text: It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated audio saved to: generated_audio/generated_speech_0.wav

Generating speech for text 2...
Text: Somebody once told me the world is gonna roll me, I ain't the sharpest tool in the shed. She was looking kind of dumb with her finger and her thumb, In the shape of an L on her forehead
Generated audio saved to: generated_audio/generated_speech_1.wav

Generating speech for text 3...
Text: Theyâ€™re taking the hobbits to Isengard!
Generated audio saved to: generated_audio/generated_speech_2.wav
