import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech import soundfile as sf # Initialize the model and processor from Hugging Face processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") def text_to_speech(text): # Process the input text into tokens inputs = processor(text, return_tensors="pt") # Generate speech with torch.no_grad(): speech = model.generate_speech(inputs.input_ids) # Save the generated speech as a WAV file sf.write('output.wav', speech.squeeze().cpu().numpy(), 16000) return "output.wav"