Spaces:
Sleeping
Sleeping
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import scipy.io.wavfile | |
| import gradio as gr | |
| import numpy as np | |
| # Load model and tokenizer | |
| model = VitsModel.from_pretrained("Toadoum/swahili-mms-tts-finetuned", device_map="auto") | |
| tokenizer = AutoTokenizer.from_pretrained("Toadoum/swahili-mms-tts-finetuned") | |
| def text_to_speech(text): | |
| # Tokenize input text | |
| inputs = tokenizer(text, return_tensors="pt") | |
| # Generate waveform | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| # Convert to numpy array | |
| output_np = output.squeeze().cpu().numpy() | |
| # Get sampling rate from model config | |
| sampling_rate = model.config.sampling_rate | |
| # Return as tuple for Gradio audio component | |
| return (sampling_rate, output_np) | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=gr.Textbox( | |
| label="Enter Swahili Text", | |
| value="""Neurotech Africa ni kampuni kutoka Tanzania inaongoza mapinduzi ya kidigitali nchini na barani Afrika kwa suluhisho za Akili bandia (AI). | |
| Tunajenga AI ambayo inasaidia biashara kuboresha uzoefu wa wateja kupitia teknolojia za kisasa za mazungumzo.""" | |
| ), | |
| outputs=gr.Audio(label="Generated Speech"), | |
| title="Swahili Text-to-Speech", | |
| description="Convert Swahili text to speech using a fine-tuned MMS-TTS model", | |
| allow_flagging="never" | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |