Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import VitsModel, AutoTokenizer | |
import soundfile as sf | |
import tempfile | |
LANG_MODEL_MAP = { | |
"English": "facebook/mms-tts-eng", | |
"Hindi": "facebook/mms-tts-hin", | |
"Tamil": "facebook/mms-tts-tam", | |
"Malayalam": "facebook/mms-tts-mal", | |
"Kannada": "facebook/mms-tts-kan" | |
} | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
cache = {} | |
def load_model_and_tokenizer(language): | |
model_name = LANG_MODEL_MAP[language] | |
if model_name not in cache: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = VitsModel.from_pretrained(model_name).to(device) | |
cache[model_name] = (tokenizer, model) | |
return cache[model_name] | |
def tts(language, text): | |
tokenizer, model = load_model_and_tokenizer(language) | |
inputs = tokenizer(text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
output = model(**inputs) | |
# Save waveform to temp file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
sf.write(f.name, output.waveform.cpu().numpy(), samplerate=16000) | |
return f.name | |
iface = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"), | |
gr.Textbox(label="Enter Text", placeholder="Type something...") | |
], | |
outputs=gr.Audio(type="filepath", label="Synthesized Audio"), | |
title="Multilingual Text-to-Speech (MMS)", | |
description="Generate speech in English, Hindi, Tamil, Malayalam, or Kannada using Meta's MMS TTS models." | |
) | |
if __name__ == "__main__": | |
iface.launch() |