Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import torch | |
import gradio as gr | |
import numpy as np | |
from transformers import AutoProcessor, CsmForConditionalGeneration | |
model_id = "Marvis-AI/marvis-tts-250m-v0.1-transformers" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
processor = AutoProcessor.from_pretrained(model_id) | |
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device) | |
def tts(text: str): | |
inputs = processor( | |
text, | |
add_special_tokens=True, | |
return_tensors="pt" | |
).to(device) | |
if "token_type_ids" in inputs: | |
inputs.pop("token_type_ids") | |
# generate audio | |
audio = model.generate(**inputs, output_audio=True) | |
audio_np = audio[0].cpu().numpy() | |
return (24_000, audio_np) | |
with gr.Blocks(title="Marvis TTS Demo") as demo: | |
gr.Markdown("## 🎙️ Marvis TTS Demo\nTry out Marvis TTS with different speakers using `[0]`, `[1]`, etc. before your text!") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Text Input", | |
value="[0] Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices.", | |
lines=3, | |
placeholder="Enter text here... (prefix with [0], [1], etc. to choose speaker)" | |
) | |
generate_btn = gr.Button("Generate Speech") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Audio") | |
generate_btn.click( | |
fn=tts, | |
inputs=text_input, | |
outputs=audio_output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |