import spaces import torch import gradio as gr import numpy as np from transformers import AutoProcessor, CsmForConditionalGeneration model_id = "Marvis-AI/marvis-tts-250m-v0.1-transformers" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(model_id) model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device) @spaces.GPU def tts(text: str): inputs = processor( text, add_special_tokens=True, return_tensors="pt" ).to(device) if "token_type_ids" in inputs: inputs.pop("token_type_ids") # generate audio audio = model.generate(**inputs, output_audio=True) audio_np = audio[0].cpu().numpy() return (24_000, audio_np) with gr.Blocks(title="Marvis TTS Demo") as demo: gr.Markdown("## 🎙️ Marvis TTS Demo\nTry out Marvis TTS with different speakers using `[0]`, `[1]`, etc. before your text!") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text Input", value="[0] Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices.", lines=3, placeholder="Enter text here... (prefix with [0], [1], etc. to choose speaker)" ) generate_btn = gr.Button("Generate Speech") with gr.Column(): audio_output = gr.Audio(label="Generated Audio") generate_btn.click( fn=tts, inputs=text_input, outputs=audio_output ) if __name__ == "__main__": demo.launch()