Marvis-TTS-250m / app.py
prince-canuma's picture
Rename model
a10db13 verified
import spaces
import torch
import gradio as gr
import numpy as np
from transformers import AutoProcessor, CsmForConditionalGeneration
model_id = "Marvis-AI/marvis-tts-250m-v0.1-transformers"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
@spaces.GPU
def tts(text: str):
inputs = processor(
text,
add_special_tokens=True,
return_tensors="pt"
).to(device)
if "token_type_ids" in inputs:
inputs.pop("token_type_ids")
# generate audio
audio = model.generate(**inputs, output_audio=True)
audio_np = audio[0].cpu().numpy()
return (24_000, audio_np)
with gr.Blocks(title="Marvis TTS Demo") as demo:
gr.Markdown("## 🎙️ Marvis TTS Demo\nTry out Marvis TTS with different speakers using `[0]`, `[1]`, etc. before your text!")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text Input",
value="[0] Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices.",
lines=3,
placeholder="Enter text here... (prefix with [0], [1], etc. to choose speaker)"
)
generate_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio")
generate_btn.click(
fn=tts,
inputs=text_input,
outputs=audio_output
)
if __name__ == "__main__":
demo.launch()