Spaces:
Running
on
Zero
Running
on
Zero
from spaces import GPU | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from peft import PeftModel | |
from snac import SNAC | |
import gradio as gr | |
# Config | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
base_model_id = "canopylabs/3b-es_it-pretrain-research_release" | |
lora_model_id = "sirekist98/spanish_tts_emotions" | |
snac_model_id = "hubertsiuzdak/snac_24khz" | |
# Load models | |
tokenizer = AutoTokenizer.from_pretrained(base_model_id) | |
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16 if device.type == "cuda" else torch.float32) | |
model = PeftModel.from_pretrained(base_model, lora_model_id) | |
model = model.to(device) | |
model.eval() | |
snac_model = SNAC.from_pretrained(snac_model_id).to(device) | |
# Emotions and voices | |
emotions = { | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude": "intense_ecstasy_pleasure_bliss_rapture_beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror": "intense_fear_dread_apprehension_horror_terror_panic", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity": "intense_numbness_detachment_insensitivity_apathy", | |
"Interest / Fascination / Curiosity / Intrigue / Attention": "intense_interest_fascination_curiosity_intrigue_attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection": "intense_contempt_disdain_loathing_detestation_rejection", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission": "intense_helplessness_powerlessness_desperation_submission_defeat", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief": "intense_astonishment_surprise_amazement_shock_disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty": "intense_confusion_bewilderment_disorientation_perplexity_uncertainty", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust": "intense_sympathy_compassion_warmth_trust_tenderness_affection", | |
"Pride / Dignity / Honor / Self-confidence / Respect": "intense_pride_dignity_self_confidence_honor_respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity": "intense_sourness_tartness_acidity_sharpness_bitterness", | |
} | |
voice_emotions = { | |
"alloy": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"ash": [ | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"ballad": [ | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity" | |
], | |
"coral": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"echo": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"fable": [ | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity" | |
], | |
"nova": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"onyx": [ | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"sage": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Pride / Dignity / Honor / Self-confidence / Respect", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
], | |
"shimmer": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Numbness / Isolation / Apathy / Detachment / Insensitivity", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Surprise / Amazement / Shock / Astonishment / Disbelief", | |
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", | |
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity" | |
], | |
"verse": [ | |
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", | |
"Fear / Terror / Panic / Dread / Apprehension / Horror", | |
"Interest / Fascination / Curiosity / Intrigue / Attention", | |
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", | |
"Sourness / Sharpness / Bitterness / Tartness / Acidity", | |
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" | |
] | |
} | |
all_voices = list(voice_emotions.keys()) | |
# Helper to decode tokens to audio | |
def decode_snac(code_list): | |
layer_1, layer_2, layer_3 = [], [], [] | |
for i in range((len(code_list)+1)//7): | |
layer_1.append(code_list[7*i]) | |
layer_2.append(code_list[7*i+1]-4096) | |
layer_3.append(code_list[7*i+2]-(2*4096)) | |
layer_3.append(code_list[7*i+3]-(3*4096)) | |
layer_2.append(code_list[7*i+4]-(4*4096)) | |
layer_3.append(code_list[7*i+5]-(5*4096)) | |
layer_3.append(code_list[7*i+6]-(6*4096)) | |
# Obtener dispositivo del primer codebook | |
device = snac_model.quantizer.quantizers[0].codebook.weight.device | |
layers = [ | |
torch.tensor(layer_1).unsqueeze(0).to(device), | |
torch.tensor(layer_2).unsqueeze(0).to(device), | |
torch.tensor(layer_3).unsqueeze(0).to(device), | |
] | |
with torch.no_grad(): | |
audio = snac_model.decode(layers).squeeze().cpu().numpy() | |
return audio | |
# Function to update emotions based on selected voice | |
def update_emotions(voice): | |
if voice in voice_emotions: | |
available_emotions = voice_emotions[voice] | |
return gr.Dropdown(choices=available_emotions, value=available_emotions[0] if available_emotions else None) | |
else: | |
return gr.Dropdown(choices=list(emotions.keys()), value=list(emotions.keys())[0]) | |
# Inference | |
def tts(prompt, user_selected_emotion, voice): | |
chosen_emotion = emotions[user_selected_emotion] | |
full_prompt = f"{voice} ({chosen_emotion}): {prompt}" | |
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device) | |
start_token = torch.tensor([[128259]], dtype=torch.long).to(device) | |
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.long).to(device) | |
input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) | |
padding_len = 4260 - input_ids.shape[1] | |
pad = torch.full((1, padding_len), 128263, dtype=torch.long).to(device) | |
input_ids = torch.cat([pad, input_ids], dim=1) | |
attention_mask = torch.cat([torch.zeros((1, padding_len), dtype=torch.long), torch.ones((1, input_ids.shape[1]-padding_len), dtype=torch.long)], dim=1).to(device) | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=1200, | |
do_sample=True, | |
temperature=0.6, | |
top_p=0.95, | |
repetition_penalty=1.1, | |
num_return_sequences=1, | |
eos_token_id=128258, | |
use_cache=True | |
) | |
token_to_find = 128257 | |
token_to_remove = 128258 | |
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True) | |
if len(token_indices[1]) > 0: | |
last_occurrence_idx = token_indices[1][-1].item() | |
cropped = generated_ids[:, last_occurrence_idx+1:] | |
else: | |
cropped = generated_ids | |
cleaned = cropped[cropped != token_to_remove] | |
trimmed = cleaned[: (len(cleaned) // 7) * 7] | |
trimmed = [int(t) - 128266 for t in trimmed] | |
audio = decode_snac(trimmed) | |
return (24000, audio) | |
# Gradio UI | |
# Create Gradio interface with dynamic emotion updating | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🗣️ Orpheus Spanish TTS Finetuned with multiple emotions and voices") | |
gr.Markdown("Fine tuned model and SNAC decoding. The available emotions change according to the selected voice.") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox(label="Texto") | |
voice_dropdown = gr.Dropdown(choices=all_voices, value=all_voices[0], label="Voz") | |
emotion_dropdown = gr.Dropdown(choices=voice_emotions[all_voices[0]], value=voice_emotions[all_voices[0]][0], label="Emoción") | |
submit_btn = gr.Button("Generar Audio") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Audio generado") | |
# Update emotions when voice changes | |
voice_dropdown.change( | |
fn=update_emotions, | |
inputs=voice_dropdown, | |
outputs=emotion_dropdown | |
) | |
# Generate audio | |
submit_btn.click( | |
fn=tts, | |
inputs=[text_input, emotion_dropdown, voice_dropdown], | |
outputs=audio_output | |
) | |
demo.launch() |