sirekist98's picture
Update app.py
c0107c0 verified
from spaces import GPU
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from snac import SNAC
import gradio as gr
# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model_id = "canopylabs/3b-es_it-pretrain-research_release"
lora_model_id = "sirekist98/spanish_tts_emotions"
snac_model_id = "hubertsiuzdak/snac_24khz"
# Load models
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16 if device.type == "cuda" else torch.float32)
model = PeftModel.from_pretrained(base_model, lora_model_id)
model = model.to(device)
model.eval()
snac_model = SNAC.from_pretrained(snac_model_id).to(device)
# Emotions and voices
emotions = {
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude": "intense_ecstasy_pleasure_bliss_rapture_beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror": "intense_fear_dread_apprehension_horror_terror_panic",
"Numbness / Isolation / Apathy / Detachment / Insensitivity": "intense_numbness_detachment_insensitivity_apathy",
"Interest / Fascination / Curiosity / Intrigue / Attention": "intense_interest_fascination_curiosity_intrigue_attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection": "intense_contempt_disdain_loathing_detestation_rejection",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission": "intense_helplessness_powerlessness_desperation_submission_defeat",
"Surprise / Amazement / Shock / Astonishment / Disbelief": "intense_astonishment_surprise_amazement_shock_disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty": "intense_confusion_bewilderment_disorientation_perplexity_uncertainty",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust": "intense_sympathy_compassion_warmth_trust_tenderness_affection",
"Pride / Dignity / Honor / Self-confidence / Respect": "intense_pride_dignity_self_confidence_honor_respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity": "intense_sourness_tartness_acidity_sharpness_bitterness",
}
voice_emotions = {
"alloy": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"ash": [
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"ballad": [
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity"
],
"coral": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"echo": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"fable": [
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Sourness / Sharpness / Bitterness / Tartness / Acidity"
],
"nova": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"onyx": [
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"sage": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Pride / Dignity / Honor / Self-confidence / Respect",
"Sourness / Sharpness / Bitterness / Tartness / Acidity",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
],
"shimmer": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Numbness / Isolation / Apathy / Detachment / Insensitivity",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Surprise / Amazement / Shock / Astonishment / Disbelief",
"Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty",
"Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission",
"Sourness / Sharpness / Bitterness / Tartness / Acidity"
],
"verse": [
"Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude",
"Fear / Terror / Panic / Dread / Apprehension / Horror",
"Interest / Fascination / Curiosity / Intrigue / Attention",
"Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection",
"Sourness / Sharpness / Bitterness / Tartness / Acidity",
"Tenderness / Warmth / Affection / Sympathy / Compassion / Trust"
]
}
all_voices = list(voice_emotions.keys())
# Helper to decode tokens to audio
def decode_snac(code_list):
layer_1, layer_2, layer_3 = [], [], []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
# Obtener dispositivo del primer codebook
device = snac_model.quantizer.quantizers[0].codebook.weight.device
layers = [
torch.tensor(layer_1).unsqueeze(0).to(device),
torch.tensor(layer_2).unsqueeze(0).to(device),
torch.tensor(layer_3).unsqueeze(0).to(device),
]
with torch.no_grad():
audio = snac_model.decode(layers).squeeze().cpu().numpy()
return audio
# Function to update emotions based on selected voice
def update_emotions(voice):
if voice in voice_emotions:
available_emotions = voice_emotions[voice]
return gr.Dropdown(choices=available_emotions, value=available_emotions[0] if available_emotions else None)
else:
return gr.Dropdown(choices=list(emotions.keys()), value=list(emotions.keys())[0])
# Inference
@GPU
def tts(prompt, user_selected_emotion, voice):
chosen_emotion = emotions[user_selected_emotion]
full_prompt = f"{voice} ({chosen_emotion}): {prompt}"
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
start_token = torch.tensor([[128259]], dtype=torch.long).to(device)
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.long).to(device)
input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
padding_len = 4260 - input_ids.shape[1]
pad = torch.full((1, padding_len), 128263, dtype=torch.long).to(device)
input_ids = torch.cat([pad, input_ids], dim=1)
attention_mask = torch.cat([torch.zeros((1, padding_len), dtype=torch.long), torch.ones((1, input_ids.shape[1]-padding_len), dtype=torch.long)], dim=1).to(device)
with torch.no_grad():
generated_ids = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
use_cache=True
)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped = generated_ids[:, last_occurrence_idx+1:]
else:
cropped = generated_ids
cleaned = cropped[cropped != token_to_remove]
trimmed = cleaned[: (len(cleaned) // 7) * 7]
trimmed = [int(t) - 128266 for t in trimmed]
audio = decode_snac(trimmed)
return (24000, audio)
# Gradio UI
# Create Gradio interface with dynamic emotion updating
with gr.Blocks() as demo:
gr.Markdown("# 🗣️ Orpheus Spanish TTS Finetuned with multiple emotions and voices")
gr.Markdown("Fine tuned model and SNAC decoding. The available emotions change according to the selected voice.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Texto")
voice_dropdown = gr.Dropdown(choices=all_voices, value=all_voices[0], label="Voz")
emotion_dropdown = gr.Dropdown(choices=voice_emotions[all_voices[0]], value=voice_emotions[all_voices[0]][0], label="Emoción")
submit_btn = gr.Button("Generar Audio")
with gr.Column():
audio_output = gr.Audio(label="Audio generado")
# Update emotions when voice changes
voice_dropdown.change(
fn=update_emotions,
inputs=voice_dropdown,
outputs=emotion_dropdown
)
# Generate audio
submit_btn.click(
fn=tts,
inputs=[text_input, emotion_dropdown, voice_dropdown],
outputs=audio_output
)
demo.launch()