from spaces import GPU import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel from snac import SNAC import gradio as gr # Config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") base_model_id = "canopylabs/3b-es_it-pretrain-research_release" lora_model_id = "sirekist98/orpheustts_spanish_finetuned" snac_model_id = "hubertsiuzdak/snac_24khz" # Load models tokenizer = AutoTokenizer.from_pretrained(base_model_id) base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16 if device.type == "cuda" else torch.float32) model = PeftModel.from_pretrained(base_model, lora_model_id) model = model.to(device) model.eval() snac_model = SNAC.from_pretrained(snac_model_id).to(device) # Emotions and voices emotions = { "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude": "intense_ecstasy_pleasure_bliss_rapture_beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror": "intense_fear_dread_apprehension_horror_terror_panic", "Numbness / Isolation / Apathy / Detachment / Insensitivity": "intense_numbness_detachment_insensitivity_apathy", "Interest / Fascination / Curiosity / Intrigue / Attention": "intense_interest_fascination_curiosity_intrigue_attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection": "intense_contempt_disdain_loathing_detestation_rejection", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission": "intense_helplessness_powerlessness_desperation_submission_defeat", "Surprise / Amazement / Shock / Astonishment / Disbelief": "intense_astonishment_surprise_amazement_shock_disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty": "intense_confusion_bewilderment_disorientation_perplexity_uncertainty", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust": "intense_sympathy_compassion_warmth_trust_tenderness_affection", "Pride / Dignity / Honor / Self-confidence / Respect": "intense_pride_dignity_self_confidence_honor_respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity": "intense_sourness_tartness_acidity_sharpness_bitterness", } voice_emotions = { "alloy": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Pride / Dignity / Honor / Self-confidence / Respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "ash": [ "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "ballad": [ "Fear / Terror / Panic / Dread / Apprehension / Horror", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Pride / Dignity / Honor / Self-confidence / Respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity" ], "coral": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Pride / Dignity / Honor / Self-confidence / Respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "echo": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Pride / Dignity / Honor / Self-confidence / Respect", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "fable": [ "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Sourness / Sharpness / Bitterness / Tartness / Acidity" ], "nova": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Pride / Dignity / Honor / Self-confidence / Respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "onyx": [ "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Pride / Dignity / Honor / Self-confidence / Respect", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "sage": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Pride / Dignity / Honor / Self-confidence / Respect", "Sourness / Sharpness / Bitterness / Tartness / Acidity", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ], "shimmer": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror", "Numbness / Isolation / Apathy / Detachment / Insensitivity", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Surprise / Amazement / Shock / Astonishment / Disbelief", "Confusion / Disorientation / Perplexity / Bewilderment / Uncertainty", "Despair / Surrender / Defeat / Helplessness / Powerlessness / Submission", "Sourness / Sharpness / Bitterness / Tartness / Acidity" ], "verse": [ "Ecstasy / Pleasure / Joy / Bliss / Rapture / Beatitude", "Fear / Terror / Panic / Dread / Apprehension / Horror", "Interest / Fascination / Curiosity / Intrigue / Attention", "Contempt / Disdain / Repulsion / Loathing / Detestation / Rejection", "Sourness / Sharpness / Bitterness / Tartness / Acidity", "Tenderness / Warmth / Affection / Sympathy / Compassion / Trust" ] } all_voices = list(voice_emotions.keys()) # Helper to decode tokens to audio def decode_snac(code_list): layer_1, layer_2, layer_3 = [], [], [] for i in range((len(code_list)+1)//7): layer_1.append(code_list[7*i]) layer_2.append(code_list[7*i+1]-4096) layer_3.append(code_list[7*i+2]-(2*4096)) layer_3.append(code_list[7*i+3]-(3*4096)) layer_2.append(code_list[7*i+4]-(4*4096)) layer_3.append(code_list[7*i+5]-(5*4096)) layer_3.append(code_list[7*i+6]-(6*4096)) # Obtener dispositivo del primer codebook device = snac_model.quantizer.quantizers[0].codebook.weight.device layers = [ torch.tensor(layer_1).unsqueeze(0).to(device), torch.tensor(layer_2).unsqueeze(0).to(device), torch.tensor(layer_3).unsqueeze(0).to(device), ] with torch.no_grad(): audio = snac_model.decode(layers).squeeze().cpu().numpy() return audio # Function to update emotions based on selected voice def update_emotions(voice): if voice in voice_emotions: available_emotions = voice_emotions[voice] return gr.Dropdown(choices=available_emotions, value=available_emotions[0] if available_emotions else None) else: return gr.Dropdown(choices=list(emotions.keys()), value=list(emotions.keys())[0]) # Inference @GPU def tts(prompt, user_selected_emotion, voice): chosen_emotion = emotions[user_selected_emotion] full_prompt = f"{voice} ({chosen_emotion}): {prompt}" input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device) start_token = torch.tensor([[128259]], dtype=torch.long).to(device) end_tokens = torch.tensor([[128009, 128260]], dtype=torch.long).to(device) input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) padding_len = 4260 - input_ids.shape[1] pad = torch.full((1, padding_len), 128263, dtype=torch.long).to(device) input_ids = torch.cat([pad, input_ids], dim=1) attention_mask = torch.cat([torch.zeros((1, padding_len), dtype=torch.long), torch.ones((1, input_ids.shape[1]-padding_len), dtype=torch.long)], dim=1).to(device) with torch.no_grad(): generated_ids = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=1200, do_sample=True, temperature=0.6, top_p=0.95, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=128258, use_cache=True ) token_to_find = 128257 token_to_remove = 128258 token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True) if len(token_indices[1]) > 0: last_occurrence_idx = token_indices[1][-1].item() cropped = generated_ids[:, last_occurrence_idx+1:] else: cropped = generated_ids cleaned = cropped[cropped != token_to_remove] trimmed = cleaned[: (len(cleaned) // 7) * 7] trimmed = [int(t) - 128266 for t in trimmed] audio = decode_snac(trimmed) return (24000, audio) # Gradio UI # Create Gradio interface with dynamic emotion updating with gr.Blocks() as demo: gr.Markdown("# 🗣️ Orpheus Spanish TTS Finetuned with multiple emotions and voices") gr.Markdown("Fine tuned model and SNAC decoding. The available emotions change according to the selected voice.") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Texto") voice_dropdown = gr.Dropdown(choices=all_voices, value=all_voices[0], label="Voz") emotion_dropdown = gr.Dropdown(choices=voice_emotions[all_voices[0]], value=voice_emotions[all_voices[0]][0], label="Emoción") submit_btn = gr.Button("Generar Audio") with gr.Column(): audio_output = gr.Audio(label="Audio generado") # Update emotions when voice changes voice_dropdown.change( fn=update_emotions, inputs=voice_dropdown, outputs=emotion_dropdown ) # Generate audio submit_btn.click( fn=tts, inputs=[text_input, emotion_dropdown, voice_dropdown], outputs=audio_output ) demo.launch()