Spaces:

sirekist98
/

Spanish_conversational_orpheustts

Runtime error

App Files Files Community

sirekist98 commited on 24 days ago

Commit

73f6e2b

verified ·

1 Parent(s): d1b0389

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -21

app.py CHANGED Viewed

@@ -4,6 +4,13 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 from snac import SNAC
 import gradio as gr
 # Config
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -12,24 +19,24 @@ lora_model_id = "sirekist98/spanish_conversational_tts"
 snac_model_id = "hubertsiuzdak/snac_24khz"
 # Load models
-tokenizer = AutoTokenizer.from_pretrained(base_model_id)
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
 )
-model = PeftModel.from_pretrained(base_model, lora_model_id)
 model = model.to(device)
 model.eval()
 snac_model = SNAC.from_pretrained(snac_model_id).to(device)
-# Speakers (sin emociones)
 speakers = [
     "Alex", "Carmen", "Daniel", "Diego", "Hugo", "Lucía", "María", "Pablo", "Sofía"
 ]
 # Helper to decode tokens to audio
 def decode_snac(code_list):
     layer_1, layer_2, layer_3 = [], [], []
     for i in range((len(code_list)+1)//7):
@@ -41,9 +48,7 @@ def decode_snac(code_list):
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
-    # Obtener dispositivo del primer codebook
     device_snac = snac_model.quantizer.quantizers[0].codebook.weight.device
     layers = [
         torch.tensor(layer_1).unsqueeze(0).to(device_snac),
         torch.tensor(layer_2).unsqueeze(0).to(device_snac),
@@ -54,22 +59,17 @@ def decode_snac(code_list):
         audio = snac_model.decode(layers).squeeze().cpu().numpy()
     return audio
-# Inference (sin emociones)
 @GPU
 def tts(prompt, speaker):
-    # Estructura de prompt: "<SPEAKER>: <texto>"
     full_prompt = f"{speaker}: {prompt}"
     input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
-    # Tokens especiales (iguales que tu versión anterior)
     start_token = torch.tensor([[128259]], dtype=torch.long).to(device)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.long).to(device)
     input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
-    # Padding fijo a 4260 para que encaje con el entrenamiento
     padding_len = max(0, 4260 - input_ids.shape[1])
     if padding_len > 0:
         pad = torch.full((1, padding_len), 128263, dtype=torch.long).to(device)
@@ -95,7 +95,6 @@ def tts(prompt, speaker):
             use_cache=True,
         )
-    # Post-procesado: recortar desde el último token 128257 y limpiar 128258
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
@@ -106,26 +105,22 @@ def tts(prompt, speaker):
         cropped = generated_ids
     cleaned = cropped[cropped != token_to_remove]
-    # Asegurar múltiplos de 7 y ajustar offset SNAC
     trimmed = cleaned[: (len(cleaned) // 7) * 7]
     trimmed = [int(t) - 128266 for t in trimmed]
     audio = decode_snac(trimmed)
     return (24000, audio)
-# Gradio UI (simple: texto + speaker)
 with gr.Blocks() as demo:
     gr.Markdown("# 🗣️ Orpheus Spanish TTS — sin emociones\nSelecciona un *speaker* y escribe el texto.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Texto", placeholder="Escribe aquí el texto a locutar")
             speaker_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
             submit_btn = gr.Button("Generar audio")
         with gr.Column():
-            audio_output = gr.Audio(label="Audio generado")
     submit_btn.click(
         fn=tts,
@@ -133,5 +128,4 @@ with gr.Blocks() as demo:
         outputs=audio_output,
     )
-if __name__ == "__main__":
-    demo.launch()

 from peft import PeftModel
 from snac import SNAC
 import gradio as gr
+import os
+# Autenticación Hugging Face para modelo privado
+from huggingface_hub import login
+hf_token = os.environ.get("HF_TOKEN")
+if hf_token:
+    login(token=hf_token)
 # Config
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 snac_model_id = "hubertsiuzdak/snac_24khz"
 # Load models
+tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_auth_token=True)
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
+    use_auth_token=True
 )
+model = PeftModel.from_pretrained(base_model, lora_model_id, use_auth_token=True)
 model = model.to(device)
 model.eval()
 snac_model = SNAC.from_pretrained(snac_model_id).to(device)
+# Speakers
 speakers = [
     "Alex", "Carmen", "Daniel", "Diego", "Hugo", "Lucía", "María", "Pablo", "Sofía"
 ]
 # Helper to decode tokens to audio
 def decode_snac(code_list):
     layer_1, layer_2, layer_3 = [], [], []
     for i in range((len(code_list)+1)//7):
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
     device_snac = snac_model.quantizer.quantizers[0].codebook.weight.device
     layers = [
         torch.tensor(layer_1).unsqueeze(0).to(device_snac),
         torch.tensor(layer_2).unsqueeze(0).to(device_snac),
         audio = snac_model.decode(layers).squeeze().cpu().numpy()
     return audio
+# Inference
 @GPU
 def tts(prompt, speaker):
     full_prompt = f"{speaker}: {prompt}"
     input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
     start_token = torch.tensor([[128259]], dtype=torch.long).to(device)
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.long).to(device)
     input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     padding_len = max(0, 4260 - input_ids.shape[1])
     if padding_len > 0:
         pad = torch.full((1, padding_len), 128263, dtype=torch.long).to(device)
             use_cache=True,
         )
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
         cropped = generated_ids
     cleaned = cropped[cropped != token_to_remove]
     trimmed = cleaned[: (len(cleaned) // 7) * 7]
     trimmed = [int(t) - 128266 for t in trimmed]
     audio = decode_snac(trimmed)
     return (24000, audio)
+# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# 🗣️ Orpheus Spanish TTS — sin emociones\nSelecciona un *speaker* y escribe el texto.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Texto", placeholder="Escribe aquí el texto a locutar")
             speaker_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
             submit_btn = gr.Button("Generar audio")
         with gr.Column():
+            audio_output = gr.Audio(label="Audio generado", type="numpy")
     submit_btn.click(
         fn=tts,
         outputs=audio_output,
     )
+demo.queue().launch(show_error=True)