import spaces from snac import SNAC import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import snapshot_download # Check if CUDA is available device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading SNAC model...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") snac_model = snac_model.to(device) # Single model trained on all voices MODEL_NAME = "Vyvo/VyvoTTS-LFM2-Multi-Speaker" AVAILABLE_VOICES = [ "Stephen_Fry", "Tighnari", "Thoma", "Shikanoin_Heizou", "Noelle", "Ningguang", "Nilou", "Neuvillette", "Navia", "Nahida", "Mualani", "Lyney", "Lynette", "Layla", "Kaveh", "Kaeya", "Furina", "Dehya", "Cyno", "Collei", "Beidou", "Alhaitham", "Arataki_Itto", ] # Load single model print(f"Loading model: {MODEL_NAME}") try: model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16) model = model.to(device) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print("โœ“ Model loaded successfully!") except Exception as e: print(f"โœ— Failed to load model: {e}") model = None tokenizer = None # LFM2 Special Tokens Configuration TOKENIZER_LENGTH = 64400 START_OF_TEXT = 1 END_OF_TEXT = 7 START_OF_SPEECH = TOKENIZER_LENGTH + 1 END_OF_SPEECH = TOKENIZER_LENGTH + 2 START_OF_HUMAN = TOKENIZER_LENGTH + 3 END_OF_HUMAN = TOKENIZER_LENGTH + 4 START_OF_AI = TOKENIZER_LENGTH + 5 END_OF_AI = TOKENIZER_LENGTH + 6 PAD_TOKEN = TOKENIZER_LENGTH + 7 AUDIO_TOKENS_START = TOKENIZER_LENGTH + 10 # Process text prompt for LFM2 with voice def process_prompt(prompt, voice, tokenizer, device): # Voice-aware prompt formatting if voice: formatted_prompt = f"[VOICE:{voice}] {prompt}" else: formatted_prompt = prompt input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64) end_tokens = torch.tensor([[END_OF_TEXT, END_OF_HUMAN]], dtype=torch.int64) modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # No padding needed for single input attention_mask = torch.ones_like(modified_input_ids) return modified_input_ids.to(device), attention_mask.to(device) # Parse output tokens to audio for LFM2 def parse_output(generated_ids): token_to_find = START_OF_SPEECH token_to_remove = END_OF_SPEECH token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True) if len(token_indices[1]) > 0: last_occurrence_idx = token_indices[1][-1].item() cropped_tensor = generated_ids[:, last_occurrence_idx+1:] else: cropped_tensor = generated_ids processed_rows = [] for row in cropped_tensor: masked_row = row[row != token_to_remove] processed_rows.append(masked_row) code_lists = [] for row in processed_rows: row_length = row.size(0) new_length = (row_length // 7) * 7 trimmed_row = row[:new_length] trimmed_row = [t - AUDIO_TOKENS_START for t in trimmed_row] code_lists.append(trimmed_row) return code_lists[0] # Return just the first one for single sample # Redistribute codes for audio generation def redistribute_codes(code_list, snac_model): device = next(snac_model.parameters()).device # Get the device of SNAC model layer_1 = [] layer_2 = [] layer_3 = [] for i in range((len(code_list)+1)//7): layer_1.append(code_list[7*i]) layer_2.append(code_list[7*i+1]-4096) layer_3.append(code_list[7*i+2]-(2*4096)) layer_3.append(code_list[7*i+3]-(3*4096)) layer_2.append(code_list[7*i+4]-(4*4096)) layer_3.append(code_list[7*i+5]-(5*4096)) layer_3.append(code_list[7*i+6]-(6*4096)) # Move tensors to the same device as the SNAC model codes = [ torch.tensor(layer_1, device=device).unsqueeze(0), torch.tensor(layer_2, device=device).unsqueeze(0), torch.tensor(layer_3, device=device).unsqueeze(0) ] audio_hat = snac_model.decode(codes) return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array # Main generation function @spaces.GPU() def generate_speech(text, voice_choice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()): if not text.strip(): return None if model is None or tokenizer is None: return None try: progress(0.1, f"๐Ÿ”„ Processing text with {voice_choice} voice...") # Voice parametresi ile prompt oluลŸtur input_ids, attention_mask = process_prompt(text, voice_choice, tokenizer, device) progress(0.3, "๐ŸŽต Generating speech tokens...") with torch.no_grad(): generated_ids = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, num_return_sequences=1, eos_token_id=END_OF_SPEECH, ) progress(0.6, "๐Ÿ”ง Processing speech tokens...") code_list = parse_output(generated_ids) progress(0.8, "๐ŸŽง Converting to audio...") audio_samples = redistribute_codes(code_list, snac_model) progress(1.0, f"โœ… Completed with {voice_choice}!") return (24000, audio_samples) except Exception as e: print(f"Error generating speech with {voice_choice}: {e}") return None # Genshin karakterleri iรงin รถrnek metinler EXAMPLE_TEXTS = [ "Hello! I am ready to help you on your adventure in Teyvat.", "The wind brings new adventures and ancient secrets.", "Let me share the wisdom of the elements with you.", "Together we can explore the mysteries of this world.", "Every journey begins with a single step forward.", "The stars above guide us through the darkest nights." ] # Create modern Gradio interface with gr.Blocks(title="๐ŸŽฎ Genshin Voice TTS", theme=gr.themes.Soft(), css=""" .gradio-textbox textarea { background-color: #6b7280 !important; color: white !important; } .gradio-audio { background-color: #6b7280 !important; } .character-category { margin: 10px 0; padding: 10px; border-radius: 8px; background-color: #f0f0f0; } """) as demo: # Header section gr.Markdown(""" # ๐ŸŽฎ VyvoTTS-LFM2 ### ๐ŸŽญ Character Voices | ๐Ÿ”— [Github](https://github.com/Vyvo-Labs/VyvoTTS) | ๐Ÿค— [HF Model](https://huggingface.co/collections/Vyvo/lfm2-tts-689eedae5353ff5b048efd55) """) gr.Markdown(""" VyvoTTS is a text-to-speech model by Vyvo team using LFM2 architecture, trained on multiple diverse open-source datasets. Since some datasets may contain transcription errors or quality issues, output quality can vary. Higher quality datasets typically produce better speech synthesis results. **Roadmap:** - [ ] Transformers.js support - [ ] Pretrained model release - [ ] vLLM support - [x] Training and inference code release """) with gr.Row(): with gr.Column(scale=2): # Text input section text_input = gr.Textbox( label="๐Ÿ“ Text Input", placeholder="Enter the text you want your chosen character to say...", lines=6, max_lines=10 ) # Voice selection voice_choice = gr.Dropdown( choices=AVAILABLE_VOICES, value="Tighnari", label="๐ŸŽค Character Voice", info=f"Choose from {len(AVAILABLE_VOICES)} available character voices" ) # Advanced settings with gr.Accordion("โš™๏ธ Advanced Settings", open=False): temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.6, step=0.05, label="๐ŸŒก๏ธ Temperature", info="Higher values create more expressive but less stable speech" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="๐ŸŽฏ Top P", info="Nucleus sampling threshold value" ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="๐Ÿ”„ Repetition Penalty", info="Higher values discourage repetitive patterns" ) max_new_tokens = gr.Slider( minimum=100, maximum=2000, value=1200, step=100, label="๐Ÿ“ Maximum Length", info="Maximum length of generated audio (in tokens)" ) # Action buttons with gr.Row(): submit_btn = gr.Button("๐ŸŽต Generate Speech", variant="primary", size="lg") clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", size="lg") with gr.Column(scale=1): # Output section audio_output = gr.Audio( label="๐ŸŽง Generated Audio", type="numpy", interactive=False ) # Example texts gr.Markdown("### ๐Ÿ“š Example Texts") with gr.Row(): for i in range(0, len(EXAMPLE_TEXTS), 2): with gr.Column(): if i < len(EXAMPLE_TEXTS): example_btn = gr.Button( EXAMPLE_TEXTS[i][:50] + "..." if len(EXAMPLE_TEXTS[i]) > 50 else EXAMPLE_TEXTS[i], size="sm" ) example_btn.click(fn=lambda text=EXAMPLE_TEXTS[i]: text, outputs=text_input) if i+1 < len(EXAMPLE_TEXTS): example_btn2 = gr.Button( EXAMPLE_TEXTS[i+1][:50] + "..." if len(EXAMPLE_TEXTS[i+1]) > 50 else EXAMPLE_TEXTS[i+1], size="sm" ) example_btn2.click(fn=lambda text=EXAMPLE_TEXTS[i+1]: text, outputs=text_input) # Set up event handlers submit_btn.click( fn=generate_speech, inputs=[text_input, voice_choice, temperature, top_p, repetition_penalty, max_new_tokens], outputs=audio_output, show_progress=True ) def clear_interface(): return "", None clear_btn.click( fn=clear_interface, inputs=[], outputs=[text_input, audio_output] ) # Launch the app if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False)