import argparse import logging import os import pathlib import time import tempfile import platform if platform.system().lower() == 'windows': temp = pathlib.PosixPath pathlib.PosixPath = pathlib.WindowsPath elif platform.system().lower() == 'linux': temp = pathlib.WindowsPath pathlib.WindowsPath = pathlib.PosixPath os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" import langid langid.set_languages(['en', 'zh', 'ja']) import torch import torchaudio import random import numpy as np from data.tokenizer import ( AudioTokenizer, tokenize_audio, ) from data.collation import get_text_token_collater from models.vallex import VALLE from utils.g2p import PhonemeBpeTokenizer from descriptions import * from macros import * import gradio as gr import whisper import multiprocessing import math import tempfile from typing import Optional, Tuple, Union import matplotlib.pyplot as plt from loguru import logger from PIL import Image from torch import Tensor from torchaudio.backend.common import AudioMetaData from df import config from df.enhance import enhance, init_df, load_audio, save_audio from df.io import resample thread_count = multiprocessing.cpu_count() print("Use",thread_count,"cpu cores for computing") torch.set_num_threads(thread_count) torch.set_num_interop_threads(thread_count) torch._C._jit_set_profiling_executor(False) torch._C._jit_set_profiling_mode(False) torch._C._set_graph_executor_optimize(False) text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json") text_collater = get_text_token_collater() device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda", 0) # Denoise model1, df, _ = init_df("./DeepFilterNet2", config_allow_defaults=True) model1 = model1.to(device=device).eval() fig_noisy: plt.Figure fig_enh: plt.Figure ax_noisy: plt.Axes ax_enh: plt.Axes fig_noisy, ax_noisy = plt.subplots(figsize=(15.2, 4)) fig_noisy.set_tight_layout(True) fig_enh, ax_enh = plt.subplots(figsize=(15.2, 4)) fig_enh.set_tight_layout(True) NOISES = { "None": None, } def mix_at_snr(clean, noise, snr, eps=1e-10): """Mix clean and noise signal at a given SNR. Args: clean: 1D Tensor with the clean signal to mix. noise: 1D Tensor of shape. snr: Signal to noise ratio. Returns: clean: 1D Tensor with gain changed according to the snr. noise: 1D Tensor with the combined noise channels. mix: 1D Tensor with added clean and noise signals. """ clean = torch.as_tensor(clean).mean(0, keepdim=True) noise = torch.as_tensor(noise).mean(0, keepdim=True) if noise.shape[1] < clean.shape[1]: noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1])))) max_start = int(noise.shape[1] - clean.shape[1]) start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0 logger.debug(f"start: {start}, {clean.shape}") noise = noise[:, start : start + clean.shape[1]] E_speech = torch.mean(clean.pow(2)) + eps E_noise = torch.mean(noise.pow(2)) K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps) noise = noise / K mixture = clean + noise logger.debug("mixture: {mixture.shape}") assert torch.isfinite(mixture).all() max_m = mixture.abs().max() if max_m > 1: logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m}") clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m return clean, noise, mixture def load_audio_gradio( audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int ) -> Optional[Tuple[Tensor, AudioMetaData]]: if audio_or_file is None: return None if isinstance(audio_or_file, str): if audio_or_file.lower() == "none": return None # First try default format audio, meta = load_audio(audio_or_file, sr) else: meta = AudioMetaData(-1, -1, -1, -1, "") assert isinstance(audio_or_file, (tuple, list)) meta.sample_rate, audio_np = audio_or_file # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not. audio_np = audio_np.reshape(audio_np.shape[0], -1).T if audio_np.dtype == np.int16: audio_np = (audio_np / (1 << 15)).astype(np.float32) elif audio_np.dtype == np.int32: audio_np = (audio_np / (1 << 31)).astype(np.float32) audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr) return audio, meta def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: str): if mic_input: speech_upl = mic_input sr = config("sr", 48000, int, section="df") logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}") snr = int(snr) noise_fn = NOISES[noise_type] meta = AudioMetaData(-1, -1, -1, -1, "") max_s = 10 # limit to 10 seconds if speech_upl is not None: sample, meta = load_audio(speech_upl, sr) max_len = max_s * sr if sample.shape[-1] > max_len: start = torch.randint(0, sample.shape[-1] - max_len, ()).item() sample = sample[..., start : start + max_len] else: sample, meta = load_audio("samples/p232_013_clean.wav", sr) sample = sample[..., : max_s * sr] if sample.dim() > 1 and sample.shape[0] > 1: assert ( sample.shape[1] > sample.shape[0] ), f"Expecting channels first, but got {sample.shape}" sample = sample.mean(dim=0, keepdim=True) logger.info(f"Loaded sample with shape {sample.shape}") if noise_fn is not None: noise, _ = load_audio(noise_fn, sr) # type: ignore logger.info(f"Loaded noise with shape {noise.shape}") _, _, sample = mix_at_snr(sample, noise, snr) logger.info("Start denoising audio") enhanced = enhance(model1, df, sample) logger.info("Denoising finished") lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0) lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1) enhanced = enhanced * lim if meta.sample_rate != sr: enhanced = resample(enhanced, sr, meta.sample_rate) sample = resample(sample, sr, meta.sample_rate) sr = meta.sample_rate noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name save_audio(noisy_wav, sample, sr) enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name save_audio(enhanced_wav, enhanced, sr) logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}") ax_noisy.clear() ax_enh.clear() noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy) enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh) # noisy_wav = gr.make_waveform(noisy_fn, bar_count=200) # enh_wav = gr.make_waveform(enhanced_fn, bar_count=200) return noisy_wav, noisy_im, enhanced_wav, enh_im def specshow( spec, ax=None, title=None, xlabel=None, ylabel=None, sr=48000, n_fft=None, hop=None, t=None, f=None, vmin=-100, vmax=0, xlim=None, ylim=None, cmap="inferno", ): """Plots a spectrogram of shape [F, T]""" spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec if ax is not None: set_title = ax.set_title set_xlabel = ax.set_xlabel set_ylabel = ax.set_ylabel set_xlim = ax.set_xlim set_ylim = ax.set_ylim else: ax = plt set_title = plt.title set_xlabel = plt.xlabel set_ylabel = plt.ylabel set_xlim = plt.xlim set_ylim = plt.ylim if n_fft is None: if spec.shape[0] % 2 == 0: n_fft = spec.shape[0] * 2 else: n_fft = (spec.shape[0] - 1) * 2 hop = hop or n_fft // 4 if t is None: t = np.arange(0, spec_np.shape[-1]) * hop / sr if f is None: f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000 im = ax.pcolormesh( t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap ) if title is not None: set_title(title) if xlabel is not None: set_xlabel(xlabel) if ylabel is not None: set_ylabel(ylabel) if xlim is not None: set_xlim(xlim) if ylim is not None: set_ylim(ylim) return im def spec_im( audio: torch.Tensor, figsize=(15, 5), colorbar=False, colorbar_format=None, figure=None, labels=True, **kwargs, ) -> Image: audio = torch.as_tensor(audio) if labels: kwargs.setdefault("xlabel", "Time [s]") kwargs.setdefault("ylabel", "Frequency [Hz]") n_fft = kwargs.setdefault("n_fft", 1024) hop = kwargs.setdefault("hop", 512) w = torch.hann_window(n_fft, device=audio.device) spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False) spec = spec.div_(w.pow(2).sum()) spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10) kwargs.setdefault("vmax", max(0.0, spec.max().item())) if figure is None: figure = plt.figure(figsize=figsize) figure.set_tight_layout(True) if spec.dim() > 2: spec = spec.squeeze(0) im = specshow(spec, **kwargs) if colorbar: ckwargs = {} if "ax" in kwargs: if colorbar_format is None: if kwargs.get("vmin", None) is not None or kwargs.get("vmax", None) is not None: colorbar_format = "%+2.0f dB" ckwargs = {"ax": kwargs["ax"]} plt.colorbar(im, format=colorbar_format, **ckwargs) figure.canvas.draw() return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb()) def toggle(choice): if choice == "mic": return gr.update(visible=True, value=None), gr.update(visible=False, value=None) else: return gr.update(visible=False, value=None), gr.update(visible=True, value=None) # VALL-E-X model model = VALLE( N_DIM, NUM_HEAD, NUM_LAYERS, norm_first=True, add_prenet=False, prefix_mode=PREFIX_MODE, share_embedding=True, nar_scale_factor=1.0, prepend_bos=True, num_quantizers=NUM_QUANTIZERS, ) checkpoint = torch.load("./epoch-10.pt", map_location='cpu') missing_keys, unexpected_keys = model.load_state_dict( checkpoint["model"], strict=True ) assert not missing_keys model.eval() # Encodec model audio_tokenizer = AudioTokenizer(device) # ASR whisper_model = whisper.load_model("medium").cpu() # Voice Presets preset_list = os.walk("./presets/").__next__()[2] preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")] def clear_prompts(): try: path = tempfile.gettempdir() for eachfile in os.listdir(path): filename = os.path.join(path, eachfile) if os.path.isfile(filename) and filename.endswith(".npz"): lastmodifytime = os.stat(filename).st_mtime endfiletime = time.time() - 60 if endfiletime > lastmodifytime: os.remove(filename) except: return def transcribe_one(model, audio_path): # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio_path) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) print(f"Detected language: {max(probs, key=probs.get)}") lang = max(probs, key=probs.get) # decode the audio options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150) result = whisper.decode(model, mel, options) # print the recognized text print(result.text) text_pr = result.text if text_pr.strip(" ")[-1] not in "?!.,。,?!。、": text_pr += "." return lang, text_pr def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content): global model, text_collater, text_tokenizer, audio_tokenizer clear_prompts() audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio sr, wav_pr = audio_prompt if len(wav_pr) / sr > 15: return "Rejected, Audio too long (should be less than 15 seconds)", None if not isinstance(wav_pr, torch.FloatTensor): wav_pr = torch.FloatTensor(wav_pr) if wav_pr.abs().max() > 1: wav_pr /= wav_pr.abs().max() if wav_pr.size(-1) == 2: wav_pr = wav_pr[:, 0] if wav_pr.ndim == 1: wav_pr = wav_pr.unsqueeze(0) assert wav_pr.ndim and wav_pr.size(0) == 1 if transcript_content == "": text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False) else: lang_pr = langid.classify(str(transcript_content))[0] lang_token = lang2token[lang_pr] text_pr = f"{lang_token}{str(transcript_content)}{lang_token}" # tokenize audio encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr)) audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy() # tokenize text phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip()) text_tokens, enroll_x_lens = text_collater( [ phonemes ] ) message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n" # save as npz file np.savez(os.path.join(tempfile.gettempdir(), f"{name}.npz"), audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr]) return "提取音色成功!", os.path.join(tempfile.gettempdir(), f"{name}.npz") def make_prompt(name, wav, sr, save=True): global whisper_model whisper_model.to(device) if not isinstance(wav, torch.FloatTensor): wav = torch.tensor(wav) if wav.abs().max() > 1: wav /= wav.abs().max() if wav.size(-1) == 2: wav = wav.mean(-1, keepdim=False) if wav.ndim == 1: wav = wav.unsqueeze(0) assert wav.ndim and wav.size(0) == 1 torchaudio.save(f"./prompts/{name}.wav", wav, sr) lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav") lang_token = lang2token[lang] text = lang_token + text + lang_token with open(f"./prompts/{name}.txt", 'w') as f: f.write(text) if not save: os.remove(f"./prompts/{name}.wav") os.remove(f"./prompts/{name}.txt") whisper_model.cpu() torch.cuda.empty_cache() return text, lang @torch.no_grad() def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content): if len(text) > 150: return "Rejected, Text too long (should be less than 150 characters)", None global model, text_collater, text_tokenizer, audio_tokenizer model.to(device) audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt sr, wav_pr = audio_prompt if len(wav_pr) / sr > 15: return "Rejected, Audio too long (should be less than 15 seconds)", None if not isinstance(wav_pr, torch.FloatTensor): wav_pr = torch.FloatTensor(wav_pr) if wav_pr.abs().max() > 1: wav_pr /= wav_pr.abs().max() if wav_pr.size(-1) == 2: wav_pr = wav_pr[:, 0] if wav_pr.ndim == 1: wav_pr = wav_pr.unsqueeze(0) assert wav_pr.ndim and wav_pr.size(0) == 1 if transcript_content == "": text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False) else: lang_pr = langid.classify(str(transcript_content))[0] lang_token = lang2token[lang_pr] text_pr = f"{lang_token}{str(transcript_content)}{lang_token}" if language == 'auto-detect': lang_token = lang2token[langid.classify(text)[0]] else: lang_token = langdropdown2token[language] lang = token2lang[lang_token] text = lang_token + text + lang_token # onload model model.to(device) # tokenize audio encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr)) audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device) # tokenize text logging.info(f"synthesize text: {text}") phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) text_tokens, text_tokens_lens = text_collater( [ phone_tokens ] ) enroll_x_lens = None if text_pr: text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip()) text_prompts, enroll_x_lens = text_collater( [ text_prompts ] ) text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) text_tokens_lens += enroll_x_lens lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] encoded_frames = model.inference( text_tokens.to(device), text_tokens_lens.to(device), audio_prompts, enroll_x_lens=enroll_x_lens, top_k=-100, temperature=1, prompt_language=lang_pr, text_language=langs if accent == "no-accent" else lang, ) samples = audio_tokenizer.decode( [(encoded_frames.transpose(2, 1), None)] ) # offload model model.to('cpu') torch.cuda.empty_cache() message = f"text prompt: {text_pr}\nsythesized text: {text}" return message, (24000, samples[0][0].cpu().numpy()) @torch.no_grad() def infer_from_prompt(text, language, accent, preset_prompt, prompt_file): if len(text) > 150: return "Rejected, Text too long (should be less than 150 characters)", None clear_prompts() model.to(device) # text to synthesize if language == 'auto-detect': lang_token = lang2token[langid.classify(text)[0]] else: lang_token = langdropdown2token[language] lang = token2lang[lang_token] text = lang_token + text + lang_token # load prompt if prompt_file is not None: prompt_data = np.load(prompt_file.name) else: prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz")) audio_prompts = prompt_data['audio_tokens'] text_prompts = prompt_data['text_tokens'] lang_pr = prompt_data['lang_code'] lang_pr = code2lang[int(lang_pr)] # numpy to tensor audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device) text_prompts = torch.tensor(text_prompts).type(torch.int32) enroll_x_lens = text_prompts.shape[-1] logging.info(f"synthesize text: {text}") phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) text_tokens, text_tokens_lens = text_collater( [ phone_tokens ] ) text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) text_tokens_lens += enroll_x_lens # accent control lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] encoded_frames = model.inference( text_tokens.to(device), text_tokens_lens.to(device), audio_prompts, enroll_x_lens=enroll_x_lens, top_k=-100, temperature=1, prompt_language=lang_pr, text_language=langs if accent == "no-accent" else lang, ) samples = audio_tokenizer.decode( [(encoded_frames.transpose(2, 1), None)] ) model.to('cpu') torch.cuda.empty_cache() message = f"sythesized text: {text}" return message, (24000, samples[0][0].cpu().numpy()) from utils.sentence_cutter import split_text_into_sentences @torch.no_grad() def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'): """ For long audio generation, two modes are available. fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence. sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance. """ if len(text) > 1000: return "Rejected, Text too long (should be less than 1000 characters)", None mode = 'fixed-prompt' global model, audio_tokenizer, text_tokenizer, text_collater model.to(device) if (prompt is None or prompt == "") and preset_prompt == "": mode = 'sliding-window' # If no prompt is given, use sliding-window mode sentences = split_text_into_sentences(text) # detect language if language == "auto-detect": language = langid.classify(text)[0] else: language = token2lang[langdropdown2token[language]] # if initial prompt is given, encode it if prompt is not None and prompt != "": # load prompt prompt_data = np.load(prompt.name) audio_prompts = prompt_data['audio_tokens'] text_prompts = prompt_data['text_tokens'] lang_pr = prompt_data['lang_code'] lang_pr = code2lang[int(lang_pr)] # numpy to tensor audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device) text_prompts = torch.tensor(text_prompts).type(torch.int32) elif preset_prompt is not None and preset_prompt != "": prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz")) audio_prompts = prompt_data['audio_tokens'] text_prompts = prompt_data['text_tokens'] lang_pr = prompt_data['lang_code'] lang_pr = code2lang[int(lang_pr)] # numpy to tensor audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device) text_prompts = torch.tensor(text_prompts).type(torch.int32) else: audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device) text_prompts = torch.zeros([1, 0]).type(torch.int32) lang_pr = language if language != 'mix' else 'en' if mode == 'fixed-prompt': complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) for text in sentences: text = text.replace("\n", "").strip(" ") if text == "": continue lang_token = lang2token[language] lang = token2lang[lang_token] text = lang_token + text + lang_token enroll_x_lens = text_prompts.shape[-1] logging.info(f"synthesize text: {text}") phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) text_tokens, text_tokens_lens = text_collater( [ phone_tokens ] ) text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) text_tokens_lens += enroll_x_lens # accent control lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] encoded_frames = model.inference( text_tokens.to(device), text_tokens_lens.to(device), audio_prompts, enroll_x_lens=enroll_x_lens, top_k=-100, temperature=1, prompt_language=lang_pr, text_language=langs if accent == "no-accent" else lang, ) complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) samples = audio_tokenizer.decode( [(complete_tokens, None)] ) model.to('cpu') message = f"Cut into {len(sentences)} sentences" return message, (24000, samples[0][0].cpu().numpy()) elif mode == "sliding-window": complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device) original_audio_prompts = audio_prompts original_text_prompts = text_prompts for text in sentences: text = text.replace("\n", "").strip(" ") if text == "": continue lang_token = lang2token[language] lang = token2lang[lang_token] text = lang_token + text + lang_token enroll_x_lens = text_prompts.shape[-1] logging.info(f"synthesize text: {text}") phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip()) text_tokens, text_tokens_lens = text_collater( [ phone_tokens ] ) text_tokens = torch.cat([text_prompts, text_tokens], dim=-1) text_tokens_lens += enroll_x_lens # accent control lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]] encoded_frames = model.inference( text_tokens.to(device), text_tokens_lens.to(device), audio_prompts, enroll_x_lens=enroll_x_lens, top_k=-100, temperature=1, prompt_language=lang_pr, text_language=langs if accent == "no-accent" else lang, ) complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1) if torch.rand(1) < 1.0: audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:] text_prompts = text_tokens[:, enroll_x_lens:] else: audio_prompts = original_audio_prompts text_prompts = original_text_prompts samples = audio_tokenizer.decode( [(complete_tokens, None)] ) model.to('cpu') message = f"Cut into {len(sentences)} sentences" return message, (24000, samples[0][0].cpu().numpy()) else: raise ValueError(f"No such mode {mode}") def main(): app = gr.Blocks() with app: gr.HTML("
" "

🌊💕🎶 VALL-E X 3秒声音克隆,支持中日英三语

" "
") gr.Markdown("##
⚡ 只需3秒语音,快速复刻您喜欢的声音;Powered by [VALL-E-X](https://github.com/Plachtaa/VALL-E-X)
") gr.Markdown("###
更多精彩应用,尽在[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕
") with gr.Tab("🎶 - 提取音色"): gr.Markdown("请上传一段3~10秒的语音,并点击”提取音色“") with gr.Row(): with gr.Column(): textbox2 = gr.TextArea(label="Prompt name", placeholder="Name your prompt here", value="prompt_1", elem_id=f"prompt-name", visible=False) # 添加选择语言和输入台本的地方 textbox_transcript2 = gr.TextArea(label="Transcript", placeholder="Write transcript here. (leave empty to use whisper)", value="", elem_id=f"prompt-name", visible=False) upload_audio_prompt_2 = gr.Audio(label='请在此上传您的语音文件', source='upload', interactive=True) record_audio_prompt_2 = gr.Audio(label='或者用麦克风上传您喜欢的声音', source='microphone', interactive=True) with gr.Column(): text_output_2 = gr.Textbox(label="音色提取进度") prompt_output_2 = gr.File(interactive=False, visible=False) btn_2 = gr.Button("提取音色", variant="primary") btn_2.click(make_npz_prompt, inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2], outputs=[text_output_2, prompt_output_2]) with gr.Tab("💕 - 声音克隆"): gr.Markdown("现在开始奇妙的声音克隆之旅吧!输入您想合成的文本后,点击”声音克隆“即可快速复刻喜欢的声音!") with gr.Row(): with gr.Column(): textbox_4 = gr.TextArea(label="请输入您想合成的文本", placeholder="说点什么吧(中英皆可)...", elem_id=f"tts-input") btn_4 = gr.Button("声音克隆", variant="primary") btn_5 = gr.Button("去除噪音", variant="primary") language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect', label='language', visible=False) accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent', visible=False) preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='更多语音包', visible=False) prompt_file_4 = prompt_output_2 with gr.Column(): text_output_4 = gr.TextArea(label="Message", visible=False) audio_output_4 = gr.Audio(label="为您合成的专属语音", elem_id="tts-audio", type="filepath", interactive=False) radio = gr.Radio( ["mic", "file"], value="file", label="How would you like to upload your audio?", visible=False ) mic_input = gr.Mic(label="Input", type="filepath", visible=False) audio_file = audio_output_4 inputs1 = [ audio_file, gr.Dropdown( label="Add background noise", choices=list(NOISES.keys()), value="None", visible=False, ), gr.Dropdown( label="Noise Level (SNR)", choices=["-5", "0", "10", "20"], value="0", visible=False, ), mic_input, ] outputs1 = [ gr.Audio(type="filepath", label="Noisy audio", visible=False), gr.Image(label="Noisy spectrogram", visible=False), gr.Audio(type="filepath", label="降噪后的专属语音"), gr.Image(label="Enhanced spectrogram", visible=False), ] btn_4.click(infer_long_text, inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4], outputs=[text_output_4, audio_output_4]) btn_5.click(fn=demo_fn, inputs=inputs1, outputs=outputs1) gr.Markdown("###
注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
") gr.Markdown("
🧸 - 如何使用此程序:在“提取音色”模块上传一段语音并提取音色之后,就可以在“声音克隆”模块一键克隆您喜欢的声音啦!
") gr.HTML(''' ''') app.launch(show_error=True) if __name__ == "__main__": formatter = ( "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" ) logging.basicConfig(format=formatter, level=logging.INFO) main()