VALLE

Runtime error

App Files Files Community

VALLE / app.py

kevinwang676

Update app.py

568023a over 1 year ago

raw

history blame contribute delete

31.9 kB

	import argparse
	import logging
	import os
	import pathlib
	import time
	import tempfile
	import platform
	if platform.system().lower() == 'windows':
	temp = pathlib.PosixPath
	pathlib.PosixPath = pathlib.WindowsPath
	elif platform.system().lower() == 'linux':
	temp = pathlib.WindowsPath
	pathlib.WindowsPath = pathlib.PosixPath
	os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

	import langid
	langid.set_languages(['en', 'zh', 'ja'])

	import torch
	import torchaudio
	import random

	import numpy as np

	from data.tokenizer import (
	AudioTokenizer,
	tokenize_audio,
	)
	from data.collation import get_text_token_collater
	from models.vallex import VALLE
	from utils.g2p import PhonemeBpeTokenizer
	from descriptions import *
	from macros import *

	import gradio as gr
	import whisper
	import multiprocessing

	import math
	import tempfile
	from typing import Optional, Tuple, Union

	import matplotlib.pyplot as plt
	from loguru import logger
	from PIL import Image
	from torch import Tensor
	from torchaudio.backend.common import AudioMetaData

	from df import config
	from df.enhance import enhance, init_df, load_audio, save_audio
	from df.io import resample


	thread_count = multiprocessing.cpu_count()

	print("Use",thread_count,"cpu cores for computing")

	torch.set_num_threads(thread_count)
	torch.set_num_interop_threads(thread_count)
	torch._C._jit_set_profiling_executor(False)
	torch._C._jit_set_profiling_mode(False)
	torch._C._set_graph_executor_optimize(False)

	text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
	text_collater = get_text_token_collater()

	device = torch.device("cpu")
	if torch.cuda.is_available():
	device = torch.device("cuda", 0)

	# Denoise

	model1, df, _ = init_df("./DeepFilterNet2", config_allow_defaults=True)
	model1 = model1.to(device=device).eval()

	fig_noisy: plt.Figure
	fig_enh: plt.Figure
	ax_noisy: plt.Axes
	ax_enh: plt.Axes
	fig_noisy, ax_noisy = plt.subplots(figsize=(15.2, 4))
	fig_noisy.set_tight_layout(True)
	fig_enh, ax_enh = plt.subplots(figsize=(15.2, 4))
	fig_enh.set_tight_layout(True)

	NOISES = {
	"None": None,
	}

	def mix_at_snr(clean, noise, snr, eps=1e-10):
	"""Mix clean and noise signal at a given SNR.
	Args:
	clean: 1D Tensor with the clean signal to mix.
	noise: 1D Tensor of shape.
	snr: Signal to noise ratio.
	Returns:
	clean: 1D Tensor with gain changed according to the snr.
	noise: 1D Tensor with the combined noise channels.
	mix: 1D Tensor with added clean and noise signals.
	"""
	clean = torch.as_tensor(clean).mean(0, keepdim=True)
	noise = torch.as_tensor(noise).mean(0, keepdim=True)
	if noise.shape[1] < clean.shape[1]:
	noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
	max_start = int(noise.shape[1] - clean.shape[1])
	start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
	logger.debug(f"start: {start}, {clean.shape}")
	noise = noise[:, start : start + clean.shape[1]]
	E_speech = torch.mean(clean.pow(2)) + eps
	E_noise = torch.mean(noise.pow(2))
	K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
	noise = noise / K
	mixture = clean + noise
	logger.debug("mixture: {mixture.shape}")
	assert torch.isfinite(mixture).all()
	max_m = mixture.abs().max()
	if max_m > 1:
	logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m}")
	clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
	return clean, noise, mixture


	def load_audio_gradio(
	audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
	) -> Optional[Tuple[Tensor, AudioMetaData]]:
	if audio_or_file is None:
	return None
	if isinstance(audio_or_file, str):
	if audio_or_file.lower() == "none":
	return None
	# First try default format
	audio, meta = load_audio(audio_or_file, sr)
	else:
	meta = AudioMetaData(-1, -1, -1, -1, "")
	assert isinstance(audio_or_file, (tuple, list))
	meta.sample_rate, audio_np = audio_or_file
	# Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
	audio_np = audio_np.reshape(audio_np.shape[0], -1).T
	if audio_np.dtype == np.int16:
	audio_np = (audio_np / (1 << 15)).astype(np.float32)
	elif audio_np.dtype == np.int32:
	audio_np = (audio_np / (1 << 31)).astype(np.float32)
	audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr)
	return audio, meta


	def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: str):
	if mic_input:
	speech_upl = mic_input
	sr = config("sr", 48000, int, section="df")
	logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}")
	snr = int(snr)
	noise_fn = NOISES[noise_type]
	meta = AudioMetaData(-1, -1, -1, -1, "")
	max_s = 10 # limit to 10 seconds
	if speech_upl is not None:
	sample, meta = load_audio(speech_upl, sr)
	max_len = max_s * sr
	if sample.shape[-1] > max_len:
	start = torch.randint(0, sample.shape[-1] - max_len, ()).item()
	sample = sample[..., start : start + max_len]
	else:
	sample, meta = load_audio("samples/p232_013_clean.wav", sr)
	sample = sample[..., : max_s * sr]
	if sample.dim() > 1 and sample.shape[0] > 1:
	assert (
	sample.shape[1] > sample.shape[0]
	), f"Expecting channels first, but got {sample.shape}"
	sample = sample.mean(dim=0, keepdim=True)
	logger.info(f"Loaded sample with shape {sample.shape}")
	if noise_fn is not None:
	noise, _ = load_audio(noise_fn, sr) # type: ignore
	logger.info(f"Loaded noise with shape {noise.shape}")
	_, _, sample = mix_at_snr(sample, noise, snr)
	logger.info("Start denoising audio")
	enhanced = enhance(model1, df, sample)
	logger.info("Denoising finished")
	lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
	lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
	enhanced = enhanced * lim
	if meta.sample_rate != sr:
	enhanced = resample(enhanced, sr, meta.sample_rate)
	sample = resample(sample, sr, meta.sample_rate)
	sr = meta.sample_rate
	noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
	save_audio(noisy_wav, sample, sr)
	enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
	save_audio(enhanced_wav, enhanced, sr)
	logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}")
	ax_noisy.clear()
	ax_enh.clear()
	noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy)
	enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh)
	# noisy_wav = gr.make_waveform(noisy_fn, bar_count=200)
	# enh_wav = gr.make_waveform(enhanced_fn, bar_count=200)
	return noisy_wav, noisy_im, enhanced_wav, enh_im


	def specshow(
	spec,
	ax=None,
	title=None,
	xlabel=None,
	ylabel=None,
	sr=48000,
	n_fft=None,
	hop=None,
	t=None,
	f=None,
	vmin=-100,
	vmax=0,
	xlim=None,
	ylim=None,
	cmap="inferno",
	):
	"""Plots a spectrogram of shape [F, T]"""
	spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec
	if ax is not None:
	set_title = ax.set_title
	set_xlabel = ax.set_xlabel
	set_ylabel = ax.set_ylabel
	set_xlim = ax.set_xlim
	set_ylim = ax.set_ylim
	else:
	ax = plt
	set_title = plt.title
	set_xlabel = plt.xlabel
	set_ylabel = plt.ylabel
	set_xlim = plt.xlim
	set_ylim = plt.ylim
	if n_fft is None:
	if spec.shape[0] % 2 == 0:
	n_fft = spec.shape[0] * 2
	else:
	n_fft = (spec.shape[0] - 1) * 2
	hop = hop or n_fft // 4
	if t is None:
	t = np.arange(0, spec_np.shape[-1]) * hop / sr
	if f is None:
	f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
	im = ax.pcolormesh(
	t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap
	)
	if title is not None:
	set_title(title)
	if xlabel is not None:
	set_xlabel(xlabel)
	if ylabel is not None:
	set_ylabel(ylabel)
	if xlim is not None:
	set_xlim(xlim)
	if ylim is not None:
	set_ylim(ylim)
	return im


	def spec_im(
	audio: torch.Tensor,
	figsize=(15, 5),
	colorbar=False,
	colorbar_format=None,
	figure=None,
	labels=True,
	**kwargs,
	) -> Image:
	audio = torch.as_tensor(audio)
	if labels:
	kwargs.setdefault("xlabel", "Time [s]")
	kwargs.setdefault("ylabel", "Frequency [Hz]")
	n_fft = kwargs.setdefault("n_fft", 1024)
	hop = kwargs.setdefault("hop", 512)
	w = torch.hann_window(n_fft, device=audio.device)
	spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
	spec = spec.div_(w.pow(2).sum())
	spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
	kwargs.setdefault("vmax", max(0.0, spec.max().item()))

	if figure is None:
	figure = plt.figure(figsize=figsize)
	figure.set_tight_layout(True)
	if spec.dim() > 2:
	spec = spec.squeeze(0)
	im = specshow(spec, **kwargs)
	if colorbar:
	ckwargs = {}
	if "ax" in kwargs:
	if colorbar_format is None:
	if kwargs.get("vmin", None) is not None or kwargs.get("vmax", None) is not None:
	colorbar_format = "%+2.0f dB"
	ckwargs = {"ax": kwargs["ax"]}
	plt.colorbar(im, format=colorbar_format, **ckwargs)
	figure.canvas.draw()
	return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())


	def toggle(choice):
	if choice == "mic":
	return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
	else:
	return gr.update(visible=False, value=None), gr.update(visible=True, value=None)


	# VALL-E-X model
	model = VALLE(
	N_DIM,
	NUM_HEAD,
	NUM_LAYERS,
	norm_first=True,
	add_prenet=False,
	prefix_mode=PREFIX_MODE,
	share_embedding=True,
	nar_scale_factor=1.0,
	prepend_bos=True,
	num_quantizers=NUM_QUANTIZERS,
	)
	checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
	missing_keys, unexpected_keys = model.load_state_dict(
	checkpoint["model"], strict=True
	)
	assert not missing_keys
	model.eval()

	# Encodec model
	audio_tokenizer = AudioTokenizer(device)

	# ASR
	whisper_model = whisper.load_model("medium").cpu()

	# Voice Presets
	preset_list = os.walk("./presets/").__next__()[2]
	preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]

	def clear_prompts():
	try:
	path = tempfile.gettempdir()
	for eachfile in os.listdir(path):
	filename = os.path.join(path, eachfile)
	if os.path.isfile(filename) and filename.endswith(".npz"):
	lastmodifytime = os.stat(filename).st_mtime
	endfiletime = time.time() - 60
	if endfiletime > lastmodifytime:
	os.remove(filename)
	except:
	return

	def transcribe_one(model, audio_path):
	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio_path)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	print(f"Detected language: {max(probs, key=probs.get)}")
	lang = max(probs, key=probs.get)
	# decode the audio
	options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150)
	result = whisper.decode(model, mel, options)

	# print the recognized text
	print(result.text)

	text_pr = result.text
	if text_pr.strip(" ")[-1] not in "?!.,。，？！。、":
	text_pr += "."
	return lang, text_pr

	def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
	global model, text_collater, text_tokenizer, audio_tokenizer
	clear_prompts()
	audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
	sr, wav_pr = audio_prompt
	if len(wav_pr) / sr > 15:
	return "Rejected, Audio too long (should be less than 15 seconds)", None
	if not isinstance(wav_pr, torch.FloatTensor):
	wav_pr = torch.FloatTensor(wav_pr)
	if wav_pr.abs().max() > 1:
	wav_pr /= wav_pr.abs().max()
	if wav_pr.size(-1) == 2:
	wav_pr = wav_pr[:, 0]
	if wav_pr.ndim == 1:
	wav_pr = wav_pr.unsqueeze(0)
	assert wav_pr.ndim and wav_pr.size(0) == 1

	if transcript_content == "":
	text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
	else:
	lang_pr = langid.classify(str(transcript_content))[0]
	lang_token = lang2token[lang_pr]
	text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
	# tokenize audio
	encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
	audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()

	# tokenize text
	phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
	text_tokens, enroll_x_lens = text_collater(
	[
	phonemes
	]
	)

	message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n"

	# save as npz file
	np.savez(os.path.join(tempfile.gettempdir(), f"{name}.npz"),
	audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr])
	return "提取音色成功！", os.path.join(tempfile.gettempdir(), f"{name}.npz")


	def make_prompt(name, wav, sr, save=True):
	global whisper_model
	whisper_model.to(device)
	if not isinstance(wav, torch.FloatTensor):
	wav = torch.tensor(wav)
	if wav.abs().max() > 1:
	wav /= wav.abs().max()
	if wav.size(-1) == 2:
	wav = wav.mean(-1, keepdim=False)
	if wav.ndim == 1:
	wav = wav.unsqueeze(0)
	assert wav.ndim and wav.size(0) == 1
	torchaudio.save(f"./prompts/{name}.wav", wav, sr)
	lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav")
	lang_token = lang2token[lang]
	text = lang_token + text + lang_token
	with open(f"./prompts/{name}.txt", 'w') as f:
	f.write(text)
	if not save:
	os.remove(f"./prompts/{name}.wav")
	os.remove(f"./prompts/{name}.txt")

	whisper_model.cpu()
	torch.cuda.empty_cache()
	return text, lang

	@torch.no_grad()
	def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content):
	if len(text) > 150:
	return "Rejected, Text too long (should be less than 150 characters)", None
	global model, text_collater, text_tokenizer, audio_tokenizer
	model.to(device)
	audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
	sr, wav_pr = audio_prompt
	if len(wav_pr) / sr > 15:
	return "Rejected, Audio too long (should be less than 15 seconds)", None
	if not isinstance(wav_pr, torch.FloatTensor):
	wav_pr = torch.FloatTensor(wav_pr)
	if wav_pr.abs().max() > 1:
	wav_pr /= wav_pr.abs().max()
	if wav_pr.size(-1) == 2:
	wav_pr = wav_pr[:, 0]
	if wav_pr.ndim == 1:
	wav_pr = wav_pr.unsqueeze(0)
	assert wav_pr.ndim and wav_pr.size(0) == 1

	if transcript_content == "":
	text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False)
	else:
	lang_pr = langid.classify(str(transcript_content))[0]
	lang_token = lang2token[lang_pr]
	text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"

	if language == 'auto-detect':
	lang_token = lang2token[langid.classify(text)[0]]
	else:
	lang_token = langdropdown2token[language]
	lang = token2lang[lang_token]
	text = lang_token + text + lang_token

	# onload model
	model.to(device)

	# tokenize audio
	encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
	audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)

	# tokenize text
	logging.info(f"synthesize text: {text}")
	phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
	text_tokens, text_tokens_lens = text_collater(
	[
	phone_tokens
	]
	)

	enroll_x_lens = None
	if text_pr:
	text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
	text_prompts, enroll_x_lens = text_collater(
	[
	text_prompts
	]
	)
	text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
	text_tokens_lens += enroll_x_lens
	lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
	encoded_frames = model.inference(
	text_tokens.to(device),
	text_tokens_lens.to(device),
	audio_prompts,
	enroll_x_lens=enroll_x_lens,
	top_k=-100,
	temperature=1,
	prompt_language=lang_pr,
	text_language=langs if accent == "no-accent" else lang,
	)
	samples = audio_tokenizer.decode(
	[(encoded_frames.transpose(2, 1), None)]
	)

	# offload model
	model.to('cpu')
	torch.cuda.empty_cache()

	message = f"text prompt: {text_pr}\nsythesized text: {text}"
	return message, (24000, samples[0][0].cpu().numpy())

	@torch.no_grad()
	def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
	if len(text) > 150:
	return "Rejected, Text too long (should be less than 150 characters)", None
	clear_prompts()
	model.to(device)
	# text to synthesize
	if language == 'auto-detect':
	lang_token = lang2token[langid.classify(text)[0]]
	else:
	lang_token = langdropdown2token[language]
	lang = token2lang[lang_token]
	text = lang_token + text + lang_token

	# load prompt
	if prompt_file is not None:
	prompt_data = np.load(prompt_file.name)
	else:
	prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
	audio_prompts = prompt_data['audio_tokens']
	text_prompts = prompt_data['text_tokens']
	lang_pr = prompt_data['lang_code']
	lang_pr = code2lang[int(lang_pr)]

	# numpy to tensor
	audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
	text_prompts = torch.tensor(text_prompts).type(torch.int32)

	enroll_x_lens = text_prompts.shape[-1]
	logging.info(f"synthesize text: {text}")
	phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
	text_tokens, text_tokens_lens = text_collater(
	[
	phone_tokens
	]
	)
	text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
	text_tokens_lens += enroll_x_lens
	# accent control
	lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
	encoded_frames = model.inference(
	text_tokens.to(device),
	text_tokens_lens.to(device),
	audio_prompts,
	enroll_x_lens=enroll_x_lens,
	top_k=-100,
	temperature=1,
	prompt_language=lang_pr,
	text_language=langs if accent == "no-accent" else lang,
	)
	samples = audio_tokenizer.decode(
	[(encoded_frames.transpose(2, 1), None)]
	)
	model.to('cpu')
	torch.cuda.empty_cache()

	message = f"sythesized text: {text}"
	return message, (24000, samples[0][0].cpu().numpy())


	from utils.sentence_cutter import split_text_into_sentences
	@torch.no_grad()
	def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
	"""
	For long audio generation, two modes are available.
	fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
	sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
	"""
	if len(text) > 1000:
	return "Rejected, Text too long (should be less than 1000 characters)", None
	mode = 'fixed-prompt'
	global model, audio_tokenizer, text_tokenizer, text_collater
	model.to(device)
	if (prompt is None or prompt == "") and preset_prompt == "":
	mode = 'sliding-window' # If no prompt is given, use sliding-window mode
	sentences = split_text_into_sentences(text)
	# detect language
	if language == "auto-detect":
	language = langid.classify(text)[0]
	else:
	language = token2lang[langdropdown2token[language]]

	# if initial prompt is given, encode it
	if prompt is not None and prompt != "":
	# load prompt
	prompt_data = np.load(prompt.name)
	audio_prompts = prompt_data['audio_tokens']
	text_prompts = prompt_data['text_tokens']
	lang_pr = prompt_data['lang_code']
	lang_pr = code2lang[int(lang_pr)]

	# numpy to tensor
	audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
	text_prompts = torch.tensor(text_prompts).type(torch.int32)
	elif preset_prompt is not None and preset_prompt != "":
	prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
	audio_prompts = prompt_data['audio_tokens']
	text_prompts = prompt_data['text_tokens']
	lang_pr = prompt_data['lang_code']
	lang_pr = code2lang[int(lang_pr)]

	# numpy to tensor
	audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
	text_prompts = torch.tensor(text_prompts).type(torch.int32)
	else:
	audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
	text_prompts = torch.zeros([1, 0]).type(torch.int32)
	lang_pr = language if language != 'mix' else 'en'
	if mode == 'fixed-prompt':
	complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
	for text in sentences:
	text = text.replace("\n", "").strip(" ")
	if text == "":
	continue
	lang_token = lang2token[language]
	lang = token2lang[lang_token]
	text = lang_token + text + lang_token

	enroll_x_lens = text_prompts.shape[-1]
	logging.info(f"synthesize text: {text}")
	phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
	text_tokens, text_tokens_lens = text_collater(
	[
	phone_tokens
	]
	)
	text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
	text_tokens_lens += enroll_x_lens
	# accent control
	lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
	encoded_frames = model.inference(
	text_tokens.to(device),
	text_tokens_lens.to(device),
	audio_prompts,
	enroll_x_lens=enroll_x_lens,
	top_k=-100,
	temperature=1,
	prompt_language=lang_pr,
	text_language=langs if accent == "no-accent" else lang,
	)
	complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
	samples = audio_tokenizer.decode(
	[(complete_tokens, None)]
	)
	model.to('cpu')
	message = f"Cut into {len(sentences)} sentences"
	return message, (24000, samples[0][0].cpu().numpy())
	elif mode == "sliding-window":
	complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
	original_audio_prompts = audio_prompts
	original_text_prompts = text_prompts
	for text in sentences:
	text = text.replace("\n", "").strip(" ")
	if text == "":
	continue
	lang_token = lang2token[language]
	lang = token2lang[lang_token]
	text = lang_token + text + lang_token

	enroll_x_lens = text_prompts.shape[-1]
	logging.info(f"synthesize text: {text}")
	phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
	text_tokens, text_tokens_lens = text_collater(
	[
	phone_tokens
	]
	)
	text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
	text_tokens_lens += enroll_x_lens
	# accent control
	lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
	encoded_frames = model.inference(
	text_tokens.to(device),
	text_tokens_lens.to(device),
	audio_prompts,
	enroll_x_lens=enroll_x_lens,
	top_k=-100,
	temperature=1,
	prompt_language=lang_pr,
	text_language=langs if accent == "no-accent" else lang,
	)
	complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
	if torch.rand(1) < 1.0:
	audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
	text_prompts = text_tokens[:, enroll_x_lens:]
	else:
	audio_prompts = original_audio_prompts
	text_prompts = original_text_prompts
	samples = audio_tokenizer.decode(
	[(complete_tokens, None)]
	)
	model.to('cpu')
	message = f"Cut into {len(sentences)} sentences"
	return message, (24000, samples[0][0].cpu().numpy())
	else:
	raise ValueError(f"No such mode {mode}")


	def main():
	app = gr.Blocks()
	with app:
	gr.HTML("<center>"
	"<h1>🌊💕🎶 VALL-E X 3秒声音克隆，支持中日英三语</h1>"
	"</center>")
	gr.Markdown("## <center>⚡ 只需3秒语音，快速复刻您喜欢的声音；Powered by [VALL-E-X](https://github.com/Plachtaa/VALL-E-X)</center>")
	gr.Markdown("### <center>更多精彩应用，尽在[滔滔AI](http://www.talktalkai.com)；滔滔AI，为爱滔滔！💕</center>")


	with gr.Tab("🎶 - 提取音色"):
	gr.Markdown("请上传一段3~10秒的语音，并点击”提取音色“")
	with gr.Row():
	with gr.Column():
	textbox2 = gr.TextArea(label="Prompt name",
	placeholder="Name your prompt here",
	value="prompt_1", elem_id=f"prompt-name", visible=False)
	# 添加选择语言和输入台本的地方
	textbox_transcript2 = gr.TextArea(label="Transcript",
	placeholder="Write transcript here. (leave empty to use whisper)",
	value="", elem_id=f"prompt-name", visible=False)
	upload_audio_prompt_2 = gr.Audio(label='请在此上传您的语音文件', source='upload', interactive=True)
	record_audio_prompt_2 = gr.Audio(label='或者用麦克风上传您喜欢的声音', source='microphone', interactive=True)
	with gr.Column():
	text_output_2 = gr.Textbox(label="音色提取进度")
	prompt_output_2 = gr.File(interactive=False, visible=False)
	btn_2 = gr.Button("提取音色", variant="primary")
	btn_2.click(make_npz_prompt,
	inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
	outputs=[text_output_2, prompt_output_2])

	with gr.Tab("💕 - 声音克隆"):
	gr.Markdown("现在开始奇妙的声音克隆之旅吧！输入您想合成的文本后，点击”声音克隆“即可快速复刻喜欢的声音！")
	with gr.Row():
	with gr.Column():
	textbox_4 = gr.TextArea(label="请输入您想合成的文本",
	placeholder="说点什么吧(中英皆可)...",
	elem_id=f"tts-input")

	btn_4 = gr.Button("声音克隆", variant="primary")
	btn_5 = gr.Button("去除噪音", variant="primary")

	language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
	label='language', visible=False)
	accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
	label='accent', visible=False)
	preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='更多语音包', visible=False)
	prompt_file_4 = prompt_output_2
	with gr.Column():
	text_output_4 = gr.TextArea(label="Message", visible=False)
	audio_output_4 = gr.Audio(label="为您合成的专属语音", elem_id="tts-audio", type="filepath", interactive=False)


	radio = gr.Radio(
	["mic", "file"], value="file", label="How would you like to upload your audio?", visible=False
	)
	mic_input = gr.Mic(label="Input", type="filepath", visible=False)
	audio_file = audio_output_4
	inputs1 = [
	audio_file,
	gr.Dropdown(
	label="Add background noise",
	choices=list(NOISES.keys()),
	value="None",
	visible=False,
	),
	gr.Dropdown(
	label="Noise Level (SNR)",
	choices=["-5", "0", "10", "20"],
	value="0",
	visible=False,
	),
	mic_input,
	]

	outputs1 = [
	gr.Audio(type="filepath", label="Noisy audio", visible=False),
	gr.Image(label="Noisy spectrogram", visible=False),
	gr.Audio(type="filepath", label="降噪后的专属语音"),
	gr.Image(label="Enhanced spectrogram", visible=False),
	]

	btn_4.click(infer_long_text,
	inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
	outputs=[text_output_4, audio_output_4])
	btn_5.click(fn=demo_fn, inputs=inputs1, outputs=outputs1)

	gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
	gr.Markdown("<center>🧸 - 如何使用此程序：在“提取音色”模块上传一段语音并提取音色之后，就可以在“声音克隆”模块一键克隆您喜欢的声音啦！</center>")
	gr.HTML('''
	<div class="footer">
	<p>🌊🏞️🎶 - 江水东流急，滔滔无尽声。明·顾璘
	</p>
	</div>
	''')
	app.launch(show_error=True)

	if __name__ == "__main__":
	formatter = (
	"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
	)
	logging.basicConfig(format=formatter, level=logging.INFO)
	main()