Spaces:
Running
Running
zhzluke96
commited on
Commit
·
f83b1b7
1
Parent(s):
32b2aaa
update
Browse files- models/ChatTTS/config/decoder.yaml +2 -0
- models/ChatTTS/config/dvae.yaml +3 -1
- models/ChatTTS/config/gpt.yaml +3 -0
- models/ChatTTS/config/path.yaml +2 -0
- models/ChatTTS/config/vocos.yaml +1 -1
- models/Denoise/.gitkeep +0 -0
- models/Denoise/audio-denoiser-512-32-v1/config.json +1 -0
- models/Denoise/audio-denoiser-512-32-v1/pytorch_model.bin +3 -0
- models/put_model_here +0 -0
- models/resemble-enhance/hparams.yaml +38 -0
- models/resemble-enhance/mp_rank_00_model_states.pt +3 -0
- modules/Enhancer/ResembleEnhance.py +46 -33
- modules/generate_audio.py +2 -2
- modules/repos_static/resemble_enhance/enhancer/enhancer.py +2 -2
- modules/speaker.py +1 -1
- modules/utils/SeedContext.py +17 -6
- modules/webui/app.py +0 -5
- modules/webui/speaker/speaker_creator.py +2 -2
- modules/webui/speaker/speaker_merger.py +3 -1
- modules/webui/ssml_tab.py +1 -1
- modules/webui/tts_tab.py +3 -4
- modules/webui/webui_utils.py +25 -19
- webui.py +7 -0
models/ChatTTS/config/decoder.yaml
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
dim: 384
|
| 2 |
|
| 3 |
decoder_config:
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
dim: 384
|
| 4 |
|
| 5 |
decoder_config:
|
models/ChatTTS/config/dvae.yaml
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
dim: 512
|
| 2 |
decoder_config:
|
| 3 |
idim: ${dim}
|
|
@@ -7,6 +9,6 @@ decoder_config:
|
|
| 7 |
|
| 8 |
vq_config:
|
| 9 |
dim: 1024
|
| 10 |
-
levels: [5,
|
| 11 |
G: 2
|
| 12 |
R: 2
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
dim: 512
|
| 4 |
decoder_config:
|
| 5 |
idim: ${dim}
|
|
|
|
| 9 |
|
| 10 |
vq_config:
|
| 11 |
dim: 1024
|
| 12 |
+
levels: [5,5,5,5]
|
| 13 |
G: 2
|
| 14 |
R: 2
|
models/ChatTTS/config/gpt.yaml
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
num_audio_tokens: 626
|
| 2 |
num_text_tokens: 21178
|
| 3 |
|
|
@@ -15,3 +17,4 @@ gpt_config:
|
|
| 15 |
num_audio_tokens: 626
|
| 16 |
num_text_tokens: null
|
| 17 |
num_vq: 4
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
num_audio_tokens: 626
|
| 4 |
num_text_tokens: 21178
|
| 5 |
|
|
|
|
| 17 |
num_audio_tokens: 626
|
| 18 |
num_text_tokens: null
|
| 19 |
num_vq: 4
|
| 20 |
+
|
models/ChatTTS/config/path.yaml
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
vocos_config_path: config/vocos.yaml
|
| 2 |
vocos_ckpt_path: asset/Vocos.pt
|
| 3 |
dvae_config_path: config/dvae.yaml
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
vocos_config_path: config/vocos.yaml
|
| 4 |
vocos_ckpt_path: asset/Vocos.pt
|
| 5 |
dvae_config_path: config/dvae.yaml
|
models/ChatTTS/config/vocos.yaml
CHANGED
|
@@ -21,4 +21,4 @@ head:
|
|
| 21 |
dim: 512
|
| 22 |
n_fft: 1024
|
| 23 |
hop_length: 256
|
| 24 |
-
padding: center
|
|
|
|
| 21 |
dim: 512
|
| 22 |
n_fft: 1024
|
| 23 |
hop_length: 256
|
| 24 |
+
padding: center
|
models/Denoise/.gitkeep
ADDED
|
File without changes
|
models/Denoise/audio-denoiser-512-32-v1/config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"scaler": {"mean": -3.027921438217163, "std": 1.9317387342453003}, "in_channels": 257, "n_fft": 512, "num_frames": 32, "exp_id": "115233"}
|
models/Denoise/audio-denoiser-512-32-v1/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5079784e228d2b36496f2c72f8d06015c8fb1827a81f757ec8540ca708ada7a9
|
| 3 |
+
size 153639572
|
models/put_model_here
ADDED
|
File without changes
|
models/resemble-enhance/hparams.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fg_dir: !!python/object/apply:pathlib.PosixPath
|
| 2 |
+
- data
|
| 3 |
+
- fg
|
| 4 |
+
bg_dir: !!python/object/apply:pathlib.PosixPath
|
| 5 |
+
- data
|
| 6 |
+
- bg
|
| 7 |
+
rir_dir: !!python/object/apply:pathlib.PosixPath
|
| 8 |
+
- data
|
| 9 |
+
- rir
|
| 10 |
+
load_fg_only: false
|
| 11 |
+
wav_rate: 44100
|
| 12 |
+
n_fft: 2048
|
| 13 |
+
win_size: 2048
|
| 14 |
+
hop_size: 420
|
| 15 |
+
num_mels: 128
|
| 16 |
+
stft_magnitude_min: 0.0001
|
| 17 |
+
preemphasis: 0.97
|
| 18 |
+
mix_alpha_range:
|
| 19 |
+
- 0.2
|
| 20 |
+
- 0.8
|
| 21 |
+
nj: 64
|
| 22 |
+
training_seconds: 3.0
|
| 23 |
+
batch_size_per_gpu: 32
|
| 24 |
+
min_lr: 1.0e-05
|
| 25 |
+
max_lr: 0.0001
|
| 26 |
+
warmup_steps: 1000
|
| 27 |
+
max_steps: 1000000
|
| 28 |
+
gradient_clipping: 1.0
|
| 29 |
+
cfm_solver_method: midpoint
|
| 30 |
+
cfm_solver_nfe: 64
|
| 31 |
+
cfm_time_mapping_divisor: 4
|
| 32 |
+
univnet_nc: 96
|
| 33 |
+
lcfm_latent_dim: 64
|
| 34 |
+
lcfm_training_mode: cfm
|
| 35 |
+
lcfm_z_scale: 6
|
| 36 |
+
vocoder_extra_dim: 32
|
| 37 |
+
gan_training_start_step: null
|
| 38 |
+
praat_augment_prob: 0.2
|
models/resemble-enhance/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9d035f318de3e6d919bc70cf7ad7d32b4fe92ec5cbe0b30029a27f5db07d9d6
|
| 3 |
+
size 713176232
|
modules/Enhancer/ResembleEnhance.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import List
|
|
|
|
| 3 |
from modules.repos_static.resemble_enhance.enhancer.enhancer import Enhancer
|
| 4 |
from modules.repos_static.resemble_enhance.enhancer.hparams import HParams
|
| 5 |
from modules.repos_static.resemble_enhance.inference import inference
|
|
@@ -25,14 +26,11 @@ def load_enhancer(device: torch.device):
|
|
| 25 |
|
| 26 |
|
| 27 |
class ResembleEnhance:
|
| 28 |
-
hparams: HParams
|
| 29 |
-
enhancer: Enhancer
|
| 30 |
-
|
| 31 |
def __init__(self, device: torch.device):
|
| 32 |
self.device = device
|
| 33 |
|
| 34 |
-
self.enhancer = None
|
| 35 |
-
self.hparams = None
|
| 36 |
|
| 37 |
def load_model(self):
|
| 38 |
hparams = HParams.load(Path(MODELS_DIR) / "resemble-enhance")
|
|
@@ -42,9 +40,7 @@ class ResembleEnhance:
|
|
| 42 |
map_location="cpu",
|
| 43 |
)["module"]
|
| 44 |
enhancer.load_state_dict(state_dict)
|
| 45 |
-
enhancer.eval()
|
| 46 |
-
enhancer.to(self.device)
|
| 47 |
-
enhancer.denoiser.to(self.device)
|
| 48 |
|
| 49 |
self.hparams = hparams
|
| 50 |
self.enhancer = enhancer
|
|
@@ -63,7 +59,7 @@ class ResembleEnhance:
|
|
| 63 |
sr,
|
| 64 |
device,
|
| 65 |
nfe=32,
|
| 66 |
-
solver="midpoint",
|
| 67 |
lambd=0.5,
|
| 68 |
tau=0.5,
|
| 69 |
) -> tuple[torch.Tensor, int]:
|
|
@@ -83,34 +79,51 @@ class ResembleEnhance:
|
|
| 83 |
|
| 84 |
if __name__ == "__main__":
|
| 85 |
import torchaudio
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
load_chat_tts()
|
| 89 |
|
| 90 |
device = torch.device("cuda")
|
| 91 |
-
ench = ResembleEnhance(device)
|
| 92 |
-
ench.load_model()
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
print(wav.shape, type(wav), sr, type(sr))
|
| 97 |
-
exit()
|
| 98 |
|
| 99 |
-
wav = wav.squeeze(0).cuda()
|
| 100 |
|
| 101 |
-
print(wav.device)
|
| 102 |
|
| 103 |
-
denoised, d_sr = ench.denoise(wav
|
| 104 |
-
denoised = denoised.unsqueeze(0)
|
| 105 |
-
print(denoised.shape)
|
| 106 |
-
torchaudio.save("denoised.wav", denoised, d_sr)
|
| 107 |
|
| 108 |
-
for solver in ("midpoint", "rk4", "euler"):
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import List, Literal
|
| 3 |
+
from modules.devices import devices
|
| 4 |
from modules.repos_static.resemble_enhance.enhancer.enhancer import Enhancer
|
| 5 |
from modules.repos_static.resemble_enhance.enhancer.hparams import HParams
|
| 6 |
from modules.repos_static.resemble_enhance.inference import inference
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
class ResembleEnhance:
|
|
|
|
|
|
|
|
|
|
| 29 |
def __init__(self, device: torch.device):
|
| 30 |
self.device = device
|
| 31 |
|
| 32 |
+
self.enhancer: HParams = None
|
| 33 |
+
self.hparams: Enhancer = None
|
| 34 |
|
| 35 |
def load_model(self):
|
| 36 |
hparams = HParams.load(Path(MODELS_DIR) / "resemble-enhance")
|
|
|
|
| 40 |
map_location="cpu",
|
| 41 |
)["module"]
|
| 42 |
enhancer.load_state_dict(state_dict)
|
| 43 |
+
enhancer.to(self.device).eval()
|
|
|
|
|
|
|
| 44 |
|
| 45 |
self.hparams = hparams
|
| 46 |
self.enhancer = enhancer
|
|
|
|
| 59 |
sr,
|
| 60 |
device,
|
| 61 |
nfe=32,
|
| 62 |
+
solver: Literal["midpoint", "rk4", "euler"] = "midpoint",
|
| 63 |
lambd=0.5,
|
| 64 |
tau=0.5,
|
| 65 |
) -> tuple[torch.Tensor, int]:
|
|
|
|
| 79 |
|
| 80 |
if __name__ == "__main__":
|
| 81 |
import torchaudio
|
| 82 |
+
import gradio as gr
|
|
|
|
|
|
|
| 83 |
|
| 84 |
device = torch.device("cuda")
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# def enhance(file):
|
| 87 |
+
# print(file)
|
| 88 |
+
# ench = load_enhancer(device)
|
| 89 |
+
# dwav, sr = torchaudio.load(file)
|
| 90 |
+
# dwav = dwav.mean(dim=0).to(device)
|
| 91 |
+
# enhanced, e_sr = ench.enhance(dwav, sr)
|
| 92 |
+
# return e_sr, enhanced.cpu().numpy()
|
| 93 |
+
|
| 94 |
+
# # 随便一个示例
|
| 95 |
+
# gr.Interface(
|
| 96 |
+
# fn=enhance, inputs=[gr.Audio(type="filepath")], outputs=[gr.Audio()]
|
| 97 |
+
# ).launch()
|
| 98 |
+
|
| 99 |
+
# load_chat_tts()
|
| 100 |
+
|
| 101 |
+
# ench = load_enhancer(device)
|
| 102 |
+
|
| 103 |
+
# devices.torch_gc()
|
| 104 |
+
|
| 105 |
+
# wav, sr = torchaudio.load("test.wav")
|
| 106 |
|
| 107 |
+
# print(wav.shape, type(wav), sr, type(sr))
|
| 108 |
+
# # exit()
|
| 109 |
|
| 110 |
+
# wav = wav.squeeze(0).cuda()
|
| 111 |
|
| 112 |
+
# print(wav.device)
|
| 113 |
|
| 114 |
+
# denoised, d_sr = ench.denoise(wav, sr)
|
| 115 |
+
# denoised = denoised.unsqueeze(0)
|
| 116 |
+
# print(denoised.shape)
|
| 117 |
+
# torchaudio.save("denoised.wav", denoised.cpu(), d_sr)
|
| 118 |
|
| 119 |
+
# for solver in ("midpoint", "rk4", "euler"):
|
| 120 |
+
# for lambd in (0.1, 0.5, 0.9):
|
| 121 |
+
# for tau in (0.1, 0.5, 0.9):
|
| 122 |
+
# enhanced, e_sr = ench.enhance(
|
| 123 |
+
# wav, sr, solver=solver, lambd=lambd, tau=tau, nfe=128
|
| 124 |
+
# )
|
| 125 |
+
# enhanced = enhanced.unsqueeze(0)
|
| 126 |
+
# print(enhanced.shape)
|
| 127 |
+
# torchaudio.save(
|
| 128 |
+
# f"enhanced_{solver}_{lambd}_{tau}.wav", enhanced.cpu(), e_sr
|
| 129 |
+
# )
|
modules/generate_audio.py
CHANGED
|
@@ -72,7 +72,7 @@ def generate_audio_batch(
|
|
| 72 |
}
|
| 73 |
|
| 74 |
if isinstance(spk, int):
|
| 75 |
-
with SeedContext(spk):
|
| 76 |
params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
|
| 77 |
logger.info(("spk", spk))
|
| 78 |
elif isinstance(spk, Speaker):
|
|
@@ -94,7 +94,7 @@ def generate_audio_batch(
|
|
| 94 |
}
|
| 95 |
)
|
| 96 |
|
| 97 |
-
with SeedContext(infer_seed):
|
| 98 |
wavs = chat_tts.generate_audio(
|
| 99 |
texts, params_infer_code, use_decoder=use_decoder
|
| 100 |
)
|
|
|
|
| 72 |
}
|
| 73 |
|
| 74 |
if isinstance(spk, int):
|
| 75 |
+
with SeedContext(spk, True):
|
| 76 |
params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
|
| 77 |
logger.info(("spk", spk))
|
| 78 |
elif isinstance(spk, Speaker):
|
|
|
|
| 94 |
}
|
| 95 |
)
|
| 96 |
|
| 97 |
+
with SeedContext(infer_seed, True):
|
| 98 |
wavs = chat_tts.generate_audio(
|
| 99 |
texts, params_infer_code, use_decoder=use_decoder
|
| 100 |
)
|
modules/repos_static/resemble_enhance/enhancer/enhancer.py
CHANGED
|
@@ -73,8 +73,8 @@ class Enhancer(nn.Module):
|
|
| 73 |
)
|
| 74 |
self._load_pretrained(pretrained_path)
|
| 75 |
|
| 76 |
-
logger.info(f"{self.__class__.__name__} summary")
|
| 77 |
-
logger.info(f"{self.summarize()}")
|
| 78 |
|
| 79 |
def _load_pretrained(self, path):
|
| 80 |
# Clone is necessary as otherwise it holds a reference to the original model
|
|
|
|
| 73 |
)
|
| 74 |
self._load_pretrained(pretrained_path)
|
| 75 |
|
| 76 |
+
# logger.info(f"{self.__class__.__name__} summary")
|
| 77 |
+
# logger.info(f"{self.summarize()}")
|
| 78 |
|
| 79 |
def _load_pretrained(self, path):
|
| 80 |
# Clone is necessary as otherwise it holds a reference to the original model
|
modules/speaker.py
CHANGED
|
@@ -11,7 +11,7 @@ import uuid
|
|
| 11 |
|
| 12 |
def create_speaker_from_seed(seed):
|
| 13 |
chat_tts = models.load_chat_tts()
|
| 14 |
-
with SeedContext(seed):
|
| 15 |
emb = chat_tts.sample_random_speaker()
|
| 16 |
return emb
|
| 17 |
|
|
|
|
| 11 |
|
| 12 |
def create_speaker_from_seed(seed):
|
| 13 |
chat_tts = models.load_chat_tts()
|
| 14 |
+
with SeedContext(seed, True):
|
| 15 |
emb = chat_tts.sample_random_speaker()
|
| 16 |
return emb
|
| 17 |
|
modules/utils/SeedContext.py
CHANGED
|
@@ -7,15 +7,17 @@ import logging
|
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
|
| 10 |
-
def deterministic(seed=0):
|
| 11 |
random.seed(seed)
|
| 12 |
np.random.seed(seed)
|
| 13 |
torch_rn = rng.convert_np_to_torch(seed)
|
| 14 |
torch.manual_seed(torch_rn)
|
| 15 |
if torch.cuda.is_available():
|
| 16 |
torch.cuda.manual_seed_all(torch_rn)
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def is_numeric(obj):
|
|
@@ -36,7 +38,7 @@ def is_numeric(obj):
|
|
| 36 |
|
| 37 |
|
| 38 |
class SeedContext:
|
| 39 |
-
def __init__(self, seed):
|
| 40 |
assert is_numeric(seed), "Seed must be an number."
|
| 41 |
|
| 42 |
try:
|
|
@@ -45,6 +47,7 @@ class SeedContext:
|
|
| 45 |
raise ValueError(f"Seed must be an integer, but: {type(seed)}")
|
| 46 |
|
| 47 |
self.seed = seed
|
|
|
|
| 48 |
self.state = None
|
| 49 |
|
| 50 |
if isinstance(seed, str) and seed.isdigit():
|
|
@@ -57,10 +60,16 @@ class SeedContext:
|
|
| 57 |
self.seed = random.randint(0, 2**32 - 1)
|
| 58 |
|
| 59 |
def __enter__(self):
|
| 60 |
-
self.state = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
try:
|
| 63 |
-
deterministic(self.seed)
|
| 64 |
except Exception as e:
|
| 65 |
# raise ValueError(
|
| 66 |
# f"Seed must be an integer, but: <{type(self.seed)}> {self.seed}"
|
|
@@ -73,6 +82,8 @@ class SeedContext:
|
|
| 73 |
torch.set_rng_state(self.state[0])
|
| 74 |
random.setstate(self.state[1])
|
| 75 |
np.random.set_state(self.state[2])
|
|
|
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
if __name__ == "__main__":
|
|
|
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
|
| 10 |
+
def deterministic(seed=0, cudnn_deterministic=False):
|
| 11 |
random.seed(seed)
|
| 12 |
np.random.seed(seed)
|
| 13 |
torch_rn = rng.convert_np_to_torch(seed)
|
| 14 |
torch.manual_seed(torch_rn)
|
| 15 |
if torch.cuda.is_available():
|
| 16 |
torch.cuda.manual_seed_all(torch_rn)
|
| 17 |
+
|
| 18 |
+
if cudnn_deterministic:
|
| 19 |
+
torch.backends.cudnn.deterministic = True
|
| 20 |
+
torch.backends.cudnn.benchmark = False
|
| 21 |
|
| 22 |
|
| 23 |
def is_numeric(obj):
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
class SeedContext:
|
| 41 |
+
def __init__(self, seed, cudnn_deterministic=False):
|
| 42 |
assert is_numeric(seed), "Seed must be an number."
|
| 43 |
|
| 44 |
try:
|
|
|
|
| 47 |
raise ValueError(f"Seed must be an integer, but: {type(seed)}")
|
| 48 |
|
| 49 |
self.seed = seed
|
| 50 |
+
self.cudnn_deterministic = cudnn_deterministic
|
| 51 |
self.state = None
|
| 52 |
|
| 53 |
if isinstance(seed, str) and seed.isdigit():
|
|
|
|
| 60 |
self.seed = random.randint(0, 2**32 - 1)
|
| 61 |
|
| 62 |
def __enter__(self):
|
| 63 |
+
self.state = (
|
| 64 |
+
torch.get_rng_state(),
|
| 65 |
+
random.getstate(),
|
| 66 |
+
np.random.get_state(),
|
| 67 |
+
torch.backends.cudnn.deterministic,
|
| 68 |
+
torch.backends.cudnn.benchmark,
|
| 69 |
+
)
|
| 70 |
|
| 71 |
try:
|
| 72 |
+
deterministic(self.seed, cudnn_deterministic=self.cudnn_deterministic)
|
| 73 |
except Exception as e:
|
| 74 |
# raise ValueError(
|
| 75 |
# f"Seed must be an integer, but: <{type(self.seed)}> {self.seed}"
|
|
|
|
| 82 |
torch.set_rng_state(self.state[0])
|
| 83 |
random.setstate(self.state[1])
|
| 84 |
np.random.set_state(self.state[2])
|
| 85 |
+
torch.backends.cudnn.deterministic = self.state[3]
|
| 86 |
+
torch.backends.cudnn.benchmark = self.state[4]
|
| 87 |
|
| 88 |
|
| 89 |
if __name__ == "__main__":
|
modules/webui/app.py
CHANGED
|
@@ -16,11 +16,6 @@ from modules.webui.readme_tab import create_readme_tab
|
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
-
logging.basicConfig(
|
| 20 |
-
level=os.getenv("LOG_LEVEL", "INFO"),
|
| 21 |
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
|
| 25 |
def webui_init():
|
| 26 |
# fix: If the system proxy is enabled in the Windows system, you need to skip these
|
|
|
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def webui_init():
|
| 21 |
# fix: If the system proxy is enabled in the Windows system, you need to skip these
|
modules/webui/speaker/speaker_creator.py
CHANGED
|
@@ -61,7 +61,7 @@ def create_spk_from_seed(
|
|
| 61 |
desc: str,
|
| 62 |
):
|
| 63 |
chat_tts = load_chat_tts()
|
| 64 |
-
with SeedContext(seed):
|
| 65 |
emb = chat_tts.sample_random_speaker()
|
| 66 |
spk = Speaker(seed=-2, name=name, gender=gender, describe=desc)
|
| 67 |
spk.emb = emb
|
|
@@ -118,7 +118,7 @@ def speaker_creator_ui():
|
|
| 118 |
with gr.Row():
|
| 119 |
current_seed = gr.Label(label="Current Seed", value=-1)
|
| 120 |
with gr.Column(scale=4):
|
| 121 |
-
output_audio = gr.Audio(label="Output Audio")
|
| 122 |
|
| 123 |
test_voice_btn.click(
|
| 124 |
fn=test_spk_voice,
|
|
|
|
| 61 |
desc: str,
|
| 62 |
):
|
| 63 |
chat_tts = load_chat_tts()
|
| 64 |
+
with SeedContext(seed, True):
|
| 65 |
emb = chat_tts.sample_random_speaker()
|
| 66 |
spk = Speaker(seed=-2, name=name, gender=gender, describe=desc)
|
| 67 |
spk.emb = emb
|
|
|
|
| 118 |
with gr.Row():
|
| 119 |
current_seed = gr.Label(label="Current Seed", value=-1)
|
| 120 |
with gr.Column(scale=4):
|
| 121 |
+
output_audio = gr.Audio(label="Output Audio", format="mp3")
|
| 122 |
|
| 123 |
test_voice_btn.click(
|
| 124 |
fn=test_spk_voice,
|
modules/webui/speaker/speaker_merger.py
CHANGED
|
@@ -204,7 +204,9 @@ def create_speaker_merger():
|
|
| 204 |
value="说话人合并测试 123456789 [uv_break] ok, test done [lbreak]",
|
| 205 |
)
|
| 206 |
|
| 207 |
-
output_audio = gr.Audio(
|
|
|
|
|
|
|
| 208 |
|
| 209 |
with gr.Column(scale=1):
|
| 210 |
with gr.Group():
|
|
|
|
| 204 |
value="说话人合并测试 123456789 [uv_break] ok, test done [lbreak]",
|
| 205 |
)
|
| 206 |
|
| 207 |
+
output_audio = gr.Audio(
|
| 208 |
+
label="Output Audio", format="mp3"
|
| 209 |
+
)
|
| 210 |
|
| 211 |
with gr.Column(scale=1):
|
| 212 |
with gr.Group():
|
modules/webui/ssml_tab.py
CHANGED
|
@@ -44,7 +44,7 @@ def create_ssml_interface():
|
|
| 44 |
inputs=[ssml_input],
|
| 45 |
)
|
| 46 |
|
| 47 |
-
ssml_output = gr.Audio(label="Generated Audio")
|
| 48 |
|
| 49 |
ssml_button.click(
|
| 50 |
synthesize_ssml,
|
|
|
|
| 44 |
inputs=[ssml_input],
|
| 45 |
)
|
| 46 |
|
| 47 |
+
ssml_output = gr.Audio(label="Generated Audio", format="mp3")
|
| 48 |
|
| 49 |
ssml_button.click(
|
| 50 |
synthesize_ssml,
|
modules/webui/tts_tab.py
CHANGED
|
@@ -204,7 +204,7 @@ def create_tts_interface():
|
|
| 204 |
|
| 205 |
with gr.Group():
|
| 206 |
gr.Markdown("🎨Output")
|
| 207 |
-
tts_output = gr.Audio(label="Generated Audio")
|
| 208 |
with gr.Column(scale=1):
|
| 209 |
with gr.Group():
|
| 210 |
gr.Markdown("🎶Refiner")
|
|
@@ -220,10 +220,9 @@ def create_tts_interface():
|
|
| 220 |
value=False, label="Disable Normalize"
|
| 221 |
)
|
| 222 |
|
| 223 |
-
|
| 224 |
-
with gr.Group(visible=webui_config.experimental):
|
| 225 |
gr.Markdown("💪🏼Enhance")
|
| 226 |
-
enable_enhance = gr.Checkbox(value=
|
| 227 |
enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")
|
| 228 |
tts_button = gr.Button(
|
| 229 |
"🔊Generate Audio",
|
|
|
|
| 204 |
|
| 205 |
with gr.Group():
|
| 206 |
gr.Markdown("🎨Output")
|
| 207 |
+
tts_output = gr.Audio(label="Generated Audio", format="mp3")
|
| 208 |
with gr.Column(scale=1):
|
| 209 |
with gr.Group():
|
| 210 |
gr.Markdown("🎶Refiner")
|
|
|
|
| 220 |
value=False, label="Disable Normalize"
|
| 221 |
)
|
| 222 |
|
| 223 |
+
with gr.Group():
|
|
|
|
| 224 |
gr.Markdown("💪🏼Enhance")
|
| 225 |
+
enable_enhance = gr.Checkbox(value=True, label="Enable Enhance")
|
| 226 |
enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")
|
| 227 |
tts_button = gr.Button(
|
| 228 |
"🔊Generate Audio",
|
modules/webui/webui_utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Union
|
| 2 |
import numpy as np
|
| 3 |
|
|
@@ -23,6 +24,9 @@ from modules import refiner
|
|
| 23 |
from modules.utils import audio
|
| 24 |
from modules.SentenceSplitter import SentenceSplitter
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def get_speakers():
|
| 28 |
return speaker_mgr.list_speakers()
|
|
@@ -67,22 +71,23 @@ def segments_length_limit(
|
|
| 67 |
@torch.inference_mode()
|
| 68 |
@spaces.GPU
|
| 69 |
def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
| 86 |
return audio_data, int(sr)
|
| 87 |
|
| 88 |
|
|
@@ -111,10 +116,12 @@ def synthesize_ssml(ssml: str, batch_size=4):
|
|
| 111 |
audio_segments = synthesize.synthesize_segments(segments)
|
| 112 |
combined_audio = combine_audio_segments(audio_segments)
|
| 113 |
|
| 114 |
-
|
| 115 |
|
|
|
|
| 116 |
|
| 117 |
-
|
|
|
|
| 118 |
@spaces.GPU
|
| 119 |
def tts_generate(
|
| 120 |
text,
|
|
@@ -186,7 +193,6 @@ def tts_generate(
|
|
| 186 |
audio_data, sample_rate = apply_audio_enhance(
|
| 187 |
audio_data, sample_rate, enable_denoise, enable_enhance
|
| 188 |
)
|
| 189 |
-
|
| 190 |
audio_data = audio.audio_to_int16(audio_data)
|
| 191 |
return sample_rate, audio_data
|
| 192 |
|
|
|
|
| 1 |
+
import io
|
| 2 |
from typing import Union
|
| 3 |
import numpy as np
|
| 4 |
|
|
|
|
| 24 |
from modules.utils import audio
|
| 25 |
from modules.SentenceSplitter import SentenceSplitter
|
| 26 |
|
| 27 |
+
from pydub import AudioSegment
|
| 28 |
+
import torch.profiler
|
| 29 |
+
|
| 30 |
|
| 31 |
def get_speakers():
|
| 32 |
return speaker_mgr.list_speakers()
|
|
|
|
| 71 |
@torch.inference_mode()
|
| 72 |
@spaces.GPU
|
| 73 |
def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
|
| 74 |
+
if not enable_denoise and not enable_enhance:
|
| 75 |
+
return audio_data, sr
|
| 76 |
+
|
| 77 |
+
device = devices.device
|
| 78 |
+
# NOTE: 这里很奇怪按道理得放到 device 上,但是 enhancer 做 chunk 的时候会报错...所以得 cpu()
|
| 79 |
+
tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
|
| 80 |
+
enhancer = load_enhancer(device)
|
| 81 |
+
|
| 82 |
+
if enable_enhance:
|
| 83 |
+
lambd = 0.9 if enable_denoise else 0.1
|
| 84 |
+
tensor, sr = enhancer.enhance(
|
| 85 |
+
tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
|
| 86 |
+
)
|
| 87 |
+
elif enable_denoise:
|
| 88 |
+
tensor, sr = enhancer.denoise(tensor, sr)
|
| 89 |
+
|
| 90 |
+
audio_data = tensor.cpu().numpy()
|
| 91 |
return audio_data, int(sr)
|
| 92 |
|
| 93 |
|
|
|
|
| 116 |
audio_segments = synthesize.synthesize_segments(segments)
|
| 117 |
combined_audio = combine_audio_segments(audio_segments)
|
| 118 |
|
| 119 |
+
sr, audio_data = audio.pydub_to_np(combined_audio)
|
| 120 |
|
| 121 |
+
return sr, audio_data
|
| 122 |
|
| 123 |
+
|
| 124 |
+
# @torch.inference_mode()
|
| 125 |
@spaces.GPU
|
| 126 |
def tts_generate(
|
| 127 |
text,
|
|
|
|
| 193 |
audio_data, sample_rate = apply_audio_enhance(
|
| 194 |
audio_data, sample_rate, enable_denoise, enable_enhance
|
| 195 |
)
|
|
|
|
| 196 |
audio_data = audio.audio_to_int16(audio_data)
|
| 197 |
return sample_rate, audio_data
|
| 198 |
|
webui.py
CHANGED
|
@@ -1,4 +1,11 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from modules.devices import devices
|
| 3 |
from modules.utils import env
|
| 4 |
from modules.webui import webui_config
|
|
|
|
| 1 |
import os
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
# logging.basicConfig(
|
| 5 |
+
# level=os.getenv("LOG_LEVEL", "INFO"),
|
| 6 |
+
# format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 7 |
+
# )
|
| 8 |
+
|
| 9 |
from modules.devices import devices
|
| 10 |
from modules.utils import env
|
| 11 |
from modules.webui import webui_config
|