ChatTTS-Story-Telling

Runtime error

App Files Files Community

fcyai commited on Jun 22, 2024

Commit

01a69aa

1 Parent(s): 623d773

init

Browse files

Files changed (25) hide show

ChatTTS/ChatTTS/core.py +50 -29
ChatTTS/ChatTTS/infer/api.py +54 -34
ChatTTS/ChatTTS/model/dvae.py +14 -6
ChatTTS/ChatTTS/model/gpt.py +83 -35
ChatTTS/ChatTTS/utils/download.py +1 -3
ChatTTS/ChatTTS/utils/gpu_utils.py +2 -2
ChatTTS/ChatTTS/utils/infer_utils.py +5 -3
ChatTTS/ChatTTS/utils/io.py +33 -0
ChatTTS/ChatTTS/utils/log.py +8 -0
ChatTTS/docs/cn/README.md +17 -10
ChatTTS/examples/cmd/run.py +17 -10
ChatTTS/examples/ipynb/colab.ipynb +93 -445
ChatTTS/examples/ipynb/example.ipynb +74 -10
ChatTTS/examples/web/funcs.py +100 -0
ChatTTS/examples/web/webui.py +43 -83
ChatTTS/requirements.txt +6 -7
ChatTTS/setup.py +0 -1
ChatTTS/tools/logger/__init__.py +1 -0
ChatTTS/tools/logger/log.py +53 -0
abc +1 -0
chattts_webui_mix.ipynb +89 -13
config.py +4 -6
tts_model.py +25 -69
utils.py +34 -166
webui_mix.py +87 -446

ChatTTS/ChatTTS/core.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import json
 import logging
-from functools import partial
-from typing import Literal
 import tempfile
 import torch
 from omegaconf import OmegaConf
@@ -15,19 +14,19 @@ from .model.dvae import DVAE
 from .model.gpt import GPT_warpper
 from .utils.gpu_utils import select_device
 from .utils.infer_utils import count_invalid_characters, detect_language, apply_character_map, apply_half2full_map, HomophonesReplacer
-from .utils.io_utils import get_latest_modified_file
 from .infer.api import refine_text, infer_code
 from .utils.download import check_all_assets, download_all_assets
-logging.basicConfig(level = logging.INFO)
 class Chat:
-    def __init__(self, ):
         self.pretrain_models = {}
         self.normalizer = {}
         self.homophones_replacer = None
-        self.logger = logging.getLogger(__name__)
     def check_model(self, level = logging.INFO, use_decoder = False):
         not_finish = False
@@ -45,7 +44,7 @@ class Chat:
         if not not_finish:
             self.logger.log(level, f'All initialized.')
         return not not_finish
     def load_models(
@@ -61,8 +60,8 @@ class Chat:
                 with tempfile.TemporaryDirectory() as tmp:
                     download_all_assets(tmpdir=tmp)
                 if not check_all_assets(update=False):
-                    logging.error("counld not satisfy all assets needed.")
-                    exit(1)
         elif source == 'huggingface':
             hf_home = os.getenv('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
             try:
@@ -78,7 +77,7 @@ class Chat:
             self.logger.log(logging.INFO, f'Load from local: {custom_path}')
             download_path = custom_path
-        self._load(**{k: os.path.join(download_path, v) for k, v in OmegaConf.load(os.path.join(download_path, 'config', 'path.yaml')).items()}, **kwargs)
     def _load(
         self,
@@ -91,17 +90,18 @@ class Chat:
         decoder_config_path: str = None,
         decoder_ckpt_path: str = None,
         tokenizer_path: str = None,
-        device: str = None,
         compile: bool = True,
     ):
-        if not device:
             device = select_device(4096)
             self.logger.log(logging.INFO, f'use {device}')
         if vocos_config_path:
             vocos = Vocos.from_hparams(vocos_config_path).to(
                 # vocos on mps will crash, use cpu fallback
-                "cpu" if torch.backends.mps.is_available() else device
             ).eval()
             assert vocos_ckpt_path, 'vocos_ckpt_path should not be None'
             vocos.load_state_dict(torch.load(vocos_ckpt_path))
@@ -118,14 +118,14 @@ class Chat:
         if gpt_config_path:
             cfg = OmegaConf.load(gpt_config_path)
-            gpt = GPT_warpper(**cfg).to(device).eval()
             assert gpt_ckpt_path, 'gpt_ckpt_path should not be None'
             gpt.load_state_dict(torch.load(gpt_ckpt_path))
             if compile and 'cuda' in str(device):
                 try:
                     gpt.gpt.forward = torch.compile(gpt.gpt.forward, backend='inductor', dynamic=True)
                 except RuntimeError as e:
-                    logging.warning(f'Compile failed,{e}. fallback to normal mode.')
             self.pretrain_models['gpt'] = gpt
             spk_stat_path = os.path.join(os.path.dirname(gpt_ckpt_path), 'spk_stat.pt')
             assert os.path.exists(spk_stat_path), f'Missing spk_stat.pt: {spk_stat_path}'
@@ -146,7 +146,7 @@ class Chat:
             self.pretrain_models['tokenizer'] = tokenizer
             self.logger.log(logging.INFO, 'tokenizer loaded.')
-        self.check_model()
     def _infer(
         self,
@@ -179,14 +179,16 @@ class Chat:
                 self.logger.log(logging.WARNING, f'Invalid characters found! : {invalid_characters}')
                 text[i] = apply_character_map(t)
             if do_homophone_replacement and self.init_homophones_replacer():
-                text[i] = self.homophones_replacer.replace(t)
-                if t != text[i]:
-                    self.logger.log(logging.INFO, f'Homophones replace: {t} -> {text[i]}')
         if not skip_refine_text:
             text_tokens = refine_text(
                 self.pretrain_models,
                 text,
                 **params_refine_text,
             )['ids']
             text_tokens = [i[i < self.pretrain_models['tokenizer'].convert_tokens_to_ids('[break_0]')] for i in text_tokens]
@@ -197,16 +199,28 @@ class Chat:
         text = [params_infer_code.get('prompt', '') + i for i in text]
         params_infer_code.pop('prompt', '')
-        result_gen = infer_code(self.pretrain_models, text, **params_infer_code, return_hidden=use_decoder, stream=stream)
         if use_decoder:
             field = 'hiddens'
             docoder_name = 'decoder'
         else:
             field = 'ids'
             docoder_name = 'dvae'
-        vocos_decode = lambda spec: [self.pretrain_models['vocos'].decode(
-                    i.cpu() if torch.backends.mps.is_available() else i
-                ).cpu().numpy() for i in spec]
         if stream:
             length = 0
@@ -220,13 +234,20 @@ class Chat:
                 if not len(chunk_data):
                     continue
                 self.logger.debug(f'new hidden {len(chunk_data)=}')
-                mel_spec = [self.pretrain_models[docoder_name](i[None].permute(0,2,1)) for i in [chunk_data]]
                 wav = vocos_decode(mel_spec)
                 self.logger.debug(f'yield wav chunk {len(wav[0])=} {len(wav[0][0])=}')
                 yield wav
             return
-        mel_spec = [self.pretrain_models[docoder_name](i[None].permute(0,2,1)) for i in next(result_gen)[field]]
-        yield vocos_decode(mel_spec)
     def infer(
         self,

 import os
 import json
 import logging
 import tempfile
+from functools import partial
+from typing import Literal, Optional
 import torch
 from omegaconf import OmegaConf
 from .model.gpt import GPT_warpper
 from .utils.gpu_utils import select_device
 from .utils.infer_utils import count_invalid_characters, detect_language, apply_character_map, apply_half2full_map, HomophonesReplacer
+from .utils.io import get_latest_modified_file, del_all
 from .infer.api import refine_text, infer_code
 from .utils.download import check_all_assets, download_all_assets
+from .utils.log import set_utils_logger
 class Chat:
+    def __init__(self, logger=logging.getLogger(__name__)):
         self.pretrain_models = {}
         self.normalizer = {}
         self.homophones_replacer = None
+        self.logger = logger
+        set_utils_logger(logger)
     def check_model(self, level = logging.INFO, use_decoder = False):
         not_finish = False
         if not not_finish:
             self.logger.log(level, f'All initialized.')
         return not not_finish
     def load_models(
                 with tempfile.TemporaryDirectory() as tmp:
                     download_all_assets(tmpdir=tmp)
                 if not check_all_assets(update=False):
+                    self.logger.error("counld not satisfy all assets needed.")
+                    return False
         elif source == 'huggingface':
             hf_home = os.getenv('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
             try:
             self.logger.log(logging.INFO, f'Load from local: {custom_path}')
             download_path = custom_path
+        return self._load(**{k: os.path.join(download_path, v) for k, v in OmegaConf.load(os.path.join(download_path, 'config', 'path.yaml')).items()}, **kwargs)
     def _load(
         self,
         decoder_config_path: str = None,
         decoder_ckpt_path: str = None,
         tokenizer_path: str = None,
+        device: Optional[torch.device] = None,
         compile: bool = True,
     ):
+        if device is None:
             device = select_device(4096)
             self.logger.log(logging.INFO, f'use {device}')
+        self.device = device
         if vocos_config_path:
             vocos = Vocos.from_hparams(vocos_config_path).to(
                 # vocos on mps will crash, use cpu fallback
+                "cpu" if "mps" in str(device) else device
             ).eval()
             assert vocos_ckpt_path, 'vocos_ckpt_path should not be None'
             vocos.load_state_dict(torch.load(vocos_ckpt_path))
         if gpt_config_path:
             cfg = OmegaConf.load(gpt_config_path)
+            gpt = GPT_warpper(**cfg, device=device, logger=self.logger).eval()
             assert gpt_ckpt_path, 'gpt_ckpt_path should not be None'
             gpt.load_state_dict(torch.load(gpt_ckpt_path))
             if compile and 'cuda' in str(device):
                 try:
                     gpt.gpt.forward = torch.compile(gpt.gpt.forward, backend='inductor', dynamic=True)
                 except RuntimeError as e:
+                    self.logger.warning(f'Compile failed,{e}. fallback to normal mode.')
             self.pretrain_models['gpt'] = gpt
             spk_stat_path = os.path.join(os.path.dirname(gpt_ckpt_path), 'spk_stat.pt')
             assert os.path.exists(spk_stat_path), f'Missing spk_stat.pt: {spk_stat_path}'
             self.pretrain_models['tokenizer'] = tokenizer
             self.logger.log(logging.INFO, 'tokenizer loaded.')
+        return self.check_model()
     def _infer(
         self,
                 self.logger.log(logging.WARNING, f'Invalid characters found! : {invalid_characters}')
                 text[i] = apply_character_map(t)
             if do_homophone_replacement and self.init_homophones_replacer():
+                text[i], replaced_words = self.homophones_replacer.replace(text[i])
+                if replaced_words:
+                    repl_res = ', '.join([f'{_[0]}->{_[1]}' for _ in replaced_words])
+                    self.logger.log(logging.INFO, f'Homophones replace: {repl_res}')
         if not skip_refine_text:
             text_tokens = refine_text(
                 self.pretrain_models,
                 text,
+                device=self.device,
                 **params_refine_text,
             )['ids']
             text_tokens = [i[i < self.pretrain_models['tokenizer'].convert_tokens_to_ids('[break_0]')] for i in text_tokens]
         text = [params_infer_code.get('prompt', '') + i for i in text]
         params_infer_code.pop('prompt', '')
+        result_gen = infer_code(
+            self.pretrain_models,
+            text,
+            device=self.device,
+            **params_infer_code,
+            return_hidden=use_decoder,
+            stream=stream,
+        )
         if use_decoder:
             field = 'hiddens'
             docoder_name = 'decoder'
         else:
             field = 'ids'
             docoder_name = 'dvae'
+        if "mps" in str(self.device):
+            vocos_decode = lambda spec: [self.pretrain_models['vocos'].decode(
+                i.cpu()
+            ).cpu().numpy() for i in spec]
+        else:
+            vocos_decode = lambda spec: [self.pretrain_models['vocos'].decode(
+                i
+            ).cpu().numpy() for i in spec]
         if stream:
             length = 0
                 if not len(chunk_data):
                     continue
                 self.logger.debug(f'new hidden {len(chunk_data)=}')
+                mel_spec = [self.pretrain_models[docoder_name](i[None].permute(0,2,1).to(self.device)) for i in [chunk_data]]
+                del_all(result)
+                del chunk_data
                 wav = vocos_decode(mel_spec)
+                del_all(mel_spec)
                 self.logger.debug(f'yield wav chunk {len(wav[0])=} {len(wav[0][0])=}')
                 yield wav
             return
+        result = next(result_gen)
+        mel_spec = [self.pretrain_models[docoder_name](i[None].permute(0,2,1).to(self.device)) for i in result[field]]
+        del_all(result)
+        wav = vocos_decode(mel_spec)
+        del_all(mel_spec)
+        yield wav
     def infer(
         self,

ChatTTS/ChatTTS/infer/api.py CHANGED Viewed

@@ -2,7 +2,10 @@
 import torch
 import torch.nn.functional as F
 from transformers.generation import TopKLogitsWarper, TopPLogitsWarper
 from ..utils.infer_utils import CustomRepetitionPenaltyLogitsProcessorRepeat
 def infer_code(
     models,
@@ -14,39 +17,42 @@ def infer_code(
     repetition_penalty = 1.05,
     max_new_token = 2048,
     stream=False,
     **kwargs
 ):
-    device = next(models['gpt'].parameters()).device
     if not isinstance(text, list):
         text = [text]
     if not isinstance(temperature, list):
-        temperature = [temperature] * models['gpt'].num_vq
     if spk_emb is not None:
         text = [f'[Stts][spk_emb]{i}[Ptts]' for i in text]
     else:
         text = [f'[Stts][empty_spk]{i}[Ptts]' for i in text]
-    text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
-    input_ids = text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq)
-    text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
-    inputs = {
-        'input_ids': input_ids,
-        'text_mask': text_mask,
-        'attention_mask': text_token['attention_mask'],
-    }
-    emb = models['gpt'].get_emb(**inputs)
     if spk_emb is not None:
-        emb[inputs['input_ids'][..., 0] == models['tokenizer'].convert_tokens_to_ids('[spk_emb]')] = \
-            F.normalize(spk_emb.to(device).to(emb.dtype)[None].expand(len(text), -1), p=2.0, dim=1, eps=1e-12)
-    num_code = models['gpt'].emb_code[0].num_embeddings - 1
     LogitsWarpers = []
     if top_P is not None:
         LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
@@ -58,10 +64,10 @@ def infer_code(
         LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(\
             repetition_penalty, num_code, 16))
-    result = models['gpt'].generate(
-        emb, inputs['input_ids'],
         temperature = torch.tensor(temperature, device=device),
-        attention_mask = inputs['attention_mask'],
         LogitsWarpers = LogitsWarpers,
         LogitsProcessors = LogitsProcessors,
         eos_token = num_code,
@@ -71,6 +77,11 @@ def infer_code(
         **kwargs
     )
     return result
@@ -83,11 +94,12 @@ def refine_text(
     repetition_penalty = 1.0,
     max_new_token = 384,
     prompt = '',
     **kwargs
 ):
-    device = next(models['gpt'].parameters()).device
     if not isinstance(text, list):
         text = [text]
@@ -97,11 +109,7 @@ def refine_text(
     text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
     text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
-    inputs = {
-        'input_ids': text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq),
-        'text_mask': text_mask,
-        'attention_mask': text_token['attention_mask'],
-    }
     LogitsWarpers = []
     if top_P is not None:
@@ -112,11 +120,17 @@ def refine_text(
     LogitsProcessors = []
     if repetition_penalty is not None and repetition_penalty != 1:
         LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(repetition_penalty, len(models['tokenizer']), 16))
-    result = models['gpt'].generate(
-        models['gpt'].get_emb(**inputs), inputs['input_ids'],
         temperature = torch.tensor([temperature,], device=device),
-        attention_mask = inputs['attention_mask'],
         LogitsWarpers = LogitsWarpers,
         LogitsProcessors = LogitsProcessors,
         eos_token = torch.tensor(models['tokenizer'].convert_tokens_to_ids('[Ebreak]'), device=device)[None],
@@ -125,4 +139,10 @@ def refine_text(
         stream = False,
         **kwargs
     )
     return next(result)

 import torch
 import torch.nn.functional as F
 from transformers.generation import TopKLogitsWarper, TopPLogitsWarper
 from ..utils.infer_utils import CustomRepetitionPenaltyLogitsProcessorRepeat
+from ..utils.io import del_all
+from ..model.gpt import GPT_warpper
 def infer_code(
     models,
     repetition_penalty = 1.05,
     max_new_token = 2048,
     stream=False,
+    device="cpu",
     **kwargs
 ):
+    gpt: GPT_warpper = models['gpt']
     if not isinstance(text, list):
         text = [text]
     if not isinstance(temperature, list):
+        temperature = [temperature] * gpt.num_vq
     if spk_emb is not None:
         text = [f'[Stts][spk_emb]{i}[Ptts]' for i in text]
     else:
         text = [f'[Stts][empty_spk]{i}[Ptts]' for i in text]
+    text_token_tmp = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True)
+    text_token = text_token_tmp.to(device)
+    del text_token_tmp
+    input_ids = text_token['input_ids'][...,None].expand(-1, -1, gpt.num_vq).to(gpt.device_gpt)
+    text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=gpt.device_gpt)
+    emb = gpt.get_emb(
+        input_ids=input_ids,
+        text_mask=text_mask,
+    )
+    del text_mask
     if spk_emb is not None:
+        n = F.normalize(spk_emb.to(emb.dtype)[None].expand(len(text), -1), p=2.0, dim=1, eps=1e-12).to(gpt.device_gpt)
+        emb[input_ids[..., 0] == models['tokenizer'].convert_tokens_to_ids('[spk_emb]')] = n
+        del n
+    num_code = int(gpt.emb_code[0].num_embeddings - 1)
     LogitsWarpers = []
     if top_P is not None:
         LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
         LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(\
             repetition_penalty, num_code, 16))
+    result = gpt.generate(
+        emb, input_ids,
         temperature = torch.tensor(temperature, device=device),
+        attention_mask = text_token['attention_mask'],
         LogitsWarpers = LogitsWarpers,
         LogitsProcessors = LogitsProcessors,
         eos_token = num_code,
         **kwargs
     )
+    del_all(text_token)
+    del emb, text_token, input_ids
+    del_all(LogitsWarpers)
+    del_all(LogitsProcessors)
     return result
     repetition_penalty = 1.0,
     max_new_token = 384,
     prompt = '',
+    device="cpu",
     **kwargs
 ):
+    gpt: GPT_warpper = models['gpt']
     if not isinstance(text, list):
         text = [text]
     text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
     text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
+    input_ids = text_token['input_ids'][...,None].expand(-1, -1, gpt.num_vq)
     LogitsWarpers = []
     if top_P is not None:
     LogitsProcessors = []
     if repetition_penalty is not None and repetition_penalty != 1:
         LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(repetition_penalty, len(models['tokenizer']), 16))
+    emb = gpt.get_emb(
+        input_ids=input_ids,
+        text_mask=text_mask,
+    )
+    del text_mask
+    result = gpt.generate(
+        emb, input_ids,
         temperature = torch.tensor([temperature,], device=device),
+        attention_mask = text_token['attention_mask'],
         LogitsWarpers = LogitsWarpers,
         LogitsProcessors = LogitsProcessors,
         eos_token = torch.tensor(models['tokenizer'].convert_tokens_to_ids('[Ebreak]'), device=device)[None],
         stream = False,
         **kwargs
     )
+    del_all(text_token)
+    del emb, text_token, input_ids
+    del_all(LogitsWarpers)
+    del_all(LogitsProcessors)
     return next(result)

ChatTTS/ChatTTS/model/dvae.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import math
-from einops import rearrange
 from vector_quantize_pytorch import GroupedResidualFSQ
 import torch
@@ -66,23 +65,32 @@ class GFSQ(nn.Module):
         self.G = G
         self.R = R
-    def _embed(self, x):
         if self.transpose:
             x = x.transpose(1,2)
         x = rearrange(
             x, "b t (g r) -> g b t r", g = self.G, r = self.R,
-        )
         feat = self.quantizer.get_output_from_indices(x)
         return feat.transpose(1,2) if self.transpose else feat
     def forward(self, x,):
         if self.transpose:
             x = x.transpose(1,2)
         feat, ind = self.quantizer(x)
         ind = rearrange(
             ind, "g b t r ->b t (g r)",
-        )
-        embed_onehot = F.one_hot(ind.long(), self.n_ind).to(x.dtype)
         e_mean = torch.mean(embed_onehot, dim=[0,1])
         e_mean = e_mean / (e_mean.sum(dim=1) + self.eps).unsqueeze(1)
         perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + self.eps), dim=1))

 import math
 from vector_quantize_pytorch import GroupedResidualFSQ
 import torch
         self.G = G
         self.R = R
+    def _embed(self, x: torch.Tensor):
         if self.transpose:
             x = x.transpose(1,2)
+        """
         x = rearrange(
             x, "b t (g r) -> g b t r", g = self.G, r = self.R,
+        )
+        """
+        x.view(-1, self.G, self.R).permute(2, 0, 1, 3)
         feat = self.quantizer.get_output_from_indices(x)
         return feat.transpose(1,2) if self.transpose else feat
     def forward(self, x,):
         if self.transpose:
             x = x.transpose(1,2)
         feat, ind = self.quantizer(x)
+        """
         ind = rearrange(
             ind, "g b t r ->b t (g r)",
+        )
+        """
+        ind = ind.permute(1, 2, 0, 3).contiguous()
+        ind = ind.view(ind.size(0), ind.size(1), -1)
+        embed_onehot_tmp = F.one_hot(ind.long(), self.n_ind)
+        embed_onehot = embed_onehot_tmp.to(x.dtype)
+        del embed_onehot_tmp
         e_mean = torch.mean(embed_onehot, dim=[0,1])
         e_mean = e_mean / (e_mean.sum(dim=1) + self.eps).unsqueeze(1)
         perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + self.eps), dim=1))

ChatTTS/ChatTTS/model/gpt.py CHANGED Viewed

@@ -2,8 +2,10 @@ import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import logging
 from tqdm import tqdm
-from einops import rearrange
 from transformers.cache_utils import Cache
 import torch
@@ -12,8 +14,10 @@ import torch.nn.functional as F
 import torch.nn.utils.parametrize as P
 from torch.nn.utils.parametrizations import weight_norm
 from transformers import LlamaModel, LlamaConfig
 class LlamaMLP(nn.Module):
     def __init__(self, hidden_size, intermediate_size):
         super().__init__()
@@ -36,40 +40,67 @@ class GPT_warpper(nn.Module):
         num_audio_tokens,
         num_text_tokens,
         num_vq=4,
     ):
         super().__init__()
-        self.logger = logging.getLogger(__name__)
-        self.gpt = self.build_model(gpt_config)
         self.model_dim = self.gpt.config.hidden_size
-        self.num_vq = num_vq
-        self.emb_code = nn.ModuleList([nn.Embedding(num_audio_tokens, self.model_dim) for i in range(self.num_vq)])
-        self.emb_text = nn.Embedding(num_text_tokens, self.model_dim)
-        self.head_text = weight_norm(nn.Linear(self.model_dim, num_text_tokens, bias=False), name='weight')
-        self.head_code = nn.ModuleList([weight_norm(nn.Linear(self.model_dim, num_audio_tokens, bias=False), name='weight') for i in range(self.num_vq)])
-    def build_model(self, config):
         configuration = LlamaConfig(**config)
         model = LlamaModel(configuration)
         del model.embed_tokens
-        return model
-    def get_emb(self, input_ids, text_mask, **kwargs):
-        emb_text = self.emb_text(input_ids[text_mask][:, 0])
-        emb_code = [self.emb_code[i](input_ids[~text_mask][:, i]) for i in range(self.num_vq)]
         emb_code = torch.stack(emb_code, 2).sum(2)
         emb = torch.zeros((input_ids.shape[:-1])+(emb_text.shape[-1],), device=emb_text.device, dtype=emb_text.dtype)
         emb[text_mask] = emb_text
         emb[~text_mask] = emb_code.to(emb.dtype)
         return emb
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
     ):
@@ -157,7 +188,7 @@ class GPT_warpper(nn.Module):
         emb,
         inputs_ids,
         temperature,
-        eos_token,
         attention_mask = None,
         max_new_token = 2048,
         min_new_token = 0,
@@ -177,8 +208,8 @@ class GPT_warpper(nn.Module):
             start_idx, end_idx = inputs_ids.shape[1], torch.zeros(inputs_ids.shape[0], device=inputs_ids.device, dtype=torch.long)
             finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
-            temperature = temperature[None].expand(inputs_ids.shape[0], -1)
-            temperature = rearrange(temperature, "b n -> (b n) 1")
             attention_mask_cache = torch.ones((inputs_ids.shape[0], inputs_ids.shape[1]+max_new_token,), dtype=torch.bool, device=inputs_ids.device)
             if attention_mask is not None:
@@ -189,7 +220,6 @@ class GPT_warpper(nn.Module):
                 past_key_values = None
                 for i in range(max_new_token):
-                    pbar.update(1)
                     model_input = self.prepare_inputs_for_generation(
                         inputs_ids,
                         past_key_values,
@@ -200,17 +230,26 @@ class GPT_warpper(nn.Module):
                     if i == 0:
                         model_input['inputs_embeds'] = emb
                     else:
                         if infer_text:
-                            model_input['inputs_embeds'] = self.emb_text(model_input['input_ids'][:,:,0])
                         else:
-                            code_emb = [self.emb_code[i](model_input['input_ids'][:,:,i]) for i in range(self.num_vq)]
                             model_input['inputs_embeds'] = torch.stack(code_emb, 3).sum(3)
-                    model_input['input_ids'] = None
-                    outputs = self.gpt.forward(**model_input, output_attentions=return_attn)
-                    del model_input
                     attentions.append(outputs.attentions)
-                    hidden_states = outputs[0] # 🐻
                     past_key_values = outputs.past_key_values
                     del outputs
                     if return_hidden:
@@ -225,8 +264,14 @@ class GPT_warpper(nn.Module):
                     logits = logits[:, -1].float()
                     if not infer_text:
-                        logits = rearrange(logits, "b c n -> (b n) c")
-                        logits_token = rearrange(inputs_ids[:, start_idx:], "b c n -> (b n) c")
                     else:
                         logits_token = inputs_ids[:, start_idx:, 0]
@@ -247,10 +292,11 @@ class GPT_warpper(nn.Module):
                     del logits
-                    idx_next = torch.multinomial(scores, num_samples=1)
                     if not infer_text:
-                        idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
                         finish_or = (idx_next == eos_token).any(1)
                         finish |= finish_or
                         del finish_or
@@ -278,9 +324,11 @@ class GPT_warpper(nn.Module):
                             'attentions': attentions,
                             'hiddens':y_hiddens,
                         }
                     if finish.all():
                         pbar.update(max_new_token-i-1)
                         break
             inputs_ids = [inputs_ids[idx, start_idx: start_idx+i] for idx, i in enumerate(end_idx.int())]
             inputs_ids = [i[:, 0] for i in inputs_ids] if infer_text else inputs_ids

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import logging
+from typing import Union
 from tqdm import tqdm
 from transformers.cache_utils import Cache
 import torch
 import torch.nn.utils.parametrize as P
 from torch.nn.utils.parametrizations import weight_norm
 from transformers import LlamaModel, LlamaConfig
+from ..utils.io import del_all
 class LlamaMLP(nn.Module):
     def __init__(self, hidden_size, intermediate_size):
         super().__init__()
         num_audio_tokens,
         num_text_tokens,
         num_vq=4,
+        device="cpu",
+        logger=logging.getLogger(__name__)
     ):
         super().__init__()
+        self.logger = logger
+        self.device = device
+        self.device_gpt = device if "mps" not in str(device) else "cpu"
+        self.num_vq = num_vq
+        self.gpt = self.build_model(gpt_config, self.device_gpt)
         self.model_dim = self.gpt.config.hidden_size
+        self.emb_code = nn.ModuleList(
+            [nn.Embedding(
+                num_audio_tokens, self.model_dim, device=self.device_gpt,
+            ) for _ in range(num_vq)],
+        )
+        self.emb_text = nn.Embedding(num_text_tokens, self.model_dim, device=self.device_gpt)
+        self.head_text = weight_norm(
+            nn.Linear(
+                self.model_dim, num_text_tokens, bias=False, device=device,
+            ),
+            name='weight',
+        )
+        self.head_code = nn.ModuleList(
+            [weight_norm(
+                nn.Linear(
+                    self.model_dim, num_audio_tokens, bias=False, device=device,
+                ),
+                name='weight',
+            ) for _ in range(self.num_vq)],
+        )
+    def build_model(self, config, device):
         configuration = LlamaConfig(**config)
         model = LlamaModel(configuration)
         del model.embed_tokens
+        return model.to(device)
+    def get_emb(self, input_ids, text_mask):
+        emb_text = self.emb_text(input_ids[text_mask][:, 0].to(self.device_gpt))
+        text_mask_inv = ~text_mask
+        masked_input_ids = input_ids[text_mask_inv].to(self.device_gpt)
+        del text_mask_inv
+        emb_code = [self.emb_code[i](masked_input_ids[:, i]) for i in range(self.num_vq)]
         emb_code = torch.stack(emb_code, 2).sum(2)
         emb = torch.zeros((input_ids.shape[:-1])+(emb_text.shape[-1],), device=emb_text.device, dtype=emb_text.dtype)
         emb[text_mask] = emb_text
         emb[~text_mask] = emb_code.to(emb.dtype)
+        del emb_text, emb_code
         return emb
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
     ):
         emb,
         inputs_ids,
         temperature,
+        eos_token: Union[int, torch.Tensor],
         attention_mask = None,
         max_new_token = 2048,
         min_new_token = 0,
             start_idx, end_idx = inputs_ids.shape[1], torch.zeros(inputs_ids.shape[0], device=inputs_ids.device, dtype=torch.long)
             finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
+            temperature = temperature.unsqueeze_(0).expand(inputs_ids.shape[0], -1).contiguous().view(-1, 1)
+            # temperature = rearrange(temperature, "b n -> (b n) 1")
             attention_mask_cache = torch.ones((inputs_ids.shape[0], inputs_ids.shape[1]+max_new_token,), dtype=torch.bool, device=inputs_ids.device)
             if attention_mask is not None:
                 past_key_values = None
                 for i in range(max_new_token):
                     model_input = self.prepare_inputs_for_generation(
                         inputs_ids,
                         past_key_values,
                     if i == 0:
                         model_input['inputs_embeds'] = emb
                     else:
+                        inputs_ids_emb = model_input['input_ids'].to(self.device_gpt)
                         if infer_text:
+                            model_input['inputs_embeds'] = self.emb_text(inputs_ids_emb[:,:,0])
                         else:
+                            code_emb = [self.emb_code[i](inputs_ids_emb[:,:,i]) for i in range(self.num_vq)]
                             model_input['inputs_embeds'] = torch.stack(code_emb, 3).sum(3)
+                        del inputs_ids_emb, model_input['input_ids']
+                    outputs = self.gpt.forward(
+                        attention_mask=model_input["attention_mask"].to(self.device_gpt),
+                        position_ids=model_input["position_ids"].to(self.device_gpt),
+                        past_key_values=model_input["past_key_values"],
+                        inputs_embeds=model_input['inputs_embeds'].to(self.device_gpt),
+                        use_cache=model_input['use_cache'],
+                        output_attentions=return_attn,
+                        cache_position=model_input['cache_position'].to(self.device_gpt),
+                    )
+                    del_all(model_input)
                     attentions.append(outputs.attentions)
+                    hidden_states = outputs[0].to(self.device) # 🐻
                     past_key_values = outputs.past_key_values
                     del outputs
                     if return_hidden:
                     logits = logits[:, -1].float()
                     if not infer_text:
+                        # logits = rearrange(logits, "b c n -> (b n) c")
+                        logits = logits.permute(0, 2, 1)
+                        logits = logits.reshape(-1, logits.size(2))
+                        # logits_token = rearrange(inputs_ids[:, start_idx:], "b c n -> (b n) c")
+                        inputs_ids_sliced = inputs_ids[:, start_idx:].permute(0, 2, 1)
+                        logits_token = inputs_ids_sliced.reshape(
+                            inputs_ids_sliced.size(0)*inputs_ids_sliced.size(1), -1,
+                        )
                     else:
                         logits_token = inputs_ids[:, start_idx:, 0]
                     del logits
+                    idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
                     if not infer_text:
+                        # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
+                        idx_next = idx_next.view(-1, self.num_vq)
                         finish_or = (idx_next == eos_token).any(1)
                         finish |= finish_or
                         del finish_or
                             'attentions': attentions,
                             'hiddens':y_hiddens,
                         }
                     if finish.all():
                         pbar.update(max_new_token-i-1)
                         break
+                    pbar.update(1)
             inputs_ids = [inputs_ids[idx, start_idx: start_idx+i] for idx, i in enumerate(end_idx.int())]
             inputs_ids = [i[:, 0] for i in inputs_ids] if infer_text else inputs_ids

ChatTTS/ChatTTS/utils/download.py CHANGED Viewed

@@ -3,10 +3,8 @@ from pathlib import Path
 import hashlib
 import requests
 from io import BytesIO
-import logging
-logger = logging.getLogger(__name__)
 def sha256(f) -> str:
     sha256_hash = hashlib.sha256()

 import hashlib
 import requests
 from io import BytesIO
+from .log import logger
 def sha256(f) -> str:
     sha256_hash = hashlib.sha256()

ChatTTS/ChatTTS/utils/gpu_utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-import logging
 def select_device(min_memory=2048):
-    logger = logging.getLogger(__name__)
     if torch.cuda.is_available():
         available_gpus = []
         for i in range(torch.cuda.device_count()):

 import torch
+from .log import logger
 def select_device(min_memory=2048):
     if torch.cuda.is_available():
         available_gpus = []
         for i in range(torch.cuda.device_count()):

ChatTTS/ChatTTS/utils/infer_utils.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import re
 import torch
 import torch.nn.functional as F
-import os
 import json
@@ -76,12 +75,15 @@ class HomophonesReplacer:
     def replace(self, text):
         result = []
         for char in text:
             if char in self.homophones_map:
-                result.append(self.homophones_map[char])
             else:
                 result.append(char)
-        return ''.join(result)
 def count_invalid_characters(s):

 import re
 import torch
 import torch.nn.functional as F
 import json
     def replace(self, text):
         result = []
+        replaced_words = []
         for char in text:
             if char in self.homophones_map:
+                repl_char = self.homophones_map[char]
+                result.append(repl_char)
+                replaced_words.append((char, repl_char))
             else:
                 result.append(char)
+        return ''.join(result), replaced_words
 def count_invalid_characters(s):

ChatTTS/ChatTTS/utils/io.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import logging
+from typing import Union
+from .log import logger
+def get_latest_modified_file(directory):
+    files = [os.path.join(directory, f) for f in os.listdir(directory)]
+    if not files:
+        logger.log(logging.WARNING, f'No files found in the directory: {directory}')
+        return None
+    latest_file = max(files, key=os.path.getmtime)
+    return latest_file
+def del_all(d: Union[dict, list]):
+    if isinstance(d, dict):
+        lst = list(d.keys())
+        for k in lst:
+            x = d.pop(k)
+            if isinstance(x, dict) or isinstance(x, list):
+                del_all(x)
+            del x
+        return
+    elif isinstance(d, list):
+        while len(d):
+            x = d.pop()
+            if isinstance(x, dict) or isinstance(x, list):
+                del_all(x)
+            del x
+        return

ChatTTS/ChatTTS/utils/log.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import logging
+from pathlib import Path
+logger = logging.getLogger(Path(__file__).parent.name)
+def set_utils_logger(l: logging.Logger):
+    global logger
+    logger = l

ChatTTS/docs/cn/README.md CHANGED Viewed

@@ -3,7 +3,7 @@
 <a href="https://trendshift.io/repositories/10489" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10489" alt="2noise%2FChatTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 # ChatTTS
-一款用于日常对话的生成式语音模型。
 [![Licence](https://img.shields.io/badge/LICENSE-CC%20BY--NC%204.0-green.svg?style=for-the-badge)](https://github.com/2noise/ChatTTS/blob/main/LICENSE)
@@ -14,6 +14,9 @@
 </div>
 ## 简介
 ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转语音模型。
@@ -26,7 +29,7 @@ ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转
 ### 亮点
-> 你可以参考 **[Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)** 上的这个视频了解详细的介绍.
 1. **对话式 TTS**: ChatTTS 针对对话式任务进行了优化，能够实现自然且富有表现力的合成语音。它支持多个说话者，便于生成互动式对话。
 2. **精细的控制**: 该模型可以预测和控制精细的韵律特征，包括笑声、停顿和插入语。
@@ -34,8 +37,8 @@ ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转
 ### 数据集和模型
-- 主要模型使用 100,000+ 小时的中文和英文音频数据进行训练。
-- **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** 上的开源版本是一个在 40,000 小时数据上进行无监督微调的预训练模型。。
 ### 路线图
@@ -50,7 +53,7 @@ ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转
 > [!Important]
 > 此仓库仅供学术用途。
-本项目旨在用于教育和研究目的，不应用于任何商业或法律目的。作者不保证信息的准确性、完整性或可靠性。此仓库中使用的信息和数据仅供学术和研究目的。数据来自公开来源，作者不声称对数据拥有任何所有权或版权。
 ChatTTS 是一款强大的文本转语音系统。但是，负责任和道德地使用这项技术非常重要。为了限制 ChatTTS 的使用，我们在 40,000 小时模型的训练过程中添加了少量高频噪声，并使用 MP3 格式尽可能压缩音频质量，以防止恶意行为者将其用于犯罪目的。同时，我们内部训练了一个检测模型，并计划在未来开源它。
@@ -60,7 +63,7 @@ ChatTTS 是一款强大的文本转语音系统。但是，负责任和道德地
 #### 合作洽谈
-如就模型和路线图进行合作洽谈，请发送邮件至 **[email protected]**。
 #### 线上讨论
@@ -131,7 +134,7 @@ wavs = chat.infer(texts, )
 torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
 ```
-### 高级用法
 ```python
 ###################################
@@ -219,10 +222,14 @@ torchaudio.save("output3.wav", torch.from_numpy(audio_array_en[0]), 24000)
 - [Awesome-ChatTTS](https://github.com/libukai/Awesome-ChatTTS) 一个 ChatTTS 的资源汇总列表。
-## 感谢所有贡献者的付出
 [![contributors](https://contrib.rocks/image?repo=2noise/ChatTTS)](https://github.com/2noise/ChatTTS/graphs/contributors)
-## Star 趋势
-[![Star History Chart](https://api.star-history.com/svg?repos=2noise/ChatTTS&type=Date)](https://star-history.com/#2noise/ChatTTS&Date)

 <a href="https://trendshift.io/repositories/10489" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10489" alt="2noise%2FChatTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 # ChatTTS
+一款适用于日常对话的生成式语音模型。
 [![Licence](https://img.shields.io/badge/LICENSE-CC%20BY--NC%204.0-green.svg?style=for-the-badge)](https://github.com/2noise/ChatTTS/blob/main/LICENSE)
 </div>
+> [!NOTE]
+> 注意此版本可能不是最新版，所有内容请以英文版为准。
 ## 简介
 ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转语音模型。
 ### 亮点
+> 你可以参考 **[Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)** 上的这个视频，了解本项目的详细情况。
 1. **对话式 TTS**: ChatTTS 针对对话式任务进行了优化，能够实现自然且富有表现力的合成语音。它支持多个说话者，便于生成互动式对话。
 2. **精细的控制**: 该模型可以预测和控制精细的韵律特征，包括笑声、停顿和插入语。
 ### 数据集和模型
+- 主模型使用了 100,000+ 小时的中文和英文音频数据进行训练。
+- **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** 上的开源版本是一个在 40,000 小时数据上进行无监督微调的预训练模型。
 ### 路线图
 > [!Important]
 > 此仓库仅供学术用途。
+本项目旨在用于教育和研究目的，不适用于任何商业或法律目的。作者不保证信息的准确性、完整性和可靠性。此仓库中使用的信息和数据仅供学术和研究目的。数据来自公开来源，作者不声称对数据拥有任何所有权或版权。
 ChatTTS 是一款强大的文本转语音系统。但是，负责任和道德地使用这项技术非常重要。为了限制 ChatTTS 的使用，我们在 40,000 小时模型的训练过程中添加了少量高频噪声，并使用 MP3 格式尽可能压缩音频质量，以防止恶意行为者将其用于犯罪目的。同时，我们内部训练了一个检测模型，并计划在未来开源它。
 #### 合作洽谈
+如需就模型和路线图进行合作洽谈，请发送邮件至 **[email protected]**。
 #### 线上讨论
 torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
 ```
+### 进阶用法
 ```python
 ###################################
 - [Awesome-ChatTTS](https://github.com/libukai/Awesome-ChatTTS) 一个 ChatTTS 的资源汇总列表。
+## 贡献者列表
 [![contributors](https://contrib.rocks/image?repo=2noise/ChatTTS)](https://github.com/2noise/ChatTTS/graphs/contributors)
+## 项目浏览量
+<div align="center">
+![counter](https://counter.seku.su/cmoe?name=chattts&theme=mbs)
+</div>

ChatTTS/examples/cmd/run.py CHANGED Viewed

@@ -13,6 +13,10 @@ import wave
 import ChatTTS
 from IPython.display import Audio
 def save_wav_file(wav, index):
     wav_filename = f"output_audio_{index}.wav"
     # Convert numpy array to bytes and write to WAV file
@@ -22,23 +26,26 @@ def save_wav_file(wav, index):
         wf.setsampwidth(2)  # Sample width in bytes
         wf.setframerate(24000)  # Sample rate in Hz
         wf.writeframes(wav_bytes)
-    print(f"Audio saved to {wav_filename}")
 def main():
     # Retrieve text from command line argument
     text_input = sys.argv[1] if len(sys.argv) > 1 else "<YOUR TEXT HERE>"
-    print("Received text input:", text_input)
-    chat = ChatTTS.Chat()
-    print("Initializing ChatTTS...")
-    chat.load_models()
-    print("Models loaded successfully.")
     texts = [text_input]
-    print("Text prepared for inference:", texts)
     wavs = chat.infer(texts, use_decoder=True)
-    print("Inference completed. Audio generation successful.")
     # Save each generated wav file to a local file
     for index, wav in enumerate(wavs):
         save_wav_file(wav, index)
@@ -46,6 +53,6 @@ def main():
     return Audio(wavs[0], rate=24_000, autoplay=True)
 if __name__ == "__main__":
-    print("Starting the TTS application...")
     main()
-    print("TTS application finished.")

 import ChatTTS
 from IPython.display import Audio
+from tools.logger import get_logger
+logger = get_logger("Command")
 def save_wav_file(wav, index):
     wav_filename = f"output_audio_{index}.wav"
     # Convert numpy array to bytes and write to WAV file
         wf.setsampwidth(2)  # Sample width in bytes
         wf.setframerate(24000)  # Sample rate in Hz
         wf.writeframes(wav_bytes)
+    logger.info(f"Audio saved to {wav_filename}")
 def main():
     # Retrieve text from command line argument
     text_input = sys.argv[1] if len(sys.argv) > 1 else "<YOUR TEXT HERE>"
+    logger.info("Received text input: %s", text_input)
+    chat = ChatTTS.Chat(get_logger("ChatTTS"))
+    logger.info("Initializing ChatTTS...")
+    if chat.load_models():
+        logger.info("Models loaded successfully.")
+    else:
+        logger.error("Models load failed.")
+        sys.exit(1)
     texts = [text_input]
+    logger.info("Text prepared for inference: %s", texts)
     wavs = chat.infer(texts, use_decoder=True)
+    logger.info("Inference completed. Audio generation successful.")
     # Save each generated wav file to a local file
     for index, wav in enumerate(wavs):
         save_wav_file(wav, index)
     return Audio(wavs[0], rate=24_000, autoplay=True)
 if __name__ == "__main__":
+    logger.info("Starting the TTS application...")
     main()
+    logger.info("TTS application finished.")

ChatTTS/examples/ipynb/colab.ipynb CHANGED Viewed

@@ -1,24 +1,38 @@
 {
   "cells": [
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "hegwDOfffwzw",
-        "outputId": "1e221210-152b-4f5b-f009-9b9ffec2fa9f"
       },
       "outputs": [],
       "source": [
         "!rm -rf /content/ChatTTS\n",
         "!git clone https://github.com/2noise/ChatTTS.git\n",
         "!pip install -r /content/ChatTTS/requirements.txt\n",
-        "!pip install nemo_text_processing WeTextProcessing\n",
         "!ldconfig /usr/lib64-nvidia"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -28,7 +42,7 @@
       "outputs": [],
       "source": [
         "from dotenv import load_dotenv\n",
-        "load_dotenv(\"sha256.env\")\n",
         "\n",
         "import torch\n",
         "torch._dynamo.config.cache_size_limit = 64\n",
@@ -52,35 +66,69 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 49,
-          "referenced_widgets": [
-            "c365a95346ec4b09a1e6467bf313baf7",
-            "d79fd51849fd463cb08b83fdb8e5ca0c",
-            "d247683a0a61441b971dfb39062e1fbf",
-            "1da23fc236034f32adcaf6bb2e0e7d80",
-            "4b2126d97c514795ab2a90f7357a203c",
-            "9775ce64008b417fac3edd55b9e999d9",
-            "96c9bb2eff4043b2a5dbd1e3e65375e5",
-            "20aa0031b7bb45bf82443b48b3694166",
-            "67252ea545d64392a1bd6ac40852e65f",
-            "2f920c00bcac4787a0078ee035e97b43",
-            "ba592297ff5347aebae298770a29fb8c"
-          ]
-        },
-        "id": "e0QSkngRbSrg",
-        "outputId": "138ac28b-6a33-4c31-8fe3-8481bb213d02"
       },
       "outputs": [],
       "source": [
-        "chat = ChatTTS.Chat()\n",
-        "\n",
-        "# Use force_redownload=True if the weights updated.\n",
-        "chat.load_models(force_redownload=True)\n",
-        "\n",
-        "# If you download the weights manually, set source='locals'.\n",
-        "# chat.load_models(source='local', local_path='YOUR LOCAL PATH')"
       ]
     },
     {
@@ -105,11 +153,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Su9FmUYAbSrh",
-        "outputId": "7c2aa0c1-1f99-4da1-b2e5-bbcb93465d89"
       },
       "outputs": [],
       "source": [
@@ -123,12 +167,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "YQRwB8lpbSri",
-        "outputId": "62ca9282-2755-44a5-ffca-c05c5e35ce76"
       },
       "outputs": [],
       "source": [
@@ -139,12 +178,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "LuFG6m7AbSri",
-        "outputId": "d8e0e3a2-d9fe-44db-e1f4-e2596289270e"
       },
       "outputs": [],
       "source": [
@@ -164,11 +198,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "kma0HBEBbSrj",
-        "outputId": "b80b9d2f-8248-41ee-f1d7-eb3bf331ee69"
       },
       "outputs": [],
       "source": [
@@ -183,12 +213,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "Nl_mT9KpbSrj",
-        "outputId": "1bfcc06a-5246-4d25-fc19-3d125362fa59"
       },
       "outputs": [],
       "source": [
@@ -208,11 +233,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Qh7dcWrAbSrk",
-        "outputId": "3b936323-170a-496b-c4c2-6caa97a8d514"
       },
       "outputs": [],
       "source": [
@@ -227,12 +248,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "0ljWDWzabSrk",
-        "outputId": "8ade2469-c226-44ae-c3a7-ff034e2bffbf"
       },
       "outputs": [],
       "source": [
@@ -252,11 +268,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "3hAAc0lJbSrl",
-        "outputId": "8dc45586-fb2a-4e81-ee53-0ce6df2fc43a"
       },
       "outputs": [],
       "source": [
@@ -269,11 +281,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "0GVJxhd3BKQX",
-        "outputId": "f1484519-7130-450a-b7d8-09de5fe2ffd1"
       },
       "outputs": [],
       "source": [
@@ -284,12 +292,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "ngyMht74BicY",
-        "outputId": "8c7447ad-9ac7-4264-9f53-057d47d43931"
       },
       "outputs": [],
       "source": [
@@ -300,11 +303,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "R2WjuVrWbSrl",
-        "outputId": "0d644cb9-4d65-4147-bd99-d5451439be02"
       },
       "outputs": [],
       "source": [
@@ -316,12 +315,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 76
-        },
-        "id": "71Y4pBdl-_Yd",
-        "outputId": "d44fdf1a-c9e8-42ff-ab96-8712986418fa"
       },
       "outputs": [],
       "source": [
@@ -406,352 +400,6 @@
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.10.8"
-    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "1da23fc236034f32adcaf6bb2e0e7d80": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_2f920c00bcac4787a0078ee035e97b43",
-            "placeholder": "",
-            "style": "IPY_MODEL_ba592297ff5347aebae298770a29fb8c",
-            "value": " 11/11 [00:00&lt;00:00, 762.51it/s]"
-          }
-        },
-        "20aa0031b7bb45bf82443b48b3694166": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2f920c00bcac4787a0078ee035e97b43": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4b2126d97c514795ab2a90f7357a203c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "67252ea545d64392a1bd6ac40852e65f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "96c9bb2eff4043b2a5dbd1e3e65375e5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9775ce64008b417fac3edd55b9e999d9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ba592297ff5347aebae298770a29fb8c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "c365a95346ec4b09a1e6467bf313baf7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_d79fd51849fd463cb08b83fdb8e5ca0c",
-              "IPY_MODEL_d247683a0a61441b971dfb39062e1fbf",
-              "IPY_MODEL_1da23fc236034f32adcaf6bb2e0e7d80"
-            ],
-            "layout": "IPY_MODEL_4b2126d97c514795ab2a90f7357a203c"
-          }
-        },
-        "d247683a0a61441b971dfb39062e1fbf": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_20aa0031b7bb45bf82443b48b3694166",
-            "max": 11,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_67252ea545d64392a1bd6ac40852e65f",
-            "value": 11
-          }
-        },
-        "d79fd51849fd463cb08b83fdb8e5ca0c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_9775ce64008b417fac3edd55b9e999d9",
-            "placeholder": "",
-            "style": "IPY_MODEL_96c9bb2eff4043b2a5dbd1e3e65375e5",
-            "value": "Fetching 11 files: 100%"
-          }
-        }
-      }
     }
   },
   "nbformat": 4,

 {
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xYJFXKP9xhQM"
+      },
+      "source": [
+        "## Clone Repo"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "hegwDOfffwzw"
       },
       "outputs": [],
       "source": [
+        "!cd /content\n",
         "!rm -rf /content/ChatTTS\n",
         "!git clone https://github.com/2noise/ChatTTS.git\n",
         "!pip install -r /content/ChatTTS/requirements.txt\n",
         "!ldconfig /usr/lib64-nvidia"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zdzEFoknxqTH"
+      },
+      "source": [
+        "## Import Libs"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
       "outputs": [],
       "source": [
         "from dotenv import load_dotenv\n",
+        "load_dotenv(\"ChatTTS/sha256.env\")\n",
         "\n",
         "import torch\n",
         "torch._dynamo.config.cache_size_limit = 64\n",
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "e0QSkngRbSrg"
       },
       "outputs": [],
       "source": [
+        "chat = ChatTTS.Chat()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Here are three choices for loading models:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### 1. Load models from Hugging Face:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# use force_redownload=True if the weights have been updated.\n",
+        "chat.load_models(source='huggingface', force_redownload=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### 2. Load models from local directories 'asset' and 'config':"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "chat.load_models()\n",
+        "# chat.load_models(source='local') same as above"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### 3. Load models from a custom path:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# write the model path into custom_path\n",
+        "chat.load_models(source='custom', custom_path='YOUR CUSTOM PATH')"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "Su9FmUYAbSrh"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "YQRwB8lpbSri"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "LuFG6m7AbSri"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "kma0HBEBbSrj"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "Nl_mT9KpbSrj"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "Qh7dcWrAbSrk"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "0ljWDWzabSrk"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "3hAAc0lJbSrl"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "0GVJxhd3BKQX"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "ngyMht74BicY"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "R2WjuVrWbSrl"
       },
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
+        "id": "71Y4pBdl-_Yd"
       },
       "outputs": [],
       "source": [
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.10.8"
     }
   },
   "nbformat": 4,

ChatTTS/examples/ipynb/example.ipynb CHANGED Viewed

@@ -13,8 +13,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from dotenv import load_dotenv\n",
-    "load_dotenv(\"sha256.env\")\n",
     "\n",
     "import torch\n",
     "torch._dynamo.config.cache_size_limit = 64\n",
@@ -38,14 +49,67 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "chat = ChatTTS.Chat()\n",
-    "chat.load_models()\n",
     "\n",
-    "# Use force_redownload=True if the weights updated.\n",
-    "# chat.load_models(force_redownload=True)\n",
-    "\n",
-    "# If you download the weights manually, set source='locals'.\n",
-    "# chat.load_models(source='local', local_path='YOUR LOCAL PATH')"
    ]
   },
   {
@@ -70,7 +134,7 @@
    "source": [
     "texts = [\"So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.\",]*3 \\\n",
     "        + [\"我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。\"]*3     \n",
-    "        \n",
     "wavs = chat.infer(texts)"
    ]
   },
@@ -239,7 +303,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
   }
  },
  "nbformat": 4,

    "metadata": {},
    "outputs": [],
    "source": [
+    "import os, sys\n",
+    "\n",
+    "if sys.platform == \"darwin\":\n",
+    "    os.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\"\n",
+    "\n",
+    "if not \"root_dir\" in globals():\n",
+    "    now_dir = os.getcwd() # skip examples/ipynb\n",
+    "    root_dir = os.path.join(now_dir, \"../../\")\n",
+    "    sys.path.append(root_dir)\n",
+    "    print(\"init root dir to\", root_dir)\n",
+    "\n",
     "from dotenv import load_dotenv\n",
+    "load_dotenv(os.path.join(root_dir, \"sha256.env\"))\n",
     "\n",
     "import torch\n",
     "torch._dynamo.config.cache_size_limit = 64\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "os.chdir(root_dir)\n",
     "\n",
+    "chat = ChatTTS.Chat()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Here are three choices for loading models:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1. Load models from Hugging Face:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use force_redownload=True if the weights have been updated.\n",
+    "chat.load_models(source='huggingface', force_redownload=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2. Load models from local directories 'asset' and 'config':"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat.load_models()\n",
+    "# chat.load_models(source='local') same as above"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3. Load models from a custom path:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# write the model path into custom_path\n",
+    "chat.load_models(source='custom', custom_path='YOUR CUSTOM PATH')"
    ]
   },
   {
    "source": [
     "texts = [\"So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.\",]*3 \\\n",
     "        + [\"我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。\"]*3     \n",
+    "\n",
     "wavs = chat.infer(texts)"
    ]
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,

ChatTTS/examples/web/funcs.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import random
+import torch
+import gradio as gr
+import numpy as np
+from tools.logger import get_logger
+logger = get_logger(" WebUI ")
+import ChatTTS
+chat = ChatTTS.Chat(get_logger("ChatTTS"))
+# 音色选项：用于预置合适的音色
+voices = {
+    "默认": {"seed": 2},
+    "音色1": {"seed": 1111},
+    "音色2": {"seed": 2222},
+    "音色3": {"seed": 3333},
+    "音色4": {"seed": 4444},
+    "音色5": {"seed": 5555},
+    "音色6": {"seed": 6666},
+    "音色7": {"seed": 7777},
+    "音色8": {"seed": 8888},
+    "音色9": {"seed": 9999},
+    "音色10": {"seed": 11111},
+}
+def generate_seed():
+    return gr.update(value=random.randint(1, 100000000))
+# 返回选择音色对应的seed
+def on_voice_change(vocie_selection):
+    return voices.get(vocie_selection)['seed']
+def refine_text(text, audio_seed_input, text_seed_input, refine_text_flag):
+    if not refine_text_flag:
+        return text
+    global chat
+    torch.manual_seed(audio_seed_input)
+    params_refine_text = {'prompt': '[oral_2][laugh_0][break_6]'}
+    torch.manual_seed(text_seed_input)
+    text = chat.infer(text,
+                        skip_refine_text=False,
+                        refine_text_only=True,
+                        params_refine_text=params_refine_text,
+                        )
+    return text[0] if isinstance(text, list) else text
+def generate_audio(text, temperature, top_P, top_K, audio_seed_input, text_seed_input, stream):
+    if not text: return None
+    global chat
+    torch.manual_seed(audio_seed_input)
+    rand_spk = chat.sample_random_speaker()
+    params_infer_code = {
+        'spk_emb': rand_spk,
+        'temperature': temperature,
+        'top_P': top_P,
+        'top_K': top_K,
+        }
+    torch.manual_seed(text_seed_input)
+    wav = chat.infer(
+        text,
+        skip_refine_text=True,
+        params_infer_code=params_infer_code,
+        stream=stream,
+    )
+    if stream:
+        for gen in wav:
+            wavs = [np.array([[]])]
+            wavs[0] = np.hstack([wavs[0], np.array(gen[0])])
+            audio = wavs[0][0]
+            # normalize
+            am = np.abs(audio).max() * 32768
+            if am > 32768:
+                am = 32768 * 32768 / am
+            np.multiply(audio, am, audio)
+            audio = audio.astype(np.int16)
+            yield 24000, audio
+        return
+    audio_data = np.array(wav[0]).flatten()
+    # normalize
+    am = np.abs(audio_data).max() * 32768
+    if am > 32768:
+        am = 32768 * 32768 / am
+    np.multiply(audio_data, am, audio_data)
+    audio_data = audio_data.astype(np.int16)
+    sample_rate = 24000
+    yield sample_rate, audio_data

ChatTTS/examples/web/webui.py CHANGED Viewed

@@ -6,106 +6,44 @@ if sys.platform == "darwin":
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-import random
 import argparse
-import torch
 import gradio as gr
-import numpy as np
 from dotenv import load_dotenv
 load_dotenv("sha256.env")
-import ChatTTS
-# 音色选项：用于预置合适的音色
-voices = {
-    "默认": {"seed": 2},
-    "音色1": {"seed": 1111},
-    "音色2": {"seed": 2222},
-    "音色3": {"seed": 3333},
-    "音色4": {"seed": 4444},
-    "音色5": {"seed": 5555},
-    "音色6": {"seed": 6666},
-    "音色7": {"seed": 7777},
-    "音色8": {"seed": 8888},
-    "音色9": {"seed": 9999},
-    "音色10": {"seed": 11111},
-}
-def generate_seed():
-    new_seed = random.randint(1, 100000000)
-    return {
-        "__type__": "update",
-        "value": new_seed
-        }
-# 返回选择音色对应的seed
-def on_voice_change(vocie_selection):
-    return voices.get(vocie_selection)['seed']
-def generate_audio(text, temperature, top_P, top_K, audio_seed_input, text_seed_input, refine_text_flag):
-    torch.manual_seed(audio_seed_input)
-    rand_spk = chat.sample_random_speaker()
-    params_infer_code = {
-        'spk_emb': rand_spk,
-        'temperature': temperature,
-        'top_P': top_P,
-        'top_K': top_K,
-        }
-    params_refine_text = {'prompt': '[oral_2][laugh_0][break_6]'}
-    torch.manual_seed(text_seed_input)
-    if refine_text_flag:
-        text = chat.infer(text,
-                          skip_refine_text=False,
-                          refine_text_only=True,
-                          params_refine_text=params_refine_text,
-                          params_infer_code=params_infer_code
-                          )
-    wav = chat.infer(text,
-                     skip_refine_text=True,
-                     params_refine_text=params_refine_text,
-                     params_infer_code=params_infer_code
-                     )
-    audio_data = np.array(wav[0]).flatten()
-    sample_rate = 24000
-    text_data = text[0] if isinstance(text, list) else text
-    return [(sample_rate, audio_data), text_data]
 def main():
     with gr.Blocks() as demo:
-        gr.Markdown("# ChatTTS Webui")
-        gr.Markdown("ChatTTS Model: [2noise/ChatTTS](https://github.com/2noise/ChatTTS)")
-        default_text = "四川美食确实以辣闻名，但也有不辣的选择。[uv_break]比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。[laugh]"
         text_input = gr.Textbox(label="Input Text", lines=4, placeholder="Please Input Text...", value=default_text)
         with gr.Row():
             refine_text_checkbox = gr.Checkbox(label="Refine text", value=True)
-            temperature_slider = gr.Slider(minimum=0.00001, maximum=1.0, step=0.00001, value=0.3, label="Audio temperature")
-            top_p_slider = gr.Slider(minimum=0.1, maximum=0.9, step=0.05, value=0.7, label="top_P")
-            top_k_slider = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_K")
         with gr.Row():
-            voice_options = {}
             voice_selection = gr.Dropdown(label="音色", choices=voices.keys(), value='默认')
             audio_seed_input = gr.Number(value=2, label="Audio Seed")
             generate_audio_seed = gr.Button("\U0001F3B2")
             text_seed_input = gr.Number(value=42, label="Text Seed")
             generate_text_seed = gr.Button("\U0001F3B2")
-        generate_button = gr.Button("Generate")
         text_output = gr.Textbox(label="Output Text", interactive=False)
-        audio_output = gr.Audio(label="Output Audio")
         # 使用Gradio的回调功能来更新数值输入框
         voice_selection.change(fn=on_voice_change, inputs=voice_selection, outputs=audio_seed_input)
@@ -117,10 +55,25 @@ def main():
         generate_text_seed.click(generate_seed,
                                  inputs=[],
                                  outputs=text_seed_input)
-        generate_button.click(generate_audio,
-                              inputs=[text_input, temperature_slider, top_p_slider, top_k_slider, audio_seed_input, text_seed_input, refine_text_checkbox],
-                              outputs=[audio_output, text_output])
         gr.Examples(
             examples=[
@@ -138,15 +91,22 @@ def main():
     parser.add_argument('--custom_path', type=str, default=None, help='the custom model path')
     args = parser.parse_args()
-    print("loading ChatTTS model...")
     global chat
-    chat = ChatTTS.Chat()
     if args.custom_path == None:
-        chat.load_models()
     else:
-        print('local model path:', args.custom_path)
-        chat.load_models('custom', custom_path=args.custom_path)
     demo.launch(server_name=args.server_name, server_port=args.server_port, root_path=args.root_path, inbrowser=True)

 now_dir = os.getcwd()
 sys.path.append(now_dir)
 import argparse
 import gradio as gr
 from dotenv import load_dotenv
 load_dotenv("sha256.env")
+from examples.web.funcs import *
 def main():
     with gr.Blocks() as demo:
+        gr.Markdown("# ChatTTS WebUI")
+        gr.Markdown("- **GitHub Repo**: https://github.com/2noise/ChatTTS")
+        gr.Markdown("- **HuggingFace Repo**: https://huggingface.co/2Noise/ChatTTS")
+        default_text = "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。"
         text_input = gr.Textbox(label="Input Text", lines=4, placeholder="Please Input Text...", value=default_text)
         with gr.Row():
             refine_text_checkbox = gr.Checkbox(label="Refine text", value=True)
+            temperature_slider = gr.Slider(minimum=0.00001, maximum=1.0, step=0.00001, value=0.3, label="Audio temperature", interactive=True)
+            top_p_slider = gr.Slider(minimum=0.1, maximum=0.9, step=0.05, value=0.7, label="top_P", interactive=True)
+            top_k_slider = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_K", interactive=True)
         with gr.Row():
             voice_selection = gr.Dropdown(label="音色", choices=voices.keys(), value='默认')
             audio_seed_input = gr.Number(value=2, label="Audio Seed")
             generate_audio_seed = gr.Button("\U0001F3B2")
             text_seed_input = gr.Number(value=42, label="Text Seed")
             generate_text_seed = gr.Button("\U0001F3B2")
+        with gr.Row():
+            auto_play_checkbox = gr.Checkbox(label="Auto Play", value=False, scale=1)
+            stream_mode_checkbox = gr.Checkbox(label="Stream Mode", value=False, scale=1)
+            generate_button = gr.Button("Generate", scale=2)
         text_output = gr.Textbox(label="Output Text", interactive=False)
         # 使用Gradio的回调功能来更新数值输入框
         voice_selection.change(fn=on_voice_change, inputs=voice_selection, outputs=audio_seed_input)
         generate_text_seed.click(generate_seed,
                                  inputs=[],
                                  outputs=text_seed_input)
+        generate_button.click(fn=lambda: "", outputs=text_output)
+        generate_button.click(refine_text,
+                              inputs=[text_input, audio_seed_input, text_seed_input, refine_text_checkbox],
+                              outputs=text_output)
+        @gr.render(inputs=[auto_play_checkbox, stream_mode_checkbox])
+        def make_audio(autoplay, stream):
+            audio_output = gr.Audio(
+                label="Output Audio",
+                value=None,
+                autoplay=autoplay,
+                streaming=stream,
+                interactive=False,
+                show_label=True,
+            )
+            text_output.change(generate_audio,
+                                inputs=[text_output, temperature_slider, top_p_slider, top_k_slider, audio_seed_input, text_seed_input, stream_mode_checkbox],
+                                outputs=audio_output)
         gr.Examples(
             examples=[
     parser.add_argument('--custom_path', type=str, default=None, help='the custom model path')
     args = parser.parse_args()
+    logger.info("loading ChatTTS model...")
     global chat
     if args.custom_path == None:
+        ret = chat.load_models()
     else:
+        logger.info('local model path: %s', args.custom_path)
+        ret = chat.load_models('custom', custom_path=args.custom_path)
+    if ret:
+        logger.info("Models loaded successfully.")
+    else:
+        logger.error("Models load failed.")
+        sys.exit(1)
     demo.launch(server_name=args.server_name, server_port=args.server_port, root_path=args.root_path, inbrowser=True)

ChatTTS/requirements.txt CHANGED Viewed

@@ -1,14 +1,13 @@
 numpy<2.0.0
-omegaconf~=2.3.0
-torch~=2.1.0
 tqdm
-einops
 vector_quantize_pytorch
-transformers~=4.41.1
 vocos
 IPython
 gradio
 python-dotenv
-pynini==2.1.5
-WeTextProcessing
-nemo_text_processing

 numpy<2.0.0
+omegaconf>=2.3.0
+torch>=2.1.0
 tqdm
 vector_quantize_pytorch
+transformers>=4.41.1
 vocos
 IPython
 gradio
 python-dotenv
+pynini==2.1.5; sys_platform == 'linux'
+WeTextProcessing; sys_platform == 'linux'
+nemo_text_processing; sys_platform == 'linux'

ChatTTS/setup.py CHANGED Viewed

@@ -6,7 +6,6 @@ setup(name='chattts',
       install_requires=['omegaconf>=2.3.0',
                 'torch>=2.1.0',
                 'tqdm',
-                'einops',
                 'vector_quantize_pytorch',
                 'transformers>=4.41.1',
                 'vocos',

       install_requires=['omegaconf>=2.3.0',
                 'torch>=2.1.0',
                 'tqdm',
                 'vector_quantize_pytorch',
                 'transformers>=4.41.1',
                 'vocos',

ChatTTS/tools/logger/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .log import get_logger

ChatTTS/tools/logger/log.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import platform
+import logging
+from datetime import datetime, timezone
+# from https://github.com/FloatTech/ZeroBot-Plugin/blob/c70766a989698452e60e5e48fb2f802a2444330d/console/console_windows.go#L89-L96
+colorCodePanic = "\x1b[1;31m"
+colorCodeFatal = "\x1b[1;31m"
+colorCodeError = "\x1b[31m"
+colorCodeWarn  = "\x1b[33m"
+colorCodeInfo  = "\x1b[37m"
+colorCodeDebug = "\x1b[32m"
+colorCodeTrace = "\x1b[36m"
+colorReset     = "\x1b[0m"
+log_level_color_code = {
+    logging.DEBUG: colorCodeDebug,
+    logging.INFO: colorCodeInfo,
+    logging.WARN: colorCodeWarn,
+    logging.ERROR: colorCodeError,
+    logging.FATAL: colorCodeFatal,
+}
+log_level_msg_str = {
+    logging.DEBUG: "DEBU",
+    logging.INFO: "INFO",
+    logging.WARN: "WARN",
+    logging.ERROR: "ERRO",
+    logging.FATAL: "FATL",
+}
+class Formatter(logging.Formatter):
+    def __init__(self, color=platform.system().lower() != "windows"):
+        # https://stackoverflow.com/questions/2720319/python-figure-out-local-timezone
+        self.tz = datetime.now(timezone.utc).astimezone().tzinfo
+        self.color = color
+    def format(self, record: logging.LogRecord):
+        logstr = "[" + datetime.now(self.tz).strftime('%z %Y%m%d %H:%M:%S') + "] ["
+        if self.color:
+            logstr += log_level_color_code.get(record.levelno, colorCodeInfo)
+        logstr += log_level_msg_str.get(record.levelno, record.levelname)
+        if self.color:
+            logstr += colorReset
+        logstr += f"] {str(record.name)} | {str(record.msg)}"
+        return logstr
+def get_logger(name: str, lv = logging.INFO):
+    logger = logging.getLogger(name)
+    syslog = logging.StreamHandler()
+    syslog.setFormatter(Formatter())
+    logger.setLevel(lv)
+    logger.addHandler(syslog)
+    return logger

abc ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit f4c8329f0d231b272b676e5e171fb9655b345f2e

chattts_webui_mix.ipynb CHANGED Viewed

@@ -4,7 +4,9 @@
  "metadata": {
   "colab": {
    "provenance": [],
-   "gpuType": "T4"
   },
   "kernelspec": {
    "name": "python3",
@@ -18,8 +20,42 @@
  "cells": [
   {
    "cell_type": "markdown",
    "source": [
-    "> 🌟 如果你觉得 ChatTTS 和 ChatTTS_colab 项目对你有帮助，请访问以下链接给它们点个星星吧！🌟\n",
     "\n",
     "- [ChatTTS 项目](https://github.com/2noise/ChatTTS)\n",
     "\n",
@@ -27,18 +63,19 @@
     "\n",
     "感谢你的支持！\n",
     "\n",
-    "# 运行方法\n",
     "\n",
     "- 点击菜单栏的--代码执行程序--全部运行即可\n",
     "- 执行后在下方的日志中找到类似\n",
     "\n",
     "  Running on public URL: https://**************.gradio.live  <-这个就是可以访问的公网地址\n",
     "\n",
-    "安装包的时候提示要重启 请点**\"否\"**"
-   ],
-   "metadata": {
-    "id": "Xo3k5XsTzWK6"
-   }
   },
   {
    "cell_type": "code",
@@ -47,19 +84,58 @@
     "%cd ChatTTS_colab\n",
     "!git clone -q https://github.com/2noise/ChatTTS\n",
     "%cd ChatTTS\n",
-    "!git checkout -q e6412b1\n",
     "%cd ..\n",
     "!mv ChatTTS abc\n",
-    "!mv abc/* /content/ChatTTS_colab/\n",
-    "!pip install -q omegaconf vocos vector_quantize_pytorch gradio cn2an pypinyin openai jieba WeTextProcessing python-dotenv\n",
     "# 启动 Gradio 有公网地址\n",
     "!python webui_mix.py --share\n"
    ],
    "metadata": {
-    "id": "hNDl-5muR77-"
    },
    "execution_count": null,
-   "outputs": []
   }
  ]
 }

  "metadata": {
   "colab": {
    "provenance": [],
+   "gpuType": "T4",
+   "authorship_tag": "ABX9TyPWzXw++IDXf5gvuBHiHqmz",
+   "include_colab_link": true
   },
   "kernelspec": {
    "name": "python3",
  "cells": [
   {
    "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/6drf21e/ChatTTS_colab/blob/main/chattts_webui_mix.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 260
+    },
+    "id": "-VNe1BeDO1n0",
+    "outputId": "f3ed0cc9-b8dd-4f2a-9cdd-3106e41f485d"
+   },
+   "outputs": [
+    {
+     "output_type": "display_data",
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ],
+      "text/markdown": "\n### 🌟 如果你觉得 ChatTTS 和 ChatTTS_colab 项目对你有帮助，请访问以下链接给它们点个星星吧！🌟\n\n- [ChatTTS 项目](https://github.com/2noise/ChatTTS)\n\n- [ChatTTS_colab 项目](https://github.com/6drf21e/ChatTTS_colab)\n\n感谢你的支持！\n\n### 运行方法 ###\n\n- 点击菜单栏的--代码执行程序--全部运行即可\n- 执行后在下方的日志中找到类似\n\n  Running on public URL: https://**********.gradio.live  <-这个就是可以访问的公网地址\n\n安装包的时候提示要重启 请点**\"否\"**\n\n\n"
+     },
+     "metadata": {}
+    }
+   ],
    "source": [
+    "from IPython.display import display, Markdown\n",
+    "\n",
+    "message = \"\"\"\n",
+    "### 🌟 如果你觉得 ChatTTS 和 ChatTTS_colab 项目对你有帮助，请访问以下链接给它们点个星星吧！🌟\n",
     "\n",
     "- [ChatTTS 项目](https://github.com/2noise/ChatTTS)\n",
     "\n",
     "\n",
     "感谢你的支持！\n",
     "\n",
+    "### 运行方法 ###\n",
     "\n",
     "- 点击菜单栏的--代码执行程序--全部运行即可\n",
     "- 执行后在下方的日志中找到类似\n",
     "\n",
     "  Running on public URL: https://**************.gradio.live  <-这个就是可以访问的公网地址\n",
     "\n",
+    "安装包的时候提示要重启 请点**\"否\"**\n",
+    "\n",
+    "\n",
+    "\"\"\"\n",
+    "display(Markdown(message))\n"
+   ]
   },
   {
    "cell_type": "code",
     "%cd ChatTTS_colab\n",
     "!git clone -q https://github.com/2noise/ChatTTS\n",
     "%cd ChatTTS\n",
+    "!git checkout -q f4c8329\n",
     "%cd ..\n",
     "!mv ChatTTS abc\n",
+    "!mv abc/ChatTTS ./ChatTTS\n",
+    "!pip install -q omegaconf vocos vector_quantize_pytorch gradio cn2an pypinyin openai\n",
     "# 启动 Gradio 有公网地址\n",
     "!python webui_mix.py --share\n"
    ],
    "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "hNDl-5muR77-",
+    "outputId": "9ca99a78-1354-4c4d-dfa9-30a82b1a7813"
    },
    "execution_count": null,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "/content/ChatTTS_colab/ChatTTS_colab\n",
+      "/content/ChatTTS_colab/ChatTTS_colab/ChatTTS\n",
+      "/content/ChatTTS_colab/ChatTTS_colab\n",
+      "Loading ChatTTS model...\n",
+      "INFO:ChatTTS.core:Load from cache: /root/.cache/huggingface/hub/models--2Noise--ChatTTS/snapshots/ce5913842aebd78e4a01a02d47244b8d62ac4ee3\n",
+      "INFO:ChatTTS.core:use cuda:0\n",
+      "INFO:ChatTTS.core:vocos loaded.\n",
+      "INFO:ChatTTS.core:dvae loaded.\n",
+      "INFO:ChatTTS.core:gpt loaded.\n",
+      "INFO:ChatTTS.core:decoder loaded.\n",
+      "INFO:ChatTTS.core:tokenizer loaded.\n",
+      "INFO:ChatTTS.core:All initialized.\n",
+      "INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: GET https://checkip.amazonaws.com/ \"HTTP/1.1 200 \"\n",
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "INFO:httpx:HTTP Request: GET http://127.0.0.1:7860/startup-events \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7860/ \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: GET https://api.gradio.app/v2/tunnel-request \"HTTP/1.1 200 OK\"\n",
+      "Running on public URL: https://054d1298c1303e0370.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
+      "[' 海底二万里 。第一章\\u3000飞走的暗礁。人们一定还记得一八六六年海上发生的一件离奇的、神秘的、无法解释的怪事。且不说当时哄动沿海居民和世界舆论的各种传闻 这里只说一般航海人员特别激动的心情。欧美的进出口商人、船长和船主、各国的海军官佐以及这两大洲的各国政府都非常注意这件事。', '这事大体是这样 不久以前 好些大船在海上碰见了一一个 庞然大物  一个很长的物体 形状很像纺锤 有时发出磷光 它的体积比鲸鱼大得多 行动起来也比鲸鱼快得多。关于这个东西的出现 许多航海日志所记下的事实 如这个东西或这个生物的形状 在它运动时的难以估计的速度 它转移的惊人力量 它那种像是天生的特殊本领等等  大致是相同的。', '如果这东西是鲸鱼类动物 那么它的体积 是大大超过了生物学家曾经加以分类的鲸鱼。居维埃·拉色别德①、杜梅里②、卡特法日③ 这些生物学家一一除非看见过 也就是说 除非这些科学家本人的眼睛看见过——是不承认有这样一种怪物存在的。把多次观察的结果折中一下来看———方面丢开那些过低的估计 即这个东西只有二百英尺长 同时也不接受过于夸张的言论 即它有一英里。', '宽三英里长 ——我们可以肯定他说  晋书·阮籍传  。 其后纲维不摄 而虚无放诞之论盈于 这个奇怪的生物 如果真是存在的话 它的体积是大大超过鱼类学家所承认的体积的。这东西既然存在 而事实本身又是不可否认的 那么 由于人类好奇的心理 我们就不难理解这个怪物的出现会在全世界引起怎样的骚动。', '至于说这是荒唐无稽之谈 那是决不会有人同意的。因为 一八六六年七月二十日 加尔各答一布纳希汽船公司的喜金孙总督号 在澳大利亚海岸东边五英里 碰见了这个游动的巨大物体。巴克船长起初还以为这是没有人知道的、暗礁 他正要测定它的位置的时候 突然这个不可解释的物体喷出两道水柱 哗的一声射到空中一百五十英尺高。', '这么说 除非这座暗礁上边有间歇喷泉 不然的话 喜金孙总督号面前的东西 就是还没有人知道的一种海中哺乳类动物 它还从鼻孔中喷出有气泡的水柱呢。同年七月二十三日 西印度 太平洋汽船公司的克利斯托巴尔哥郎号 在太平洋上也碰到这样的事。喜金孙总督号看见这怪物以后三天 克利斯托巴尔哥郎号在相距七百里的地方也看见了它 由此可知义 实用性则是鉴别它们正确与否的根据。', ' 这个奇特的鲸鱼类动物能以掠人的速度从这一处转移到另一处。十五天以后 在离上面说的地点有两千里远的地方 国营轮船公司的海尔维地亚号和皇家邮船公司的山农号 在美国和欧洲之间的大西洋海面上相遇的时候 在北纬四十二度十五分、西经六十度三十五分的地方 同时看到了这个大怪物。', '根据两船同时观察得到的结果 估计这只哺乳动物的长度至少有三百五十多英尺 约一百零六米  因为山农号和海尔维地亚号两船连起来 都还比它短 两船从头至尾只有一百米长。可是 最长的鲸鱼 像常常出役于阿留申群岛的久阑马克岛和翁居里克岛①附近海面的那些鲸鱼 也只不过是五十六米 而比这再长的 从来就没有过。', '接连不断地传来的消息 横渡大西洋的贝雷尔号所做的种种观察 茵曼轮船公司的越提那号跟这个怪物的一次相碰 法国二级军舰诺曼第号军官们所写的记录 海军高级参谋弗兹一詹姆斯在克利德爵士号上所做的很精密的测算 这一切在当时的确曾经哄动一时。在民族性比较浮躁的国家里 大家都拿这件事作为谈笑资料 但在严肃和踏实的国家里 像英国、美国和德国就不同 它们对这事就非常关心。', '在各大城市里 这怪物变成了家喻户晓的事件。咖啡馆里歌唱它 报刊上嘲笑它 舞台上扮演它。谣言正好有了机会 从这怪物身上捏造出各种各样的奇闻。在一些发行量不多的报刊上派 。 出现了关于各种离奇的巨大动物的报道 从白鲸、北极海中可怕的 莫比·狄克 ①一直到庞大的 克拉肯 ②——这种怪鱼的触须可以缠住一只载重五百吨的船而把它拖到海底下去——都应有尽有。', '有些人甚至不惜引经据典 或者搬出古代的传说如亚里士多德③和蒲林尼④的见解 他们承认这类怪物的存在  或者搬出彭土皮丹主教⑤的挪威童话 保罗·埃纪德的记述 以及哈林顿的报告 这报告是不容怀疑的 他说 一八五七年 他在嘉斯第兰号上看见过一种大蛇 那种蛇以前只在那立宪号到过的海面上⑤才能看见。', '于是 在学术团体里和科学报刊中产生了相信者和怀疑者 这两派人无休止地争论着。 怪物问题 激动着人们。自以为懂科学的新闻记者和一向自以为多才的文人开起火来 他们在这次值得纪念的笔战中花费了不少的墨水 。甚至有几个人还流了两三滴血 因为有人把针对大海蛇的笔锋移向一些态度傲慢的家伙身上了。', '在六个月当中 争论继续着。彼此有理 各执一词。当时流行的小报都兴致勃勃地刊登争论的文章 它们不是攻击巴西地理学院、柏林皇家科学院、不列颠学术联合会或华盛顿斯密孙学院发表的权威论文 就是驳斥印度群岛报、摩亚诺神父的宇宙杂志、皮德曼的消息报里面的讨论和法国及其他各国大报刊的科学新闻。', '这些多才的作家故意曲解反对派也常引证的林奈①的一句话  大自然不制造蠢东西  恳求大家不要相信北海的大怪鱼、大海蛇、 莫比·狄克 和疯狂的海员们臆造出来的其它怪物的存在 不要因此而否定了大自然。最后 某一著名尖刻的讽刺报有一位最受欢迎的编辑先生草草了事地发表一篇文章物主义的一些基本范畴和基本原理。', '强调马克思主义哲学必 处理了这个怪物 他像夷包列提②那样 在大家的笑声中 给这佳物最后一次打击、把它结果了。于是机智战胜了科学。在一八六七年头几个月里 这个问题好像是人了土 不会再复洁了。但就在这个时候 人们又听说发生了一些新的事件。', '现在的问题并不是一个急待解决的科学问题 而是必须认真设法避免的一个危险。问题带了完全不同的面貌。这个怪物变成了小岛、岩石、暗礁 但它是会奔驰的、不可捉摸的、行动莫测的暗礁。一八六七年八月五日 蒙特利奥航海公司的摩拉维安号夜间驶到北纬二十七度三十分、西经七十二度十五分的地方 船右舷撞上了一座岩石 可是 任何地图也没有记载过这一带海面上有这座岩石。', '由于风力的助航和四百匹马力的推动 船的速度达到每小时十三海里。毫无疑问 如果不是船身质地优良 特别坚固 摩拉维安号被撞以后 一定要把它从加拿大载来的二百三十六名乘客一齐带到海底去。事故发生在早晨五点左右天刚破晓的时候。船上值班的海员们立即跑到船的后部 他们十分细心地观察海面。', '除了有个六百多米宽的大漩涡——好像水面受过猛烈的冲击——以外 他们什么也没有看见 只把事故发生的地点确切地记了下来。摩拉维安号继续航行 似乎并没有受到什么损伤。·它是撞上了暗礁呢 还是撞上了一只沉没的破船？。当时没有法子知道。后来到船坞检查了船底朋友？。', '这个问题是革命的首要问题。 运用马克思主义的立尝 才发现一部分龙骨折断了。这事实本身是十分严重的 可是 如果不是过了三个星期后 在相同的情况下又发生了相同的事件 它很可能跟许多其他的事件一样很快被人忘掉了。接着又发生的那一次撞船的事件 单单由于受害船的国籍和它所属公司的声望 就足以引起十分广泛的反响。', '英国著名的船主苟纳尔的名字是没有一个人不知道伪。这位精明的企业家早在一八四零年就创办了一家邮船公司 开辟了从利物浦到哈利法克斯①的航线 当时只有三艘四百匹马力、载重一千一百六十二吨的明轮木船。八年以后 公司扩大了 共有四艘六百五十匹马力、载重一千八百二十吨的船。', '再过两年 又添了两艘马力和载重量更大的船 一八五三年 苟纳尔公司继续取得装运政府邮件的特权 一连添造了阿拉伯号、波斯号、中国号、斯备脱亚号、爪哇号、俄罗斯号 这些都是头等的快船 而且是最宽大的 除了大东方号外 在海上航行的船没有能跟它们相比的。', '到一八六七年 这家公司一共有十二艘船～八艘明轮的 四艘暗轮的。我所以要把上面的情形简单地介绍一下 是要大家知道这家海运公司的重要性。它由于经营得法 是全世界都闻名的。任何航海企业 没有比这公司搞得更精明 经营得更成功的了。二十六年来学流派均是庸俗进化论的宣传者。', '实证主义者斯宾塞对其曾 苟纳尔公司的船在大西洋上航行了两千次 没有一次航行不达目的地 没有一次发生迟误 从没有遗失过一封信 损失过一个人或一只船。 因此 尽管法国竭力要抢它的生意 但是乘客们都一致愿意搭苟纳尔公司的船 这点从近年来官方的统计文献中就可以看出来。', '了解这情形以后 便没有人奇怪这家公司的一只汽船遭遇到意外事件会引起那么巨大的反响。一八六七年四月十三日 海很平静 风又是顺风 斯备脱亚号在西经十五度十二分、北纬四十五度三十七分的海面上行驶着。它在一千匹马力的发动机推动下 速度为每小时十三海里半。', '它的机轮在海中转动 完全正常。它当时的吃水深度是六米七十厘米 排水量是六 六百八十五方米。下午四点十六分 乘客们正在大厅中吃点心的时候 在斯各脱亚号船尾、左舷机轮后面一点 似乎发生了轻微的撞击。斯各脱亚号不是撞上了什么 而是被什么撞上了。', '憧它的不是敲击的器械而是钻凿的器械。这次冲撞是十分轻微的 要不是管船舱的人员跑到甲板上来喊  船要沉了 船要沉了 。 也许船上的人谁也不会在意。旅客们起初十分惊慌 但船长安德生很快就使他们安稳下来。危险并不会立刻就发生。斯各脱亚号由防水板分为七大间 一点也不在乎个把漏洞。', '安德生船长立即跑到舱底下去。他查出第五间被海水浸人了 海水浸入十分快 证明漏洞相当大。好在这间里没有蒸汽炉 不然的话 炉火就要熄灭了。安德生船长吩咐马上停船 并且命令一个潜水员下水检查船身的损坏情形。一会儿 他知道船底有一个长两米的大洞。', '这样一个裂口是没法堵住的 斯各脱亚号尽管机轮有一半浸在水里 但也必须继续行驶。当时船离克利亚峡还有三百海里 等船驶进公司的码头 已经误了三天期 在这三天里 利物浦的人都为它惶惶不安。斯各脱亚号被架了起来 工程师们开始检查。他们眼睛所看见的情形连自己也不能相信。', '在船身吃水线下两米半的地方 露出一个很规则的等边三角形的缺口。铁皮上的伤痕十分整齐 、就是钻孔机也不能凿得这么准确 弄成这个裂口的锐利器械一定不是用普通的钢铁制的 因为 这家伙在以惊人的力量向前猛撞 凿穿了四厘米厚的铁皮以后、还能用一种很难做到的后退动作 使自己脱身逃走。', '最近这次事件的经过大致就是这样。结果这又一次使舆论哄动起来。从这时候起 所有从前原因不明的航海遇难事件 现在都算在这个怪物的账上了。这只离奇古怪的动物于是负起了所有船只沉没的责任。不幸的是船沉的数目相当大 按照统计年鉴的记载 包括帆船和汽船在内 每年的损失约有三千艘左右 至于因下落不明而断定失踪 的 每年的数目也不下两百艘 。不管有没有冤枉这怪物 人们都把船只失踪的原因算在它身上。由于它的存在 五大洲间的海上交通越来越危险了 大家都坚决要求不惜任何代价清除海上这条可怕盼鲸鱼怪。']\n",
+      "INFO:ChatTTS.core:All initialized.\n",
+      " 46% 175/384 [00:05<00:07, 29.18steps/s]\n",
+      " 73% 1501/2048 [01:18<00:28, 19.09steps/s]\n",
+      "INFO:ChatTTS.core:All initialized.\n",
+      " 62% 238/384 [00:08<00:05, 28.48steps/s]\n",
+      " 36% 736/2048 [00:28<01:07, 19.51steps/s]"
+     ]
+    }
+   ]
   }
  ]
 }

config.py CHANGED Viewed

@@ -1,15 +1,13 @@
 #  Description: Configuration file for the project
-llama_seed = 2581
-DEFAULT_DIR = "output"
 DEFAULT_SPEED = 5
 DEFAULT_ORAL = 2
 DEFAULT_LAUGH = 0
 DEFAULT_BK = 4
 # 段落切割
-DEFAULT_SEG_LENGTH = 80
-DEFAULT_BATCH_SIZE = 3
 # 温度
-DEFAULT_TEMPERATURE = 0.1
 # top_P
 DEFAULT_TOP_P = 0.7
 # top_K
@@ -43,4 +41,4 @@ LLM_PROMPT = """
 注意: character 字段的值需要使用类似 "旁白"、"年轻男性"、"年轻女性" 等角色身份。如果有多个角色，可以使用 "年轻男性1"、"年轻男性2" 等。
 --故事文本--
-"""

 #  Description: Configuration file for the project
 DEFAULT_SPEED = 5
 DEFAULT_ORAL = 2
 DEFAULT_LAUGH = 0
 DEFAULT_BK = 4
 # 段落切割
+DEFAULT_SEG_LENGTH = 120
+DEFAULT_BATCH_SIZE = 5
 # 温度
+DEFAULT_TEMPERATURE = 0.3
 # top_P
 DEFAULT_TOP_P = 0.7
 # top_K
 注意: character 字段的值需要使用类似 "旁白"、"年轻男性"、"年轻女性" 等角色身份。如果有多个角色，可以使用 "年轻男性1"、"年轻男性2" 等。
 --故事文本--
+"""

tts_model.py CHANGED Viewed

@@ -1,15 +1,12 @@
-import datetime
-import json
 import os
-import re
 import time
-import numpy as np
-import torch
 from tqdm import tqdm
-import ChatTTS
 from config import DEFAULT_TEMPERATURE, DEFAULT_TOP_P, DEFAULT_TOP_K
 def load_chat_tts_model(source='huggingface', force_redownload=False, local_path=None):
@@ -22,7 +19,7 @@ def load_chat_tts_model(source='huggingface', force_redownload=False, local_path
     """
     print("Loading ChatTTS model...")
     chat = ChatTTS.Chat()
-    chat.load_models(source=source, force_redownload=force_redownload, custom_path=local_path, compile=False)
     return chat
@@ -47,38 +44,19 @@ def deterministic(seed=0):
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
-def generate_audio_for_seed(chat, seed, texts, batch_size, speed, refine_text_prompt, roleid=None,
-                            temperature=DEFAULT_TEMPERATURE,
-                            top_P=DEFAULT_TOP_P, top_K=DEFAULT_TOP_K, cur_tqdm=None, skip_save=False,
-                            skip_refine_text=False, speaker_type="seed", pt_file=None):
     from utils import combine_audio, save_audio, batch_split
-    print(f"speaker_type: {speaker_type}")
-    if speaker_type == "seed":
-        if seed in [None, -1, 0, "", "random"]:
-            seed = np.random.randint(0, 9999)
-        deterministic(seed)
-        rnd_spk_emb = chat.sample_random_speaker()
-    elif speaker_type == "role":
-        # 从 JSON 文件中读取数据
-        with open('./slct_voice_240605.json', 'r', encoding='utf-8') as json_file:
-            slct_idx_loaded = json.load(json_file)
-        # 将包含 Tensor 数据的部分转换回 Tensor 对象
-        for key in slct_idx_loaded:
-            tensor_list = slct_idx_loaded[key]["tensor"]
-            slct_idx_loaded[key]["tensor"] = torch.tensor(tensor_list)
-        # 将音色 tensor 打包进params_infer_code，固定使用此音色发音，调低temperature
-        rnd_spk_emb = slct_idx_loaded[roleid]["tensor"]
-        # temperature = 0.001
-    elif speaker_type == "pt":
-        print(pt_file)
-        rnd_spk_emb = torch.load(pt_file)
-        print(rnd_spk_emb.shape)
-        if rnd_spk_emb.shape != (768,):
-            raise ValueError("维度应为 768。")
-    else:
-        raise ValueError(f"Invalid speaker_type: {speaker_type}. ")
     params_infer_code = {
         'spk_emb': rnd_spk_emb,
         'prompt': f'[speed_{speed}]',
@@ -99,16 +77,13 @@ def generate_audio_for_seed(chat, seed, texts, batch_size, speed, refine_text_pr
     if not cur_tqdm:
         cur_tqdm = tqdm
-    if re.search(r'\[uv_break\]|\[laugh\]', ''.join(texts)) is not None:
-        if not skip_refine_text:
-            print("Detected [uv_break] or [laugh] in text, skipping refine_text")
-        skip_refine_text = True
     for batch in cur_tqdm(batch_split(texts, batch_size), desc=f"Inferring audio for seed={seed}"):
         flag += len(batch)
-        _params_infer_code = {**params_infer_code}
-        wavs = chat.infer(batch, params_infer_code=_params_infer_code, params_refine_text=params_refine_text,
-                          use_decoder=True, skip_refine_text=skip_refine_text)
         all_wavs.extend(wavs)
         clear_cuda_cache()
     if skip_save:
@@ -118,28 +93,9 @@ def generate_audio_for_seed(chat, seed, texts, batch_size, speed, refine_text_pr
     elapsed_time = end_time - start_time
     print(f"Saving audio for seed {seed}, took {elapsed_time:.2f}s")
     timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
-    wav_filename = f"chattts-[seed_{seed}][speed_{speed}]{refine_text_prompt}[{timestamp}].wav"
-    return save_audio(wav_filename, combined_audio)
-def generate_refine_text(chat, seed, text, refine_text_prompt, temperature=DEFAULT_TEMPERATURE,
-                         top_P=DEFAULT_TOP_P, top_K=DEFAULT_TOP_K):
-    if seed in [None, -1, 0, "", "random"]:
-        seed = np.random.randint(0, 9999)
-    deterministic(seed)
-    params_refine_text = {
-        'prompt': refine_text_prompt,
-        'top_P': top_P,
-        'top_K': top_K,
-        'temperature': temperature
-    }
-    print('params_refine_text:', text)
-    print('refine_text_prompt:', refine_text_prompt)
-    refine_text = chat.infer(text, params_refine_text=params_refine_text, refine_text_only=True, skip_refine_text=False)
-    print('refine_text:', refine_text)
-    return refine_text
 def tts(chat, text_file, seed, speed, oral, laugh, bk, seg, batch, progres=None):

+import ChatTTS
+import torch
+import numpy as np
 import os
 import time
 from tqdm import tqdm
+import datetime
 from config import DEFAULT_TEMPERATURE, DEFAULT_TOP_P, DEFAULT_TOP_K
+import spaces
 def load_chat_tts_model(source='huggingface', force_redownload=False, local_path=None):
     """
     print("Loading ChatTTS model...")
     chat = ChatTTS.Chat()
+    chat.load_models(source=source, force_redownload=force_redownload, local_path=local_path)
     return chat
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
+@spaces.GPU
+def generate_audio_for_seed(chat, seed, texts, batch_size, speed, refine_text_prompt, temperature=DEFAULT_TEMPERATURE,
+                            top_P=DEFAULT_TOP_P, top_K=DEFAULT_TOP_K, cur_tqdm=None, skip_save=False):
     from utils import combine_audio, save_audio, batch_split
+    # torch.manual_seed(seed)
+    # top_P = 0.7,
+    # top_K = 20,
+    # temperature = 0.3,
+    if seed in [None, -1, 0, "", "random"]:
+        seed = np.random.randint(0, 9999)
+    deterministic(seed)
+    rnd_spk_emb = chat.sample_random_speaker()
     params_infer_code = {
         'spk_emb': rnd_spk_emb,
         'prompt': f'[speed_{speed}]',
     if not cur_tqdm:
         cur_tqdm = tqdm
     for batch in cur_tqdm(batch_split(texts, batch_size), desc=f"Inferring audio for seed={seed}"):
         flag += len(batch)
+        # refine_text =  chat.infer(batch, params_infer_code=params_infer_code, params_refine_text=params_refine_text, refine_text_only=True)
+        # print(refine_text)
+        # exit()
+        wavs = chat.infer(batch, params_infer_code=params_infer_code, params_refine_text=params_refine_text,
+                          use_decoder=True, skip_refine_text=False)
         all_wavs.extend(wavs)
         clear_cuda_cache()
     if skip_save:
     elapsed_time = end_time - start_time
     print(f"Saving audio for seed {seed}, took {elapsed_time:.2f}s")
     timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
+    wav_filename = f"long-[seed_{seed}][speed_{speed}]{refine_text_prompt}[{timestamp}].wav"
+    save_audio(wav_filename, combined_audio)
+    return wav_filename
 def tts(chat, text_file, seed, speed, oral, laugh, bk, seg, batch, progres=None):

utils.py CHANGED Viewed

@@ -4,16 +4,9 @@ except ImportError:
     print("The 'cn2an' module is not installed. Please install it using 'pip install cn2an'.")
     exit(1)
-try:
-    import jieba
-except ImportError:
-    print("The 'jieba' module is not installed. Please install it using 'pip install jieba'.")
-    exit(1)
 import re
 import numpy as np
 import wave
-import jieba.posseg as pseg
 def save_audio(file_name, audio, rate=24000):
@@ -24,20 +17,13 @@ def save_audio(file_name, audio, rate=24000):
     :param rate:
     :return:
     """
-    import os
-    from config import DEFAULT_DIR
     audio = (audio * 32767).astype(np.int16)
-    # 检查默认目录
-    if not os.path.exists(DEFAULT_DIR):
-        os.makedirs(DEFAULT_DIR)
-    full_path = os.path.join(DEFAULT_DIR, file_name)
-    with wave.open(full_path, "w") as wf:
         wf.setnchannels(1)
         wf.setsampwidth(2)
         wf.setframerate(rate)
         wf.writeframes(audio.tobytes())
-    return full_path
 def combine_audio(wavs):
@@ -101,32 +87,16 @@ def remove_chinese_punctuation(text):
     :param text:
     :return:
     """
-    chinese_punctuation_pattern = r"[：；！（），【】『』「」《》－‘“’”:,;!\(\)\[\]><\-·]"
-    text = re.sub(chinese_punctuation_pattern, '，', text)
-    # 使用正则表达式将多个连续的句号替换为一个句号
-    text = re.sub(r'[。，]{2,}', '。', text)
-    # 删除开头和结尾的 ， 号
-    text = re.sub(r'^，|，$', '', text)
-    return text
-def remove_english_punctuation(text):
-    """
-    移除文本中的中文标点符号 [：；！（），【】『』「」《》－‘“’”:,;!\(\)\[\]><\-] 替换为 ，
-    :param text:
-    :return:
-    """
-    chinese_punctuation_pattern = r"[：；！（），【】『』「」《》－‘“’”:,;!\(\)\[\]><\-·]"
-    text = re.sub(chinese_punctuation_pattern, ',', text)
     # 使用正则表达式将多个连续的句号替换为一个句号
-    text = re.sub(r'[,\.]{2,}', '.', text)
-    # 删除开头和结尾的 ， 号
-    text = re.sub(r'^,|,$', '', text)
     return text
 def text_normalize(text):
     """
-    对文本进行归一化处理 （PaddlePaddle版本）
     :param text:
     :return:
     """
@@ -134,7 +104,14 @@ def text_normalize(text):
     # ref: https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
     tx = TextNormalizer()
     sentences = tx.normalize(text)
     _txt = ''.join(sentences)
     return _txt
@@ -147,20 +124,6 @@ def convert_numbers_to_chinese(text):
     return cn2an.transform(text, "an2cn")
-def detect_language(sentence):
-    # ref: https://github.com/2noise/ChatTTS/blob/main/ChatTTS/utils/infer_utils.py#L55
-    chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')
-    english_word_pattern = re.compile(r'\b[A-Za-z]+\b')
-    chinese_chars = chinese_char_pattern.findall(sentence)
-    english_words = english_word_pattern.findall(sentence)
-    if len(chinese_chars) > len(english_words):
-        return "zh"
-    else:
-        return "en"
 def split_text(text, min_length=60):
     """
     将文本分割为长度不小于min_length的句子
@@ -168,63 +131,33 @@ def split_text(text, min_length=60):
     :param min_length:
     :return:
     """
-    # 短句分割符号
-    sentence_delimiters = re.compile(r'([。？！\.]+)')
-    # 匹配多个连续的回车符 作为段落点 强制分段
-    paragraph_delimiters = re.compile(r'(\s*\n\s*)+')
-    paragraphs = re.split(paragraph_delimiters, text)
     result = []
-    for paragraph in paragraphs:
-        if not paragraph.strip():
-            continue  # 跳过空段落
-        # 小于阈值的段落直接分开
-        if len(paragraph.strip()) < min_length:
-            result.append(paragraph.strip())
-            continue
-        # 大于的再计算拆分
-        sentences = re.split(sentence_delimiters, paragraph)
-        current_sentence = ''
-        for sentence in sentences:
-            if re.match(sentence_delimiters, sentence):
-                current_sentence += sentence.strip() + ''
-                if len(current_sentence) >= min_length:
-                    result.append(current_sentence.strip())
-                    current_sentence = ''
-            else:
-                current_sentence += sentence.strip()
-        if current_sentence:
-            if len(current_sentence) < min_length and len(result) > 0:
-                result[-1] += current_sentence
-            else:
-                result.append(current_sentence)
-    if detect_language(text[:1024]) == "zh":
-        result = [normalize_zh(_.strip()) for _ in result if _.strip()]
-    else:
-        result = [normalize_en(_.strip()) for _ in result if _.strip()]
     return result
-def normalize_en(text):
-    # 不再在 ChatTTS 外正则化文本
-    # from tn.english.normalizer import Normalizer
-    # normalizer = Normalizer()
-    # text = normalizer.normalize(text)
-    # text = remove_english_punctuation(text)
-    return text
 def normalize_zh(text):
-    # 不再在 ChatTTS 外正则化文本
-    # from tn.chinese.normalizer import Normalizer
-    # normalizer = Normalizer()
-    # text = normalizer.normalize(text)
-    # text = remove_chinese_punctuation(text)
-    text = process_ddd(text)
-    return text
 def batch_split(items, batch_size=5):
@@ -256,76 +189,11 @@ def read_long_text(file_path):
     raise ValueError("无法识别文件编码")
-def replace_tokens(text):
-    remove_tokens = ['UNK']
-    for token in remove_tokens:
-        text = re.sub(r'\[' + re.escape(token) + r'\]', '', text)
-    tokens = ['uv_break', 'laugh','lbreak']
-    for token in tokens:
-        text = re.sub(r'\[' + re.escape(token) + r'\]', f'uu{token}uu', text)
-        text = text.replace('_', '')
-    return text
-def restore_tokens(text):
-    tokens = ['uvbreak', 'laugh', 'UNK', 'lbreak']
-    for token in tokens:
-        text = re.sub(r'uu' + re.escape(token) + r'uu', f'[{token}]', text)
-    text = text.replace('[uvbreak]', '[uv_break]')
-    return text
-def process_ddd(text):
-    """
-    处理“地”、“得” 字的使用，都替换为“的”
-    依据：地、得的使用，主要是在动词和形容词前后，本方法没有严格按照语法替换，因为时常遇到用错的情况。
-    另外受 jieba 分词准确率的影响，部分情况下可能会出漏掉。例如：小红帽疑惑地问
-    :param text: 输入的文本
-    :return: 处理后的文本
-    """
-    word_list = [(word, flag) for word, flag in pseg.cut(text, use_paddle=False)]
-    # print(word_list)
-    processed_words = []
-    for i, (word, flag) in enumerate(word_list):
-        if word in ["地", "得"]:
-            # Check previous and next word's flag
-            # prev_flag = word_list[i - 1][1] if i > 0 else None
-            # next_flag = word_list[i + 1][1] if i + 1 < len(word_list) else None
-            # if prev_flag in ['v', 'a'] or next_flag in ['v', 'a']:
-            if flag in ['uv', 'ud']:
-                processed_words.append("的")
-            else:
-                processed_words.append(word)
-        else:
-            processed_words.append(word)
-    return ''.join(processed_words)
-def replace_space_between_chinese(text):
-    return re.sub(r'(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])', '', text)
 if __name__ == '__main__':
-    # txts = [
-    #     "快速地跑过红色的大门",
-    #     "笑得很开心，学得很好",
-    #     "小红帽疑惑地问？",
-    #     "大灰狼慌张地回答",
-    #     "哦，这是为了更好地听你说话。",
-    #     "大灰狼不耐烦地说：“为了更好地抱你。”",
-    #     "他跑得很快，工作做得非常认真，这是他努力地结果。得到",
-    # ]
-    # for txt in txts:
-    #     print(txt, '-->', process_ddd(txt))
     txts = [
         "电影中梁朝伟扮演的陈永仁的编号27149",
         "这块黄金重达324.75克 我们班的最高总分为583分",
         "12\~23 -1.5\~2",
-        "居维埃·拉色别德①、杜梅里②、卡特法日③，"
     ]
     for txt in txts:

     print("The 'cn2an' module is not installed. Please install it using 'pip install cn2an'.")
     exit(1)
 import re
 import numpy as np
 import wave
 def save_audio(file_name, audio, rate=24000):
     :param rate:
     :return:
     """
     audio = (audio * 32767).astype(np.int16)
+    with wave.open(file_name, "w") as wf:
         wf.setnchannels(1)
         wf.setsampwidth(2)
         wf.setframerate(rate)
         wf.writeframes(audio.tobytes())
 def combine_audio(wavs):
     :param text:
     :return:
     """
+    chinese_punctuation_pattern = r"[：；！（），【】『』「」《》－‘“’”:,;!\(\)\[\]><\-]"
+    text = re.sub(chinese_punctuation_pattern, ' ', text)
     # 使用正则表达式将多个连续的句号替换为一个句号
+    text = re.sub(r'。{2,}', '。', text)
     return text
 def text_normalize(text):
     """
+    对文本进行归一化处理
     :param text:
     :return:
     """
     # ref: https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
     tx = TextNormalizer()
     sentences = tx.normalize(text)
+    # print(sentences)
     _txt = ''.join(sentences)
+    # 替换掉除中文之外的所有字符
+    _txt = re.sub(
+        r"[^\u4e00-\u9fa5，。！？、]+", "", _txt
+    )
     return _txt
     return cn2an.transform(text, "an2cn")
 def split_text(text, min_length=60):
     """
     将文本分割为长度不小于min_length的句子
     :param min_length:
     :return:
     """
+    sentence_delimiters = re.compile(r'([。？！\.\n]+)')
+    sentences = re.split(sentence_delimiters, text)
+    # print(sentences)
+    # exit()
     result = []
+    current_sentence = ''
+    for sentence in sentences:
+        if re.match(sentence_delimiters, sentence):
+            current_sentence += sentence.strip() + '。'
+            if len(current_sentence) >= min_length:
+                result.append(current_sentence.strip())
+                current_sentence = ''
+        else:
+            current_sentence += sentence.strip()
+    if current_sentence:
+        if len(current_sentence) < min_length and len(result) > 0:
+            result[-1] += current_sentence
+        else:
+            result.append(current_sentence)
+    # result = [convert_numbers_to_chinese(remove_chinese_punctuation(_.strip())) for _ in result if _.strip()]
+    result = [normalize_zh(_.strip()) for _ in result if _.strip()]
     return result
 def normalize_zh(text):
+    # return text_normalize(remove_chinese_punctuation(text))
+    return convert_numbers_to_chinese(remove_chinese_punctuation(text))
 def batch_split(items, batch_size=5):
     raise ValueError("无法识别文件编码")
 if __name__ == '__main__':
     txts = [
         "电影中梁朝伟扮演的陈永仁的编号27149",
         "这块黄金重达324.75克 我们班的最高总分为583分",
         "12\~23 -1.5\~2",
     ]
     for txt in txts:

webui_mix.py CHANGED Viewed

@@ -1,7 +1,3 @@
-import os
-import sys
-sys.path.insert(0, os.getcwd())
 import argparse
 import re
 import time
@@ -10,13 +6,12 @@ import pandas
 import numpy as np
 from tqdm import tqdm
 import random
 import gradio as gr
 import json
-from utils import normalize_zh, batch_split, normalize_audio, combine_audio
-from tts_model import load_chat_tts_model, clear_cuda_cache, generate_audio_for_seed
-from config import DEFAULT_BATCH_SIZE, DEFAULT_SPEED, DEFAULT_TEMPERATURE, DEFAULT_TOP_K, DEFAULT_TOP_P, DEFAULT_ORAL, \
-    DEFAULT_LAUGH, DEFAULT_BK, DEFAULT_SEG_LENGTH
-import torch
 parser = argparse.ArgumentParser(description="Gradio ChatTTS MIX")
 parser.add_argument("--source", type=str, default="huggingface", help="Model source: 'huggingface' or 'local'.")
@@ -45,31 +40,30 @@ if not os.path.exists(SAVED_SEEDS_FILE):
 chat = load_chat_tts_model(source=args.source, local_path=args.local_path)
 # chat = None
-# chat = load_chat_tts_model(source="local", local_path=r"models")
 # 抽卡的最大数量
 max_audio_components = 10
 # 加载
 def load_seeds():
     with open(SAVED_SEEDS_FILE, "r") as f:
         global saved_seeds
-        seeds = json.load(f)
-        # 兼容旧的 JSON 格式，添加 path 字段
-        for seed in seeds:
-            if 'path' not in seed:
-                seed['path'] = None
-        saved_seeds = seeds
     return saved_seeds
 def display_seeds():
     seeds = load_seeds()
     # 转换为 List[List] 的形式
-    return [[i, s['seed'], s['name'], s['path']] for i, s in enumerate(seeds)]
 saved_seeds = load_seeds()
@@ -84,14 +78,13 @@ def save_seeds():
 # 添加 seed
-def add_seed(seed, name, audio_path, save=True):
     for s in saved_seeds:
         if s['seed'] == seed:
             return False
     saved_seeds.append({
         'seed': seed,
-        'name': name,
-        'path': audio_path
     })
     if save:
         save_seeds()
@@ -117,7 +110,7 @@ def delete_seed(seed, save=True):
             return True
     return False
 def generate_seeds(num_seeds, texts, tq):
     """
     生成随机音频种子并保存
@@ -136,7 +129,7 @@ def generate_seeds(num_seeds, texts, tq):
     for _ in tq(range(num_seeds), desc=f"随机音色生成中..."):
         seed = np.random.randint(0, 9999)
-        filename = generate_audio_for_seed(chat, seed, texts, 1, 5, "[oral_2][laugh_0][break_4]", None, 0.3, 0.7, 20)
         seeds.append((filename, seed))
         clear_cuda_cache()
@@ -144,12 +137,11 @@ def generate_seeds(num_seeds, texts, tq):
 # 保存选定的音频种子
-def do_save_seed(seed, audio_path):
-    print(f"Saving seed {seed} to {audio_path}")
     seed = seed.replace('保存种子 ', '').strip()
     if not seed:
         return
-    add_seed(int(seed), seed, audio_path)
     gr.Info(f"Seed {seed} has been saved.")
@@ -181,24 +173,11 @@ def do_delete_seed(val):
     return display_seeds()
-# 定义播放音频的函数
-def do_play_seed(val):
-    # 从 val 匹配 [(\d+)] 获取index
-    index = re.search(r'\[(\d+)\]', val)
-    if index:
-        index = int(index.group(1))
-        seed = saved_seeds[index]['seed']
-        audio_path = saved_seeds[index]['path']
-        if audio_path:
-            return gr.update(visible=True, value=audio_path)
-    return gr.update(visible=False, value=None)
 def seed_change_btn():
     global SELECTED_SEED_INDEX
     if SELECTED_SEED_INDEX == -1:
-        return ['删除', '试听']
-    return [f'删除 idx=[{SELECTED_SEED_INDEX[0]}]', f'试听 idx=[{SELECTED_SEED_INDEX[0]}]']
 def audio_interface(num_seeds, texts, progress=gr.Progress()):
@@ -215,26 +194,11 @@ def audio_interface(num_seeds, texts, progress=gr.Progress()):
     # 不足的部分
     all_wavs = wavs + [None] * (max_audio_components - len(wavs))
     all_seeds = seeds + [''] * (max_audio_components - len(seeds))
-    return [item for pair in zip(all_wavs, all_seeds, all_wavs) for item in pair]
-# 保存刚刚生成的种子文件路径
-audio_paths = [gr.State(value=None) for _ in range(max_audio_components)]
-def audio_interface_with_paths(num_seeds, texts, progress=gr.Progress()):
-    """
-    比 audio_interface 多携带音频的 path
-    """
-    results = audio_interface(num_seeds, texts, progress)
-    wavs = results[::2]  # 提取音频文件路径
-    for i, wav in enumerate(wavs):
-        audio_paths[i].value = wav  # 直接为 State 组件赋值
-    return results
 def audio_interface_empty(num_seeds, texts, progress=gr.Progress(track_tqdm=True)):
-    return [None, "", None] * max_audio_components
 def update_audio_components(slider_value):
@@ -242,9 +206,8 @@ def update_audio_components(slider_value):
     k = int(slider_value)
     audios = [gr.Audio(visible=True)] * k + [gr.Audio(visible=False)] * (max_audio_components - k)
     tbs = [gr.Textbox(visible=True)] * k + [gr.Textbox(visible=False)] * (max_audio_components - k)
-    stats = [gr.State(value=None)] * max_audio_components
     print(f'k={k}, audios={len(audios)}')
-    return [item for pair in zip(audios, tbs, stats) for item in pair]
 def seed_change(evt: gr.SelectData):
@@ -253,11 +216,11 @@ def seed_change(evt: gr.SelectData):
     SELECTED_SEED_INDEX = evt.index
     return evt.index
 def generate_tts_audio(text_file, num_seeds, seed, speed, oral, laugh, bk, min_length, batch_size, temperature, top_P,
-                       top_K, roleid=None, refine_text=True, speaker_type="seed", pt_file=None, progress=gr.Progress()):
     from tts_model import generate_audio_for_seed
-    from utils import split_text, replace_tokens, restore_tokens
     if seed in [0, -1, None]:
         seed = random.randint(1, 9999)
     content = ''
@@ -265,151 +228,19 @@ def generate_tts_audio(text_file, num_seeds, seed, speed, oral, laugh, bk, min_l
         content = ""
     elif isinstance(text_file, str):
         content = text_file
-    # 将  [uv_break]  [laugh] 替换为 _uv_break_ _laugh_ 处理后再还原
-    content = replace_tokens(content)
     texts = split_text(content, min_length=min_length)
-    for i, text in enumerate(texts):
-        texts[i] = restore_tokens(text)
     if oral < 0 or oral > 9 or laugh < 0 or laugh > 2 or bk < 0 or bk > 7:
         raise ValueError("oral_(0-9), laugh_(0-2), break_(0-7) out of range")
     refine_text_prompt = f"[oral_{oral}][laugh_{laugh}][break_{bk}]"
     try:
-        output_files = generate_audio_for_seed(
-            chat=chat,
-            seed=seed,
-            texts=texts,
-            batch_size=batch_size,
-            speed=speed,
-            refine_text_prompt=refine_text_prompt,
-            roleid=roleid,
-            temperature=temperature,
-            top_P=top_P,
-            top_K=top_K,
-            cur_tqdm=progress.tqdm,
-            skip_save=False,
-            skip_refine_text=not refine_text,
-            speaker_type=speaker_type,
-            pt_file=pt_file,
-        )
         return output_files
     except Exception as e:
-        raise e
-def generate_tts_audio_stream(text_file, num_seeds, seed, speed, oral, laugh, bk, min_length, batch_size, temperature,
-                              top_P,
-                              top_K, roleid=None, refine_text=True, speaker_type="seed", pt_file=None,
-                              stream_mode="fake"):
-    from utils import split_text, replace_tokens, restore_tokens
-    from tts_model import deterministic
-    if seed in [0, -1, None]:
-        seed = random.randint(1, 9999)
-    content = ''
-    if os.path.isfile(text_file):
-        content = ""
-    elif isinstance(text_file, str):
-        content = text_file
-    # 将  [uv_break]  [laugh] 替换为 _uv_break_ _laugh_ 处理后再还原
-    content = replace_tokens(content)
-    # texts = [normalize_zh(_) for _ in content.split('\n') if _.strip()]
-    texts = split_text(content, min_length=min_length)
-    for i, text in enumerate(texts):
-        texts[i] = restore_tokens(text)
-    if oral < 0 or oral > 9 or laugh < 0 or laugh > 2 or bk < 0 or bk > 7:
-        raise ValueError("oral_(0-9), laugh_(0-2), break_(0-7) out of range")
-    refine_text_prompt = f"[oral_{oral}][laugh_{laugh}][break_{bk}]"
-    print(f"speaker_type: {speaker_type}")
-    if speaker_type == "seed":
-        if seed in [None, -1, 0, "", "random"]:
-            seed = np.random.randint(0, 9999)
-        deterministic(seed)
-        rnd_spk_emb = chat.sample_random_speaker()
-    elif speaker_type == "role":
-        # 从 JSON 文件中读取数据
-        with open('./slct_voice_240605.json', 'r', encoding='utf-8') as json_file:
-            slct_idx_loaded = json.load(json_file)
-        # 将包含 Tensor 数据的部分转换回 Tensor 对象
-        for key in slct_idx_loaded:
-            tensor_list = slct_idx_loaded[key]["tensor"]
-            slct_idx_loaded[key]["tensor"] = torch.tensor(tensor_list)
-        # 将音色 tensor 打包进params_infer_code，固定使用此音色发音，调低temperature
-        rnd_spk_emb = slct_idx_loaded[roleid]["tensor"]
-        # temperature = 0.001
-    elif speaker_type == "pt":
-        print(pt_file)
-        rnd_spk_emb = torch.load(pt_file)
-        print(rnd_spk_emb.shape)
-        if rnd_spk_emb.shape != (768,):
-            raise ValueError("维度应为 768。")
-    else:
-        raise ValueError(f"Invalid speaker_type: {speaker_type}. ")
-    params_infer_code = {
-        'spk_emb': rnd_spk_emb,
-        'prompt': f'[speed_{speed}]',
-        'top_P': top_P,
-        'top_K': top_K,
-        'temperature': temperature
-    }
-    params_refine_text = {
-        'prompt': refine_text_prompt,
-        'top_P': top_P,
-        'top_K': top_K,
-        'temperature': temperature
-    }
-    if stream_mode == "real":
-        for text in texts:
-            _params_infer_code = {**params_infer_code}
-            wavs_gen = chat.infer(text, params_infer_code=_params_infer_code, params_refine_text=params_refine_text,
-                                  use_decoder=True, skip_refine_text=True, stream=True)
-            for gen in wavs_gen:
-                wavs = [np.array([[]])]
-                wavs[0] = np.hstack([wavs[0], np.array(gen[0])])
-                audio = wavs[0][0]
-                yield 24000, normalize_audio(audio)
-            clear_cuda_cache()
-    else:
-        for text in batch_split(texts, batch_size):
-            _params_infer_code = {**params_infer_code}
-            wavs = chat.infer(text, params_infer_code=_params_infer_code, params_refine_text=params_refine_text,
-                              use_decoder=True, skip_refine_text=False, stream=False)
-            combined_audio = combine_audio(wavs)
-            yield 24000, combined_audio[0]
-def generate_refine(text_file, oral, laugh, bk, temperature, top_P, top_K, progress=gr.Progress()):
-    from tts_model import generate_refine_text
-    from utils import split_text, replace_tokens, restore_tokens, replace_space_between_chinese
-    seed = random.randint(1, 9999)
-    refine_text_prompt = f"[oral_{oral}][laugh_{laugh}][break_{bk}]"
-    content = ''
-    if os.path.isfile(text_file):
-        content = ""
-    elif isinstance(text_file, str):
-        content = text_file
-    if re.search(r'\[uv_break\]|\[laugh\]', content) is not None:
-        gr.Info("检测到 [uv_break] [laugh]，不能重复 refine ")
-        # print("检测到 [uv_break] [laugh]，不能重复 refine ")
-        return content
-    batch_size = 5
-    content = replace_tokens(content)
-    texts = split_text(content, min_length=120)
-    print(texts)
-    for i, text in enumerate(texts):
-        texts[i] = restore_tokens(text)
-    txts = []
-    for batch in progress.tqdm(batch_split(texts, batch_size), desc=f"Refine Text Please Wait ..."):
-        txts.extend(generate_refine_text(chat, seed, batch, refine_text_prompt, temperature, top_P, top_K))
-    return replace_space_between_chinese('\n\n'.join(txts))
 def generate_seed():
@@ -422,28 +253,10 @@ def generate_seed():
 def update_label(text):
     word_count = len(text)
-    return gr.update(label=f"朗读文本（{word_count} 字）")
-def inser_token(text, btn):
-    if btn == "+笑声":
-        return gr.update(
-            value=text + "[laugh]"
-        )
-    elif btn == "+停顿":
-        return gr.update(
-            value=text + "[uv_break]"
-        )
 with gr.Blocks() as demo:
-    # 项目链接
-    gr.Markdown("""
-        <div style='text-align: center; font-size: 16px;'>
-            🌟  <a href='https://github.com/6drf21e/ChatTTS_colab'>项目地址 欢迎 start</a> 🌟
-        </div>
-        """)
     with gr.Tab("音色抽卡"):
         with gr.Row():
             with gr.Column(scale=1):
@@ -454,10 +267,6 @@ with gr.Blocks() as demo:
                 ]
                 # gr.Markdown("### 随机音色抽卡")
                 gr.Markdown("""
-                免抽卡，直接找稳定音色👇
-                [ModelScope ChatTTS Speaker(国内)](https://modelscope.cn/studios/ttwwwaa/ChatTTS_Speaker) | [HuggingFace ChatTTS Speaker(国外)](https://huggingface.co/spaces/taa/ChatTTS_Speaker)
                 在相同的 seed 和 温度等参数下，音色具有一定的一致性。点击下面的“随机音色生成”按钮将生成多个 seed。找到满意的音色后，点击音频下方“保存”按钮。
                 **注意：不同机器使用相同种子生成的音频音色可能不同，同一机器使用相同种子多次生成的音频音色也可能变化。**
                 """)
@@ -474,29 +283,21 @@ with gr.Blocks() as demo:
                 gr.Markdown("### 种子管理界面")
                 seed_list = gr.DataFrame(
                     label="种子列表",
-                    headers=["Index", "Seed", "Name", "Path"],
-                    datatype=["number", "number", "str", "str"],
                     interactive=True,
-                    col_count=(4, "fixed"),
-                    value=display_seeds
                 )
                 with gr.Row():
                     refresh_button = gr.Button("刷新")
                     save_button = gr.Button("保存")
                     del_button = gr.Button("删除")
-                    play_button = gr.Button("试听")
-                with gr.Row():
-                    # 添加已保存的种子音频播放组件
-                    audio_player = gr.Audio(label="播放已保存种子音频", visible=False)
                 # 绑定按钮和函数
                 refresh_button.click(display_seeds, outputs=seed_list)
-                seed_list.select(seed_change).success(seed_change_btn, outputs=[del_button, play_button])
                 save_button.click(do_save_seeds, inputs=[seed_list], outputs=None)
                 del_button.click(do_delete_seed, inputs=del_button, outputs=seed_list)
-                play_button.click(do_play_seed, inputs=play_button, outputs=audio_player)
             with gr.Column(scale=1):
                 audio_components = []
@@ -504,13 +305,12 @@ with gr.Blocks() as demo:
                     visible = i < num_seeds_default
                     a = gr.Audio(f"Audio {i}", visible=visible)
                     t = gr.Button(f"Seed", visible=visible)
-                    s = gr.State(value=None)
-                    t.click(do_save_seed, inputs=[t, s], outputs=None).success(display_seeds, outputs=seed_list)
                     audio_components.append(a)
                     audio_components.append(t)
-                    audio_components.append(s)
                 num_seeds.change(update_audio_components, inputs=num_seeds, outputs=audio_components)
                 # output = gr.Column()
                 # audio = gr.Audio(label="Output Audio")
@@ -530,136 +330,46 @@ with gr.Blocks() as demo:
                                              placeholder="Please Input Text...", value=default_text)
                 # 当文本框内容发生变化时调用 update_label 函数
                 text_file_input.change(update_label, inputs=text_file_input, outputs=text_file_input)
-                # 加入停顿按钮
-                with gr.Row():
-                    break_button = gr.Button("+停顿", variant="secondary")
-                    laugh_button = gr.Button("+笑声", variant="secondary")
-                refine_button = gr.Button("Refine Text（预处理 加入停顿词、笑声等）", variant="secondary")
             with gr.Column():
                 gr.Markdown("### 配置参数")
                 with gr.Row():
-                    with gr.Column():
-                        gr.Markdown("音色选择")
-                        num_seeds_input = gr.Number(label="生成音频的数量", value=1, precision=0, visible=False)
-                        speaker_stat = gr.State(value="seed")
-                        tab_seed = gr.Tab(label="种子")
-                        with tab_seed:
-                            with gr.Row():
-                                seed_input = gr.Number(label="指定种子", info="种子决定音色 0则随机", value=None,
-                                                       precision=0)
-                                generate_audio_seed = gr.Button("\U0001F3B2")
-                        tab_roleid = gr.Tab(label="内置音色")
-                        with tab_roleid:
-                            roleid_input = gr.Dropdown(label="内置音色",
-                                                       choices=[("发姐", "1"),
-                                                                ("纯情男大学生", "2"),
-                                                                ("阳光开朗大男孩", "3"),
-                                                                ("知心小姐姐", "4"),
-                                                                ("电视台女主持", "5"),
-                                                                ("魅力大叔", "6"),
-                                                                ("优雅甜美", "7"),
-                                                                ("贴心男宝2", "21"),
-                                                                ("正式打工人", "8"),
-                                                                ("贴心男宝1", "9")],
-                                                       value="1",
-                                                       info="选择音色后会覆盖种子。感谢 @QuantumDriver 提供音色")
-                        tab_pt = gr.Tab(label="上传.PT文件")
-                        with tab_pt:
-                            pt_input = gr.File(label="上传音色文件", file_types=[".pt"], height=100)
                 with gr.Row():
-                    style_select = gr.Radio(label="预设参数", info="语速部分可自行更改",
-                                            choices=["小说朗读", "对话", "中英混合", "默认"], value="默认",
-                                            interactive=True, )
-                with gr.Row():
-                    # refine
-                    refine_text_input = gr.Checkbox(label="Refine",
-                                                    info="打开后会自动根据下方参数添加笑声/停顿等。关闭后可自行添加 [uv_break] [laugh] 或者点击下方 Refin按钮先行转换",
-                                                    value=True)
-                    speed_input = gr.Slider(label="语速", minimum=1, maximum=10, value=DEFAULT_SPEED, step=1)
-                with gr.Row():
-                    oral_input = gr.Slider(label="口语化", minimum=0, maximum=9, value=DEFAULT_ORAL, step=1)
-                    laugh_input = gr.Slider(label="笑声", minimum=0, maximum=2, value=DEFAULT_LAUGH, step=1)
-                    bk_input = gr.Slider(label="停顿", minimum=0, maximum=7, value=DEFAULT_BK, step=1)
                 # gr.Markdown("### 文本参数")
                 with gr.Row():
-                    min_length_input = gr.Number(label="文本分段长度", info="大于这个数值进行分段",
-                                                 value=DEFAULT_SEG_LENGTH, precision=0)
-                    batch_size_input = gr.Number(label="批大小", info="越高越快 太高爆显存 4G推荐3 其他酌情",
-                                                 value=DEFAULT_BATCH_SIZE, precision=0)
                 with gr.Accordion("其他参数", open=False):
                     with gr.Row():
                         # 温度 top_P top_K
-                        temperature_input = gr.Slider(label="温度", minimum=0.01, maximum=1.0, step=0.01,
-                                                      value=DEFAULT_TEMPERATURE)
-                        top_P_input = gr.Slider(label="top_P", minimum=0.1, maximum=0.9, step=0.05, value=DEFAULT_TOP_P)
-                        top_K_input = gr.Slider(label="top_K", minimum=1, maximum=20, step=1, value=DEFAULT_TOP_K)
                         # reset 按钮
                         reset_button = gr.Button("重置")
         with gr.Row():
-            with gr.Column():
-                generate_button = gr.Button("生成音频", variant="primary")
-            with gr.Column():
-                generate_button_stream = gr.Button("流式生成音频(一边播放一边推理)", variant="primary")
-                stream_select = gr.Radio(label="流输出方式",
-                                         info="真流式为实验功能，播放效果：卡播卡播卡播（⏳🎵⏳🎵⏳🎵）；伪流式为分段推理后输出，播放效果：卡卡卡播播播播（⏳⏳🎵🎵🎵🎵）。伪流式批次建议4以上减少卡顿",
-                                         choices=[("真", "real"), ("伪", "fake")], value="fake", interactive=True, )
         with gr.Row():
             output_audio = gr.Audio(label="生成的音频文件")
-            output_audio_stream = gr.Audio(label="流式音频", value=None,
-                                           streaming=True,
-                                           autoplay=True,
-                                           # disable auto play for Windows, due to https://developer.chrome.com/blog/autoplay#webaudio
-                                           interactive=False,
-                                           show_label=True)
         generate_audio_seed.click(generate_seed,
                                   inputs=[],
                                   outputs=seed_input)
-        def do_tab_change(evt: gr.SelectData):
-            print(evt.selected, evt.index, evt.value, evt.target)
-            kv = {
-                "种子": "seed",
-                "内置音色": "role",
-                "上传.PT文件": "pt"
-            }
-            return kv.get(evt.value, "seed")
-        tab_seed.select(do_tab_change, outputs=speaker_stat)
-        tab_roleid.select(do_tab_change, outputs=speaker_stat)
-        tab_pt.select(do_tab_change, outputs=speaker_stat)
-        def do_style_select(x):
-            if x == "小说朗读":
-                return [4, 0, 0, 2]
-            elif x == "对话":
-                return [5, 5, 1, 4]
-            elif x == "中英混合":
-                return [4, 1, 0, 3]
-            else:
-                return [DEFAULT_SPEED, DEFAULT_ORAL, DEFAULT_LAUGH, DEFAULT_BK]
-        # style_select 选择
-        style_select.change(
-            do_style_select,
-            inputs=style_select,
-            outputs=[speed_input, oral_input, laugh_input, bk_input]
-        )
-        # refine 按钮
-        refine_button.click(
-            generate_refine,
-            inputs=[text_file_input, oral_input, laugh_input, bk_input, temperature_input, top_P_input, top_K_input],
-            outputs=text_file_input
-        )
         # 重置按钮 重置温度等参数
         reset_button.click(
             lambda: [0.3, 0.7, 20],
@@ -682,50 +392,9 @@ with gr.Blocks() as demo:
                 temperature_input,
                 top_P_input,
                 top_K_input,
-                roleid_input,
-                refine_text_input,
-                speaker_stat,
-                pt_input
             ],
             outputs=[output_audio]
         )
-        generate_button_stream.click(
-            fn=generate_tts_audio_stream,
-            inputs=[
-                text_file_input,
-                num_seeds_input,
-                seed_input,
-                speed_input,
-                oral_input,
-                laugh_input,
-                bk_input,
-                min_length_input,
-                batch_size_input,
-                temperature_input,
-                top_P_input,
-                top_K_input,
-                roleid_input,
-                refine_text_input,
-                speaker_stat,
-                pt_input,
-                stream_select
-            ],
-            outputs=[output_audio_stream]
-        )
-        break_button.click(
-            inser_token,
-            inputs=[text_file_input, break_button],
-            outputs=text_file_input
-        )
-        laugh_button.click(
-            inser_token,
-            inputs=[text_file_input, laugh_button],
-            outputs=text_file_input
-        )
     with gr.Tab("角色扮演"):
         def txt_2_script(text):
             lines = text.split("\n")
@@ -757,7 +426,7 @@ with gr.Blocks() as demo:
             characters = list([_["character"] for _ in lines])
             unique_characters = list(dict.fromkeys(characters))
             print([[character, 0] for character in unique_characters])
-            return [[character, 0, 5, 2, 0, 4] for character in unique_characters]
         def get_txt_characters(text):
@@ -784,7 +453,7 @@ with gr.Blocks() as demo:
             scripts = llm_operation(api_base, api_key, model, LLM_PROMPT, text, required_keys=["txt", "character"])
             return script_2_txt(scripts)
         def generate_script_audio(text, models_seeds, progress=gr.Progress()):
             scripts = txt_2_script(text)  # 将文本转换为剧本
             characters = get_characters(scripts)  # 从剧本中提取角色
@@ -795,6 +464,7 @@ with gr.Blocks() as demo:
             import itertools
             from tts_model import generate_audio_for_seed
             from utils import combine_audio, save_audio, normalize_zh
             assert isinstance(models_seeds, pd.DataFrame)
@@ -807,40 +477,18 @@ with gr.Blocks() as demo:
                         break
                     yield batch
-            column_mapping = {
-                '角色': 'character',
-                '种子': 'seed',
-                '语速': 'speed',
-                '口语': 'oral',
-                '笑声': 'laugh',
-                '停顿': 'break'
-            }
-            # 使用 rename 方法重命名 DataFrame 的列
-            models_seeds = models_seeds.rename(columns=column_mapping).to_dict(orient='records')
-            # models_seeds = models_seeds.to_dict(orient='records')
             # 检查每个角色是否都有对应的种子
-            print(models_seeds)
-            seed_lookup = {seed['character']: seed for seed in models_seeds}
-            character_seeds = {}
-            missing_seeds = []
-            # 遍历所有角色
-            for character in characters:
-                character_name = character[0]
-                seed_info = seed_lookup.get(character_name)
-                if seed_info:
-                    character_seeds[character_name] = seed_info
-                else:
-                    missing_seeds.append(character_name)
-            if missing_seeds:
-                missing_characters_str = ', '.join(missing_seeds)
-                gr.Info(f"以下角色没有种子，请先设置种子：{missing_characters_str}")
-                return None
-            print(character_seeds)
-            # return
             refine_text_prompt = "[oral_2][laugh_0][break_4]"
             all_wavs = []
@@ -854,21 +502,13 @@ with gr.Blocks() as demo:
             batch_size = 5  # 设置批次大小
             # 按角色处理
             for character, lines in progress.tqdm(grouped_lines.items(), desc="生成剧本音频"):
-                info = character_seeds[character]
-                seed = info["seed"]
-                speed = info["speed"]
-                orla = info["oral"]
-                laugh = info["laugh"]
-                bk = info["break"]
-                refine_text_prompt = f"[oral_{orla}][laugh_{laugh}][break_{bk}]"
                 # 按批次处理
                 for batch_lines in batch(lines, batch_size):
                     texts = [normalize_zh(line["txt"]) for line in batch_lines]
-                    print(f"seed={seed} t={texts} c={character} s={speed} r={refine_text_prompt}")
-                    wavs = generate_audio_for_seed(chat, int(seed), texts, DEFAULT_BATCH_SIZE, speed,
-                                                   refine_text_prompt, None, DEFAULT_TEMPERATURE, DEFAULT_TOP_P,
                                                    DEFAULT_TOP_K, skip_save=True)  # 批量处理文本
                     batch_results[character].extend(wavs)
@@ -880,7 +520,8 @@ with gr.Blocks() as demo:
             # 合成所有音频
             audio = combine_audio(all_wavs)
             fname = f"script_{int(time.time())}.wav"
-            return save_audio(fname, audio)
         script_example = {
@@ -915,7 +556,7 @@ with gr.Blocks() as demo:
                 "txt": "当小红帽到达奶奶家时，她发现大灰狼伪装成了奶奶。",
                 "character": "旁白"
             }, {
-                "txt": "小红帽疑惑的问",
                 "character": "旁白"
             }, {
                 "txt": "奶奶，你的耳朵怎么这么尖？",
@@ -964,7 +605,7 @@ with gr.Blocks() as demo:
                                                        placeholder="请输入API Base URL",
                                                        value=r"https://api.openai.com/v1")
                     openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="请输入API Key",
-                                                      value="sk-xxxxxxx", type="password")
                 # AI提示词
                 ai_text_input = gr.Textbox(label="剧情简介或者一段故事", placeholder="请输入文本...", lines=2,
                                            value=ai_text_default)
@@ -975,7 +616,7 @@ with gr.Blocks() as demo:
             with gr.Column(scale=3):
                 gr.Markdown("### 脚本")
                 gr.Markdown(
-                    "脚本可以手工编写也可以��左侧的AI脚本生成按钮生成。脚本格式 **角色::文本** 一行为一句” 注意是::")
                 script_text = "\n".join(
                     [f"{_.get('character', '')}::{_.get('txt', '')}" for _ in script_example['lines']])
@@ -987,20 +628,20 @@ with gr.Blocks() as demo:
             with gr.Column(scale=1):
                 gr.Markdown("### 角色种子")
                 # DataFrame 来存放转换后的脚本
-                # 默认数据 [speed_5][oral_2][laugh_0][break_4]
                 default_data = [
-                    ["旁白", 2222, 3, 0, 0, 2],
-                    ["年轻女性", 2, 5, 2, 0, 2],
-                    ["中年男性", 2424, 5, 2, 0, 2]
                 ]
                 script_data = gr.DataFrame(
                     value=default_data,
                     label="角色对应的音色种子，从抽卡那获取",
-                    headers=["角色", "种子", "语速", "口语", "笑声", "停顿"],
-                    datatype=["str", "number", "number", "number", "number", "number"],
                     interactive=True,
-                    col_count=(6, "fixed"),
                 )
                 # 生视频按钮
                 script_generate_audio = gr.Button("步骤②：生成音频")
@@ -1033,4 +674,4 @@ with gr.Blocks() as demo:
             outputs=[script_audio]
         )
-demo.launch(share=args.share, inbrowser=True)

 import argparse
 import re
 import time
 import numpy as np
 from tqdm import tqdm
 import random
+import os
 import gradio as gr
 import json
+from utils import combine_audio, save_audio, batch_split, normalize_zh
+from tts_model import load_chat_tts_model, clear_cuda_cache, deterministic, generate_audio_for_seed
+import spaces
 parser = argparse.ArgumentParser(description="Gradio ChatTTS MIX")
 parser.add_argument("--source", type=str, default="huggingface", help="Model source: 'huggingface' or 'local'.")
 chat = load_chat_tts_model(source=args.source, local_path=args.local_path)
 # chat = None
+# chat = load_chat_tts_model(source="local", local_path="models")
 # 抽卡的最大数量
 max_audio_components = 10
+# print("loading ChatTTS model...")
+# chat = ChatTTS.Chat()
+# chat.load_models(source="local", local_path="models")
+# torch.cuda.empty_cache()
 # 加载
 def load_seeds():
     with open(SAVED_SEEDS_FILE, "r") as f:
         global saved_seeds
+        saved_seeds = json.load(f)
     return saved_seeds
 def display_seeds():
     seeds = load_seeds()
     # 转换为 List[List] 的形式
+    return [[i, s['seed'], s['name']] for i, s in enumerate(seeds)]
 saved_seeds = load_seeds()
 # 添加 seed
+def add_seed(seed, name, save=True):
     for s in saved_seeds:
         if s['seed'] == seed:
             return False
     saved_seeds.append({
         'seed': seed,
+        'name': name
     })
     if save:
         save_seeds()
             return True
     return False
+@spaces.GPU
 def generate_seeds(num_seeds, texts, tq):
     """
     生成随机音频种子并保存
     for _ in tq(range(num_seeds), desc=f"随机音色生成中..."):
         seed = np.random.randint(0, 9999)
+        filename = generate_audio_for_seed(chat, seed, texts, 1, 5, "[oral_2][laugh_0][break_4]", 0.3, 0.7, 20)
         seeds.append((filename, seed))
         clear_cuda_cache()
 # 保存选定的音频种子
+def do_save_seed(seed):
     seed = seed.replace('保存种子 ', '').strip()
     if not seed:
         return
+    add_seed(int(seed), seed)
     gr.Info(f"Seed {seed} has been saved.")
     return display_seeds()
 def seed_change_btn():
     global SELECTED_SEED_INDEX
     if SELECTED_SEED_INDEX == -1:
+        return '删除'
+    return f'删除 idx=[{SELECTED_SEED_INDEX[0]}]'
 def audio_interface(num_seeds, texts, progress=gr.Progress()):
     # 不足的部分
     all_wavs = wavs + [None] * (max_audio_components - len(wavs))
     all_seeds = seeds + [''] * (max_audio_components - len(seeds))
+    return [item for pair in zip(all_wavs, all_seeds) for item in pair]
 def audio_interface_empty(num_seeds, texts, progress=gr.Progress(track_tqdm=True)):
+    return [None, ""] * max_audio_components
 def update_audio_components(slider_value):
     k = int(slider_value)
     audios = [gr.Audio(visible=True)] * k + [gr.Audio(visible=False)] * (max_audio_components - k)
     tbs = [gr.Textbox(visible=True)] * k + [gr.Textbox(visible=False)] * (max_audio_components - k)
     print(f'k={k}, audios={len(audios)}')
+    return [item for pair in zip(audios, tbs) for item in pair]
 def seed_change(evt: gr.SelectData):
     SELECTED_SEED_INDEX = evt.index
     return evt.index
+@spaces.GPU
 def generate_tts_audio(text_file, num_seeds, seed, speed, oral, laugh, bk, min_length, batch_size, temperature, top_P,
+                       top_K, progress=gr.Progress()):
     from tts_model import generate_audio_for_seed
+    from utils import split_text
     if seed in [0, -1, None]:
         seed = random.randint(1, 9999)
     content = ''
         content = ""
     elif isinstance(text_file, str):
         content = text_file
     texts = split_text(content, min_length=min_length)
+    print(texts)
     if oral < 0 or oral > 9 or laugh < 0 or laugh > 2 or bk < 0 or bk > 7:
         raise ValueError("oral_(0-9), laugh_(0-2), break_(0-7) out of range")
     refine_text_prompt = f"[oral_{oral}][laugh_{laugh}][break_{bk}]"
     try:
+        output_files = generate_audio_for_seed(chat, seed, texts, batch_size, speed, refine_text_prompt, temperature,
+                                               top_P, top_K, progress.tqdm)
         return output_files
     except Exception as e:
+        return str(e)
 def generate_seed():
 def update_label(text):
     word_count = len(text)
+    return gr.update(label=f"朗读文本（字数: {word_count}）")
 with gr.Blocks() as demo:
     with gr.Tab("音色抽卡"):
         with gr.Row():
             with gr.Column(scale=1):
                 ]
                 # gr.Markdown("### 随机音色抽卡")
                 gr.Markdown("""
                 在相同的 seed 和 温度等参数下，音色具有一定的一致性。点击下面的“随机音色生成”按钮将生成多个 seed。找到满意的音色后，点击音频下方“保存”按钮。
                 **注意：不同机器使用相同种子生成的音频音色可能不同，同一机器使用相同种子多次生成的音频音色也可能变化。**
                 """)
                 gr.Markdown("### 种子管理界面")
                 seed_list = gr.DataFrame(
                     label="种子列表",
+                    headers=["Index", "Seed", "Name"],
+                    datatype=["number", "number", "str"],
                     interactive=True,
+                    col_count=(3, "fixed"),
+                    value=display_seeds()
                 )
                 with gr.Row():
                     refresh_button = gr.Button("刷新")
                     save_button = gr.Button("保存")
                     del_button = gr.Button("删除")
                 # 绑定按钮和函数
                 refresh_button.click(display_seeds, outputs=seed_list)
+                seed_list.select(seed_change).success(seed_change_btn, outputs=[del_button])
                 save_button.click(do_save_seeds, inputs=[seed_list], outputs=None)
                 del_button.click(do_delete_seed, inputs=del_button, outputs=seed_list)
             with gr.Column(scale=1):
                 audio_components = []
                     visible = i < num_seeds_default
                     a = gr.Audio(f"Audio {i}", visible=visible)
                     t = gr.Button(f"Seed", visible=visible)
+                    t.click(do_save_seed, inputs=[t], outputs=None).success(display_seeds, outputs=seed_list)
                     audio_components.append(a)
                     audio_components.append(t)
                 num_seeds.change(update_audio_components, inputs=num_seeds, outputs=audio_components)
                 # output = gr.Column()
                 # audio = gr.Audio(label="Output Audio")
                                              placeholder="Please Input Text...", value=default_text)
                 # 当文本框内容发生变化时调用 update_label 函数
                 text_file_input.change(update_label, inputs=text_file_input, outputs=text_file_input)
             with gr.Column():
                 gr.Markdown("### 配置参数")
+                gr.Markdown("根据需要配置以下参数来生成音频。")
                 with gr.Row():
+                    num_seeds_input = gr.Number(label="生成音频的数量", value=1, precision=0, visible=False)
+                    seed_input = gr.Number(label="指定种子（留空则随机）", value=None, precision=0)
+                    generate_audio_seed = gr.Button("\U0001F3B2")
                 with gr.Row():
+                    speed_input = gr.Slider(label="语速", minimum=1, maximum=10, value=5, step=1)
+                    oral_input = gr.Slider(label="口语化", minimum=0, maximum=9, value=2, step=1)
+                    laugh_input = gr.Slider(label="笑声", minimum=0, maximum=2, value=0, step=1)
+                    bk_input = gr.Slider(label="停顿", minimum=0, maximum=7, value=4, step=1)
                 # gr.Markdown("### 文本参数")
                 with gr.Row():
+                    min_length_input = gr.Number(label="文本分段长度", info="大于这个数值进行分段", value=120,
+                                                 precision=0)
+                    batch_size_input = gr.Number(label="批大小", info="同时处理的批次 越高越快 太高爆显存", value=5,
+                                                 precision=0)
                 with gr.Accordion("其他参数", open=False):
                     with gr.Row():
                         # 温度 top_P top_K
+                        temperature_input = gr.Slider(label="温度", minimum=0.01, maximum=1.0, step=0.01, value=0.3)
+                        top_P_input = gr.Slider(label="top_P", minimum=0.1, maximum=0.9, step=0.05, value=0.7)
+                        top_K_input = gr.Slider(label="top_K", minimum=1, maximum=20, step=1, value=20)
                         # reset 按钮
                         reset_button = gr.Button("重置")
         with gr.Row():
+            generate_button = gr.Button("生成音频", variant="primary")
         with gr.Row():
             output_audio = gr.Audio(label="生成的音频文件")
         generate_audio_seed.click(generate_seed,
                                   inputs=[],
                                   outputs=seed_input)
         # 重置按钮 重置温度等参数
         reset_button.click(
             lambda: [0.3, 0.7, 20],
                 temperature_input,
                 top_P_input,
                 top_K_input,
             ],
             outputs=[output_audio]
         )
     with gr.Tab("角色扮演"):
         def txt_2_script(text):
             lines = text.split("\n")
             characters = list([_["character"] for _ in lines])
             unique_characters = list(dict.fromkeys(characters))
             print([[character, 0] for character in unique_characters])
+            return [[character, 0] for character in unique_characters]
         def get_txt_characters(text):
             scripts = llm_operation(api_base, api_key, model, LLM_PROMPT, text, required_keys=["txt", "character"])
             return script_2_txt(scripts)
+        @spaces.GPU
         def generate_script_audio(text, models_seeds, progress=gr.Progress()):
             scripts = txt_2_script(text)  # 将文本转换为剧本
             characters = get_characters(scripts)  # 从剧本中提取角色
             import itertools
             from tts_model import generate_audio_for_seed
             from utils import combine_audio, save_audio, normalize_zh
+            from config import DEFAULT_BATCH_SIZE, DEFAULT_SPEED, DEFAULT_TEMPERATURE, DEFAULT_TOP_K, DEFAULT_TOP_P
             assert isinstance(models_seeds, pd.DataFrame)
                         break
                     yield batch
+            models_seeds = models_seeds.to_dict(orient='records')
             # 检查每个角色是否都有对应的种子
+            for character, _ in characters:
+                if not any(seed['Character'] == character for seed in models_seeds):
+                    gr.Info(f"角色 {character} 没有种子，请先设置种子。")
+                    return None
+            # 将角色和对应的种子存为字典
+            character_seeds = {character: [seed['Seed'] for seed in models_seeds if seed['Character'] == character][0]
+                               for character, _ in characters}
+            # todo 可以自定义 最好是按角色
             refine_text_prompt = "[oral_2][laugh_0][break_4]"
             all_wavs = []
             batch_size = 5  # 设置批次大小
             # 按角色处理
             for character, lines in progress.tqdm(grouped_lines.items(), desc="生成剧本音频"):
+                seed = character_seeds.get(character, 0)
                 # 按批次处理
                 for batch_lines in batch(lines, batch_size):
                     texts = [normalize_zh(line["txt"]) for line in batch_lines]
+                    print(f"seed={seed} t={texts} c={character}")
+                    wavs = generate_audio_for_seed(chat, int(seed), texts, DEFAULT_BATCH_SIZE, DEFAULT_SPEED,
+                                                   refine_text_prompt, DEFAULT_TEMPERATURE, DEFAULT_TOP_P,
                                                    DEFAULT_TOP_K, skip_save=True)  # 批量处理文本
                     batch_results[character].extend(wavs)
             # 合成所有音频
             audio = combine_audio(all_wavs)
             fname = f"script_{int(time.time())}.wav"
+            save_audio(fname, audio)
+            return fname
         script_example = {
                 "txt": "当小红帽到达奶奶家时，她发现大灰狼伪装成了奶奶。",
                 "character": "旁白"
             }, {
+                "txt": "小红帽疑惑地问",
                 "character": "旁白"
             }, {
                 "txt": "奶奶，你的耳朵怎么这么尖？",
                                                        placeholder="请输入API Base URL",
                                                        value=r"https://api.openai.com/v1")
                     openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="请输入API Key",
+                                                      value="sk-xxxxxxx")
                 # AI提示词
                 ai_text_input = gr.Textbox(label="剧情简介或者一段故事", placeholder="请输入文本...", lines=2,
                                            value=ai_text_default)
             with gr.Column(scale=3):
                 gr.Markdown("### 脚本")
                 gr.Markdown(
+                    "脚本可以手工编写也可以从右侧的AI脚本生成按钮生成。脚本格式 **角色::文本** 一行为一句” 注意是::")
                 script_text = "\n".join(
                     [f"{_.get('character', '')}::{_.get('txt', '')}" for _ in script_example['lines']])
             with gr.Column(scale=1):
                 gr.Markdown("### 角色种子")
                 # DataFrame 来存放转换后的脚本
+                # 默认数据
                 default_data = [
+                    ["旁白", 2222],
+                    ["年轻女性", 2],
+                    ["中年男性", 2424]
                 ]
                 script_data = gr.DataFrame(
                     value=default_data,
                     label="角色对应的音色种子，从抽卡那获取",
+                    headers=["Character", "Seed"],
+                    datatype=["str", "number"],
                     interactive=True,
+                    col_count=(2, "fixed"),
                 )
                 # 生视频按钮
                 script_generate_audio = gr.Button("步骤②：生成音频")
             outputs=[script_audio]
         )
+demo.launch(share=args.share)