Spaces:
Sleeping
Sleeping
zhzluke96
commited on
Commit
·
ebc4336
1
Parent(s):
21473c0
update
Browse files- language/zh-CN.json +3 -2
- modules/api/impl/google_api.py +12 -6
- modules/api/impl/openai_api.py +5 -1
- modules/api/impl/refiner_api.py +5 -1
- modules/api/impl/ssml_api.py +15 -5
- modules/api/impl/tts_api.py +38 -1
- modules/generate_audio.py +5 -1
- modules/normalization.py +6 -0
- modules/ssml.py +0 -242
- modules/ssml_parser/SSMLParser.py +9 -3
- modules/utils/git.py +15 -0
- modules/utils/markdown.py +7 -0
- modules/webui/localization.py +9 -3
- modules/webui/speaker/speaker_editor.py +147 -0
- modules/webui/speaker_tab.py +3 -0
- modules/webui/ssml/podcast_tab.py +38 -17
- modules/webui/tts_tab.py +3 -1
- modules/webui/webui_utils.py +1 -3
- webui.py +1 -4
language/zh-CN.json
CHANGED
|
@@ -57,8 +57,8 @@
|
|
| 57 |
"🔊Generate speaker.pt": "🔊生成 speaker.pt",
|
| 58 |
"Save .pt file": "保存.pt文件",
|
| 59 |
"Save to File": "保存到文件",
|
| 60 |
-
"🎤Test voice": "
|
| 61 |
-
"Test Voice": "
|
| 62 |
"Current Seed": "当前种子",
|
| 63 |
"Output Audio": "输出音频",
|
| 64 |
"Merger": "融合",
|
|
@@ -79,6 +79,7 @@
|
|
| 79 |
"README": "README",
|
| 80 |
"readme": "readme",
|
| 81 |
"changelog": "changelog",
|
|
|
|
| 82 |
"TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt,效果更强但是影响质量"],
|
| 83 |
"SSML_SPLITER_GUIDE": [
|
| 84 |
"- 字数限制详见README,超过部分将截断",
|
|
|
|
| 57 |
"🔊Generate speaker.pt": "🔊生成 speaker.pt",
|
| 58 |
"Save .pt file": "保存.pt文件",
|
| 59 |
"Save to File": "保存到文件",
|
| 60 |
+
"🎤Test voice": "🎤试语",
|
| 61 |
+
"Test Voice": "试语",
|
| 62 |
"Current Seed": "当前种子",
|
| 63 |
"Output Audio": "输出音频",
|
| 64 |
"Merger": "融合",
|
|
|
|
| 79 |
"README": "README",
|
| 80 |
"readme": "readme",
|
| 81 |
"changelog": "changelog",
|
| 82 |
+
"💼Speaker file": "💼音色文件",
|
| 83 |
"TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt,效果更强但是影响质量"],
|
| 84 |
"SSML_SPLITER_GUIDE": [
|
| 85 |
"- 字数限制详见README,超过部分将截断",
|
modules/api/impl/google_api.py
CHANGED
|
@@ -14,7 +14,7 @@ from modules import generate_audio as generate
|
|
| 14 |
from modules.speaker import speaker_mgr
|
| 15 |
|
| 16 |
|
| 17 |
-
from modules.
|
| 18 |
from modules.SynthesizeSegments import (
|
| 19 |
SynthesizeSegments,
|
| 20 |
combine_audio_segments,
|
|
@@ -65,6 +65,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
| 65 |
audioConfig = request.audioConfig
|
| 66 |
|
| 67 |
# 提取参数
|
|
|
|
|
|
|
| 68 |
language_code = voice.languageCode
|
| 69 |
voice_name = voice.name
|
| 70 |
infer_seed = voice.seed or 42
|
|
@@ -86,9 +88,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
| 86 |
# TODO maybe need to change the sample rate
|
| 87 |
sample_rate = 24000
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
if spk is None:
|
| 92 |
raise HTTPException(
|
| 93 |
status_code=400, detail="The specified voice name is not supported."
|
| 94 |
)
|
|
@@ -120,7 +121,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
| 120 |
|
| 121 |
elif input.ssml:
|
| 122 |
# 处理SSML合成逻辑
|
| 123 |
-
|
|
|
|
| 124 |
for seg in segments:
|
| 125 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 126 |
|
|
@@ -171,7 +173,11 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
| 171 |
import logging
|
| 172 |
|
| 173 |
logging.exception(e)
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
|
| 177 |
def setup(app: APIManager):
|
|
|
|
| 14 |
from modules.speaker import speaker_mgr
|
| 15 |
|
| 16 |
|
| 17 |
+
from modules.ssml_parser.SSMLParser import create_ssml_parser
|
| 18 |
from modules.SynthesizeSegments import (
|
| 19 |
SynthesizeSegments,
|
| 20 |
combine_audio_segments,
|
|
|
|
| 65 |
audioConfig = request.audioConfig
|
| 66 |
|
| 67 |
# 提取参数
|
| 68 |
+
|
| 69 |
+
# TODO 这个也许应该传给 normalizer
|
| 70 |
language_code = voice.languageCode
|
| 71 |
voice_name = voice.name
|
| 72 |
infer_seed = voice.seed or 42
|
|
|
|
| 88 |
# TODO maybe need to change the sample rate
|
| 89 |
sample_rate = 24000
|
| 90 |
|
| 91 |
+
# 虽然 calc_spk_style 可以解析 seed 形式,但是这个接口只准备支持 speakers list 中存在的 speaker
|
| 92 |
+
if speaker_mgr.get_speaker(voice_name) is None:
|
|
|
|
| 93 |
raise HTTPException(
|
| 94 |
status_code=400, detail="The specified voice name is not supported."
|
| 95 |
)
|
|
|
|
| 121 |
|
| 122 |
elif input.ssml:
|
| 123 |
# 处理SSML合成逻辑
|
| 124 |
+
parser = create_ssml_parser()
|
| 125 |
+
segments = parser.parse(input.ssml)
|
| 126 |
for seg in segments:
|
| 127 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 128 |
|
|
|
|
| 173 |
import logging
|
| 174 |
|
| 175 |
logging.exception(e)
|
| 176 |
+
|
| 177 |
+
if isinstance(e, HTTPException):
|
| 178 |
+
raise e
|
| 179 |
+
else:
|
| 180 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 181 |
|
| 182 |
|
| 183 |
def setup(app: APIManager):
|
modules/api/impl/openai_api.py
CHANGED
|
@@ -115,7 +115,11 @@ async def openai_speech_api(
|
|
| 115 |
import logging
|
| 116 |
|
| 117 |
logging.exception(e)
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
class TranscribeSegment(BaseModel):
|
|
|
|
| 115 |
import logging
|
| 116 |
|
| 117 |
logging.exception(e)
|
| 118 |
+
|
| 119 |
+
if isinstance(e, HTTPException):
|
| 120 |
+
raise e
|
| 121 |
+
else:
|
| 122 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 123 |
|
| 124 |
|
| 125 |
class TranscribeSegment(BaseModel):
|
modules/api/impl/refiner_api.py
CHANGED
|
@@ -42,7 +42,11 @@ async def refiner_prompt_post(request: RefineTextRequest):
|
|
| 42 |
import logging
|
| 43 |
|
| 44 |
logging.exception(e)
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def setup(api_manager: APIManager):
|
|
|
|
| 42 |
import logging
|
| 43 |
|
| 44 |
logging.exception(e)
|
| 45 |
+
|
| 46 |
+
if isinstance(e, HTTPException):
|
| 47 |
+
raise e
|
| 48 |
+
else:
|
| 49 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 50 |
|
| 51 |
|
| 52 |
def setup(api_manager: APIManager):
|
modules/api/impl/ssml_api.py
CHANGED
|
@@ -7,7 +7,7 @@ from fastapi.responses import FileResponse
|
|
| 7 |
|
| 8 |
|
| 9 |
from modules.normalization import text_normalize
|
| 10 |
-
from modules.
|
| 11 |
from modules.SynthesizeSegments import (
|
| 12 |
SynthesizeSegments,
|
| 13 |
combine_audio_segments,
|
|
@@ -34,7 +34,7 @@ async def synthesize_ssml(
|
|
| 34 |
):
|
| 35 |
try:
|
| 36 |
ssml = request.ssml
|
| 37 |
-
format = request.format
|
| 38 |
batch_size = request.batch_size
|
| 39 |
|
| 40 |
if batch_size < 1:
|
|
@@ -42,10 +42,16 @@ async def synthesize_ssml(
|
|
| 42 |
status_code=400, detail="Batch size must be greater than 0."
|
| 43 |
)
|
| 44 |
|
| 45 |
-
if not ssml:
|
| 46 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
| 47 |
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
for seg in segments:
|
| 50 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 51 |
|
|
@@ -63,7 +69,11 @@ async def synthesize_ssml(
|
|
| 63 |
import logging
|
| 64 |
|
| 65 |
logging.exception(e)
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def setup(api_manager: APIManager):
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
from modules.normalization import text_normalize
|
| 10 |
+
from modules.ssml_parser.SSMLParser import create_ssml_parser
|
| 11 |
from modules.SynthesizeSegments import (
|
| 12 |
SynthesizeSegments,
|
| 13 |
combine_audio_segments,
|
|
|
|
| 34 |
):
|
| 35 |
try:
|
| 36 |
ssml = request.ssml
|
| 37 |
+
format = request.format.lower()
|
| 38 |
batch_size = request.batch_size
|
| 39 |
|
| 40 |
if batch_size < 1:
|
|
|
|
| 42 |
status_code=400, detail="Batch size must be greater than 0."
|
| 43 |
)
|
| 44 |
|
| 45 |
+
if not ssml or ssml == "":
|
| 46 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
| 47 |
|
| 48 |
+
if format not in ["mp3", "wav"]:
|
| 49 |
+
raise HTTPException(
|
| 50 |
+
status_code=400, detail="Format must be 'mp3' or 'wav'."
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
parser = create_ssml_parser()
|
| 54 |
+
segments = parser.parse(ssml)
|
| 55 |
for seg in segments:
|
| 56 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 57 |
|
|
|
|
| 69 |
import logging
|
| 70 |
|
| 71 |
logging.exception(e)
|
| 72 |
+
|
| 73 |
+
if isinstance(e, HTTPException):
|
| 74 |
+
raise e
|
| 75 |
+
else:
|
| 76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 77 |
|
| 78 |
|
| 79 |
def setup(api_manager: APIManager):
|
modules/api/impl/tts_api.py
CHANGED
|
@@ -44,6 +44,39 @@ class TTSParams(BaseModel):
|
|
| 44 |
|
| 45 |
async def synthesize_tts(params: TTSParams = Depends()):
|
| 46 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
text = text_normalize(params.text, is_end=False)
|
| 48 |
|
| 49 |
calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
|
|
@@ -87,7 +120,11 @@ async def synthesize_tts(params: TTSParams = Depends()):
|
|
| 87 |
import logging
|
| 88 |
|
| 89 |
logging.exception(e)
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def setup(api_manager: APIManager):
|
|
|
|
| 44 |
|
| 45 |
async def synthesize_tts(params: TTSParams = Depends()):
|
| 46 |
try:
|
| 47 |
+
# Validate text
|
| 48 |
+
if not params.text.strip():
|
| 49 |
+
raise HTTPException(
|
| 50 |
+
status_code=422, detail="Text parameter cannot be empty"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Validate temperature
|
| 54 |
+
if not (0 <= params.temperature <= 1):
|
| 55 |
+
raise HTTPException(
|
| 56 |
+
status_code=422, detail="Temperature must be between 0 and 1"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Validate top_P
|
| 60 |
+
if not (0 <= params.top_P <= 1):
|
| 61 |
+
raise HTTPException(status_code=422, detail="top_P must be between 0 and 1")
|
| 62 |
+
|
| 63 |
+
# Validate top_K
|
| 64 |
+
if params.top_K <= 0:
|
| 65 |
+
raise HTTPException(
|
| 66 |
+
status_code=422, detail="top_K must be a positive integer"
|
| 67 |
+
)
|
| 68 |
+
if params.top_K > 100:
|
| 69 |
+
raise HTTPException(
|
| 70 |
+
status_code=422, detail="top_K must be less than or equal to 100"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Validate format
|
| 74 |
+
if params.format not in ["mp3", "wav"]:
|
| 75 |
+
raise HTTPException(
|
| 76 |
+
status_code=422,
|
| 77 |
+
detail="Invalid format. Supported formats are mp3 and wav",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
text = text_normalize(params.text, is_end=False)
|
| 81 |
|
| 82 |
calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
|
|
|
|
| 120 |
import logging
|
| 121 |
|
| 122 |
logging.exception(e)
|
| 123 |
+
|
| 124 |
+
if isinstance(e, HTTPException):
|
| 125 |
+
raise e
|
| 126 |
+
else:
|
| 127 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 128 |
|
| 129 |
|
| 130 |
def setup(api_manager: APIManager):
|
modules/generate_audio.py
CHANGED
|
@@ -79,7 +79,11 @@ def generate_audio_batch(
|
|
| 79 |
params_infer_code["spk_emb"] = spk.emb
|
| 80 |
logger.info(("spk", spk.name))
|
| 81 |
else:
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
logger.info(
|
| 85 |
{
|
|
|
|
| 79 |
params_infer_code["spk_emb"] = spk.emb
|
| 80 |
logger.info(("spk", spk.name))
|
| 81 |
else:
|
| 82 |
+
logger.warn(
|
| 83 |
+
f"spk must be int or Speaker, but: <{type(spk)}> {spk}, wiil set to default voice"
|
| 84 |
+
)
|
| 85 |
+
with SeedContext(2, True):
|
| 86 |
+
params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
|
| 87 |
|
| 88 |
logger.info(
|
| 89 |
{
|
modules/normalization.py
CHANGED
|
@@ -5,6 +5,10 @@ from modules.utils.markdown import markdown_to_text
|
|
| 5 |
from modules import models
|
| 6 |
import re
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
@lru_cache(maxsize=64)
|
| 10 |
def is_chinese(text):
|
|
@@ -159,6 +163,8 @@ def replace_unk_tokens(text):
|
|
| 159 |
"""
|
| 160 |
把不在字典里的字符替换为 " , "
|
| 161 |
"""
|
|
|
|
|
|
|
| 162 |
chat_tts = models.load_chat_tts()
|
| 163 |
if "tokenizer" not in chat_tts.pretrain_models:
|
| 164 |
# 这个地方只有在 huggingface spaces 中才会触发
|
|
|
|
| 5 |
from modules import models
|
| 6 |
import re
|
| 7 |
|
| 8 |
+
# 是否关闭 unk token 检查
|
| 9 |
+
# NOTE: 单测的时候用于跳过模型加载
|
| 10 |
+
DISABLE_UNK_TOKEN_CHECK = False
|
| 11 |
+
|
| 12 |
|
| 13 |
@lru_cache(maxsize=64)
|
| 14 |
def is_chinese(text):
|
|
|
|
| 163 |
"""
|
| 164 |
把不在字典里的字符替换为 " , "
|
| 165 |
"""
|
| 166 |
+
if DISABLE_UNK_TOKEN_CHECK:
|
| 167 |
+
return text
|
| 168 |
chat_tts = models.load_chat_tts()
|
| 169 |
if "tokenizer" not in chat_tts.pretrain_models:
|
| 170 |
# 这个地方只有在 huggingface spaces 中才会触发
|
modules/ssml.py
CHANGED
|
@@ -66,245 +66,3 @@ def apply_random_seed(attrs: dict):
|
|
| 66 |
seed = random.randint(0, 2**32 - 1)
|
| 67 |
attrs["seed"] = seed
|
| 68 |
logger.info(f"random seed: {seed}")
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
class NotSupportSSML(Exception):
|
| 72 |
-
pass
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def parse_ssml(ssml: str) -> List[Dict[str, Any]]:
|
| 76 |
-
root = etree.fromstring(ssml)
|
| 77 |
-
|
| 78 |
-
ssml_version = root.get("version", "NONE")
|
| 79 |
-
if ssml_version != "0.1":
|
| 80 |
-
raise NotSupportSSML("Unsupported ssml version: {ssml_version}")
|
| 81 |
-
|
| 82 |
-
segments = []
|
| 83 |
-
|
| 84 |
-
for voice in root.findall(".//voice"):
|
| 85 |
-
voice_attrs = {
|
| 86 |
-
"spk": voice.get("spk"),
|
| 87 |
-
"style": voice.get("style"),
|
| 88 |
-
"seed": voice.get("seed"),
|
| 89 |
-
"top_p": voice.get("top_p"),
|
| 90 |
-
"top_k": voice.get("top_k"),
|
| 91 |
-
"temp": voice.get("temp"),
|
| 92 |
-
"prompt1": voice.get("prompt1"),
|
| 93 |
-
"prompt2": voice.get("prompt2"),
|
| 94 |
-
"prefix": voice.get("prefix"),
|
| 95 |
-
"normalize": voice.get("normalize"),
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
voice_attrs = {k: v for k, v in voice_attrs.items() if v is not None}
|
| 99 |
-
|
| 100 |
-
expand_spk(voice_attrs)
|
| 101 |
-
expand_style(voice_attrs)
|
| 102 |
-
|
| 103 |
-
merge_prompt(voice_attrs, voice)
|
| 104 |
-
apply_random_seed(voice_attrs)
|
| 105 |
-
|
| 106 |
-
voice_segments = []
|
| 107 |
-
|
| 108 |
-
if voice_attrs.get("temp", "") == "min":
|
| 109 |
-
# ref: https://github.com/2noise/ChatTTS/issues/123#issue-2326908144
|
| 110 |
-
voice_attrs["temp"] = 0.000000000001
|
| 111 |
-
if voice_attrs.get("temp", "") == "max":
|
| 112 |
-
voice_attrs["temp"] = 1
|
| 113 |
-
|
| 114 |
-
# 处理 voice 开头的文本
|
| 115 |
-
if voice.text and voice.text.strip():
|
| 116 |
-
voice_segments.append(
|
| 117 |
-
{"text": voice.text.strip(), "attrs": voice_attrs.copy()}
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
# 处理 voice 内部的文本和 prosody 元素
|
| 121 |
-
for node in voice.iterchildren():
|
| 122 |
-
if node.tag == "prosody":
|
| 123 |
-
prosody_attrs = voice_attrs.copy()
|
| 124 |
-
new_attrs = {
|
| 125 |
-
"rate": node.get("rate"),
|
| 126 |
-
"volume": node.get("volume"),
|
| 127 |
-
"pitch": node.get("pitch"),
|
| 128 |
-
}
|
| 129 |
-
prosody_attrs.update(
|
| 130 |
-
{k: v for k, v in new_attrs.items() if v is not None}
|
| 131 |
-
)
|
| 132 |
-
expand_style(prosody_attrs)
|
| 133 |
-
merge_prompt(prosody_attrs, node)
|
| 134 |
-
apply_random_seed(voice_attrs)
|
| 135 |
-
|
| 136 |
-
if node.text and node.text.strip():
|
| 137 |
-
voice_segments.append(
|
| 138 |
-
{"text": node.text.strip(), "attrs": prosody_attrs}
|
| 139 |
-
)
|
| 140 |
-
elif node.tag == "break":
|
| 141 |
-
time_ms = int(node.get("time", "0").replace("ms", ""))
|
| 142 |
-
segment = {"break": time_ms}
|
| 143 |
-
voice_segments.append(segment)
|
| 144 |
-
|
| 145 |
-
if node.tail and node.tail.strip():
|
| 146 |
-
voice_segments.append(
|
| 147 |
-
{"text": node.tail.strip(), "attrs": voice_attrs.copy()}
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
end_segment = voice_segments[-1]
|
| 151 |
-
end_segment["is_end"] = True
|
| 152 |
-
|
| 153 |
-
segments = segments + voice_segments
|
| 154 |
-
|
| 155 |
-
logger.info(f"collect len(segments): {len(segments)}")
|
| 156 |
-
# logger.info(f"segments: {json.dumps(segments, ensure_ascii=False)}")
|
| 157 |
-
|
| 158 |
-
return segments
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
if __name__ == "__main__":
|
| 162 |
-
# 示例 SSML 输入
|
| 163 |
-
ssml1 = """
|
| 164 |
-
<speak version="0.1">
|
| 165 |
-
<voice spk="20398768" seed="42" temp="min" top_p="0.9" top_k="20">
|
| 166 |
-
电影中梁朝伟扮演的陈永仁的
|
| 167 |
-
<prosody volume="5">
|
| 168 |
-
编号27149
|
| 169 |
-
</prosody>
|
| 170 |
-
<prosody rate="2">
|
| 171 |
-
编号27149
|
| 172 |
-
</prosody>
|
| 173 |
-
<prosody pitch="-12">
|
| 174 |
-
编号27149
|
| 175 |
-
</prosody>
|
| 176 |
-
<prosody pitch="12">
|
| 177 |
-
编号27149
|
| 178 |
-
</prosody>
|
| 179 |
-
</voice>
|
| 180 |
-
<voice spk="20398768" seed="42" speed="9">
|
| 181 |
-
编号27149
|
| 182 |
-
</voice>
|
| 183 |
-
<voice spk="20398768" seed="42">
|
| 184 |
-
电影中梁朝伟扮演的陈永仁的编号27149
|
| 185 |
-
</voice>
|
| 186 |
-
</speak>
|
| 187 |
-
"""
|
| 188 |
-
|
| 189 |
-
ssml2 = """
|
| 190 |
-
<speak version="0.1">
|
| 191 |
-
<voice spk="Bob">
|
| 192 |
-
也可以合成多角色多情感的有声 [uv_break] 书 [uv_break] ,例如:
|
| 193 |
-
</voice>
|
| 194 |
-
<voice spk="Bob">
|
| 195 |
-
黛玉冷笑道:
|
| 196 |
-
</voice>
|
| 197 |
-
<voice spk="female2">
|
| 198 |
-
我说呢,亏了绊住,不然,早就飞了来了。
|
| 199 |
-
</voice>
|
| 200 |
-
<voice spk="Bob" speed="0">
|
| 201 |
-
宝玉道:
|
| 202 |
-
</voice>
|
| 203 |
-
<voice spk="Alice">
|
| 204 |
-
“只许和你玩,替你解闷。不过偶然到他那里,就说这些闲话。”
|
| 205 |
-
</voice>
|
| 206 |
-
<voice spk="female2">
|
| 207 |
-
“好没意思的话!去不去,关我什么事儿?又没叫你替我解闷儿,还许你不理我呢”
|
| 208 |
-
</voice>
|
| 209 |
-
<voice spk="Bob">
|
| 210 |
-
说着,便赌气回房去了。
|
| 211 |
-
</voice>
|
| 212 |
-
</speak>
|
| 213 |
-
"""
|
| 214 |
-
ssml22 = """
|
| 215 |
-
<speak version="0.1">
|
| 216 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 217 |
-
下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
|
| 218 |
-
</voice>
|
| 219 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 220 |
-
黛玉冷笑道:
|
| 221 |
-
</voice>
|
| 222 |
-
<voice spk="female2" style="angry">
|
| 223 |
-
我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了。
|
| 224 |
-
</voice>
|
| 225 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 226 |
-
宝玉道:
|
| 227 |
-
</voice>
|
| 228 |
-
<voice spk="Alice" style="unfriendly">
|
| 229 |
-
“只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”
|
| 230 |
-
</voice>
|
| 231 |
-
<voice spk="female2" style="angry">
|
| 232 |
-
“好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢”
|
| 233 |
-
</voice>
|
| 234 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 235 |
-
说着,便赌气回房去了。
|
| 236 |
-
</voice>
|
| 237 |
-
</speak>
|
| 238 |
-
"""
|
| 239 |
-
|
| 240 |
-
ssml3 = """
|
| 241 |
-
<speak version="0.1">
|
| 242 |
-
<voice spk="Bob" style="angry">
|
| 243 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
| 244 |
-
</voice>
|
| 245 |
-
<voice spk="Bob" style="assistant">
|
| 246 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
| 247 |
-
</voice>
|
| 248 |
-
<voice spk="Bob" style="gentle">
|
| 249 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
| 250 |
-
</voice>
|
| 251 |
-
</speak>
|
| 252 |
-
"""
|
| 253 |
-
|
| 254 |
-
ssml4 = """
|
| 255 |
-
<speak version="0.1">
|
| 256 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 257 |
-
使用 prosody 控制生成文本的语速语调和音量,示例如下
|
| 258 |
-
|
| 259 |
-
<prosody>
|
| 260 |
-
无任何限制将会继承父级voice配置进行生成
|
| 261 |
-
</prosody>
|
| 262 |
-
<prosody rate="1.5">
|
| 263 |
-
设置 rate 大于1表示加速,小于1为减速
|
| 264 |
-
</prosody>
|
| 265 |
-
<prosody pitch="6">
|
| 266 |
-
设置 pitch 调整音调,设置为6表示提高6个半音
|
| 267 |
-
</prosody>
|
| 268 |
-
<prosody volume="2">
|
| 269 |
-
设置 volume 调整音量,设置为2表示提高2个分贝
|
| 270 |
-
</prosody>
|
| 271 |
-
|
| 272 |
-
在 voice 中无prosody包裹的文本即为默认生成状态下的语音
|
| 273 |
-
</voice>
|
| 274 |
-
</speak>
|
| 275 |
-
"""
|
| 276 |
-
|
| 277 |
-
ssml5 = """
|
| 278 |
-
<speak version="0.1">
|
| 279 |
-
<voice spk="Bob" style="narration-relaxed">
|
| 280 |
-
使用 break 标签将会简单的
|
| 281 |
-
|
| 282 |
-
<break time="500" />
|
| 283 |
-
|
| 284 |
-
插入一段空白到生成结果中
|
| 285 |
-
</voice>
|
| 286 |
-
</speak>
|
| 287 |
-
"""
|
| 288 |
-
|
| 289 |
-
ssml6 = """
|
| 290 |
-
<speak version="0.1">
|
| 291 |
-
<voice spk="Bob" style="excited">
|
| 292 |
-
temperature for sampling (may be overridden by style or speaker)
|
| 293 |
-
<break time="500" />
|
| 294 |
-
温度值用于采样,这个值有可能被 style 或者 speaker 覆盖
|
| 295 |
-
<break time="500" />
|
| 296 |
-
temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖
|
| 297 |
-
<break time="500" />
|
| 298 |
-
温度值用于采样,(may be overridden by style or speaker)
|
| 299 |
-
</voice>
|
| 300 |
-
</speak>
|
| 301 |
-
"""
|
| 302 |
-
|
| 303 |
-
segments = parse_ssml(ssml6)
|
| 304 |
-
|
| 305 |
-
print(segments)
|
| 306 |
-
|
| 307 |
-
# audio_segments = synthesize_segments(segments)
|
| 308 |
-
# combined_audio = combine_audio_segments(audio_segments)
|
| 309 |
-
|
| 310 |
-
# combined_audio.export("output.wav", format="wav")
|
|
|
|
| 66 |
seed = random.randint(0, 2**32 - 1)
|
| 67 |
attrs["seed"] = seed
|
| 68 |
logger.info(f"random seed: {seed}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/ssml_parser/SSMLParser.py
CHANGED
|
@@ -29,6 +29,12 @@ class SSMLContext(Box):
|
|
| 29 |
self.prompt2 = None
|
| 30 |
self.prefix = None
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
class SSMLSegment(Box):
|
| 34 |
def __init__(self, text: str, attrs=SSMLContext()):
|
|
@@ -84,7 +90,7 @@ def create_ssml_parser():
|
|
| 84 |
|
| 85 |
@parser.resolver("speak")
|
| 86 |
def tag_speak(element, context, segments, parser):
|
| 87 |
-
ctx =
|
| 88 |
|
| 89 |
version = element.get("version")
|
| 90 |
if version != "0.1":
|
|
@@ -95,7 +101,7 @@ def create_ssml_parser():
|
|
| 95 |
|
| 96 |
@parser.resolver("voice")
|
| 97 |
def tag_voice(element, context, segments, parser):
|
| 98 |
-
ctx =
|
| 99 |
|
| 100 |
ctx.spk = element.get("spk", ctx.spk)
|
| 101 |
ctx.style = element.get("style", ctx.style)
|
|
@@ -131,7 +137,7 @@ def create_ssml_parser():
|
|
| 131 |
|
| 132 |
@parser.resolver("prosody")
|
| 133 |
def tag_prosody(element, context, segments, parser):
|
| 134 |
-
ctx =
|
| 135 |
|
| 136 |
ctx.spk = element.get("spk", ctx.spk)
|
| 137 |
ctx.style = element.get("style", ctx.style)
|
|
|
|
| 29 |
self.prompt2 = None
|
| 30 |
self.prefix = None
|
| 31 |
|
| 32 |
+
def clone(self):
|
| 33 |
+
ctx = SSMLContext()
|
| 34 |
+
for k, v in self.items():
|
| 35 |
+
ctx[k] = v
|
| 36 |
+
return ctx
|
| 37 |
+
|
| 38 |
|
| 39 |
class SSMLSegment(Box):
|
| 40 |
def __init__(self, text: str, attrs=SSMLContext()):
|
|
|
|
| 90 |
|
| 91 |
@parser.resolver("speak")
|
| 92 |
def tag_speak(element, context, segments, parser):
|
| 93 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
| 94 |
|
| 95 |
version = element.get("version")
|
| 96 |
if version != "0.1":
|
|
|
|
| 101 |
|
| 102 |
@parser.resolver("voice")
|
| 103 |
def tag_voice(element, context, segments, parser):
|
| 104 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
| 105 |
|
| 106 |
ctx.spk = element.get("spk", ctx.spk)
|
| 107 |
ctx.style = element.get("style", ctx.style)
|
|
|
|
| 137 |
|
| 138 |
@parser.resolver("prosody")
|
| 139 |
def tag_prosody(element, context, segments, parser):
|
| 140 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
| 141 |
|
| 142 |
ctx.spk = element.get("spk", ctx.spk)
|
| 143 |
ctx.style = element.get("style", ctx.style)
|
modules/utils/git.py
CHANGED
|
@@ -2,14 +2,25 @@ from functools import lru_cache
|
|
| 2 |
import os
|
| 3 |
import subprocess
|
| 4 |
|
|
|
|
| 5 |
from modules.utils import constants
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
git = os.environ.get("GIT", "git")
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
@lru_cache()
|
| 11 |
def commit_hash():
|
| 12 |
try:
|
|
|
|
|
|
|
| 13 |
return subprocess.check_output(
|
| 14 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
|
| 15 |
shell=False,
|
|
@@ -22,6 +33,8 @@ def commit_hash():
|
|
| 22 |
@lru_cache()
|
| 23 |
def git_tag():
|
| 24 |
try:
|
|
|
|
|
|
|
| 25 |
return subprocess.check_output(
|
| 26 |
[git, "-C", constants.ROOT_DIR, "describe", "--tags"],
|
| 27 |
shell=False,
|
|
@@ -44,6 +57,8 @@ def git_tag():
|
|
| 44 |
@lru_cache()
|
| 45 |
def branch_name():
|
| 46 |
try:
|
|
|
|
|
|
|
| 47 |
return subprocess.check_output(
|
| 48 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
|
| 49 |
shell=False,
|
|
|
|
| 2 |
import os
|
| 3 |
import subprocess
|
| 4 |
|
| 5 |
+
|
| 6 |
from modules.utils import constants
|
| 7 |
|
| 8 |
+
# 用于判断是否在hf spaces
|
| 9 |
+
try:
|
| 10 |
+
import spaces
|
| 11 |
+
except:
|
| 12 |
+
spaces = None
|
| 13 |
+
|
| 14 |
git = os.environ.get("GIT", "git")
|
| 15 |
|
| 16 |
+
in_hf_spaces = spaces is not None
|
| 17 |
+
|
| 18 |
|
| 19 |
@lru_cache()
|
| 20 |
def commit_hash():
|
| 21 |
try:
|
| 22 |
+
if in_hf_spaces:
|
| 23 |
+
return "<hf>"
|
| 24 |
return subprocess.check_output(
|
| 25 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
|
| 26 |
shell=False,
|
|
|
|
| 33 |
@lru_cache()
|
| 34 |
def git_tag():
|
| 35 |
try:
|
| 36 |
+
if in_hf_spaces:
|
| 37 |
+
return "<hf>"
|
| 38 |
return subprocess.check_output(
|
| 39 |
[git, "-C", constants.ROOT_DIR, "describe", "--tags"],
|
| 40 |
shell=False,
|
|
|
|
| 57 |
@lru_cache()
|
| 58 |
def branch_name():
|
| 59 |
try:
|
| 60 |
+
if in_hf_spaces:
|
| 61 |
+
return "<hf>"
|
| 62 |
return subprocess.check_output(
|
| 63 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
|
| 64 |
shell=False,
|
modules/utils/markdown.py
CHANGED
|
@@ -46,6 +46,10 @@ class PlainTextRenderer(mistune.HTMLRenderer):
|
|
| 46 |
# remove code
|
| 47 |
return ""
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def markdown_to_text(markdown_text):
|
| 51 |
renderer = PlainTextRenderer()
|
|
@@ -69,6 +73,9 @@ console.log(1)
|
|
| 69 |
- 列表项 2
|
| 70 |
- 列表项 3
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
> 这是一个引用。
|
| 73 |
|
| 74 |
`代码片段`
|
|
|
|
| 46 |
# remove code
|
| 47 |
return ""
|
| 48 |
|
| 49 |
+
def thematic_break(self) -> str:
|
| 50 |
+
# remove break
|
| 51 |
+
return "\n"
|
| 52 |
+
|
| 53 |
|
| 54 |
def markdown_to_text(markdown_text):
|
| 55 |
renderer = PlainTextRenderer()
|
|
|
|
| 73 |
- 列表项 2
|
| 74 |
- 列表项 3
|
| 75 |
|
| 76 |
+
1. 第一
|
| 77 |
+
2. 第二
|
| 78 |
+
|
| 79 |
> 这是一个引用。
|
| 80 |
|
| 81 |
`代码片段`
|
modules/webui/localization.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
current_translation = {}
|
| 7 |
localization_root = os.path.join(
|
|
@@ -24,11 +26,15 @@ def localization_js(filename):
|
|
| 24 |
assert isinstance(v, str) or isinstance(
|
| 25 |
v, list
|
| 26 |
), f"Value for key {k} is not a string or list"
|
|
|
|
|
|
|
| 27 |
except Exception as e:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
else:
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# current_translation = {k: 'XXX' for k in current_translation.keys()} # use this to see if all texts are covered
|
| 34 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
| 4 |
+
import logging
|
| 5 |
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
|
| 8 |
current_translation = {}
|
| 9 |
localization_root = os.path.join(
|
|
|
|
| 26 |
assert isinstance(v, str) or isinstance(
|
| 27 |
v, list
|
| 28 |
), f"Value for key {k} is not a string or list"
|
| 29 |
+
|
| 30 |
+
logger.info(f"Loaded localization file {full_name}")
|
| 31 |
except Exception as e:
|
| 32 |
+
logger.warning(str(e))
|
| 33 |
+
logger.warning(f"Failed to load localization file {full_name}")
|
| 34 |
else:
|
| 35 |
+
logger.warning(f"Localization file {full_name} does not exist")
|
| 36 |
+
else:
|
| 37 |
+
logger.warning(f"Localization file {filename} is not a string")
|
| 38 |
|
| 39 |
# current_translation = {k: 'XXX' for k in current_translation.keys()} # use this to see if all texts are covered
|
| 40 |
|
modules/webui/speaker/speaker_editor.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
from modules.speaker import Speaker
|
| 4 |
+
from modules.hf import spaces
|
| 5 |
+
from modules.webui import webui_config
|
| 6 |
+
from modules.webui.webui_utils import tts_generate
|
| 7 |
+
|
| 8 |
+
import tempfile
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@torch.inference_mode()
|
| 12 |
+
@spaces.GPU
|
| 13 |
+
def test_spk_voice(spk_file, text: str):
|
| 14 |
+
if spk_file == "" or spk_file is None:
|
| 15 |
+
return None
|
| 16 |
+
spk = Speaker.from_file(spk_file)
|
| 17 |
+
return tts_generate(
|
| 18 |
+
spk=spk,
|
| 19 |
+
text=text,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def speaker_editor_ui():
|
| 24 |
+
def on_generate(spk_file, name, gender, desc):
|
| 25 |
+
spk: Speaker = Speaker.from_file(spk_file)
|
| 26 |
+
spk.name = name
|
| 27 |
+
spk.gender = gender
|
| 28 |
+
spk.desc = desc
|
| 29 |
+
|
| 30 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pt") as tmp_file:
|
| 31 |
+
torch.save(spk, tmp_file)
|
| 32 |
+
tmp_file_path = tmp_file.name
|
| 33 |
+
|
| 34 |
+
return tmp_file_path
|
| 35 |
+
|
| 36 |
+
def create_test_voice_card(spk_file):
|
| 37 |
+
with gr.Group():
|
| 38 |
+
gr.Markdown("🎤Test voice")
|
| 39 |
+
with gr.Row():
|
| 40 |
+
test_voice_btn = gr.Button(
|
| 41 |
+
"Test Voice", variant="secondary", interactive=False
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
with gr.Column(scale=4):
|
| 45 |
+
test_text = gr.Textbox(
|
| 46 |
+
label="Test Text",
|
| 47 |
+
placeholder="Please input test text",
|
| 48 |
+
value=webui_config.localization.DEFAULT_SPEAKER_TEST_TEXT,
|
| 49 |
+
)
|
| 50 |
+
with gr.Row():
|
| 51 |
+
with gr.Column(scale=4):
|
| 52 |
+
output_audio = gr.Audio(label="Output Audio", format="mp3")
|
| 53 |
+
|
| 54 |
+
test_voice_btn.click(
|
| 55 |
+
fn=test_spk_voice,
|
| 56 |
+
inputs=[spk_file, test_text],
|
| 57 |
+
outputs=[output_audio],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return test_voice_btn
|
| 61 |
+
|
| 62 |
+
has_file = gr.State(False)
|
| 63 |
+
|
| 64 |
+
# TODO 也许需要写个说明?
|
| 65 |
+
# gr.Markdown("SPEAKER_CREATOR_GUIDE")
|
| 66 |
+
|
| 67 |
+
with gr.Row():
|
| 68 |
+
with gr.Column(scale=2):
|
| 69 |
+
with gr.Group():
|
| 70 |
+
gr.Markdown("💼Speaker file")
|
| 71 |
+
spk_file = gr.File(label="*.pt file", file_types=[".pt"])
|
| 72 |
+
|
| 73 |
+
with gr.Group():
|
| 74 |
+
gr.Markdown("ℹ️Speaker info")
|
| 75 |
+
name_input = gr.Textbox(
|
| 76 |
+
label="Name",
|
| 77 |
+
placeholder="Enter speaker name",
|
| 78 |
+
value="*",
|
| 79 |
+
interactive=False,
|
| 80 |
+
)
|
| 81 |
+
gender_input = gr.Textbox(
|
| 82 |
+
label="Gender",
|
| 83 |
+
placeholder="Enter gender",
|
| 84 |
+
value="*",
|
| 85 |
+
interactive=False,
|
| 86 |
+
)
|
| 87 |
+
desc_input = gr.Textbox(
|
| 88 |
+
label="Description",
|
| 89 |
+
placeholder="Enter description",
|
| 90 |
+
value="*",
|
| 91 |
+
interactive=False,
|
| 92 |
+
)
|
| 93 |
+
with gr.Group():
|
| 94 |
+
gr.Markdown("🔊Generate speaker.pt")
|
| 95 |
+
generate_button = gr.Button("Save .pt file", interactive=False)
|
| 96 |
+
output_file = gr.File(label="Save to File")
|
| 97 |
+
with gr.Column(scale=5):
|
| 98 |
+
btn1 = create_test_voice_card(spk_file=spk_file)
|
| 99 |
+
btn2 = create_test_voice_card(spk_file=spk_file)
|
| 100 |
+
btn3 = create_test_voice_card(spk_file=spk_file)
|
| 101 |
+
btn4 = create_test_voice_card(spk_file=spk_file)
|
| 102 |
+
|
| 103 |
+
generate_button.click(
|
| 104 |
+
fn=on_generate,
|
| 105 |
+
inputs=[spk_file, name_input, gender_input, desc_input],
|
| 106 |
+
outputs=[output_file],
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def spk_file_change(spk_file):
|
| 110 |
+
empty = spk_file is None or spk_file == ""
|
| 111 |
+
if empty:
|
| 112 |
+
return [
|
| 113 |
+
gr.Textbox(value="*", interactive=False),
|
| 114 |
+
gr.Textbox(value="*", interactive=False),
|
| 115 |
+
gr.Textbox(value="*", interactive=False),
|
| 116 |
+
gr.Button(interactive=False),
|
| 117 |
+
gr.Button(interactive=False),
|
| 118 |
+
gr.Button(interactive=False),
|
| 119 |
+
gr.Button(interactive=False),
|
| 120 |
+
gr.Button(interactive=False),
|
| 121 |
+
]
|
| 122 |
+
spk: Speaker = Speaker.from_file(spk_file)
|
| 123 |
+
return [
|
| 124 |
+
gr.Textbox(value=spk.name, interactive=True),
|
| 125 |
+
gr.Textbox(value=spk.gender, interactive=True),
|
| 126 |
+
gr.Textbox(value=spk.describe, interactive=True),
|
| 127 |
+
gr.Button(interactive=True),
|
| 128 |
+
gr.Button(interactive=True),
|
| 129 |
+
gr.Button(interactive=True),
|
| 130 |
+
gr.Button(interactive=True),
|
| 131 |
+
gr.Button(interactive=True),
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
+
spk_file.change(
|
| 135 |
+
fn=spk_file_change,
|
| 136 |
+
inputs=[spk_file],
|
| 137 |
+
outputs=[
|
| 138 |
+
name_input,
|
| 139 |
+
gender_input,
|
| 140 |
+
desc_input,
|
| 141 |
+
generate_button,
|
| 142 |
+
btn1,
|
| 143 |
+
btn2,
|
| 144 |
+
btn3,
|
| 145 |
+
btn4,
|
| 146 |
+
],
|
| 147 |
+
)
|
modules/webui/speaker_tab.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
|
|
|
| 3 |
from modules.webui.speaker.speaker_merger import create_speaker_merger
|
| 4 |
from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
| 5 |
|
|
@@ -7,6 +8,8 @@ from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
|
| 7 |
def create_speaker_panel():
|
| 8 |
|
| 9 |
with gr.Tabs():
|
|
|
|
|
|
|
| 10 |
with gr.TabItem("Creator"):
|
| 11 |
speaker_creator_ui()
|
| 12 |
with gr.TabItem("Merger"):
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
from modules.webui.speaker.speaker_editor import speaker_editor_ui
|
| 4 |
from modules.webui.speaker.speaker_merger import create_speaker_merger
|
| 5 |
from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
| 6 |
|
|
|
|
| 8 |
def create_speaker_panel():
|
| 9 |
|
| 10 |
with gr.Tabs():
|
| 11 |
+
with gr.Tab("Editor"):
|
| 12 |
+
speaker_editor_ui()
|
| 13 |
with gr.TabItem("Creator"):
|
| 14 |
speaker_creator_ui()
|
| 15 |
with gr.TabItem("Merger"):
|
modules/webui/ssml/podcast_tab.py
CHANGED
|
@@ -7,45 +7,65 @@ from modules.webui import webui_utils
|
|
| 7 |
from modules.hf import spaces
|
| 8 |
|
| 9 |
podcast_default_case = [
|
| 10 |
-
[
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
[
|
| 13 |
3,
|
| 14 |
"Bob",
|
| 15 |
-
"没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。",
|
| 16 |
-
"
|
| 17 |
],
|
| 18 |
[
|
| 19 |
4,
|
| 20 |
"female2",
|
| 21 |
-
"那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。",
|
| 22 |
-
"
|
| 23 |
],
|
| 24 |
[
|
| 25 |
5,
|
| 26 |
"Alice",
|
| 27 |
-
"对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。",
|
| 28 |
-
"
|
| 29 |
],
|
| 30 |
[
|
| 31 |
6,
|
| 32 |
"Bob",
|
| 33 |
-
"除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。",
|
| 34 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
],
|
| 36 |
-
[7, "female2", "对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。", "chat"],
|
| 37 |
-
[8, "Alice", "还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。", "chat"],
|
| 38 |
[
|
| 39 |
9,
|
| 40 |
"Bob",
|
| 41 |
-
"不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。",
|
| 42 |
-
"
|
| 43 |
],
|
| 44 |
[
|
| 45 |
10,
|
| 46 |
"female2",
|
| 47 |
-
"对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。",
|
| 48 |
-
"
|
| 49 |
],
|
| 50 |
]
|
| 51 |
|
|
@@ -111,10 +131,11 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
|
|
| 111 |
script_table = gr.DataFrame(
|
| 112 |
headers=["index", "speaker", "text", "style"],
|
| 113 |
datatype=["number", "str", "str", "str"],
|
| 114 |
-
interactive=
|
| 115 |
wrap=True,
|
| 116 |
value=podcast_default_case,
|
| 117 |
row_count=(0, "dynamic"),
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
|
|
|
|
| 7 |
from modules.hf import spaces
|
| 8 |
|
| 9 |
podcast_default_case = [
|
| 10 |
+
[
|
| 11 |
+
1,
|
| 12 |
+
"female2",
|
| 13 |
+
"你好,欢迎收听今天的播客内容。今天我们要聊的是中华料理。 [lbreak]",
|
| 14 |
+
"podcast_p",
|
| 15 |
+
],
|
| 16 |
+
[
|
| 17 |
+
2,
|
| 18 |
+
"Alice",
|
| 19 |
+
"嗨,我特别期待这个话题!中华料理真的是博大精深。 [lbreak]",
|
| 20 |
+
"podcast_p",
|
| 21 |
+
],
|
| 22 |
[
|
| 23 |
3,
|
| 24 |
"Bob",
|
| 25 |
+
"没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。 [lbreak]",
|
| 26 |
+
"podcast_p",
|
| 27 |
],
|
| 28 |
[
|
| 29 |
4,
|
| 30 |
"female2",
|
| 31 |
+
"那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。 [lbreak]",
|
| 32 |
+
"podcast_p",
|
| 33 |
],
|
| 34 |
[
|
| 35 |
5,
|
| 36 |
"Alice",
|
| 37 |
+
"对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。 [lbreak]",
|
| 38 |
+
"podcast_p",
|
| 39 |
],
|
| 40 |
[
|
| 41 |
6,
|
| 42 |
"Bob",
|
| 43 |
+
"除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。 [lbreak]",
|
| 44 |
+
"podcast_p",
|
| 45 |
+
],
|
| 46 |
+
[
|
| 47 |
+
7,
|
| 48 |
+
"female2",
|
| 49 |
+
"对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。 [lbreak]",
|
| 50 |
+
"podcast_p",
|
| 51 |
+
],
|
| 52 |
+
[
|
| 53 |
+
8,
|
| 54 |
+
"Alice",
|
| 55 |
+
"还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。 [lbreak]",
|
| 56 |
+
"podcast_p",
|
| 57 |
],
|
|
|
|
|
|
|
| 58 |
[
|
| 59 |
9,
|
| 60 |
"Bob",
|
| 61 |
+
"不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。 [lbreak]",
|
| 62 |
+
"podcast_p",
|
| 63 |
],
|
| 64 |
[
|
| 65 |
10,
|
| 66 |
"female2",
|
| 67 |
+
"对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。 [lbreak]",
|
| 68 |
+
"podcast_p",
|
| 69 |
],
|
| 70 |
]
|
| 71 |
|
|
|
|
| 131 |
script_table = gr.DataFrame(
|
| 132 |
headers=["index", "speaker", "text", "style"],
|
| 133 |
datatype=["number", "str", "str", "str"],
|
| 134 |
+
interactive=True,
|
| 135 |
wrap=True,
|
| 136 |
value=podcast_default_case,
|
| 137 |
row_count=(0, "dynamic"),
|
| 138 |
+
col_count=(4, "fixed"),
|
| 139 |
)
|
| 140 |
|
| 141 |
send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
|
modules/webui/tts_tab.py
CHANGED
|
@@ -91,7 +91,9 @@ def create_tts_interface():
|
|
| 91 |
)
|
| 92 |
|
| 93 |
with gr.Tab(label="Upload"):
|
| 94 |
-
spk_file_upload = gr.File(
|
|
|
|
|
|
|
| 95 |
|
| 96 |
gr.Markdown("📝Speaker info")
|
| 97 |
infos = gr.Markdown("empty")
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
with gr.Tab(label="Upload"):
|
| 94 |
+
spk_file_upload = gr.File(
|
| 95 |
+
label="Speaker (Upload)", file_types=[".pt"]
|
| 96 |
+
)
|
| 97 |
|
| 98 |
gr.Markdown("📝Speaker info")
|
| 99 |
infos = gr.Markdown("empty")
|
modules/webui/webui_utils.py
CHANGED
|
@@ -93,13 +93,11 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
|
|
| 93 |
tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
|
| 94 |
enhancer = load_enhancer(device)
|
| 95 |
|
| 96 |
-
if enable_enhance:
|
| 97 |
lambd = 0.9 if enable_denoise else 0.1
|
| 98 |
tensor, sr = enhancer.enhance(
|
| 99 |
tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
|
| 100 |
)
|
| 101 |
-
elif enable_denoise:
|
| 102 |
-
tensor, sr = enhancer.denoise(tensor, sr)
|
| 103 |
|
| 104 |
audio_data = tensor.cpu().numpy()
|
| 105 |
return audio_data, int(sr)
|
|
|
|
| 93 |
tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
|
| 94 |
enhancer = load_enhancer(device)
|
| 95 |
|
| 96 |
+
if enable_enhance or enable_denoise:
|
| 97 |
lambd = 0.9 if enable_denoise else 0.1
|
| 98 |
tensor, sr = enhancer.enhance(
|
| 99 |
tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
|
| 100 |
)
|
|
|
|
|
|
|
| 101 |
|
| 102 |
audio_data = tensor.cpu().numpy()
|
| 103 |
return audio_data, int(sr)
|
webui.py
CHANGED
|
@@ -84,7 +84,6 @@ if __name__ == "__main__":
|
|
| 84 |
parser.add_argument(
|
| 85 |
"--language",
|
| 86 |
type=str,
|
| 87 |
-
default="zh-CN",
|
| 88 |
help="Set the default language for the webui",
|
| 89 |
)
|
| 90 |
args = parser.parse_args()
|
|
@@ -106,7 +105,7 @@ if __name__ == "__main__":
|
|
| 106 |
device_id = get_and_update_env(args, "device_id", None, str)
|
| 107 |
use_cpu = get_and_update_env(args, "use_cpu", [], list)
|
| 108 |
compile = get_and_update_env(args, "compile", False, bool)
|
| 109 |
-
language = get_and_update_env(args, "language",
|
| 110 |
|
| 111 |
webui_config.experimental = get_and_update_env(
|
| 112 |
args, "webui_experimental", False, bool
|
|
@@ -115,8 +114,6 @@ if __name__ == "__main__":
|
|
| 115 |
webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
|
| 116 |
webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
|
| 117 |
|
| 118 |
-
config.runtime_env_vars.language = "zh-CN"
|
| 119 |
-
|
| 120 |
webui_init()
|
| 121 |
demo = create_interface()
|
| 122 |
|
|
|
|
| 84 |
parser.add_argument(
|
| 85 |
"--language",
|
| 86 |
type=str,
|
|
|
|
| 87 |
help="Set the default language for the webui",
|
| 88 |
)
|
| 89 |
args = parser.parse_args()
|
|
|
|
| 105 |
device_id = get_and_update_env(args, "device_id", None, str)
|
| 106 |
use_cpu = get_and_update_env(args, "use_cpu", [], list)
|
| 107 |
compile = get_and_update_env(args, "compile", False, bool)
|
| 108 |
+
language = get_and_update_env(args, "language", "zh-CN", str)
|
| 109 |
|
| 110 |
webui_config.experimental = get_and_update_env(
|
| 111 |
args, "webui_experimental", False, bool
|
|
|
|
| 114 |
webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
|
| 115 |
webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
|
| 116 |
|
|
|
|
|
|
|
| 117 |
webui_init()
|
| 118 |
demo = create_interface()
|
| 119 |
|