|
import logging
|
|
|
|
import regex as re
|
|
|
|
from tools.classify_language import classify_language, split_alpha_nonalpha
|
|
|
|
|
|
def check_is_none(item) -> bool:
|
|
"""none -> True, not none -> False"""
|
|
return (
|
|
item is None
|
|
or (isinstance(item, str) and str(item).isspace())
|
|
or str(item) == ""
|
|
)
|
|
|
|
|
|
def markup_language(text: str, target_languages: list = None) -> str:
|
|
pattern = (
|
|
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
|
|
r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
|
|
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
|
|
)
|
|
sentences = re.split(pattern, text)
|
|
|
|
pre_lang = ""
|
|
p = 0
|
|
|
|
if target_languages is not None:
|
|
sorted_target_languages = sorted(target_languages)
|
|
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
|
|
new_sentences = []
|
|
for sentence in sentences:
|
|
new_sentences.extend(split_alpha_nonalpha(sentence))
|
|
sentences = new_sentences
|
|
|
|
for sentence in sentences:
|
|
if check_is_none(sentence):
|
|
continue
|
|
|
|
lang = classify_language(sentence, target_languages)
|
|
|
|
if pre_lang == "":
|
|
text = text[:p] + text[p:].replace(
|
|
sentence, f"[{lang.upper()}]{sentence}", 1
|
|
)
|
|
p += len(f"[{lang.upper()}]")
|
|
elif pre_lang != lang:
|
|
text = text[:p] + text[p:].replace(
|
|
sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
|
|
)
|
|
p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
|
|
pre_lang = lang
|
|
p += text[p:].index(sentence) + len(sentence)
|
|
text += f"[{pre_lang.upper()}]"
|
|
|
|
return text
|
|
|
|
|
|
def split_by_language(text: str, target_languages: list = None) -> list:
|
|
pattern = (
|
|
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
|
|
r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
|
|
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
|
|
)
|
|
sentences = re.split(pattern, text)
|
|
|
|
pre_lang = ""
|
|
start = 0
|
|
end = 0
|
|
sentences_list = []
|
|
|
|
if target_languages is not None:
|
|
sorted_target_languages = sorted(target_languages)
|
|
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
|
|
new_sentences = []
|
|
for sentence in sentences:
|
|
new_sentences.extend(split_alpha_nonalpha(sentence))
|
|
sentences = new_sentences
|
|
|
|
for sentence in sentences:
|
|
if check_is_none(sentence):
|
|
continue
|
|
|
|
lang = classify_language(sentence, target_languages)
|
|
|
|
end += text[end:].index(sentence)
|
|
if pre_lang != "" and pre_lang != lang:
|
|
sentences_list.append((text[start:end], pre_lang))
|
|
start = end
|
|
end += len(sentence)
|
|
pre_lang = lang
|
|
sentences_list.append((text[start:], pre_lang))
|
|
|
|
return sentences_list
|
|
|
|
|
|
def sentence_split(text: str, max: int) -> list:
|
|
pattern = r"[!(),—+\-.:;??。,、;:]+"
|
|
sentences = re.split(pattern, text)
|
|
discarded_chars = re.findall(pattern, text)
|
|
|
|
sentences_list, count, p = [], 0, 0
|
|
|
|
|
|
for i, discarded_chars in enumerate(discarded_chars):
|
|
count += len(sentences[i]) + len(discarded_chars)
|
|
if count >= max:
|
|
sentences_list.append(text[p : p + count].strip())
|
|
p += count
|
|
count = 0
|
|
|
|
|
|
if p < len(text):
|
|
sentences_list.append(text[p:])
|
|
|
|
return sentences_list
|
|
|
|
|
|
def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
|
|
|
|
if speaker_lang is not None and len(speaker_lang) == 1:
|
|
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
|
|
logging.debug(
|
|
f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
|
|
)
|
|
lang = speaker_lang[0]
|
|
|
|
sentences_list = []
|
|
if lang.upper() != "MIX":
|
|
if max <= 0:
|
|
sentences_list.append(
|
|
markup_language(text, speaker_lang)
|
|
if lang.upper() == "AUTO"
|
|
else f"[{lang.upper()}]{text}[{lang.upper()}]"
|
|
)
|
|
else:
|
|
for i in sentence_split(text, max):
|
|
if check_is_none(i):
|
|
continue
|
|
sentences_list.append(
|
|
markup_language(i, speaker_lang)
|
|
if lang.upper() == "AUTO"
|
|
else f"[{lang.upper()}]{i}[{lang.upper()}]"
|
|
)
|
|
else:
|
|
sentences_list.append(text)
|
|
|
|
for i in sentences_list:
|
|
logging.debug(i)
|
|
|
|
return sentences_list
|
|
|
|
|
|
if __name__ == "__main__":
|
|
text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
|
|
print(markup_language(text, target_languages=None))
|
|
print(sentence_split(text, max=50))
|
|
print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
|
|
|
|
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
|
|
print(split_by_language(text, ["zh", "ja", "en"]))
|
|
|
|
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
|
|
|
|
print(split_by_language(text, ["zh", "ja", "en"]))
|
|
|
|
|
|
print(split_by_language(text, ["zh", "en"]))
|
|
|
|
|
|
text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days"
|
|
print(split_by_language(text, ["zh", "en"]))
|
|
|
|
|