Spaces:
Build error
Build error
| import re | |
| import unicodedata | |
| from g2p_en import G2p | |
| from g2p_en.expand import normalize_numbers | |
| from nltk import pos_tag | |
| from nltk.tokenize import TweetTokenizer | |
| from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors | |
| from utils.text.text_encoder import PUNCS, is_sil_phoneme | |
| class EnG2p(G2p): | |
| word_tokenize = TweetTokenizer().tokenize | |
| def __call__(self, text): | |
| # preprocessing | |
| words = EnG2p.word_tokenize(text) | |
| tokens = pos_tag(words) # tuples of (word, tag) | |
| # steps | |
| prons = [] | |
| for word, pos in tokens: | |
| if re.search("[a-z]", word) is None: | |
| pron = [word] | |
| elif word in self.homograph2features: # Check homograph | |
| pron1, pron2, pos1 = self.homograph2features[word] | |
| if pos.startswith(pos1): | |
| pron = pron1 | |
| else: | |
| pron = pron2 | |
| elif word in self.cmu: # lookup CMU dict | |
| pron = self.cmu[word][0] | |
| else: # predict for oov | |
| pron = self.predict(word) | |
| prons.extend(pron) | |
| prons.extend([" "]) | |
| return prons[:-1] | |
| class TxtProcessor(BaseTxtProcessor): | |
| g2p = EnG2p() | |
| def preprocess_text(text): | |
| text = normalize_numbers(text) | |
| text = ''.join(char for char in unicodedata.normalize('NFD', text) | |
| if unicodedata.category(char) != 'Mn') # Strip accents | |
| text = text.lower() | |
| text = re.sub("[\'\"()]+", "", text) | |
| text = re.sub("[-]+", " ", text) | |
| text = re.sub(f"[^ a-z{PUNCS}]", "", text) | |
| text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> ! | |
| text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! | |
| text = text.replace("i.e.", "that is") | |
| text = text.replace("i.e.", "that is") | |
| text = text.replace("etc.", "etc") | |
| text = re.sub(f"([{PUNCS}])", r" \1 ", text) | |
| text = re.sub(rf"\s+", r" ", text) | |
| return text | |
| def process(cls, txt, preprocess_args): | |
| txt = cls.preprocess_text(txt).strip() | |
| phs = cls.g2p(txt) | |
| txt_struct = [[w, []] for w in txt.split(" ")] | |
| i_word = 0 | |
| for p in phs: | |
| if p == ' ': | |
| i_word += 1 | |
| else: | |
| txt_struct[i_word][1].append(p) | |
| txt_struct = cls.postprocess(txt_struct, preprocess_args) | |
| return txt_struct, txt | |