# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: MIT # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # Based on https://github.com/NVIDIA/flowtron/blob/master/data.py # Original license text: ############################################################################### # # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ############################################################################### """adapted from https://github.com/keithito/tacotron""" import re from string import punctuation from functools import reduce import torch import torch.utils.data ######### # REGEX # ######### # Regular expression matching text enclosed in curly braces for encoding _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") # Regular expression separating words enclosed in curly braces for cleaning _arpa_re = re.compile(r"{[^}]+}|\S+") def lowercase(text): return text.lower() def collapse_whitespace(text): return re.sub(_whitespace_re, " ", text) def remove_space_before_punctuation(text): return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text) class Cleaner: def __init__(self, cleaner_names, phonemedict): self.cleaner_names = cleaner_names self.phonemedict = phonemedict def __call__(self, text): for cleaner_name in self.cleaner_names: sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name) for fn in sequence_fns: text = fn(text) text = [ reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split for split in _arpa_re.findall(text) ] text = " ".join(text) text = remove_space_before_punctuation(text) return text def get_cleaner_fns(self, cleaner_name): sequence_fns = [lowercase, collapse_whitespace] word_fns = [] return sequence_fns, word_fns def get_symbols(): _punctuation = "'.,?! " _special = "-+" _letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії" symbols = list(_punctuation + _special + _letters) return symbols class TextProcessing: def __init__( self, symbol_set, cleaner_name, heteronyms_path, phoneme_dict_path, p_phoneme, handle_phoneme, handle_phoneme_ambiguous, prepend_space_to_text=False, append_space_to_text=False, add_bos_eos_to_text=False, encoding="latin-1", ): self.phonemedict = {} self.p_phoneme = p_phoneme self.handle_phoneme = handle_phoneme self.handle_phoneme_ambiguous = handle_phoneme_ambiguous self.symbols = get_symbols() self.cleaner_names = cleaner_name self.cleaner = Cleaner(cleaner_name, self.phonemedict) self.prepend_space_to_text = prepend_space_to_text self.append_space_to_text = append_space_to_text self.add_bos_eos_to_text = add_bos_eos_to_text if add_bos_eos_to_text: self.symbols.append("") self.symbols.append("") # Mappings from symbol to numeric ID and vice versa: self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} def text_to_sequence(self, text): sequence = [] # Check for curly braces and treat their contents as phoneme: while len(text): m = _curly_re.match(text) if not m: sequence += self.symbols_to_sequence(text) break sequence += self.symbols_to_sequence(m.group(1)) sequence += self.phoneme_to_sequence(m.group(2)) text = m.group(3) return sequence def sequence_to_text(self, sequence): result = "" for symbol_id in sequence: if symbol_id in self.id_to_symbol: s = self.id_to_symbol[symbol_id] # Enclose phoneme back in curly braces: if len(s) > 1 and s[0] == "@": s = "{%s}" % s[1:] result += s return result.replace("}{", " ") def clean_text(self, text): text = self.cleaner(text) return text def symbols_to_sequence(self, symbols): return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] def encode_text(self, text, return_all=False): text_clean = self.clean_text(text) text = text_clean text_encoded = self.text_to_sequence(text) if self.prepend_space_to_text: text_encoded.insert(0, self.symbol_to_id[" "]) if self.append_space_to_text: text_encoded.append(self.symbol_to_id[" "]) if self.add_bos_eos_to_text: text_encoded.insert(0, self.symbol_to_id[""]) text_encoded.append(self.symbol_to_id[""]) if return_all: return text_encoded, text_clean return text_encoded class TextProcessor(torch.utils.data.Dataset): def __init__( self, datasets, filter_length, hop_length, win_length, sampling_rate, n_mel_channels, mel_fmin, mel_fmax, f0_min, f0_max, max_wav_value, use_f0, use_energy_avg, use_log_f0, use_scaled_energy, symbol_set, cleaner_names, heteronyms_path, phoneme_dict_path, p_phoneme, handle_phoneme="word", handle_phoneme_ambiguous="ignore", speaker_ids=None, include_speakers=None, n_frames=-1, use_attn_prior_masking=True, prepend_space_to_text=True, append_space_to_text=True, add_bos_eos_to_text=False, betabinom_cache_path="", betabinom_scaling_factor=0.05, lmdb_cache_path="", dur_min=None, dur_max=None, combine_speaker_and_emotion=False, **kwargs, ): self.tp = TextProcessing( symbol_set, cleaner_names, heteronyms_path, phoneme_dict_path, p_phoneme=p_phoneme, handle_phoneme=handle_phoneme, handle_phoneme_ambiguous=handle_phoneme_ambiguous, prepend_space_to_text=prepend_space_to_text, append_space_to_text=append_space_to_text, add_bos_eos_to_text=add_bos_eos_to_text, )