Spaces:

AIGC-Audio
/

Make_An_Audio

Runtime error

App Files Files Community

txt2audio commited on Jun 1, 2023

Commit

56c9694

1 Parent(s): c4d0fa8

delete

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

NeuralSeq/LICENSE +0 -21
NeuralSeq/README.md +0 -9
NeuralSeq/configs/config_base.yaml +0 -42
NeuralSeq/configs/singing/base.yaml +0 -42
NeuralSeq/configs/singing/fs2.yaml +0 -3
NeuralSeq/configs/tts/base.yaml +0 -95
NeuralSeq/configs/tts/base_zh.yaml +0 -3
NeuralSeq/configs/tts/emotion/base_text2mel.yaml +0 -17
NeuralSeq/configs/tts/emotion/pre_align.py +0 -25
NeuralSeq/configs/tts/fs2.yaml +0 -80
NeuralSeq/configs/tts/hifigan.yaml +0 -21
NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc +0 -0
NeuralSeq/configs/tts/libritts/base_text2mel.yaml +0 -14
NeuralSeq/configs/tts/libritts/fs2.yaml +0 -3
NeuralSeq/configs/tts/libritts/pre_align.py +0 -27
NeuralSeq/configs/tts/libritts/pwg.yaml +0 -8
NeuralSeq/configs/tts/lj/base_mel2wav.yaml +0 -3
NeuralSeq/configs/tts/lj/base_text2mel.yaml +0 -13
NeuralSeq/configs/tts/lj/fs2.yaml +0 -3
NeuralSeq/configs/tts/lj/hifigan.yaml +0 -3
NeuralSeq/configs/tts/lj/pwg.yaml +0 -3
NeuralSeq/configs/tts/pwg.yaml +0 -110
NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/base_binarizer.py +0 -224
NeuralSeq/data_gen/tts/base_binarizer_emotion.py +0 -352
NeuralSeq/data_gen/tts/base_preprocess.py +0 -254
NeuralSeq/data_gen/tts/binarizer_zh.py +0 -59
NeuralSeq/data_gen/tts/data_gen_utils.py +0 -357
NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/audio.py +0 -107
NeuralSeq/data_gen/tts/emotion/inference.py +0 -177
NeuralSeq/data_gen/tts/emotion/model.py +0 -78
NeuralSeq/data_gen/tts/emotion/params_data.py +0 -29
NeuralSeq/data_gen/tts/emotion/params_model.py +0 -11
NeuralSeq/data_gen/tts/emotion/test_emotion.py +0 -184
NeuralSeq/data_gen/tts/txt_processors/__init__.py +0 -1
NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py +0 -47
NeuralSeq/data_gen/tts/txt_processors/en.py +0 -77
NeuralSeq/data_gen/tts/txt_processors/zh.py +0 -43

NeuralSeq/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2021 Jinglin Liu
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

NeuralSeq/README.md DELETED Viewed

@@ -1,9 +0,0 @@
----
-title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
-emoji: 🎶
-colorFrom: purple
-colorTo: blue
-sdk: gradio
-app_file: "inference/svs/gradio/infer.py"
-pinned: false
----

NeuralSeq/configs/config_base.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-# task
-binary_data_dir: ''
-work_dir: '' # experiment directory.
-infer: false # infer
-seed: 1234
-debug: false
-save_codes:
-  - configs
-  - modules
-  - tasks
-  - utils
-  - usr
-#############
-# dataset
-#############
-ds_workers: 1
-test_num: 100
-valid_num: 100
-endless_ds: false
-sort_by_len: true
-#########
-# train and eval
-#########
-load_ckpt: ''
-save_ckpt: true
-save_best: false
-num_ckpt_keep: 3
-clip_grad_norm: 0
-accumulate_grad_batches: 1
-log_interval: 100
-num_sanity_val_steps: 5  # steps of validation at the beginning
-check_val_every_n_epoch: 10
-val_check_interval: 2000
-max_epochs: 1000
-max_updates: 160000
-max_tokens: 31250
-max_sentences: 100000
-max_eval_tokens: -1
-max_eval_sentences: -1
-test_input_dir: ''

NeuralSeq/configs/singing/base.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-base_config:
-  - configs/tts/base.yaml
-  - configs/tts/base_zh.yaml
-datasets: []
-test_prefixes: []
-test_num: 0
-valid_num: 0
-pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
-binarizer_cls: data_gen.singing.binarize.SingingBinarizer
-pre_align_args:
-  use_tone: false # for ZH
-  forced_align: mfa
-  use_sox: true
-hop_size: 128            # Hop size.
-fft_size: 512           # FFT size.
-win_size: 512           # FFT size.
-max_frames: 8000
-fmin: 50                 # Minimum freq in mel basis calculation.
-fmax: 11025               # Maximum frequency in mel basis calculation.
-pitch_type: frame
-hidden_size: 256
-mel_loss: "ssim:0.5|l1:0.5"
-lambda_f0: 0.0
-lambda_uv: 0.0
-lambda_energy: 0.0
-lambda_ph_dur: 0.0
-lambda_sent_dur: 0.0
-lambda_word_dur: 0.0
-predictor_grad: 0.0
-use_spk_embed: true
-use_spk_id: false
-max_tokens: 20000
-max_updates: 400000
-num_spk: 100
-save_f0: true
-use_gt_dur: true
-use_gt_f0: true

NeuralSeq/configs/singing/fs2.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-base_config:
-  - configs/tts/fs2.yaml
-  - configs/singing/base.yaml

NeuralSeq/configs/tts/base.yaml DELETED Viewed

@@ -1,95 +0,0 @@
-# task
-base_config: configs/config_base.yaml
-task_cls: ''
-#############
-# dataset
-#############
-raw_data_dir: ''
-processed_data_dir: ''
-binary_data_dir: ''
-dict_dir: ''
-pre_align_cls: ''
-binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
-pre_align_args:
-  use_tone: true # for ZH
-  forced_align: mfa
-  use_sox: false
-  txt_processor: en
-  allow_no_txt: false
-  denoise: false
-binarization_args:
-  shuffle: false
-  with_txt: true
-  with_wav: false
-  with_align: true
-  with_spk_embed: true
-  with_f0: true
-  with_f0cwt: true
-loud_norm: false
-endless_ds: true
-reset_phone_dict: true
-test_num: 100
-valid_num: 100
-max_frames: 1550
-max_input_tokens: 1550
-audio_num_mel_bins: 80
-audio_sample_rate: 22050
-hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
-win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
-fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
-fmax: 7600  # To be increased/reduced depending on data.
-fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
-min_level_db: -100
-num_spk: 1
-mel_vmin: -6
-mel_vmax: 1.5
-ds_workers: 4
-#########
-# model
-#########
-dropout: 0.1
-enc_layers: 4
-dec_layers: 4
-hidden_size: 384
-num_heads: 2
-prenet_dropout: 0.5
-prenet_hidden_size: 256
-stop_token_weight: 5.0
-enc_ffn_kernel_size: 9
-dec_ffn_kernel_size: 9
-ffn_act: gelu
-ffn_padding: 'SAME'
-###########
-# optimization
-###########
-lr: 2.0
-warmup_updates: 8000
-optimizer_adam_beta1: 0.9
-optimizer_adam_beta2: 0.98
-weight_decay: 0
-clip_grad_norm: 1
-###########
-# train and eval
-###########
-max_tokens: 30000
-max_sentences: 100000
-max_eval_sentences: 1
-max_eval_tokens: 60000
-train_set_name: 'train'
-valid_set_name: 'valid'
-test_set_name: 'test'
-vocoder: pwg
-vocoder_ckpt: ''
-profile_infer: false
-out_wav_norm: false
-save_gt: false
-save_f0: false
-gen_dir_name: ''
-use_denoise: false

NeuralSeq/configs/tts/base_zh.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-pre_align_args:
-  txt_processor: zh_g2pM
-binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer

NeuralSeq/configs/tts/emotion/base_text2mel.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-raw_data_dir: 'data/raw/ESD'
-processed_data_dir: 'data/processed/emotion'
-binary_data_dir: 'data/binary/emotion'
-pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign
-audio_sample_rate: 16000
-binarization_args:
-  shuffle: true
-binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
-use_spk_id: true
-test_num: 200
-num_spk: 10
-pitch_type: frame
-min_frames: 128
-num_test_samples: 30
-mel_loss: "ssim:0.5|l1:0.5"
-vocoder_ckpt: ''
-use_emotion: true

NeuralSeq/configs/tts/emotion/pre_align.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from data_gen.tts.base_preprocess import BasePreprocessor
-import glob
-import re
-class EmoPreAlign(BasePreprocessor):
-    def meta_data(self):
-        spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
-        pattern = re.compile('[\t\n ]+')
-        for spk in spks:
-            for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'):  # 打开文件
-                line = re.sub(pattern, ' ', line)
-                if line == ' ': continue
-                split_ = line.split(' ')
-                txt = ' '.join(split_[1: -2])
-                item_name = split_[0]
-                emotion = split_[-2]
-                wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
-                yield item_name, wav_fn, txt, spk, emotion
-if __name__ == "__main__":
-    EmoPreAlign().process()

NeuralSeq/configs/tts/fs2.yaml DELETED Viewed

@@ -1,80 +0,0 @@
-base_config: configs/tts/base.yaml
-task_cls: tasks.tts.fs2.FastSpeech2Task
-# model
-hidden_size: 256
-dropout: 0.1
-encoder_type: fft # fft|tacotron|tacotron2|conformer
-encoder_K: 8 # for tacotron encoder
-decoder_type: fft # fft|rnn|conv|conformer
-use_pos_embed: true
-# duration
-predictor_hidden: -1
-predictor_kernel: 5
-predictor_layers: 2
-dur_predictor_kernel: 3
-dur_predictor_layers: 2
-predictor_dropout: 0.5
-# pitch and energy
-use_pitch_embed: true
-pitch_type: ph # frame|ph|cwt
-use_uv: true
-cwt_hidden_size: 128
-cwt_layers: 2
-cwt_loss: l1
-cwt_add_f0_loss: false
-cwt_std_scale: 0.8
-pitch_ar: false
-#pitch_embed_type: 0q
-pitch_loss: 'l1' # l1|l2|ssim
-pitch_norm: log
-use_energy_embed: false
-# reference encoder and speaker embedding
-use_spk_id: false
-use_split_spk_id: false
-use_spk_embed: false
-use_var_enc: false
-lambda_commit: 0.25
-ref_norm_layer: bn
-pitch_enc_hidden_stride_kernel:
-  - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
-  - 0,2,5
-  - 0,2,5
-dur_enc_hidden_stride_kernel:
-  - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
-  - 0,2,3
-  - 0,1,3
-# mel
-mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
-# loss lambda
-lambda_f0: 1.0
-lambda_uv: 1.0
-lambda_energy: 0.1
-lambda_ph_dur: 1.0
-lambda_sent_dur: 1.0
-lambda_word_dur: 1.0
-predictor_grad: 0.1
-# train and eval
-pretrain_fs_ckpt: ''
-warmup_updates: 2000
-max_tokens: 32000
-max_sentences: 100000
-max_eval_sentences: 1
-max_updates: 120000
-num_valid_plots: 5
-num_test_samples: 0
-test_ids: []
-use_gt_dur: false
-use_gt_f0: false
-# exp
-dur_loss: mse # huber|mol
-norm_type: gn

NeuralSeq/configs/tts/hifigan.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-base_config: configs/tts/pwg.yaml
-task_cls: tasks.vocoder.hifigan.HifiGanTask
-resblock: "1"
-adam_b1: 0.8
-adam_b2: 0.99
-upsample_rates: [ 8,8,2,2 ]
-upsample_kernel_sizes: [ 16,16,4,4 ]
-upsample_initial_channel: 128
-resblock_kernel_sizes: [ 3,7,11 ]
-resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
-lambda_mel: 45.0
-max_samples: 8192
-max_sentences: 16
-generator_params:
-  lr: 0.0002            # Generator's learning rate.
-  aux_context_window: 0 # Context window size for auxiliary feature.
-discriminator_optimizer_params:
-  lr: 0.0002            # Discriminator's learning rate.

NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc DELETED Viewed

Binary file (981 Bytes)

NeuralSeq/configs/tts/libritts/base_text2mel.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-raw_data_dir: 'data/raw/LibriTTS'
-processed_data_dir: 'data/processed/libritts'
-binary_data_dir: 'data/binary/libritts'
-pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign
-binarization_args:
-  shuffle: true
-use_spk_id: true
-test_num: 200
-num_spk: 2320
-pitch_type: frame
-min_frames: 128
-num_test_samples: 30
-mel_loss: "ssim:0.5|l1:0.5"
-vocoder_ckpt: ''

NeuralSeq/configs/tts/libritts/fs2.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-base_config:
-  - configs/tts/fs2.yaml
-  - ./base_text2mel.yaml

NeuralSeq/configs/tts/libritts/pre_align.py DELETED Viewed

@@ -1,27 +0,0 @@
-import os
-from data_gen.tts.base_preprocess import BasePreprocessor
-import glob
-class LibrittsPreAlign(BasePreprocessor):
-    def meta_data(self):
-        wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
-        for wav_fn in wav_fns:
-            item_name = os.path.basename(wav_fn)[:-4]
-            txt_fn = f'{wav_fn[:-4]}.normalized.txt'
-            with open(txt_fn, 'r') as f:
-                txt = f.readlines()
-                f.close()
-            spk = item_name.split("_")[0]
-            # Example:
-            #
-            # 'item_name': '103_1241_000000_000001'
-            # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
-            # 'txt': 'matthew Cuthbert is surprised'
-            # 'spk_name': '103'
-            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
-if __name__ == "__main__":
-    LibrittsPreAlign().process()

NeuralSeq/configs/tts/libritts/pwg.yaml DELETED Viewed

@@ -1,8 +0,0 @@
-base_config: egs/egs_bases/tts/vocoder/pwg.yaml
-raw_data_dir: 'data/raw/LibriTTS'
-processed_data_dir: 'data/processed/libritts'
-binary_data_dir: 'data/binary/libritts_wav'
-generator_params:
-  kernel_size: 5
-num_spk: 400
-max_samples: 20480

NeuralSeq/configs/tts/lj/base_mel2wav.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-raw_data_dir: 'data/raw/LJSpeech-1.1'
-processed_data_dir: 'data/processed/ljspeech'
-binary_data_dir: 'data/binary/ljspeech_wav'

NeuralSeq/configs/tts/lj/base_text2mel.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-raw_data_dir: 'data/raw/LJSpeech-1.1'
-processed_data_dir: 'data/processed/ljspeech'
-binary_data_dir: 'data/binary/ljspeech'
-pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
-pitch_type: cwt
-mel_loss: l1
-num_test_samples: 20
-test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
-            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
-use_energy_embed: false
-test_num: 523
-valid_num: 348

NeuralSeq/configs/tts/lj/fs2.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-base_config:
-  - configs/tts/fs2.yaml
-  - configs/tts/lj/base_text2mel.yaml

NeuralSeq/configs/tts/lj/hifigan.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-base_config:
-  - configs/tts/hifigan.yaml
-  - configs/tts/lj/base_mel2wav.yaml

NeuralSeq/configs/tts/lj/pwg.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-base_config:
-  - configs/tts/pwg.yaml
-  - configs/tts/lj/base_mel2wav.yaml

NeuralSeq/configs/tts/pwg.yaml DELETED Viewed

@@ -1,110 +0,0 @@
-base_config: configs/tts/base.yaml
-task_cls: tasks.vocoder.pwg.PwgTask
-binarization_args:
-  with_wav: true
-  with_spk_embed: false
-  with_align: false
-test_input_dir: ''
-###########
-# train and eval
-###########
-max_samples: 25600
-max_sentences: 5
-max_eval_sentences: 1
-max_updates: 1000000
-val_check_interval: 2000
-###########################################################
-#                FEATURE EXTRACTION SETTING               #
-###########################################################
-sampling_rate: 22050     # Sampling rate.
-fft_size: 1024           # FFT size.
-hop_size: 256            # Hop size.
-win_length: null         # Window length.
-# If set to null, it will be the same as fft_size.
-window: "hann"           # Window function.
-num_mels: 80             # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation.
-fmax: 7600               # Maximum frequency in mel basis calculation.
-format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-generator_params:
-  in_channels: 1        # Number of input channels.
-  out_channels: 1       # Number of output channels.
-  kernel_size: 3        # Kernel size of dilated convolution.
-  layers: 30            # Number of residual block layers.
-  stacks: 3             # Number of stacks i.e., dilation cycles.
-  residual_channels: 64 # Number of channels in residual conv.
-  gate_channels: 128    # Number of channels in gated conv.
-  skip_channels: 64     # Number of channels in skip conv.
-  aux_channels: 80      # Number of channels for auxiliary feature conv.
-  # Must be the same as num_mels.
-  aux_context_window: 2 # Context window size for auxiliary feature.
-  # If set to 2, previous 2 and future 2 frames will be considered.
-  dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-  use_weight_norm: true # Whether to use weight norm.
-  # If set to true, it will be applied to all of the conv layers.
-  upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
-  upsample_params:                      # Upsampling network parameters.
-    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
-  use_pitch_embed: false
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-discriminator_params:
-  in_channels: 1        # Number of input channels.
-  out_channels: 1       # Number of output channels.
-  kernel_size: 3        # Number of output channels.
-  layers: 10            # Number of conv layers.
-  conv_channels: 64     # Number of chnn layers.
-  bias: true            # Whether to use bias parameter in conv.
-  use_weight_norm: true # Whether to use weight norm.
-  # If set to true, it will be applied to all of the conv layers.
-  nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
-  nonlinear_activation_params:      # Nonlinear function parameters
-    negative_slope: 0.2           # Alpha in LeakyReLU.
-###########################################################
-#                   STFT LOSS SETTING                     #
-###########################################################
-stft_loss_params:
-  fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-  hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
-  win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
-  window: "hann_window"         # Window function for STFT-based loss
-use_mel_loss: false
-###########################################################
-#               ADVERSARIAL LOSS SETTING                  #
-###########################################################
-lambda_adv: 4.0  # Loss balancing coefficient.
-###########################################################
-#             OPTIMIZER & SCHEDULER SETTING               #
-###########################################################
-generator_optimizer_params:
-  lr: 0.0001             # Generator's learning rate.
-  eps: 1.0e-6            # Generator's epsilon.
-  weight_decay: 0.0      # Generator's weight decay coefficient.
-generator_scheduler_params:
-  step_size: 200000      # Generator's scheduler step size.
-  gamma: 0.5             # Generator's scheduler gamma.
-  # At each step size, lr will be multiplied by this parameter.
-generator_grad_norm: 10    # Generator's gradient norm.
-discriminator_optimizer_params:
-  lr: 0.00005            # Discriminator's learning rate.
-  eps: 1.0e-6            # Discriminator's epsilon.
-  weight_decay: 0.0      # Discriminator's weight decay coefficient.
-discriminator_scheduler_params:
-  step_size: 200000      # Discriminator's scheduler step size.
-  gamma: 0.5             # Discriminator's scheduler gamma.
-  # At each step size, lr will be multiplied by this parameter.
-discriminator_grad_norm: 1 # Discriminator's gradient norm.
-disc_start_steps: 40000 # Number of steps to start to train discriminator.

NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc DELETED Viewed

Binary file (8.23 kB)

NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc DELETED Viewed

Binary file (13.3 kB)

NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc DELETED Viewed

Binary file (11.1 kB)

NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc DELETED Viewed

Binary file (11 kB)

NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc DELETED Viewed

Binary file (11 kB)

NeuralSeq/data_gen/tts/base_binarizer.py DELETED Viewed

@@ -1,224 +0,0 @@
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-from utils.multiprocess_utils import chunked_multiprocess_run
-import random
-import traceback
-import json
-from resemblyzer import VoiceEncoder
-from tqdm import tqdm
-from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
-from utils.hparams import set_hparams, hparams
-import numpy as np
-from utils.indexed_datasets import IndexedDatasetBuilder
-from vocoders.base_vocoder import VOCODERS
-import pandas as pd
-class BinarizationError(Exception):
-    pass
-class BaseBinarizer:
-    def __init__(self, processed_data_dir=None):
-        if processed_data_dir is None:
-            processed_data_dir = hparams['processed_data_dir']
-        self.processed_data_dirs = processed_data_dir.split(",")
-        self.binarization_args = hparams['binarization_args']
-        self.pre_align_args = hparams['pre_align_args']
-        self.forced_align = self.pre_align_args['forced_align']
-        tg_dir = None
-        if self.forced_align == 'mfa':
-            tg_dir = 'mfa_outputs'
-        if self.forced_align == 'kaldi':
-            tg_dir = 'kaldi_outputs'
-        self.item2txt = {}
-        self.item2ph = {}
-        self.item2wavfn = {}
-        self.item2tgfn = {}
-        self.item2spk = {}
-        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
-            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
-            for r_idx, r in self.meta_df.iterrows():
-                item_name = raw_item_name = r['item_name']
-                if len(self.processed_data_dirs) > 1:
-                    item_name = f'ds{ds_id}_{item_name}'
-                self.item2txt[item_name] = r['txt']
-                self.item2ph[item_name] = r['ph']
-                self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
-                self.item2spk[item_name] = r.get('spk', 'SPK1')
-                if len(self.processed_data_dirs) > 1:
-                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
-                if tg_dir is not None:
-                    self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
-        self.item_names = sorted(list(self.item2txt.keys()))
-        if self.binarization_args['shuffle']:
-            random.seed(1234)
-            random.shuffle(self.item_names)
-    @property
-    def train_item_names(self):
-        return self.item_names[hparams['test_num']+hparams['valid_num']:]
-    @property
-    def valid_item_names(self):
-        return self.item_names[0: hparams['test_num']+hparams['valid_num']]  #
-    @property
-    def test_item_names(self):
-        return self.item_names[0: hparams['test_num']]  # Audios for MOS testing are in 'test_ids'
-    def build_spk_map(self):
-        spk_map = set()
-        for item_name in self.item_names:
-            spk_name = self.item2spk[item_name]
-            spk_map.add(spk_name)
-        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
-        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
-        return spk_map
-    def item_name2spk_id(self, item_name):
-        return self.spk_map[self.item2spk[item_name]]
-    def _phone_encoder(self):
-        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
-        ph_set = []
-        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
-            for processed_data_dir in self.processed_data_dirs:
-                ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
-            ph_set = sorted(set(ph_set))
-            json.dump(ph_set, open(ph_set_fn, 'w'))
-        else:
-            ph_set = json.load(open(ph_set_fn, 'r'))
-        print("| phone set: ", ph_set)
-        return build_phone_encoder(hparams['binary_data_dir'])
-    def meta_data(self, prefix):
-        if prefix == 'valid':
-            item_names = self.valid_item_names
-        elif prefix == 'test':
-            item_names = self.test_item_names
-        else:
-            item_names = self.train_item_names
-        for item_name in item_names:
-            ph = self.item2ph[item_name]
-            txt = self.item2txt[item_name]
-            tg_fn = self.item2tgfn.get(item_name)
-            wav_fn = self.item2wavfn[item_name]
-            spk_id = self.item_name2spk_id(item_name)
-            yield item_name, ph, txt, tg_fn, wav_fn, spk_id
-    def process(self):
-        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
-        self.spk_map = self.build_spk_map()
-        print("| spk_map: ", self.spk_map)
-        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
-        json.dump(self.spk_map, open(spk_map_fn, 'w'))
-        self.phone_encoder = self._phone_encoder()
-        self.process_data('valid')
-        self.process_data('test')
-        self.process_data('train')
-    def process_data(self, prefix):
-        data_dir = hparams['binary_data_dir']
-        args = []
-        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
-        lengths = []
-        f0s = []
-        total_sec = 0
-        if self.binarization_args['with_spk_embed']:
-            voice_encoder = VoiceEncoder().cuda()
-        meta_data = list(self.meta_data(prefix))
-        for m in meta_data:
-            args.append(list(m) + [self.phone_encoder, self.binarization_args])
-        num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
-        for f_id, (_, item) in enumerate(
-                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
-            if item is None:
-                continue
-            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
-                if self.binarization_args['with_spk_embed'] else None
-            if not self.binarization_args['with_wav'] and 'wav' in item:
-                print("del wav")
-                del item['wav']
-            builder.add_item(item)
-            lengths.append(item['len'])
-            total_sec += item['sec']
-            if item.get('f0') is not None:
-                f0s.append(item['f0'])
-        builder.finalize()
-        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
-        if len(f0s) > 0:
-            f0s = np.concatenate(f0s, 0)
-            f0s = f0s[f0s != 0]
-            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
-        print(f"| {prefix} total duration: {total_sec:.3f}s")
-    @classmethod
-    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
-        if hparams['vocoder'] in VOCODERS:
-            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
-        else:
-            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
-        res = {
-            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
-            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
-        }
-        try:
-            if binarization_args['with_f0']:
-                cls.get_pitch(wav, mel, res)
-                if binarization_args['with_f0cwt']:
-                    cls.get_f0cwt(res['f0'], res)
-            if binarization_args['with_txt']:
-                try:
-                    phone_encoded = res['phone'] = encoder.encode(ph)
-                except:
-                    traceback.print_exc()
-                    raise BinarizationError(f"Empty phoneme")
-                if binarization_args['with_align']:
-                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
-        except BinarizationError as e:
-            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
-            return None
-        return res
-    @staticmethod
-    def get_align(tg_fn, ph, mel, phone_encoded, res):
-        if tg_fn is not None and os.path.exists(tg_fn):
-            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
-        else:
-            raise BinarizationError(f"Align not found")
-        if mel2ph.max() - 1 >= len(phone_encoded):
-            raise BinarizationError(
-                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
-        res['mel2ph'] = mel2ph
-        res['dur'] = dur
-    @staticmethod
-    def get_pitch(wav, mel, res):
-        f0, pitch_coarse = get_pitch(wav, mel, hparams)
-        if sum(f0) == 0:
-            raise BinarizationError("Empty f0")
-        res['f0'] = f0
-        res['pitch'] = pitch_coarse
-    @staticmethod
-    def get_f0cwt(f0, res):
-        from utils.cwt import get_cont_lf0, get_lf0_cwt
-        uv, cont_lf0_lpf = get_cont_lf0(f0)
-        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
-        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
-        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
-        if np.any(np.isnan(Wavelet_lf0)):
-            raise BinarizationError("NaN CWT")
-        res['cwt_spec'] = Wavelet_lf0
-        res['cwt_scales'] = scales
-        res['f0_mean'] = logf0s_mean_org
-        res['f0_std'] = logf0s_std_org
-if __name__ == "__main__":
-    set_hparams()
-    BaseBinarizer().process()

NeuralSeq/data_gen/tts/base_binarizer_emotion.py DELETED Viewed

@@ -1,352 +0,0 @@
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-import torch
-from collections import Counter
-from utils.text_encoder import TokenTextEncoder
-from data_gen.tts.emotion import inference as EmotionEncoder
-from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
-from data_gen.tts.emotion.inference import preprocess_wav
-from utils.multiprocess_utils import chunked_multiprocess_run
-import random
-import traceback
-import json
-from resemblyzer import VoiceEncoder
-from tqdm import tqdm
-from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
-from utils.hparams import hparams, set_hparams
-import numpy as np
-from utils.indexed_datasets import IndexedDatasetBuilder
-from vocoders.base_vocoder import get_vocoder_cls
-import pandas as pd
-class BinarizationError(Exception):
-    pass
-class EmotionBinarizer:
-    def __init__(self, processed_data_dir=None):
-        if processed_data_dir is None:
-            processed_data_dir = hparams['processed_data_dir']
-        self.processed_data_dirs = processed_data_dir.split(",")
-        self.binarization_args = hparams['binarization_args']
-        self.pre_align_args = hparams['pre_align_args']
-        self.item2txt = {}
-        self.item2ph = {}
-        self.item2wavfn = {}
-        self.item2tgfn = {}
-        self.item2spk = {}
-        self.item2emo = {}
-    def load_meta_data(self):
-        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
-            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
-            for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
-                item_name = raw_item_name = r['item_name']
-                if len(self.processed_data_dirs) > 1:
-                    item_name = f'ds{ds_id}_{item_name}'
-                self.item2txt[item_name] = r['txt']
-                self.item2ph[item_name] = r['ph']
-                self.item2wavfn[item_name] = r['wav_fn']
-                self.item2spk[item_name] = r.get('spk_name', 'SPK1') \
-                    if self.binarization_args['with_spk_id'] else 'SPK1'
-                if len(self.processed_data_dirs) > 1:
-                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
-                self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
-                self.item2emo[item_name] = r.get('others', '"Neutral"')
-        self.item_names = sorted(list(self.item2txt.keys()))
-        if self.binarization_args['shuffle']:
-            random.seed(1234)
-            random.shuffle(self.item_names)
-    @property
-    def train_item_names(self):
-        return self.item_names[hparams['test_num']:]
-    @property
-    def valid_item_names(self):
-        return self.item_names[:hparams['test_num']]
-    @property
-    def test_item_names(self):
-        return self.valid_item_names
-    def build_spk_map(self):
-        spk_map = set()
-        for item_name in self.item_names:
-            spk_name = self.item2spk[item_name]
-            spk_map.add(spk_name)
-        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
-        print("| #Spk: ", len(spk_map))
-        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
-        return spk_map
-    def build_emo_map(self):
-        emo_map = set()
-        for item_name in self.item_names:
-            emo_name = self.item2emo[item_name]
-            emo_map.add(emo_name)
-        emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
-        print("| #Emo: ", len(emo_map))
-        return emo_map
-    def item_name2spk_id(self, item_name):
-        return self.spk_map[self.item2spk[item_name]]
-    def item_name2emo_id(self, item_name):
-        return self.emo_map[self.item2emo[item_name]]
-    def _phone_encoder(self):
-        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
-        ph_set = []
-        if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
-            for ph_sent in self.item2ph.values():
-                ph_set += ph_sent.split(' ')
-            ph_set = sorted(set(ph_set))
-            json.dump(ph_set, open(ph_set_fn, 'w'))
-            print("| Build phone set: ", ph_set)
-        else:
-            ph_set = json.load(open(ph_set_fn, 'r'))
-            print("| Load phone set: ", ph_set)
-        return build_phone_encoder(hparams['binary_data_dir'])
-    def _word_encoder(self):
-        fn = f"{hparams['binary_data_dir']}/word_set.json"
-        word_set = []
-        if self.binarization_args['reset_word_dict']:
-            for word_sent in self.item2txt.values():
-                word_set += [x for x in word_sent.split(' ') if x != '']
-            word_set = Counter(word_set)
-            total_words = sum(word_set.values())
-            word_set = word_set.most_common(hparams['word_size'])
-            num_unk_words = total_words - sum([x[1] for x in word_set])
-            word_set = [x[0] for x in word_set]
-            json.dump(word_set, open(fn, 'w'))
-            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
-                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
-        else:
-            word_set = json.load(open(fn, 'r'))
-            print("| Load word set. Size: ", len(word_set), word_set[:10])
-        return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
-    def meta_data(self, prefix):
-        if prefix == 'valid':
-            item_names = self.valid_item_names
-        elif prefix == 'test':
-            item_names = self.test_item_names
-        else:
-            item_names = self.train_item_names
-        for item_name in item_names:
-            ph = self.item2ph[item_name]
-            txt = self.item2txt[item_name]
-            tg_fn = self.item2tgfn.get(item_name)
-            wav_fn = self.item2wavfn[item_name]
-            spk_id = self.item_name2spk_id(item_name)
-            emotion = self.item_name2emo_id(item_name)
-            yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
-    def process(self):
-        self.load_meta_data()
-        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
-        self.spk_map = self.build_spk_map()
-        print("| spk_map: ", self.spk_map)
-        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
-        json.dump(self.spk_map, open(spk_map_fn, 'w'))
-        self.emo_map = self.build_emo_map()
-        print("| emo_map: ", self.emo_map)
-        emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
-        json.dump(self.emo_map, open(emo_map_fn, 'w'))
-        self.phone_encoder = self._phone_encoder()
-        self.word_encoder = None
-        EmotionEncoder.load_model(hparams['emotion_encoder_path'])
-        if self.binarization_args['with_word']:
-            self.word_encoder = self._word_encoder()
-        self.process_data('valid')
-        self.process_data('test')
-        self.process_data('train')
-    def process_data(self, prefix):
-        data_dir = hparams['binary_data_dir']
-        args = []
-        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
-        ph_lengths = []
-        mel_lengths = []
-        f0s = []
-        total_sec = 0
-        if self.binarization_args['with_spk_embed']:
-            voice_encoder = VoiceEncoder().cuda()
-        meta_data = list(self.meta_data(prefix))
-        for m in meta_data:
-            args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
-        num_workers = self.num_workers
-        for f_id, (_, item) in enumerate(
-                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
-            if item is None:
-                continue
-            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
-                if self.binarization_args['with_spk_embed'] else None
-            processed_wav = preprocess_wav(item['wav_fn'])
-            item['emo_embed'] = Embed_utterance(processed_wav)
-            if not self.binarization_args['with_wav'] and 'wav' in item:
-                del item['wav']
-            builder.add_item(item)
-            mel_lengths.append(item['len'])
-            if 'ph_len' in item:
-                ph_lengths.append(item['ph_len'])
-            total_sec += item['sec']
-            if item.get('f0') is not None:
-                f0s.append(item['f0'])
-        builder.finalize()
-        np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
-        if len(ph_lengths) > 0:
-            np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
-        if len(f0s) > 0:
-            f0s = np.concatenate(f0s, 0)
-            f0s = f0s[f0s != 0]
-            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
-        print(f"| {prefix} total duration: {total_sec:.3f}s")
-    @classmethod
-    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
-        res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
-        if binarization_args['with_linear']:
-            wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
-            res['linear'] = linear_stft
-        else:
-            wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
-        wav = wav.astype(np.float16)
-        res.update({'mel': mel, 'wav': wav,
-                    'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
-        try:
-            if binarization_args['with_f0']:
-                cls.get_pitch(res)
-                if binarization_args['with_f0cwt']:
-                    cls.get_f0cwt(res)
-            if binarization_args['with_txt']:
-                ph_encoder, word_encoder = encoder
-                try:
-                    res['phone'] = ph_encoder.encode(ph)
-                    res['ph_len'] = len(res['phone'])
-                except:
-                    traceback.print_exc()
-                    raise BinarizationError(f"Empty phoneme")
-                if binarization_args['with_align']:
-                    cls.get_align(tg_fn, res)
-                    if binarization_args['trim_eos_bos']:
-                        bos_dur = res['dur'][0]
-                        eos_dur = res['dur'][-1]
-                        res['mel'] = mel[bos_dur:-eos_dur]
-                        res['f0'] = res['f0'][bos_dur:-eos_dur]
-                        res['pitch'] = res['pitch'][bos_dur:-eos_dur]
-                        res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
-                        res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
-                        res['dur'] = res['dur'][1:-1]
-                        res['len'] = res['mel'].shape[0]
-                if binarization_args['with_word']:
-                    cls.get_word(res, word_encoder)
-        except BinarizationError as e:
-            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
-            return None
-        except Exception as e:
-            traceback.print_exc()
-            print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
-            return None
-        return res
-    @staticmethod
-    def get_align(tg_fn, res):
-        ph = res['ph']
-        mel = res['mel']
-        phone_encoded = res['phone']
-        if tg_fn is not None and os.path.exists(tg_fn):
-            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
-        else:
-            raise BinarizationError(f"Align not found")
-        if mel2ph.max() - 1 >= len(phone_encoded):
-            raise BinarizationError(
-                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
-        res['mel2ph'] = mel2ph
-        res['dur'] = dur
-    @staticmethod
-    def get_pitch(res):
-        wav, mel = res['wav'], res['mel']
-        f0, pitch_coarse = get_pitch(wav, mel, hparams)
-        if sum(f0) == 0:
-            raise BinarizationError("Empty f0")
-        res['f0'] = f0
-        res['pitch'] = pitch_coarse
-    @staticmethod
-    def get_f0cwt(res):
-        from utils.cwt import get_cont_lf0, get_lf0_cwt
-        f0 = res['f0']
-        uv, cont_lf0_lpf = get_cont_lf0(f0)
-        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
-        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
-        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
-        if np.any(np.isnan(Wavelet_lf0)):
-            raise BinarizationError("NaN CWT")
-        res['cwt_spec'] = Wavelet_lf0
-        res['cwt_scales'] = scales
-        res['f0_mean'] = logf0s_mean_org
-        res['f0_std'] = logf0s_std_org
-    @staticmethod
-    def get_word(res, word_encoder):
-        ph_split = res['ph'].split(" ")
-        # ph side mapping to word
-        ph_words = []  # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
-        ph2word = np.zeros([len(ph_split)], dtype=int)
-        last_ph_idx_for_word = []  # [2, 11, ...]
-        for i, ph in enumerate(ph_split):
-            if ph == '|':
-                last_ph_idx_for_word.append(i)
-            elif not ph[0].isalnum():
-                if ph not in ['<BOS>']:
-                    last_ph_idx_for_word.append(i - 1)
-                last_ph_idx_for_word.append(i)
-        start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
-        for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
-            ph_words.append(ph_split[s_w:e_w + 1])
-            ph2word[s_w:e_w + 1] = i
-        ph2word = ph2word.tolist()
-        ph_words = ["_".join(w) for w in ph_words]
-        # mel side mapping to word
-        mel2word = []
-        dur_word = [0 for _ in range(len(ph_words))]
-        for i, m2p in enumerate(res['mel2ph']):
-            word_idx = ph2word[m2p - 1]
-            mel2word.append(ph2word[m2p - 1])
-            dur_word[word_idx] += 1
-        ph2word = [x + 1 for x in ph2word]  # 0预留给padding
-        mel2word = [x + 1 for x in mel2word]  # 0预留给padding
-        res['ph_words'] = ph_words  # [T_word]
-        res['ph2word'] = ph2word  # [T_ph]
-        res['mel2word'] = mel2word  # [T_mel]
-        res['dur_word'] = dur_word  # [T_word]
-        words = [x for x in res['txt'].split(" ") if x != '']
-        while len(words) > 0 and is_sil_phoneme(words[0]):
-            words = words[1:]
-        while len(words) > 0 and is_sil_phoneme(words[-1]):
-            words = words[:-1]
-        words = ['<BOS>'] + words + ['<EOS>']
-        word_tokens = word_encoder.encode(" ".join(words))
-        res['words'] = words
-        res['word_tokens'] = word_tokens
-        assert len(words) == len(ph_words), [words, ph_words]
-    @property
-    def num_workers(self):
-        return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
-if __name__ == "__main__":
-    set_hparams()
-    EmotionBinarizer().process()

NeuralSeq/data_gen/tts/base_preprocess.py DELETED Viewed

@@ -1,254 +0,0 @@
-import json
-import os
-import random
-import re
-import traceback
-from collections import Counter
-from functools import partial
-import pandas as pd
-import librosa
-from tqdm import tqdm
-from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
-from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
-from utils.hparams import hparams
-from utils.multiprocess_utils import multiprocess_run_tqdm
-from utils.os_utils import link_file, move_file, remove_file
-from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
-class BasePreprocessor:
-    def __init__(self):
-        self.preprocess_args = hparams['preprocess_args']
-        txt_processor = self.preprocess_args['txt_processor']
-        self.txt_processor = get_txt_processor_cls(txt_processor)
-        self.raw_data_dir = hparams['raw_data_dir']
-        self.processed_dir = hparams['processed_data_dir']
-        self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
-    def meta_data(self):
-        """
-        :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
-        """
-        raise NotImplementedError
-    def process(self):
-        processed_dir = self.processed_dir
-        wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
-        remove_file(wav_processed_tmp_dir)
-        os.makedirs(wav_processed_tmp_dir, exist_ok=True)
-        wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
-        remove_file(wav_processed_dir)
-        os.makedirs(wav_processed_dir, exist_ok=True)
-        meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
-        item_names = [d['item_name'] for d in meta_data]
-        assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
-        # preprocess data
-        phone_list = []
-        word_list = []
-        spk_names = set()
-        process_item = partial(self.preprocess_first_pass,
-                               txt_processor=self.txt_processor,
-                               wav_processed_dir=wav_processed_dir,
-                               wav_processed_tmp=wav_processed_tmp_dir,
-                               preprocess_args=self.preprocess_args)
-        items = []
-        args = [{
-            'item_name': item_raw['item_name'],
-            'txt_raw': item_raw['txt'],
-            'wav_fn': item_raw['wav_fn'],
-            'txt_loader': item_raw.get('txt_loader'),
-            'others': item_raw.get('others', None)
-        } for item_raw in meta_data]
-        for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
-            if item is not None:
-                item_.update(item)
-                item = item_
-                if 'txt_loader' in item:
-                    del item['txt_loader']
-                item['id'] = item_id
-                item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
-                item['others'] = item.get('others', None)
-                phone_list += item['ph'].split(" ")
-                word_list += item['word'].split(" ")
-                spk_names.add(item['spk_name'])
-                items.append(item)
-        # add encoded tokens
-        ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
-        spk_map = self.build_spk_map(spk_names)
-        args = [{
-            'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
-            'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
-        } for item in items]
-        for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
-            items[idx].update(item_new_kv)
-        # build mfa data
-        if self.preprocess_args['use_mfa']:
-            mfa_dict = set()
-            mfa_input_dir = f'{processed_dir}/mfa_inputs'
-            remove_file(mfa_input_dir)
-            # group MFA inputs for better parallelism
-            mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
-            if self.preprocess_args['mfa_group_shuffle']:
-                random.seed(hparams['seed'])
-                random.shuffle(mfa_groups)
-            args = [{
-                'item': item, 'mfa_input_dir': mfa_input_dir,
-                'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
-                'preprocess_args': self.preprocess_args
-            } for item, mfa_group in zip(items, mfa_groups)]
-            for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
-                    self.build_mfa_inputs, args, desc='Build MFA data'):
-                items[i]['wav_align_fn'] = new_wav_align_fn
-                for w in ph_gb_word_nosil.split(" "):
-                    mfa_dict.add(f"{w} {w.replace('_', ' ')}")
-            mfa_dict = sorted(mfa_dict)
-            with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
-                f.writelines([f'{l}\n' for l in mfa_dict])
-        with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
-            f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
-        remove_file(wav_processed_tmp_dir)
-    @classmethod
-    def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
-                              wav_fn, wav_processed_dir, wav_processed_tmp,
-                              preprocess_args, txt_loader=None, others=None):
-        try:
-            if txt_loader is not None:
-                txt_raw = txt_loader(txt_raw)
-            ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
-            wav_fn, wav_align_fn = cls.process_wav(
-                item_name, wav_fn,
-                hparams['processed_data_dir'],
-                wav_processed_tmp, preprocess_args)
-            # wav for binarization
-            ext = os.path.splitext(wav_fn)[1]
-            os.makedirs(wav_processed_dir, exist_ok=True)
-            new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
-            move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
-            move_link_func(wav_fn, new_wav_fn)
-            return {
-                'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
-                'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
-                'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
-                'others': others
-            }
-        except:
-            traceback.print_exc()
-            print(f"| Error is caught. item_name: {item_name}.")
-            return None
-    @staticmethod
-    def txt_to_ph(txt_processor, txt_raw, preprocess_args):
-        txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
-        ph = [p for w in txt_struct for p in w[1]]
-        ph_gb_word = ["_".join(w[1]) for w in txt_struct]
-        words = [w[0] for w in txt_struct]
-        # word_id=0 is reserved for padding
-        ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
-        return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
-    @staticmethod
-    def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
-        processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
-        processors = [k() for k in processors if k is not None]
-        if len(processors) >= 1:
-            sr_file = librosa.core.get_samplerate(wav_fn)
-            output_fn_for_align = None
-            ext = os.path.splitext(wav_fn)[1]
-            input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
-            link_file(wav_fn, input_fn)
-            for p in processors:
-                outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
-                if len(outputs) == 3:
-                    input_fn, sr, output_fn_for_align = outputs
-                else:
-                    input_fn, sr = outputs
-            if output_fn_for_align is None:
-                return input_fn, input_fn
-            else:
-                return input_fn, output_fn_for_align
-        else:
-            return wav_fn, wav_fn
-    def _phone_encoder(self, ph_set):
-        ph_set_fn = f"{self.processed_dir}/phone_set.json"
-        if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
-            ph_set = sorted(set(ph_set))
-            json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
-            print("| Build phone set: ", ph_set)
-        else:
-            ph_set = json.load(open(ph_set_fn, 'r'))
-            print("| Load phone set: ", ph_set)
-        return build_token_encoder(ph_set_fn)
-    def _word_encoder(self, word_set):
-        word_set_fn = f"{self.processed_dir}/word_set.json"
-        if self.preprocess_args['reset_word_dict']:
-            word_set = Counter(word_set)
-            total_words = sum(word_set.values())
-            word_set = word_set.most_common(hparams['word_dict_size'])
-            num_unk_words = total_words - sum([x[1] for x in word_set])
-            word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
-            word_set = sorted(set(word_set))
-            json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
-            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
-                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
-        else:
-            word_set = json.load(open(word_set_fn, 'r'))
-            print("| Load word set. Size: ", len(word_set), word_set[:10])
-        return build_token_encoder(word_set_fn)
-    @classmethod
-    def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
-        word_token = word_encoder.encode(word)
-        ph_token = ph_encoder.encode(ph)
-        spk_id = spk_map[spk_name]
-        return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
-    def build_spk_map(self, spk_names):
-        spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
-        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
-        print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
-        json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
-        return spk_map
-    @classmethod
-    def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
-        item_name = item['item_name']
-        wav_align_fn = item['wav_align_fn']
-        ph_gb_word = item['ph_gb_word']
-        ext = os.path.splitext(wav_align_fn)[1]
-        mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
-        os.makedirs(mfa_input_group_dir, exist_ok=True)
-        new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
-        move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
-        move_link_func(wav_align_fn, new_wav_align_fn)
-        ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
-                                     for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
-        with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
-            f_txt.write(ph_gb_word_nosil)
-        return ph_gb_word_nosil, new_wav_align_fn
-    def load_spk_map(self, base_dir):
-        spk_map_fn = f"{base_dir}/spk_map.json"
-        spk_map = json.load(open(spk_map_fn, 'r'))
-        return spk_map
-    def load_dict(self, base_dir):
-        ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
-        word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
-        return ph_encoder, word_encoder
-    @property
-    def meta_csv_filename(self):
-        return 'metadata'
-    @property
-    def wav_processed_dirname(self):
-        return 'wav_processed'

NeuralSeq/data_gen/tts/binarizer_zh.py DELETED Viewed

@@ -1,59 +0,0 @@
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
-from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
-from data_gen.tts.data_gen_utils import get_mel2ph
-from utils.hparams import set_hparams, hparams
-import numpy as np
-class ZhBinarizer(BaseBinarizer):
-    @staticmethod
-    def get_align(tg_fn, ph, mel, phone_encoded, res):
-        if tg_fn is not None and os.path.exists(tg_fn):
-            _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
-        else:
-            raise BinarizationError(f"Align not found")
-        ph_list = ph.split(" ")
-        assert len(dur) == len(ph_list)
-        mel2ph = []
-        # 分隔符的时长分配给韵母
-        dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
-        for i in range(len(dur)):
-            p = ph_list[i]
-            if p[0] != '<' and not p[0].isalpha():
-                uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
-                j = 0
-                while j < len(uv_) and not uv_[j]:
-                    j += 1
-                dur[i - 1] += j
-                dur[i] -= j
-                if dur[i] < 100:
-                    dur[i - 1] += dur[i]
-                    dur[i] = 0
-        # 声母和韵母等长
-        for i in range(len(dur)):
-            p = ph_list[i]
-            if p in ALL_SHENMU:
-                p_next = ph_list[i + 1]
-                if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
-                    print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
-                          f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
-                    continue
-                total = dur[i + 1] + dur[i]
-                dur[i] = total // 2
-                dur[i + 1] = total - dur[i]
-        for i in range(len(dur)):
-            mel2ph += [i + 1] * dur[i]
-        mel2ph = np.array(mel2ph)
-        if mel2ph.max() - 1 >= len(phone_encoded):
-            raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
-        res['mel2ph'] = mel2ph
-        res['dur'] = dur
-if __name__ == "__main__":
-    set_hparams()
-    ZhBinarizer().process()

NeuralSeq/data_gen/tts/data_gen_utils.py DELETED Viewed

@@ -1,357 +0,0 @@
-import warnings
-warnings.filterwarnings("ignore")
-import parselmouth
-import os
-import torch
-from skimage.transform import resize
-from utils.text_encoder import TokenTextEncoder
-from utils.pitch_utils import f0_to_coarse
-import struct
-import webrtcvad
-from scipy.ndimage.morphology import binary_dilation
-import librosa
-import numpy as np
-from utils import audio
-import pyloudnorm as pyln
-import re
-import json
-from collections import OrderedDict
-PUNCS = '!,.?;:'
-int16_max = (2 ** 15) - 1
-def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
-    """
-    Ensures that segments without voice in the waveform remain no longer than a
-    threshold determined by the VAD parameters in params.py.
-    :param wav: the raw waveform as a numpy array of floats
-    :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
-    :return: the same waveform with silences trimmed away (length <= original wav length)
-    """
-    ## Voice Activation Detection
-    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
-    # This sets the granularity of the VAD. Should not need to be changed.
-    sampling_rate = 16000
-    wav_raw, sr = librosa.core.load(path, sr=sr)
-    if norm:
-        meter = pyln.Meter(sr)  # create BS.1770 meter
-        loudness = meter.integrated_loudness(wav_raw)
-        wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
-        if np.abs(wav_raw).max() > 1.0:
-            wav_raw = wav_raw / np.abs(wav_raw).max()
-    wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
-    vad_window_length = 30  # In milliseconds
-    # Number of frames to average together when performing the moving average smoothing.
-    # The larger this value, the larger the VAD variations must be to not get smoothed out.
-    vad_moving_average_width = 8
-    # Compute the voice detection window size
-    samples_per_window = (vad_window_length * sampling_rate) // 1000
-    # Trim the end of the audio to have a multiple of the window size
-    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    # Convert the float waveform to 16-bit mono PCM
-    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    # Perform voice activation detection
-    voice_flags = []
-    vad = webrtcvad.Vad(mode=3)
-    for window_start in range(0, len(wav), samples_per_window):
-        window_end = window_start + samples_per_window
-        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
-                                         sample_rate=sampling_rate))
-    voice_flags = np.array(voice_flags)
-    # Smooth the voice detection with a moving average
-    def moving_average(array, width):
-        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
-        ret = np.cumsum(array_padded, dtype=float)
-        ret[width:] = ret[width:] - ret[:-width]
-        return ret[width - 1:] / width
-    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
-    # Dilate the voiced regions
-    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
-    audio_mask = np.repeat(audio_mask, samples_per_window)
-    audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
-    if return_raw_wav:
-        return wav_raw, audio_mask, sr
-    return wav_raw[audio_mask], audio_mask, sr
-def process_utterance(wav_path,
-                      fft_size=1024,
-                      hop_size=256,
-                      win_length=1024,
-                      window="hann",
-                      num_mels=80,
-                      fmin=80,
-                      fmax=7600,
-                      eps=1e-6,
-                      sample_rate=22050,
-                      loud_norm=False,
-                      min_level_db=-100,
-                      return_linear=False,
-                      trim_long_sil=False, vocoder='pwg'):
-    if isinstance(wav_path, str):
-        if trim_long_sil:
-            wav, _, _ = trim_long_silences(wav_path, sample_rate)
-        else:
-            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
-    else:
-        wav = wav_path
-    if loud_norm:
-        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
-        loudness = meter.integrated_loudness(wav)
-        wav = pyln.normalize.loudness(wav, loudness, -22.0)
-        if np.abs(wav).max() > 1:
-            wav = wav / np.abs(wav).max()
-    # get amplitude spectrogram
-    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
-                          win_length=win_length, window=window, pad_mode="constant")
-    spc = np.abs(x_stft)  # (n_bins, T)
-    # get mel basis
-    fmin = 0 if fmin == -1 else fmin
-    fmax = sample_rate / 2 if fmax == -1 else fmax
-    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
-    mel = mel_basis @ spc
-    if vocoder == 'pwg':
-        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
-    else:
-        assert False, f'"{vocoder}" is not in ["pwg"].'
-    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
-    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
-    wav = wav[:mel.shape[1] * hop_size]
-    if not return_linear:
-        return wav, mel
-    else:
-        spc = audio.amp_to_db(spc)
-        spc = audio.normalize(spc, {'min_level_db': min_level_db})
-        return wav, mel, spc
-def get_pitch(wav_data, mel, hparams):
-    """
-    :param wav_data: [T]
-    :param mel: [T, 80]
-    :param hparams:
-    :return:
-    """
-    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
-    f0_min = 80
-    f0_max = 750
-    if hparams['hop_size'] == 128:
-        pad_size = 4
-    elif hparams['hop_size'] == 256:
-        pad_size = 2
-    else:
-        assert False
-    f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
-        time_step=time_step / 1000, voicing_threshold=0.6,
-        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
-    lpad = pad_size * 2
-    rpad = len(mel) - len(f0) - lpad
-    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
-    # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
-    # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
-    # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
-    delta_l = len(mel) - len(f0)
-    assert np.abs(delta_l) <= 8
-    if delta_l > 0:
-        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
-    f0 = f0[:len(mel)]
-    pitch_coarse = f0_to_coarse(f0)
-    return f0, pitch_coarse
-def remove_empty_lines(text):
-    """remove empty lines"""
-    assert (len(text) > 0)
-    assert (isinstance(text, list))
-    text = [t.strip() for t in text]
-    if "" in text:
-        text.remove("")
-    return text
-class TextGrid(object):
-    def __init__(self, text):
-        text = remove_empty_lines(text)
-        self.text = text
-        self.line_count = 0
-        self._get_type()
-        self._get_time_intval()
-        self._get_size()
-        self.tier_list = []
-        self._get_item_list()
-    def _extract_pattern(self, pattern, inc):
-        """
-        Parameters
-        ----------
-        pattern : regex to extract pattern
-        inc : increment of line count after extraction
-        Returns
-        -------
-        group : extracted info
-        """
-        try:
-            group = re.match(pattern, self.text[self.line_count]).group(1)
-            self.line_count += inc
-        except AttributeError:
-            raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
-        return group
-    def _get_type(self):
-        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
-    def _get_time_intval(self):
-        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
-        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
-    def _get_size(self):
-        self.size = int(self._extract_pattern(r"size = (.*)", 2))
-    def _get_item_list(self):
-        """Only supports IntervalTier currently"""
-        for itemIdx in range(1, self.size + 1):
-            tier = OrderedDict()
-            item_list = []
-            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
-            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
-            if tier_class != "IntervalTier":
-                raise NotImplementedError("Only IntervalTier class is supported currently")
-            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
-            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
-            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
-            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
-            for i in range(int(tier_size)):
-                item = OrderedDict()
-                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
-                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
-                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
-                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
-                item_list.append(item)
-            tier["idx"] = tier_idx
-            tier["class"] = tier_class
-            tier["name"] = tier_name
-            tier["xmin"] = tier_xmin
-            tier["xmax"] = tier_xmax
-            tier["size"] = tier_size
-            tier["items"] = item_list
-            self.tier_list.append(tier)
-    def toJson(self):
-        _json = OrderedDict()
-        _json["file_type"] = self.file_type
-        _json["xmin"] = self.xmin
-        _json["xmax"] = self.xmax
-        _json["size"] = self.size
-        _json["tiers"] = self.tier_list
-        return json.dumps(_json, ensure_ascii=False, indent=2)
-def get_mel2ph(tg_fn, ph, mel, hparams):
-    ph_list = ph.split(" ")
-    with open(tg_fn, "r") as f:
-        tg = f.readlines()
-    tg = remove_empty_lines(tg)
-    tg = TextGrid(tg)
-    tg = json.loads(tg.toJson())
-    split = np.ones(len(ph_list) + 1, np.float) * -1
-    tg_idx = 0
-    ph_idx = 0
-    tg_align = [x for x in tg['tiers'][-1]['items']]
-    tg_align_ = []
-    for x in tg_align:
-        x['xmin'] = float(x['xmin'])
-        x['xmax'] = float(x['xmax'])
-        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
-            x['text'] = ''
-            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
-                tg_align_[-1]['xmax'] = x['xmax']
-                continue
-        tg_align_.append(x)
-    tg_align = tg_align_
-    tg_len = len([x for x in tg_align if x['text'] != ''])
-    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
-    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
-    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
-        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
-            split[ph_idx] = 1e8
-            ph_idx += 1
-            continue
-        x = tg_align[tg_idx]
-        if x['text'] == '' and ph_idx == len(ph_list):
-            tg_idx += 1
-            continue
-        assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
-        ph = ph_list[ph_idx]
-        if x['text'] == '' and not is_sil_phoneme(ph):
-            assert False, (ph_list, tg_align)
-        if x['text'] != '' and is_sil_phoneme(ph):
-            ph_idx += 1
-        else:
-            assert (x['text'] == '' and is_sil_phoneme(ph)) \
-                   or x['text'].lower() == ph.lower() \
-                   or x['text'].lower() == 'sil', (x['text'], ph)
-            split[ph_idx] = x['xmin']
-            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
-                split[ph_idx - 1] = split[ph_idx]
-            ph_idx += 1
-            tg_idx += 1
-    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
-    assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
-    mel2ph = np.zeros([mel.shape[0]], np.int)
-    split[0] = 0
-    split[-1] = 1e8
-    for i in range(len(split) - 1):
-        assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
-    split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
-    for ph_idx in range(len(ph_list)):
-        mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
-    mel2ph_torch = torch.from_numpy(mel2ph)
-    T_t = len(ph_list)
-    dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
-    dur = dur[1:].numpy()
-    return mel2ph, dur
-def build_phone_encoder(data_dir):
-    phone_list_file = os.path.join(data_dir, 'phone_set.json')
-    phone_list = json.load(open(phone_list_file))
-    return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
-def build_word_encoder(data_dir):
-    word_list_file = os.path.join(data_dir, 'word_set.json')
-    word_list = json.load(open(word_list_file))
-    return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
-def is_sil_phoneme(p):
-    return not p[0].isalpha()
-def build_token_encoder(token_list_file):
-    token_list = json.load(open(token_list_file))
-    return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')

NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc DELETED Viewed

Binary file (3.8 kB)

NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc DELETED Viewed

Binary file (7.28 kB)

NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (2.53 kB)

NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc DELETED Viewed

Binary file (491 Bytes)

NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc DELETED Viewed

Binary file (371 Bytes)

NeuralSeq/data_gen/tts/emotion/audio.py DELETED Viewed

@@ -1,107 +0,0 @@
-from scipy.ndimage.morphology import binary_dilation
-from data_gen.tts.emotion.params_data import *
-from pathlib import Path
-from typing import Optional, Union
-import numpy as np
-import webrtcvad
-import librosa
-import struct
-int16_max = (2 ** 15) - 1
-def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
-                   source_sr: Optional[int] = None):
-    """
-    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
-    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
-    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
-    just .wav), either the waveform as a numpy array of floats.
-    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
-    preprocessing. After preprocessing, the waveform's sampling rate will match the data
-    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
-    this argument will be ignored.
-    """
-    # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
-    else:
-        wav = fpath_or_wav
-    # Resample the wav if needed
-    if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
-    # Apply the preprocessing: normalize volume and shorten long silences
-    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
-    wav = trim_long_silences(wav)
-    return wav
-def wav_to_mel_spectrogram(wav):
-    """
-    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
-    Note: this not a log-mel spectrogram.
-    """
-    frames = librosa.feature.melspectrogram(
-        wav,
-        sampling_rate,
-        n_fft=int(sampling_rate * mel_window_length / 1000),
-        hop_length=int(sampling_rate * mel_window_step / 1000),
-        n_mels=mel_n_channels
-    )
-    return frames.astype(np.float32).T
-def trim_long_silences(wav):
-    """
-    Ensures that segments without voice in the waveform remain no longer than a
-    threshold determined by the VAD parameters in params.py.
-    :param wav: the raw waveform as a numpy array of floats
-    :return: the same waveform with silences trimmed away (length <= original wav length)
-    """
-    # Compute the voice detection window size
-    samples_per_window = (vad_window_length * sampling_rate) // 1000
-    # Trim the end of the audio to have a multiple of the window size
-    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
-    # Convert the float waveform to 16-bit mono PCM
-    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
-    # Perform voice activation detection
-    voice_flags = []
-    vad = webrtcvad.Vad(mode=3)
-    for window_start in range(0, len(wav), samples_per_window):
-        window_end = window_start + samples_per_window
-        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
-                                         sample_rate=sampling_rate))
-    voice_flags = np.array(voice_flags)
-    # Smooth the voice detection with a moving average
-    def moving_average(array, width):
-        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
-        ret = np.cumsum(array_padded, dtype=float)
-        ret[width:] = ret[width:] - ret[:-width]
-        return ret[width - 1:] / width
-    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
-    # Dilate the voiced regions
-    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
-    audio_mask = np.repeat(audio_mask, samples_per_window)
-    return wav[audio_mask == True]
-def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
-    if increase_only and decrease_only:
-        raise ValueError("Both increase only and decrease only are set")
-    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
-    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
-        return wav
-    return wav * (10 ** (dBFS_change / 20))

NeuralSeq/data_gen/tts/emotion/inference.py DELETED Viewed

@@ -1,177 +0,0 @@
-from data_gen.tts.emotion.params_data import *
-from data_gen.tts.emotion.model import EmotionEncoder
-from data_gen.tts.emotion.audio import preprocess_wav   # We want to expose this function from here
-from matplotlib import cm
-from data_gen.tts.emotion import audio
-from pathlib import Path
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-_model = None # type: EmotionEncoder
-_device = None # type: torch.device
-def load_model(weights_fpath: Path, device=None):
-    """
-    Loads the model in memory. If this function is not explicitely called, it will be run on the
-    first call to embed_frames() with the default weights file.
-    :param weights_fpath: the path to saved model weights.
-    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
-    model will be loaded and will run on this device. Outputs will however always be on the cpu.
-    If None, will default to your GPU if it"s available, otherwise your CPU.
-    """
-    # TODO: I think the slow loading of the encoder might have something to do with the device it
-    #   was saved on. Worth investigating.
-    global _model, _device
-    if device is None:
-        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    elif isinstance(device, str):
-        _device = torch.device(device)
-    _model = EmotionEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath)
-    _model.load_state_dict(checkpoint["model_state"])
-    _model.eval()
-    print("Loaded encoder trained to step %d" % (checkpoint["step"]))
-def is_loaded():
-    return _model is not None
-def embed_frames_batch(frames_batch):
-    """
-    Computes embeddings for a batch of mel spectrogram.
-    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
-    (batch_size, n_frames, n_channels)
-    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
-    """
-    if _model is None:
-        raise Exception("Model was not loaded. Call load_model() before inference.")
-    frames = torch.from_numpy(frames_batch).to(_device)
-    embed = _model.inference(frames).detach().cpu().numpy()
-    return embed
-def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
-                           min_pad_coverage=0.75, overlap=0.5):
-    """
-    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
-    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
-    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
-    its spectrogram. This function assumes that the mel spectrogram parameters used are those
-    defined in params_data.py.
-    The returned ranges may be indexing further than the length of the waveform. It is
-    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
-    :param n_samples: the number of samples in the waveform
-    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
-    utterance
-    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
-    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
-    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
-    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
-    utterance, this parameter is ignored so that the function always returns at least 1 slice.
-    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
-    utterances are entirely disjoint.
-    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
-    respectively the waveform and the mel spectrogram with these slices to obtain the partial
-    utterances.
-    """
-    assert 0 <= overlap < 1
-    assert 0 < min_pad_coverage <= 1
-    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
-    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
-    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
-    # Compute the slices
-    wav_slices, mel_slices = [], []
-    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
-    for i in range(0, steps, frame_step):
-        mel_range = np.array([i, i + partial_utterance_n_frames])
-        wav_range = mel_range * samples_per_frame
-        mel_slices.append(slice(*mel_range))
-        wav_slices.append(slice(*wav_range))
-    # Evaluate whether extra padding is warranted or not
-    last_wav_range = wav_slices[-1]
-    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
-    if coverage < min_pad_coverage and len(mel_slices) > 1:
-        mel_slices = mel_slices[:-1]
-        wav_slices = wav_slices[:-1]
-    return wav_slices, mel_slices
-def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
-    """
-    Computes an embedding for a single utterance.
-    # TODO: handle multiple wavs to benefit from batching on GPU
-    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
-    :param using_partials: if True, then the utterance is split in partial utterances of
-    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
-    normalized average. If False, the utterance is instead computed from feeding the entire
-    spectogram to the network.
-    :param return_partials: if True, the partial embeddings will also be returned along with the
-    wav slices that correspond to the partial embeddings.
-    :param kwargs: additional arguments to compute_partial_splits()
-    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
-    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
-    returned. If <using_partials> is simultaneously set to False, both these values will be None
-    instead.
-    """
-    # Process the entire utterance if not using partials
-    if not using_partials:
-        frames = audio.wav_to_mel_spectrogram(wav)
-        embed = embed_frames_batch(frames[None, ...])[0]
-        if return_partials:
-            return embed, None, None
-        return embed
-    # Compute where to split the utterance into partials and pad if necessary
-    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
-    max_wave_length = wave_slices[-1].stop
-    if max_wave_length >= len(wav):
-        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
-    # Split the utterance into partials
-    frames = audio.wav_to_mel_spectrogram(wav)
-    frames_batch = np.array([frames[s] for s in mel_slices])
-    partial_embeds = embed_frames_batch(frames_batch)
-    # Compute the utterance embedding from the partial embeddings
-    raw_embed = np.mean(partial_embeds, axis=0)
-    embed = raw_embed / np.linalg.norm(raw_embed, 2)
-    if return_partials:
-        return embed, partial_embeds, wave_slices
-    return embed
-def embed_speaker(wavs, **kwargs):
-    raise NotImplemented()
-def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
-    if ax is None:
-        ax = plt.gca()
-    if shape is None:
-        height = int(np.sqrt(len(embed)))
-        shape = (height, -1)
-    embed = embed.reshape(shape)
-    cmap = cm.get_cmap()
-    mappable = ax.imshow(embed, cmap=cmap)
-    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
-    cbar.set_clim(*color_range)
-    ax.set_xticks([]), ax.set_yticks([])
-    ax.set_title(title)

NeuralSeq/data_gen/tts/emotion/model.py DELETED Viewed

@@ -1,78 +0,0 @@
-from data_gen.tts.emotion.params_model import *
-from data_gen.tts.emotion.params_data import *
-from torch.nn.utils import clip_grad_norm_
-from scipy.optimize import brentq
-from torch import nn
-import numpy as np
-import torch
-class EmotionEncoder(nn.Module):
-    def __init__(self, device, loss_device):
-        super().__init__()
-        self.loss_device = loss_device
-        # Network defition
-        self.lstm = nn.LSTM(input_size=mel_n_channels,
-                            hidden_size=model_hidden_size,
-                            num_layers=model_num_layers,
-                            batch_first=True).to(device)
-        self.linear = nn.Linear(in_features=model_hidden_size,
-                                out_features=model_embedding_size).to(device)
-        self.relu = torch.nn.ReLU().to(device)
-        # Cosine similarity scaling (with fixed initial parameter values)
-        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
-        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
-        # Loss
-        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
-    def do_gradient_ops(self):
-        # Gradient scale
-        self.similarity_weight.grad *= 0.01
-        self.similarity_bias.grad *= 0.01
-        # Gradient clipping
-        clip_grad_norm_(self.parameters(), 3, norm_type=2)
-    def forward(self, utterances, hidden_init=None):
-        """
-        Computes the embeddings of a batch of utterance spectrograms.
-        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
-        (batch_size, n_frames, n_channels)
-        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
-        batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
-        """
-        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
-        # and the final cell state.
-        out, (hidden, cell) = self.lstm(utterances, hidden_init)
-        # We take only the hidden state of the last layer
-        embeds_raw = self.relu(self.linear(hidden[-1]))
-        # L2-normalize it
-        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
-        return embeds
-    def inference(self, utterances, hidden_init=None):
-        """
-        Computes the embeddings of a batch of utterance spectrograms.
-        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
-        (batch_size, n_frames, n_channels)
-        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
-        batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
-        """
-        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
-        # and the final cell state.
-        out, (hidden, cell) = self.lstm(utterances, hidden_init)
-        return hidden[-1]

NeuralSeq/data_gen/tts/emotion/params_data.py DELETED Viewed

@@ -1,29 +0,0 @@
-## Mel-filterbank
-mel_window_length = 25  # In milliseconds
-mel_window_step = 10    # In milliseconds
-mel_n_channels = 40
-## Audio
-sampling_rate = 16000
-# Number of spectrogram frames in a partial utterance
-partials_n_frames = 160     # 1600 ms
-# Number of spectrogram frames at inference
-inference_n_frames = 80     #  800 ms
-## Voice Activation Detection
-# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
-# This sets the granularity of the VAD. Should not need to be changed.
-vad_window_length = 30  # In milliseconds
-# Number of frames to average together when performing the moving average smoothing.
-# The larger this value, the larger the VAD variations must be to not get smoothed out.
-vad_moving_average_width = 8
-# Maximum number of consecutive silent frames a segment can have.
-vad_max_silence_length = 6
-## Audio volume normalization
-audio_norm_target_dBFS = -30

NeuralSeq/data_gen/tts/emotion/params_model.py DELETED Viewed

@@ -1,11 +0,0 @@
-## Model parameters
-model_hidden_size = 256
-model_embedding_size = 256
-model_num_layers = 3
-## Training parameters
-learning_rate_init = 1e-4
-speakers_per_batch = 6
-utterances_per_speaker = 20

NeuralSeq/data_gen/tts/emotion/test_emotion.py DELETED Viewed

@@ -1,184 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Run inference for pre-processed data with a trained model.
-"""
-import logging
-import math
-import numpy, math, pdb, sys, random
-import time, os, itertools, shutil, importlib
-import argparse
-import os
-import sys
-import glob
-from sklearn import metrics
-import soundfile as sf
-#import sentencepiece as spm
-import torch
-import inference as encoder
-import torch.nn as nn
-import torch.nn.functional as F
-from pathlib import Path
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-from resemblyzer import VoiceEncoder, preprocess_wav
-def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
-    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
-    fnr = 1 - tpr
-    fnr = fnr * 100
-    fpr = fpr * 100
-    tunedThreshold = [];
-    if target_fr:
-        for tfr in target_fr:
-            idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
-            tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
-    for tfa in target_fa:
-        idx = numpy.nanargmin(numpy.absolute((tfa - fpr)))  # numpy.where(fpr<=tfa)[0][-1]
-        tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
-    idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
-    eer = max(fpr[idxE], fnr[idxE])
-    return (tunedThreshold, eer, fpr, fnr);
-def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
-    # Maximum audio length
-    max_audio = max_frames * 160 + 240
-    # Read wav file and convert to torch tensor
-    audio,sample_rate = sf.read(filename)
-    feats_v0 = torch.from_numpy(audio).float()
-    audiosize = audio.shape[0]
-    if audiosize <= max_audio:
-        shortage = math.floor((max_audio - audiosize + 1) / 2)
-        audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
-        audiosize = audio.shape[0]
-    if evalmode:
-        startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
-    else:
-        startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
-    feats = []
-    if evalmode and max_frames == 0:
-        feats.append(audio)
-    else:
-        for asf in startframe:
-            feats.append(audio[int(asf):int(asf) + max_audio])
-    feat = numpy.stack(feats, axis=0)
-    feat = torch.FloatTensor(feat)
-    return feat;
-def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
-    lines       = []
-    files       = []
-    feats       = {}
-    tstart      = time.time()
-    ## Read all lines
-    with open(listfilename) as listfile:
-        while True:
-            line = listfile.readline();
-            if (not line):
-                break;
-            data = line.split();
-            ## Append random label if missing
-            if len(data) == 2: data = [random.randint(0,1)] + data
-            files.append(data[1])
-            files.append(data[2])
-            lines.append(line)
-    setfiles = list(set(files))
-    setfiles.sort()
-    ## Save all features to file
-    for idx, file in enumerate(setfiles):
-        # preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
-        # embed = encoder.embed_utterance(preprocessed_wav)
-        processed_wav = preprocess_wav(os.path.join(test_path,file))
-        embed = voice_encoder.embed_utterance(processed_wav)
-        torch.cuda.empty_cache()
-        ref_feat = torch.from_numpy(embed).unsqueeze(0)
-        feats[file]     = ref_feat
-        telapsed = time.time() - tstart
-        if idx % print_interval == 0:
-            sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
-    print('')
-    all_scores = [];
-    all_labels = [];
-    all_trials = [];
-    tstart = time.time()
-    ## Read files and compute all scores
-    for idx, line in enumerate(lines):
-        data = line.split();
-        ## Append random label if missing
-        if len(data) == 2: data = [random.randint(0,1)] + data
-        ref_feat = feats[data[1]]
-        com_feat = feats[data[2]]
-        ref_feat = ref_feat.cuda()
-        com_feat = com_feat.cuda()
-        # normalize feats
-        ref_feat = F.normalize(ref_feat, p=2, dim=1)
-        com_feat = F.normalize(com_feat, p=2, dim=1)
-        dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
-        score = -1 * numpy.mean(dist);
-        all_scores.append(score);
-        all_labels.append(int(data[0]));
-        all_trials.append(data[1]+" "+data[2])
-        if idx % print_interval == 0:
-            telapsed = time.time() - tstart
-            sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
-            sys.stdout.flush();
-    print('\n')
-    return (all_scores, all_labels, all_trials);
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser("baseline")
-    parser.add_argument("--data_root", type=str, help="", required=True)
-    parser.add_argument("--list", type=str, help="", required=True)
-    parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
-    args = parser.parse_args()
-    # Load the models one by one.
-    print("Preparing the encoder...")
-    # encoder.load_model(Path(args.model_dir))
-    print("Insert the wav file name...")
-    voice_encoder = VoiceEncoder().cuda()
-    sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
-    result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
-    print('EER %2.4f'%result[1])

NeuralSeq/data_gen/tts/txt_processors/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from . import en

NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (218 Bytes)

NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc DELETED Viewed

Binary file (1.9 kB)

NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc DELETED Viewed

Binary file (2.87 kB)

NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py DELETED Viewed

@@ -1,47 +0,0 @@
-from data_gen.tts.data_gen_utils import is_sil_phoneme
-REGISTERED_TEXT_PROCESSORS = {}
-def register_txt_processors(name):
-    def _f(cls):
-        REGISTERED_TEXT_PROCESSORS[name] = cls
-        return cls
-    return _f
-def get_txt_processor_cls(name):
-    return REGISTERED_TEXT_PROCESSORS.get(name, None)
-class BaseTxtProcessor:
-    @staticmethod
-    def sp_phonemes():
-        return ['|']
-    @classmethod
-    def process(cls, txt, preprocess_args):
-        raise NotImplementedError
-    @classmethod
-    def postprocess(cls, txt_struct, preprocess_args):
-        # remove sil phoneme in head and tail
-        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
-            txt_struct = txt_struct[1:]
-        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
-            txt_struct = txt_struct[:-1]
-        if preprocess_args['with_phsep']:
-            txt_struct = cls.add_bdr(txt_struct)
-        if preprocess_args['add_eos_bos']:
-            txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
-        return txt_struct
-    @classmethod
-    def add_bdr(cls, txt_struct):
-        txt_struct_ = []
-        for i, ts in enumerate(txt_struct):
-            txt_struct_.append(ts)
-            if i != len(txt_struct) - 1 and \
-                    not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
-                txt_struct_.append(['|', ['|']])
-        return txt_struct_

NeuralSeq/data_gen/tts/txt_processors/en.py DELETED Viewed

@@ -1,77 +0,0 @@
-import re
-import unicodedata
-from g2p_en import G2p
-from g2p_en.expand import normalize_numbers
-from nltk import pos_tag
-from nltk.tokenize import TweetTokenizer
-from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
-from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
-class EnG2p(G2p):
-    word_tokenize = TweetTokenizer().tokenize
-    def __call__(self, text):
-        # preprocessing
-        words = EnG2p.word_tokenize(text)
-        tokens = pos_tag(words)  # tuples of (word, tag)
-        # steps
-        prons = []
-        for word, pos in tokens:
-            if re.search("[a-z]", word) is None:
-                pron = [word]
-            elif word in self.homograph2features:  # Check homograph
-                pron1, pron2, pos1 = self.homograph2features[word]
-                if pos.startswith(pos1):
-                    pron = pron1
-                else:
-                    pron = pron2
-            elif word in self.cmu:  # lookup CMU dict
-                pron = self.cmu[word][0]
-            else:  # predict for oov
-                pron = self.predict(word)
-            prons.extend(pron)
-            prons.extend([" "])
-        return prons[:-1]
-@register_txt_processors('en')
-class TxtProcessor(BaseTxtProcessor):
-    g2p = EnG2p()
-    @staticmethod
-    def preprocess_text(text):
-        text = normalize_numbers(text)
-        text = ''.join(char for char in unicodedata.normalize('NFD', text)
-                       if unicodedata.category(char) != 'Mn')  # Strip accents
-        text = text.lower()
-        text = re.sub("[\'\"()]+", "", text)
-        text = re.sub("[-]+", " ", text)
-        text = re.sub(f"[^ a-z{PUNCS}]", "", text)
-        text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
-        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
-        text = text.replace("i.e.", "that is")
-        text = text.replace("i.e.", "that is")
-        text = text.replace("etc.", "etc")
-        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
-        text = re.sub(rf"\s+", r" ", text)
-        return text
-    @classmethod
-    def process(cls, txt, preprocess_args):
-        txt = cls.preprocess_text(txt).strip()
-        phs = cls.g2p(txt)
-        txt_struct = [[w, []] for w in txt.split(" ")]
-        i_word = 0
-        for p in phs:
-            if p == ' ':
-                i_word += 1
-            else:
-                txt_struct[i_word][1].append(p)
-        txt_struct = cls.postprocess(txt_struct, preprocess_args)
-        return txt_struct, txt

NeuralSeq/data_gen/tts/txt_processors/zh.py DELETED Viewed

@@ -1,43 +0,0 @@
-import re
-import jieba
-from pypinyin import pinyin, Style
-from data_gen.tts.data_gen_utils import PUNCS
-from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
-from utils.text_norm import NSWNormalizer
-class TxtProcessor(BaseTxtProcessor):
-    table = {ord(f): ord(t) for f, t in zip(
-        u'：，。！？【】（）％＃＠＆１２３４５６７８９０',
-        u':,.!?[]()%#@&1234567890')}
-    @staticmethod
-    def preprocess_text(text):
-        text = text.translate(TxtProcessor.table)
-        text = NSWNormalizer(text).normalize(remove_punc=False)
-        text = re.sub("[\'\"()]+", "", text)
-        text = re.sub("[-]+", " ", text)
-        text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
-        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
-        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
-        text = re.sub(rf"\s+", r"", text)
-        text = re.sub(rf"[A-Za-z]+", r"$", text)
-        return text
-    @classmethod
-    def process(cls, txt, pre_align_args):
-        txt = cls.preprocess_text(txt)
-        shengmu = pinyin(txt, style=Style.INITIALS)  # https://blog.csdn.net/zhoulei124/article/details/89055403
-        yunmu_finals = pinyin(txt, style=Style.FINALS)
-        yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
-        yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
-            if pre_align_args['use_tone'] else yunmu_finals
-        assert len(shengmu) == len(yunmu)
-        phs = ["|"]
-        for a, b, c in zip(shengmu, yunmu, yunmu_finals):
-            if a[0] == c[0]:
-                phs += [a[0], "|"]
-            else:
-                phs += [a[0], b[0], "|"]
-        return phs, txt