txt2audio commited on
Commit
56c9694
·
1 Parent(s): c4d0fa8
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. NeuralSeq/LICENSE +0 -21
  2. NeuralSeq/README.md +0 -9
  3. NeuralSeq/configs/config_base.yaml +0 -42
  4. NeuralSeq/configs/singing/base.yaml +0 -42
  5. NeuralSeq/configs/singing/fs2.yaml +0 -3
  6. NeuralSeq/configs/tts/base.yaml +0 -95
  7. NeuralSeq/configs/tts/base_zh.yaml +0 -3
  8. NeuralSeq/configs/tts/emotion/base_text2mel.yaml +0 -17
  9. NeuralSeq/configs/tts/emotion/pre_align.py +0 -25
  10. NeuralSeq/configs/tts/fs2.yaml +0 -80
  11. NeuralSeq/configs/tts/hifigan.yaml +0 -21
  12. NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc +0 -0
  13. NeuralSeq/configs/tts/libritts/base_text2mel.yaml +0 -14
  14. NeuralSeq/configs/tts/libritts/fs2.yaml +0 -3
  15. NeuralSeq/configs/tts/libritts/pre_align.py +0 -27
  16. NeuralSeq/configs/tts/libritts/pwg.yaml +0 -8
  17. NeuralSeq/configs/tts/lj/base_mel2wav.yaml +0 -3
  18. NeuralSeq/configs/tts/lj/base_text2mel.yaml +0 -13
  19. NeuralSeq/configs/tts/lj/fs2.yaml +0 -3
  20. NeuralSeq/configs/tts/lj/hifigan.yaml +0 -3
  21. NeuralSeq/configs/tts/lj/pwg.yaml +0 -3
  22. NeuralSeq/configs/tts/pwg.yaml +0 -110
  23. NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc +0 -0
  24. NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc +0 -0
  25. NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc +0 -0
  26. NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc +0 -0
  27. NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc +0 -0
  28. NeuralSeq/data_gen/tts/base_binarizer.py +0 -224
  29. NeuralSeq/data_gen/tts/base_binarizer_emotion.py +0 -352
  30. NeuralSeq/data_gen/tts/base_preprocess.py +0 -254
  31. NeuralSeq/data_gen/tts/binarizer_zh.py +0 -59
  32. NeuralSeq/data_gen/tts/data_gen_utils.py +0 -357
  33. NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc +0 -0
  34. NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc +0 -0
  35. NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc +0 -0
  36. NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc +0 -0
  37. NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc +0 -0
  38. NeuralSeq/data_gen/tts/emotion/audio.py +0 -107
  39. NeuralSeq/data_gen/tts/emotion/inference.py +0 -177
  40. NeuralSeq/data_gen/tts/emotion/model.py +0 -78
  41. NeuralSeq/data_gen/tts/emotion/params_data.py +0 -29
  42. NeuralSeq/data_gen/tts/emotion/params_model.py +0 -11
  43. NeuralSeq/data_gen/tts/emotion/test_emotion.py +0 -184
  44. NeuralSeq/data_gen/tts/txt_processors/__init__.py +0 -1
  45. NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc +0 -0
  46. NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc +0 -0
  47. NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc +0 -0
  48. NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py +0 -47
  49. NeuralSeq/data_gen/tts/txt_processors/en.py +0 -77
  50. NeuralSeq/data_gen/tts/txt_processors/zh.py +0 -43
NeuralSeq/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2021 Jinglin Liu
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/README.md DELETED
@@ -1,9 +0,0 @@
1
- ---
2
- title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
3
- emoji: 🎶
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: gradio
7
- app_file: "inference/svs/gradio/infer.py"
8
- pinned: false
9
- ---
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/config_base.yaml DELETED
@@ -1,42 +0,0 @@
1
- # task
2
- binary_data_dir: ''
3
- work_dir: '' # experiment directory.
4
- infer: false # infer
5
- seed: 1234
6
- debug: false
7
- save_codes:
8
- - configs
9
- - modules
10
- - tasks
11
- - utils
12
- - usr
13
-
14
- #############
15
- # dataset
16
- #############
17
- ds_workers: 1
18
- test_num: 100
19
- valid_num: 100
20
- endless_ds: false
21
- sort_by_len: true
22
-
23
- #########
24
- # train and eval
25
- #########
26
- load_ckpt: ''
27
- save_ckpt: true
28
- save_best: false
29
- num_ckpt_keep: 3
30
- clip_grad_norm: 0
31
- accumulate_grad_batches: 1
32
- log_interval: 100
33
- num_sanity_val_steps: 5 # steps of validation at the beginning
34
- check_val_every_n_epoch: 10
35
- val_check_interval: 2000
36
- max_epochs: 1000
37
- max_updates: 160000
38
- max_tokens: 31250
39
- max_sentences: 100000
40
- max_eval_tokens: -1
41
- max_eval_sentences: -1
42
- test_input_dir: ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/singing/base.yaml DELETED
@@ -1,42 +0,0 @@
1
- base_config:
2
- - configs/tts/base.yaml
3
- - configs/tts/base_zh.yaml
4
-
5
-
6
- datasets: []
7
- test_prefixes: []
8
- test_num: 0
9
- valid_num: 0
10
-
11
- pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
12
- binarizer_cls: data_gen.singing.binarize.SingingBinarizer
13
- pre_align_args:
14
- use_tone: false # for ZH
15
- forced_align: mfa
16
- use_sox: true
17
- hop_size: 128 # Hop size.
18
- fft_size: 512 # FFT size.
19
- win_size: 512 # FFT size.
20
- max_frames: 8000
21
- fmin: 50 # Minimum freq in mel basis calculation.
22
- fmax: 11025 # Maximum frequency in mel basis calculation.
23
- pitch_type: frame
24
-
25
- hidden_size: 256
26
- mel_loss: "ssim:0.5|l1:0.5"
27
- lambda_f0: 0.0
28
- lambda_uv: 0.0
29
- lambda_energy: 0.0
30
- lambda_ph_dur: 0.0
31
- lambda_sent_dur: 0.0
32
- lambda_word_dur: 0.0
33
- predictor_grad: 0.0
34
- use_spk_embed: true
35
- use_spk_id: false
36
-
37
- max_tokens: 20000
38
- max_updates: 400000
39
- num_spk: 100
40
- save_f0: true
41
- use_gt_dur: true
42
- use_gt_f0: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/singing/fs2.yaml DELETED
@@ -1,3 +0,0 @@
1
- base_config:
2
- - configs/tts/fs2.yaml
3
- - configs/singing/base.yaml
 
 
 
 
NeuralSeq/configs/tts/base.yaml DELETED
@@ -1,95 +0,0 @@
1
- # task
2
- base_config: configs/config_base.yaml
3
- task_cls: ''
4
- #############
5
- # dataset
6
- #############
7
- raw_data_dir: ''
8
- processed_data_dir: ''
9
- binary_data_dir: ''
10
- dict_dir: ''
11
- pre_align_cls: ''
12
- binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
13
- pre_align_args:
14
- use_tone: true # for ZH
15
- forced_align: mfa
16
- use_sox: false
17
- txt_processor: en
18
- allow_no_txt: false
19
- denoise: false
20
- binarization_args:
21
- shuffle: false
22
- with_txt: true
23
- with_wav: false
24
- with_align: true
25
- with_spk_embed: true
26
- with_f0: true
27
- with_f0cwt: true
28
-
29
- loud_norm: false
30
- endless_ds: true
31
- reset_phone_dict: true
32
-
33
- test_num: 100
34
- valid_num: 100
35
- max_frames: 1550
36
- max_input_tokens: 1550
37
- audio_num_mel_bins: 80
38
- audio_sample_rate: 22050
39
- hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
40
- win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
41
- fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
42
- fmax: 7600 # To be increased/reduced depending on data.
43
- fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
44
- min_level_db: -100
45
- num_spk: 1
46
- mel_vmin: -6
47
- mel_vmax: 1.5
48
- ds_workers: 4
49
-
50
- #########
51
- # model
52
- #########
53
- dropout: 0.1
54
- enc_layers: 4
55
- dec_layers: 4
56
- hidden_size: 384
57
- num_heads: 2
58
- prenet_dropout: 0.5
59
- prenet_hidden_size: 256
60
- stop_token_weight: 5.0
61
- enc_ffn_kernel_size: 9
62
- dec_ffn_kernel_size: 9
63
- ffn_act: gelu
64
- ffn_padding: 'SAME'
65
-
66
-
67
- ###########
68
- # optimization
69
- ###########
70
- lr: 2.0
71
- warmup_updates: 8000
72
- optimizer_adam_beta1: 0.9
73
- optimizer_adam_beta2: 0.98
74
- weight_decay: 0
75
- clip_grad_norm: 1
76
-
77
-
78
- ###########
79
- # train and eval
80
- ###########
81
- max_tokens: 30000
82
- max_sentences: 100000
83
- max_eval_sentences: 1
84
- max_eval_tokens: 60000
85
- train_set_name: 'train'
86
- valid_set_name: 'valid'
87
- test_set_name: 'test'
88
- vocoder: pwg
89
- vocoder_ckpt: ''
90
- profile_infer: false
91
- out_wav_norm: false
92
- save_gt: false
93
- save_f0: false
94
- gen_dir_name: ''
95
- use_denoise: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/base_zh.yaml DELETED
@@ -1,3 +0,0 @@
1
- pre_align_args:
2
- txt_processor: zh_g2pM
3
- binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer
 
 
 
 
NeuralSeq/configs/tts/emotion/base_text2mel.yaml DELETED
@@ -1,17 +0,0 @@
1
- raw_data_dir: 'data/raw/ESD'
2
- processed_data_dir: 'data/processed/emotion'
3
- binary_data_dir: 'data/binary/emotion'
4
- pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign
5
- audio_sample_rate: 16000
6
- binarization_args:
7
- shuffle: true
8
- binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
9
- use_spk_id: true
10
- test_num: 200
11
- num_spk: 10
12
- pitch_type: frame
13
- min_frames: 128
14
- num_test_samples: 30
15
- mel_loss: "ssim:0.5|l1:0.5"
16
- vocoder_ckpt: ''
17
- use_emotion: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/emotion/pre_align.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from data_gen.tts.base_preprocess import BasePreprocessor
4
- import glob
5
- import re
6
-
7
- class EmoPreAlign(BasePreprocessor):
8
-
9
- def meta_data(self):
10
- spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
11
- pattern = re.compile('[\t\n ]+')
12
- for spk in spks:
13
- for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件
14
- line = re.sub(pattern, ' ', line)
15
- if line == ' ': continue
16
- split_ = line.split(' ')
17
- txt = ' '.join(split_[1: -2])
18
- item_name = split_[0]
19
- emotion = split_[-2]
20
- wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
21
- yield item_name, wav_fn, txt, spk, emotion
22
-
23
-
24
- if __name__ == "__main__":
25
- EmoPreAlign().process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/fs2.yaml DELETED
@@ -1,80 +0,0 @@
1
- base_config: configs/tts/base.yaml
2
- task_cls: tasks.tts.fs2.FastSpeech2Task
3
-
4
- # model
5
- hidden_size: 256
6
- dropout: 0.1
7
- encoder_type: fft # fft|tacotron|tacotron2|conformer
8
- encoder_K: 8 # for tacotron encoder
9
- decoder_type: fft # fft|rnn|conv|conformer
10
- use_pos_embed: true
11
-
12
- # duration
13
- predictor_hidden: -1
14
- predictor_kernel: 5
15
- predictor_layers: 2
16
- dur_predictor_kernel: 3
17
- dur_predictor_layers: 2
18
- predictor_dropout: 0.5
19
-
20
- # pitch and energy
21
- use_pitch_embed: true
22
- pitch_type: ph # frame|ph|cwt
23
- use_uv: true
24
- cwt_hidden_size: 128
25
- cwt_layers: 2
26
- cwt_loss: l1
27
- cwt_add_f0_loss: false
28
- cwt_std_scale: 0.8
29
-
30
- pitch_ar: false
31
- #pitch_embed_type: 0q
32
- pitch_loss: 'l1' # l1|l2|ssim
33
- pitch_norm: log
34
- use_energy_embed: false
35
-
36
- # reference encoder and speaker embedding
37
- use_spk_id: false
38
- use_split_spk_id: false
39
- use_spk_embed: false
40
- use_var_enc: false
41
- lambda_commit: 0.25
42
- ref_norm_layer: bn
43
- pitch_enc_hidden_stride_kernel:
44
- - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
45
- - 0,2,5
46
- - 0,2,5
47
- dur_enc_hidden_stride_kernel:
48
- - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
49
- - 0,2,3
50
- - 0,1,3
51
-
52
-
53
- # mel
54
- mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
55
-
56
- # loss lambda
57
- lambda_f0: 1.0
58
- lambda_uv: 1.0
59
- lambda_energy: 0.1
60
- lambda_ph_dur: 1.0
61
- lambda_sent_dur: 1.0
62
- lambda_word_dur: 1.0
63
- predictor_grad: 0.1
64
-
65
- # train and eval
66
- pretrain_fs_ckpt: ''
67
- warmup_updates: 2000
68
- max_tokens: 32000
69
- max_sentences: 100000
70
- max_eval_sentences: 1
71
- max_updates: 120000
72
- num_valid_plots: 5
73
- num_test_samples: 0
74
- test_ids: []
75
- use_gt_dur: false
76
- use_gt_f0: false
77
-
78
- # exp
79
- dur_loss: mse # huber|mol
80
- norm_type: gn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/hifigan.yaml DELETED
@@ -1,21 +0,0 @@
1
- base_config: configs/tts/pwg.yaml
2
- task_cls: tasks.vocoder.hifigan.HifiGanTask
3
- resblock: "1"
4
- adam_b1: 0.8
5
- adam_b2: 0.99
6
- upsample_rates: [ 8,8,2,2 ]
7
- upsample_kernel_sizes: [ 16,16,4,4 ]
8
- upsample_initial_channel: 128
9
- resblock_kernel_sizes: [ 3,7,11 ]
10
- resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11
-
12
- lambda_mel: 45.0
13
-
14
- max_samples: 8192
15
- max_sentences: 16
16
-
17
- generator_params:
18
- lr: 0.0002 # Generator's learning rate.
19
- aux_context_window: 0 # Context window size for auxiliary feature.
20
- discriminator_optimizer_params:
21
- lr: 0.0002 # Discriminator's learning rate.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc DELETED
Binary file (981 Bytes)
 
NeuralSeq/configs/tts/libritts/base_text2mel.yaml DELETED
@@ -1,14 +0,0 @@
1
- raw_data_dir: 'data/raw/LibriTTS'
2
- processed_data_dir: 'data/processed/libritts'
3
- binary_data_dir: 'data/binary/libritts'
4
- pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign
5
- binarization_args:
6
- shuffle: true
7
- use_spk_id: true
8
- test_num: 200
9
- num_spk: 2320
10
- pitch_type: frame
11
- min_frames: 128
12
- num_test_samples: 30
13
- mel_loss: "ssim:0.5|l1:0.5"
14
- vocoder_ckpt: ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/libritts/fs2.yaml DELETED
@@ -1,3 +0,0 @@
1
- base_config:
2
- - configs/tts/fs2.yaml
3
- - ./base_text2mel.yaml
 
 
 
 
NeuralSeq/configs/tts/libritts/pre_align.py DELETED
@@ -1,27 +0,0 @@
1
- import os
2
-
3
- from data_gen.tts.base_preprocess import BasePreprocessor
4
- import glob
5
-
6
-
7
- class LibrittsPreAlign(BasePreprocessor):
8
- def meta_data(self):
9
- wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
10
- for wav_fn in wav_fns:
11
- item_name = os.path.basename(wav_fn)[:-4]
12
- txt_fn = f'{wav_fn[:-4]}.normalized.txt'
13
- with open(txt_fn, 'r') as f:
14
- txt = f.readlines()
15
- f.close()
16
- spk = item_name.split("_")[0]
17
- # Example:
18
- #
19
- # 'item_name': '103_1241_000000_000001'
20
- # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
21
- # 'txt': 'matthew Cuthbert is surprised'
22
- # 'spk_name': '103'
23
- yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
24
-
25
-
26
- if __name__ == "__main__":
27
- LibrittsPreAlign().process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/libritts/pwg.yaml DELETED
@@ -1,8 +0,0 @@
1
- base_config: egs/egs_bases/tts/vocoder/pwg.yaml
2
- raw_data_dir: 'data/raw/LibriTTS'
3
- processed_data_dir: 'data/processed/libritts'
4
- binary_data_dir: 'data/binary/libritts_wav'
5
- generator_params:
6
- kernel_size: 5
7
- num_spk: 400
8
- max_samples: 20480
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/lj/base_mel2wav.yaml DELETED
@@ -1,3 +0,0 @@
1
- raw_data_dir: 'data/raw/LJSpeech-1.1'
2
- processed_data_dir: 'data/processed/ljspeech'
3
- binary_data_dir: 'data/binary/ljspeech_wav'
 
 
 
 
NeuralSeq/configs/tts/lj/base_text2mel.yaml DELETED
@@ -1,13 +0,0 @@
1
- raw_data_dir: 'data/raw/LJSpeech-1.1'
2
- processed_data_dir: 'data/processed/ljspeech'
3
- binary_data_dir: 'data/binary/ljspeech'
4
- pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
5
-
6
- pitch_type: cwt
7
- mel_loss: l1
8
- num_test_samples: 20
9
- test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
10
- 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
11
- use_energy_embed: false
12
- test_num: 523
13
- valid_num: 348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/configs/tts/lj/fs2.yaml DELETED
@@ -1,3 +0,0 @@
1
- base_config:
2
- - configs/tts/fs2.yaml
3
- - configs/tts/lj/base_text2mel.yaml
 
 
 
 
NeuralSeq/configs/tts/lj/hifigan.yaml DELETED
@@ -1,3 +0,0 @@
1
- base_config:
2
- - configs/tts/hifigan.yaml
3
- - configs/tts/lj/base_mel2wav.yaml
 
 
 
 
NeuralSeq/configs/tts/lj/pwg.yaml DELETED
@@ -1,3 +0,0 @@
1
- base_config:
2
- - configs/tts/pwg.yaml
3
- - configs/tts/lj/base_mel2wav.yaml
 
 
 
 
NeuralSeq/configs/tts/pwg.yaml DELETED
@@ -1,110 +0,0 @@
1
- base_config: configs/tts/base.yaml
2
- task_cls: tasks.vocoder.pwg.PwgTask
3
-
4
- binarization_args:
5
- with_wav: true
6
- with_spk_embed: false
7
- with_align: false
8
- test_input_dir: ''
9
-
10
- ###########
11
- # train and eval
12
- ###########
13
- max_samples: 25600
14
- max_sentences: 5
15
- max_eval_sentences: 1
16
- max_updates: 1000000
17
- val_check_interval: 2000
18
-
19
-
20
- ###########################################################
21
- # FEATURE EXTRACTION SETTING #
22
- ###########################################################
23
- sampling_rate: 22050 # Sampling rate.
24
- fft_size: 1024 # FFT size.
25
- hop_size: 256 # Hop size.
26
- win_length: null # Window length.
27
- # If set to null, it will be the same as fft_size.
28
- window: "hann" # Window function.
29
- num_mels: 80 # Number of mel basis.
30
- fmin: 80 # Minimum freq in mel basis calculation.
31
- fmax: 7600 # Maximum frequency in mel basis calculation.
32
- format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
33
-
34
- ###########################################################
35
- # GENERATOR NETWORK ARCHITECTURE SETTING #
36
- ###########################################################
37
- generator_params:
38
- in_channels: 1 # Number of input channels.
39
- out_channels: 1 # Number of output channels.
40
- kernel_size: 3 # Kernel size of dilated convolution.
41
- layers: 30 # Number of residual block layers.
42
- stacks: 3 # Number of stacks i.e., dilation cycles.
43
- residual_channels: 64 # Number of channels in residual conv.
44
- gate_channels: 128 # Number of channels in gated conv.
45
- skip_channels: 64 # Number of channels in skip conv.
46
- aux_channels: 80 # Number of channels for auxiliary feature conv.
47
- # Must be the same as num_mels.
48
- aux_context_window: 2 # Context window size for auxiliary feature.
49
- # If set to 2, previous 2 and future 2 frames will be considered.
50
- dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
51
- use_weight_norm: true # Whether to use weight norm.
52
- # If set to true, it will be applied to all of the conv layers.
53
- upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
54
- upsample_params: # Upsampling network parameters.
55
- upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
56
- use_pitch_embed: false
57
-
58
- ###########################################################
59
- # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
60
- ###########################################################
61
- discriminator_params:
62
- in_channels: 1 # Number of input channels.
63
- out_channels: 1 # Number of output channels.
64
- kernel_size: 3 # Number of output channels.
65
- layers: 10 # Number of conv layers.
66
- conv_channels: 64 # Number of chnn layers.
67
- bias: true # Whether to use bias parameter in conv.
68
- use_weight_norm: true # Whether to use weight norm.
69
- # If set to true, it will be applied to all of the conv layers.
70
- nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
71
- nonlinear_activation_params: # Nonlinear function parameters
72
- negative_slope: 0.2 # Alpha in LeakyReLU.
73
-
74
- ###########################################################
75
- # STFT LOSS SETTING #
76
- ###########################################################
77
- stft_loss_params:
78
- fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
79
- hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
80
- win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
81
- window: "hann_window" # Window function for STFT-based loss
82
- use_mel_loss: false
83
-
84
- ###########################################################
85
- # ADVERSARIAL LOSS SETTING #
86
- ###########################################################
87
- lambda_adv: 4.0 # Loss balancing coefficient.
88
-
89
- ###########################################################
90
- # OPTIMIZER & SCHEDULER SETTING #
91
- ###########################################################
92
- generator_optimizer_params:
93
- lr: 0.0001 # Generator's learning rate.
94
- eps: 1.0e-6 # Generator's epsilon.
95
- weight_decay: 0.0 # Generator's weight decay coefficient.
96
- generator_scheduler_params:
97
- step_size: 200000 # Generator's scheduler step size.
98
- gamma: 0.5 # Generator's scheduler gamma.
99
- # At each step size, lr will be multiplied by this parameter.
100
- generator_grad_norm: 10 # Generator's gradient norm.
101
- discriminator_optimizer_params:
102
- lr: 0.00005 # Discriminator's learning rate.
103
- eps: 1.0e-6 # Discriminator's epsilon.
104
- weight_decay: 0.0 # Discriminator's weight decay coefficient.
105
- discriminator_scheduler_params:
106
- step_size: 200000 # Discriminator's scheduler step size.
107
- gamma: 0.5 # Discriminator's scheduler gamma.
108
- # At each step size, lr will be multiplied by this parameter.
109
- discriminator_grad_norm: 1 # Discriminator's gradient norm.
110
- disc_start_steps: 40000 # Number of steps to start to train discriminator.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc DELETED
Binary file (8.23 kB)
 
NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc DELETED
Binary file (13.3 kB)
 
NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc DELETED
Binary file (11.1 kB)
 
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc DELETED
Binary file (11 kB)
 
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc DELETED
Binary file (11 kB)
 
NeuralSeq/data_gen/tts/base_binarizer.py DELETED
@@ -1,224 +0,0 @@
1
- import os
2
- os.environ["OMP_NUM_THREADS"] = "1"
3
-
4
- from utils.multiprocess_utils import chunked_multiprocess_run
5
- import random
6
- import traceback
7
- import json
8
- from resemblyzer import VoiceEncoder
9
- from tqdm import tqdm
10
- from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
11
- from utils.hparams import set_hparams, hparams
12
- import numpy as np
13
- from utils.indexed_datasets import IndexedDatasetBuilder
14
- from vocoders.base_vocoder import VOCODERS
15
- import pandas as pd
16
-
17
-
18
- class BinarizationError(Exception):
19
- pass
20
-
21
-
22
- class BaseBinarizer:
23
- def __init__(self, processed_data_dir=None):
24
- if processed_data_dir is None:
25
- processed_data_dir = hparams['processed_data_dir']
26
- self.processed_data_dirs = processed_data_dir.split(",")
27
- self.binarization_args = hparams['binarization_args']
28
- self.pre_align_args = hparams['pre_align_args']
29
- self.forced_align = self.pre_align_args['forced_align']
30
- tg_dir = None
31
- if self.forced_align == 'mfa':
32
- tg_dir = 'mfa_outputs'
33
- if self.forced_align == 'kaldi':
34
- tg_dir = 'kaldi_outputs'
35
- self.item2txt = {}
36
- self.item2ph = {}
37
- self.item2wavfn = {}
38
- self.item2tgfn = {}
39
- self.item2spk = {}
40
- for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
41
- self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
42
- for r_idx, r in self.meta_df.iterrows():
43
- item_name = raw_item_name = r['item_name']
44
- if len(self.processed_data_dirs) > 1:
45
- item_name = f'ds{ds_id}_{item_name}'
46
- self.item2txt[item_name] = r['txt']
47
- self.item2ph[item_name] = r['ph']
48
- self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
49
- self.item2spk[item_name] = r.get('spk', 'SPK1')
50
- if len(self.processed_data_dirs) > 1:
51
- self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
52
- if tg_dir is not None:
53
- self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
54
- self.item_names = sorted(list(self.item2txt.keys()))
55
- if self.binarization_args['shuffle']:
56
- random.seed(1234)
57
- random.shuffle(self.item_names)
58
-
59
- @property
60
- def train_item_names(self):
61
- return self.item_names[hparams['test_num']+hparams['valid_num']:]
62
-
63
- @property
64
- def valid_item_names(self):
65
- return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
66
-
67
- @property
68
- def test_item_names(self):
69
- return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
70
-
71
- def build_spk_map(self):
72
- spk_map = set()
73
- for item_name in self.item_names:
74
- spk_name = self.item2spk[item_name]
75
- spk_map.add(spk_name)
76
- spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
77
- assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
78
- return spk_map
79
-
80
- def item_name2spk_id(self, item_name):
81
- return self.spk_map[self.item2spk[item_name]]
82
-
83
- def _phone_encoder(self):
84
- ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
85
- ph_set = []
86
- if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
87
- for processed_data_dir in self.processed_data_dirs:
88
- ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
89
- ph_set = sorted(set(ph_set))
90
- json.dump(ph_set, open(ph_set_fn, 'w'))
91
- else:
92
- ph_set = json.load(open(ph_set_fn, 'r'))
93
- print("| phone set: ", ph_set)
94
- return build_phone_encoder(hparams['binary_data_dir'])
95
-
96
- def meta_data(self, prefix):
97
- if prefix == 'valid':
98
- item_names = self.valid_item_names
99
- elif prefix == 'test':
100
- item_names = self.test_item_names
101
- else:
102
- item_names = self.train_item_names
103
- for item_name in item_names:
104
- ph = self.item2ph[item_name]
105
- txt = self.item2txt[item_name]
106
- tg_fn = self.item2tgfn.get(item_name)
107
- wav_fn = self.item2wavfn[item_name]
108
- spk_id = self.item_name2spk_id(item_name)
109
- yield item_name, ph, txt, tg_fn, wav_fn, spk_id
110
-
111
- def process(self):
112
- os.makedirs(hparams['binary_data_dir'], exist_ok=True)
113
- self.spk_map = self.build_spk_map()
114
- print("| spk_map: ", self.spk_map)
115
- spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
116
- json.dump(self.spk_map, open(spk_map_fn, 'w'))
117
-
118
- self.phone_encoder = self._phone_encoder()
119
- self.process_data('valid')
120
- self.process_data('test')
121
- self.process_data('train')
122
-
123
- def process_data(self, prefix):
124
- data_dir = hparams['binary_data_dir']
125
- args = []
126
- builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
127
- lengths = []
128
- f0s = []
129
- total_sec = 0
130
- if self.binarization_args['with_spk_embed']:
131
- voice_encoder = VoiceEncoder().cuda()
132
-
133
- meta_data = list(self.meta_data(prefix))
134
- for m in meta_data:
135
- args.append(list(m) + [self.phone_encoder, self.binarization_args])
136
- num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
137
- for f_id, (_, item) in enumerate(
138
- zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
139
- if item is None:
140
- continue
141
- item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
142
- if self.binarization_args['with_spk_embed'] else None
143
- if not self.binarization_args['with_wav'] and 'wav' in item:
144
- print("del wav")
145
- del item['wav']
146
- builder.add_item(item)
147
- lengths.append(item['len'])
148
- total_sec += item['sec']
149
- if item.get('f0') is not None:
150
- f0s.append(item['f0'])
151
- builder.finalize()
152
- np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
153
- if len(f0s) > 0:
154
- f0s = np.concatenate(f0s, 0)
155
- f0s = f0s[f0s != 0]
156
- np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
157
- print(f"| {prefix} total duration: {total_sec:.3f}s")
158
-
159
- @classmethod
160
- def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
161
- if hparams['vocoder'] in VOCODERS:
162
- wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
163
- else:
164
- wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
165
- res = {
166
- 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
167
- 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
168
- }
169
- try:
170
- if binarization_args['with_f0']:
171
- cls.get_pitch(wav, mel, res)
172
- if binarization_args['with_f0cwt']:
173
- cls.get_f0cwt(res['f0'], res)
174
- if binarization_args['with_txt']:
175
- try:
176
- phone_encoded = res['phone'] = encoder.encode(ph)
177
- except:
178
- traceback.print_exc()
179
- raise BinarizationError(f"Empty phoneme")
180
- if binarization_args['with_align']:
181
- cls.get_align(tg_fn, ph, mel, phone_encoded, res)
182
- except BinarizationError as e:
183
- print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
184
- return None
185
- return res
186
-
187
- @staticmethod
188
- def get_align(tg_fn, ph, mel, phone_encoded, res):
189
- if tg_fn is not None and os.path.exists(tg_fn):
190
- mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
191
- else:
192
- raise BinarizationError(f"Align not found")
193
- if mel2ph.max() - 1 >= len(phone_encoded):
194
- raise BinarizationError(
195
- f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
196
- res['mel2ph'] = mel2ph
197
- res['dur'] = dur
198
-
199
- @staticmethod
200
- def get_pitch(wav, mel, res):
201
- f0, pitch_coarse = get_pitch(wav, mel, hparams)
202
- if sum(f0) == 0:
203
- raise BinarizationError("Empty f0")
204
- res['f0'] = f0
205
- res['pitch'] = pitch_coarse
206
-
207
- @staticmethod
208
- def get_f0cwt(f0, res):
209
- from utils.cwt import get_cont_lf0, get_lf0_cwt
210
- uv, cont_lf0_lpf = get_cont_lf0(f0)
211
- logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
212
- cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
213
- Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
214
- if np.any(np.isnan(Wavelet_lf0)):
215
- raise BinarizationError("NaN CWT")
216
- res['cwt_spec'] = Wavelet_lf0
217
- res['cwt_scales'] = scales
218
- res['f0_mean'] = logf0s_mean_org
219
- res['f0_std'] = logf0s_std_org
220
-
221
-
222
- if __name__ == "__main__":
223
- set_hparams()
224
- BaseBinarizer().process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/base_binarizer_emotion.py DELETED
@@ -1,352 +0,0 @@
1
- import os
2
-
3
- os.environ["OMP_NUM_THREADS"] = "1"
4
- import torch
5
- from collections import Counter
6
- from utils.text_encoder import TokenTextEncoder
7
- from data_gen.tts.emotion import inference as EmotionEncoder
8
- from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
9
- from data_gen.tts.emotion.inference import preprocess_wav
10
- from utils.multiprocess_utils import chunked_multiprocess_run
11
- import random
12
- import traceback
13
- import json
14
- from resemblyzer import VoiceEncoder
15
- from tqdm import tqdm
16
- from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
17
- from utils.hparams import hparams, set_hparams
18
- import numpy as np
19
- from utils.indexed_datasets import IndexedDatasetBuilder
20
- from vocoders.base_vocoder import get_vocoder_cls
21
- import pandas as pd
22
-
23
-
24
- class BinarizationError(Exception):
25
- pass
26
-
27
-
28
- class EmotionBinarizer:
29
- def __init__(self, processed_data_dir=None):
30
- if processed_data_dir is None:
31
- processed_data_dir = hparams['processed_data_dir']
32
- self.processed_data_dirs = processed_data_dir.split(",")
33
- self.binarization_args = hparams['binarization_args']
34
- self.pre_align_args = hparams['pre_align_args']
35
- self.item2txt = {}
36
- self.item2ph = {}
37
- self.item2wavfn = {}
38
- self.item2tgfn = {}
39
- self.item2spk = {}
40
- self.item2emo = {}
41
-
42
- def load_meta_data(self):
43
- for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
44
- self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
45
- for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
46
- item_name = raw_item_name = r['item_name']
47
- if len(self.processed_data_dirs) > 1:
48
- item_name = f'ds{ds_id}_{item_name}'
49
- self.item2txt[item_name] = r['txt']
50
- self.item2ph[item_name] = r['ph']
51
- self.item2wavfn[item_name] = r['wav_fn']
52
- self.item2spk[item_name] = r.get('spk_name', 'SPK1') \
53
- if self.binarization_args['with_spk_id'] else 'SPK1'
54
- if len(self.processed_data_dirs) > 1:
55
- self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
56
- self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
57
- self.item2emo[item_name] = r.get('others', '"Neutral"')
58
- self.item_names = sorted(list(self.item2txt.keys()))
59
- if self.binarization_args['shuffle']:
60
- random.seed(1234)
61
- random.shuffle(self.item_names)
62
-
63
- @property
64
- def train_item_names(self):
65
- return self.item_names[hparams['test_num']:]
66
-
67
- @property
68
- def valid_item_names(self):
69
- return self.item_names[:hparams['test_num']]
70
-
71
- @property
72
- def test_item_names(self):
73
- return self.valid_item_names
74
-
75
- def build_spk_map(self):
76
- spk_map = set()
77
- for item_name in self.item_names:
78
- spk_name = self.item2spk[item_name]
79
- spk_map.add(spk_name)
80
- spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
81
- print("| #Spk: ", len(spk_map))
82
- assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
83
- return spk_map
84
-
85
- def build_emo_map(self):
86
- emo_map = set()
87
- for item_name in self.item_names:
88
- emo_name = self.item2emo[item_name]
89
- emo_map.add(emo_name)
90
- emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
91
- print("| #Emo: ", len(emo_map))
92
- return emo_map
93
-
94
- def item_name2spk_id(self, item_name):
95
- return self.spk_map[self.item2spk[item_name]]
96
-
97
- def item_name2emo_id(self, item_name):
98
- return self.emo_map[self.item2emo[item_name]]
99
-
100
- def _phone_encoder(self):
101
- ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
102
- ph_set = []
103
- if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
104
- for ph_sent in self.item2ph.values():
105
- ph_set += ph_sent.split(' ')
106
- ph_set = sorted(set(ph_set))
107
- json.dump(ph_set, open(ph_set_fn, 'w'))
108
- print("| Build phone set: ", ph_set)
109
- else:
110
- ph_set = json.load(open(ph_set_fn, 'r'))
111
- print("| Load phone set: ", ph_set)
112
- return build_phone_encoder(hparams['binary_data_dir'])
113
-
114
- def _word_encoder(self):
115
- fn = f"{hparams['binary_data_dir']}/word_set.json"
116
- word_set = []
117
- if self.binarization_args['reset_word_dict']:
118
- for word_sent in self.item2txt.values():
119
- word_set += [x for x in word_sent.split(' ') if x != '']
120
- word_set = Counter(word_set)
121
- total_words = sum(word_set.values())
122
- word_set = word_set.most_common(hparams['word_size'])
123
- num_unk_words = total_words - sum([x[1] for x in word_set])
124
- word_set = [x[0] for x in word_set]
125
- json.dump(word_set, open(fn, 'w'))
126
- print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
127
- f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
128
- else:
129
- word_set = json.load(open(fn, 'r'))
130
- print("| Load word set. Size: ", len(word_set), word_set[:10])
131
- return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
132
-
133
- def meta_data(self, prefix):
134
- if prefix == 'valid':
135
- item_names = self.valid_item_names
136
- elif prefix == 'test':
137
- item_names = self.test_item_names
138
- else:
139
- item_names = self.train_item_names
140
- for item_name in item_names:
141
- ph = self.item2ph[item_name]
142
- txt = self.item2txt[item_name]
143
- tg_fn = self.item2tgfn.get(item_name)
144
- wav_fn = self.item2wavfn[item_name]
145
- spk_id = self.item_name2spk_id(item_name)
146
- emotion = self.item_name2emo_id(item_name)
147
- yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
148
-
149
- def process(self):
150
- self.load_meta_data()
151
- os.makedirs(hparams['binary_data_dir'], exist_ok=True)
152
- self.spk_map = self.build_spk_map()
153
- print("| spk_map: ", self.spk_map)
154
- spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
155
- json.dump(self.spk_map, open(spk_map_fn, 'w'))
156
-
157
- self.emo_map = self.build_emo_map()
158
- print("| emo_map: ", self.emo_map)
159
- emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
160
- json.dump(self.emo_map, open(emo_map_fn, 'w'))
161
-
162
- self.phone_encoder = self._phone_encoder()
163
- self.word_encoder = None
164
- EmotionEncoder.load_model(hparams['emotion_encoder_path'])
165
-
166
- if self.binarization_args['with_word']:
167
- self.word_encoder = self._word_encoder()
168
- self.process_data('valid')
169
- self.process_data('test')
170
- self.process_data('train')
171
-
172
- def process_data(self, prefix):
173
- data_dir = hparams['binary_data_dir']
174
- args = []
175
- builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
176
- ph_lengths = []
177
- mel_lengths = []
178
- f0s = []
179
- total_sec = 0
180
- if self.binarization_args['with_spk_embed']:
181
- voice_encoder = VoiceEncoder().cuda()
182
-
183
- meta_data = list(self.meta_data(prefix))
184
- for m in meta_data:
185
- args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
186
- num_workers = self.num_workers
187
- for f_id, (_, item) in enumerate(
188
- zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
189
- if item is None:
190
- continue
191
- item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
192
- if self.binarization_args['with_spk_embed'] else None
193
- processed_wav = preprocess_wav(item['wav_fn'])
194
- item['emo_embed'] = Embed_utterance(processed_wav)
195
- if not self.binarization_args['with_wav'] and 'wav' in item:
196
- del item['wav']
197
- builder.add_item(item)
198
- mel_lengths.append(item['len'])
199
- if 'ph_len' in item:
200
- ph_lengths.append(item['ph_len'])
201
- total_sec += item['sec']
202
- if item.get('f0') is not None:
203
- f0s.append(item['f0'])
204
- builder.finalize()
205
- np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
206
- if len(ph_lengths) > 0:
207
- np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
208
- if len(f0s) > 0:
209
- f0s = np.concatenate(f0s, 0)
210
- f0s = f0s[f0s != 0]
211
- np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
212
- print(f"| {prefix} total duration: {total_sec:.3f}s")
213
-
214
- @classmethod
215
- def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
216
- res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
217
- if binarization_args['with_linear']:
218
- wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
219
- res['linear'] = linear_stft
220
- else:
221
- wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
222
- wav = wav.astype(np.float16)
223
- res.update({'mel': mel, 'wav': wav,
224
- 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
225
- try:
226
- if binarization_args['with_f0']:
227
- cls.get_pitch(res)
228
- if binarization_args['with_f0cwt']:
229
- cls.get_f0cwt(res)
230
- if binarization_args['with_txt']:
231
- ph_encoder, word_encoder = encoder
232
- try:
233
- res['phone'] = ph_encoder.encode(ph)
234
- res['ph_len'] = len(res['phone'])
235
- except:
236
- traceback.print_exc()
237
- raise BinarizationError(f"Empty phoneme")
238
- if binarization_args['with_align']:
239
- cls.get_align(tg_fn, res)
240
- if binarization_args['trim_eos_bos']:
241
- bos_dur = res['dur'][0]
242
- eos_dur = res['dur'][-1]
243
- res['mel'] = mel[bos_dur:-eos_dur]
244
- res['f0'] = res['f0'][bos_dur:-eos_dur]
245
- res['pitch'] = res['pitch'][bos_dur:-eos_dur]
246
- res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
247
- res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
248
- res['dur'] = res['dur'][1:-1]
249
- res['len'] = res['mel'].shape[0]
250
- if binarization_args['with_word']:
251
- cls.get_word(res, word_encoder)
252
- except BinarizationError as e:
253
- print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
254
- return None
255
- except Exception as e:
256
- traceback.print_exc()
257
- print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
258
- return None
259
- return res
260
-
261
- @staticmethod
262
- def get_align(tg_fn, res):
263
- ph = res['ph']
264
- mel = res['mel']
265
- phone_encoded = res['phone']
266
- if tg_fn is not None and os.path.exists(tg_fn):
267
- mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
268
- else:
269
- raise BinarizationError(f"Align not found")
270
- if mel2ph.max() - 1 >= len(phone_encoded):
271
- raise BinarizationError(
272
- f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
273
- res['mel2ph'] = mel2ph
274
- res['dur'] = dur
275
-
276
- @staticmethod
277
- def get_pitch(res):
278
- wav, mel = res['wav'], res['mel']
279
- f0, pitch_coarse = get_pitch(wav, mel, hparams)
280
- if sum(f0) == 0:
281
- raise BinarizationError("Empty f0")
282
- res['f0'] = f0
283
- res['pitch'] = pitch_coarse
284
-
285
- @staticmethod
286
- def get_f0cwt(res):
287
- from utils.cwt import get_cont_lf0, get_lf0_cwt
288
- f0 = res['f0']
289
- uv, cont_lf0_lpf = get_cont_lf0(f0)
290
- logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
291
- cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
292
- Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
293
- if np.any(np.isnan(Wavelet_lf0)):
294
- raise BinarizationError("NaN CWT")
295
- res['cwt_spec'] = Wavelet_lf0
296
- res['cwt_scales'] = scales
297
- res['f0_mean'] = logf0s_mean_org
298
- res['f0_std'] = logf0s_std_org
299
-
300
- @staticmethod
301
- def get_word(res, word_encoder):
302
- ph_split = res['ph'].split(" ")
303
- # ph side mapping to word
304
- ph_words = [] # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
305
- ph2word = np.zeros([len(ph_split)], dtype=int)
306
- last_ph_idx_for_word = [] # [2, 11, ...]
307
- for i, ph in enumerate(ph_split):
308
- if ph == '|':
309
- last_ph_idx_for_word.append(i)
310
- elif not ph[0].isalnum():
311
- if ph not in ['<BOS>']:
312
- last_ph_idx_for_word.append(i - 1)
313
- last_ph_idx_for_word.append(i)
314
- start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
315
- for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
316
- ph_words.append(ph_split[s_w:e_w + 1])
317
- ph2word[s_w:e_w + 1] = i
318
- ph2word = ph2word.tolist()
319
- ph_words = ["_".join(w) for w in ph_words]
320
-
321
- # mel side mapping to word
322
- mel2word = []
323
- dur_word = [0 for _ in range(len(ph_words))]
324
- for i, m2p in enumerate(res['mel2ph']):
325
- word_idx = ph2word[m2p - 1]
326
- mel2word.append(ph2word[m2p - 1])
327
- dur_word[word_idx] += 1
328
- ph2word = [x + 1 for x in ph2word] # 0预留给padding
329
- mel2word = [x + 1 for x in mel2word] # 0预留给padding
330
- res['ph_words'] = ph_words # [T_word]
331
- res['ph2word'] = ph2word # [T_ph]
332
- res['mel2word'] = mel2word # [T_mel]
333
- res['dur_word'] = dur_word # [T_word]
334
- words = [x for x in res['txt'].split(" ") if x != '']
335
- while len(words) > 0 and is_sil_phoneme(words[0]):
336
- words = words[1:]
337
- while len(words) > 0 and is_sil_phoneme(words[-1]):
338
- words = words[:-1]
339
- words = ['<BOS>'] + words + ['<EOS>']
340
- word_tokens = word_encoder.encode(" ".join(words))
341
- res['words'] = words
342
- res['word_tokens'] = word_tokens
343
- assert len(words) == len(ph_words), [words, ph_words]
344
-
345
- @property
346
- def num_workers(self):
347
- return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
348
-
349
-
350
- if __name__ == "__main__":
351
- set_hparams()
352
- EmotionBinarizer().process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/base_preprocess.py DELETED
@@ -1,254 +0,0 @@
1
- import json
2
- import os
3
- import random
4
- import re
5
- import traceback
6
- from collections import Counter
7
- from functools import partial
8
- import pandas as pd
9
- import librosa
10
- from tqdm import tqdm
11
- from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
12
- from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
13
- from utils.hparams import hparams
14
- from utils.multiprocess_utils import multiprocess_run_tqdm
15
- from utils.os_utils import link_file, move_file, remove_file
16
- from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
17
-
18
-
19
- class BasePreprocessor:
20
- def __init__(self):
21
- self.preprocess_args = hparams['preprocess_args']
22
- txt_processor = self.preprocess_args['txt_processor']
23
- self.txt_processor = get_txt_processor_cls(txt_processor)
24
- self.raw_data_dir = hparams['raw_data_dir']
25
- self.processed_dir = hparams['processed_data_dir']
26
- self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
27
-
28
- def meta_data(self):
29
- """
30
- :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
31
- """
32
- raise NotImplementedError
33
-
34
- def process(self):
35
- processed_dir = self.processed_dir
36
- wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
37
- remove_file(wav_processed_tmp_dir)
38
- os.makedirs(wav_processed_tmp_dir, exist_ok=True)
39
- wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
40
- remove_file(wav_processed_dir)
41
- os.makedirs(wav_processed_dir, exist_ok=True)
42
-
43
- meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
44
- item_names = [d['item_name'] for d in meta_data]
45
- assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
46
-
47
- # preprocess data
48
- phone_list = []
49
- word_list = []
50
- spk_names = set()
51
- process_item = partial(self.preprocess_first_pass,
52
- txt_processor=self.txt_processor,
53
- wav_processed_dir=wav_processed_dir,
54
- wav_processed_tmp=wav_processed_tmp_dir,
55
- preprocess_args=self.preprocess_args)
56
- items = []
57
- args = [{
58
- 'item_name': item_raw['item_name'],
59
- 'txt_raw': item_raw['txt'],
60
- 'wav_fn': item_raw['wav_fn'],
61
- 'txt_loader': item_raw.get('txt_loader'),
62
- 'others': item_raw.get('others', None)
63
- } for item_raw in meta_data]
64
- for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
65
- if item is not None:
66
- item_.update(item)
67
- item = item_
68
- if 'txt_loader' in item:
69
- del item['txt_loader']
70
- item['id'] = item_id
71
- item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
72
- item['others'] = item.get('others', None)
73
- phone_list += item['ph'].split(" ")
74
- word_list += item['word'].split(" ")
75
- spk_names.add(item['spk_name'])
76
- items.append(item)
77
-
78
- # add encoded tokens
79
- ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
80
- spk_map = self.build_spk_map(spk_names)
81
- args = [{
82
- 'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
83
- 'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
84
- } for item in items]
85
- for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
86
- items[idx].update(item_new_kv)
87
-
88
- # build mfa data
89
- if self.preprocess_args['use_mfa']:
90
- mfa_dict = set()
91
- mfa_input_dir = f'{processed_dir}/mfa_inputs'
92
- remove_file(mfa_input_dir)
93
- # group MFA inputs for better parallelism
94
- mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
95
- if self.preprocess_args['mfa_group_shuffle']:
96
- random.seed(hparams['seed'])
97
- random.shuffle(mfa_groups)
98
- args = [{
99
- 'item': item, 'mfa_input_dir': mfa_input_dir,
100
- 'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
101
- 'preprocess_args': self.preprocess_args
102
- } for item, mfa_group in zip(items, mfa_groups)]
103
- for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
104
- self.build_mfa_inputs, args, desc='Build MFA data'):
105
- items[i]['wav_align_fn'] = new_wav_align_fn
106
- for w in ph_gb_word_nosil.split(" "):
107
- mfa_dict.add(f"{w} {w.replace('_', ' ')}")
108
- mfa_dict = sorted(mfa_dict)
109
- with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
110
- f.writelines([f'{l}\n' for l in mfa_dict])
111
- with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
112
- f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
113
- remove_file(wav_processed_tmp_dir)
114
-
115
-
116
- @classmethod
117
- def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
118
- wav_fn, wav_processed_dir, wav_processed_tmp,
119
- preprocess_args, txt_loader=None, others=None):
120
- try:
121
- if txt_loader is not None:
122
- txt_raw = txt_loader(txt_raw)
123
- ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
124
- wav_fn, wav_align_fn = cls.process_wav(
125
- item_name, wav_fn,
126
- hparams['processed_data_dir'],
127
- wav_processed_tmp, preprocess_args)
128
-
129
- # wav for binarization
130
- ext = os.path.splitext(wav_fn)[1]
131
- os.makedirs(wav_processed_dir, exist_ok=True)
132
- new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
133
- move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
134
- move_link_func(wav_fn, new_wav_fn)
135
- return {
136
- 'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
137
- 'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
138
- 'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
139
- 'others': others
140
- }
141
- except:
142
- traceback.print_exc()
143
- print(f"| Error is caught. item_name: {item_name}.")
144
- return None
145
-
146
- @staticmethod
147
- def txt_to_ph(txt_processor, txt_raw, preprocess_args):
148
- txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
149
- ph = [p for w in txt_struct for p in w[1]]
150
- ph_gb_word = ["_".join(w[1]) for w in txt_struct]
151
- words = [w[0] for w in txt_struct]
152
- # word_id=0 is reserved for padding
153
- ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
154
- return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
155
-
156
- @staticmethod
157
- def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
158
- processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
159
- processors = [k() for k in processors if k is not None]
160
- if len(processors) >= 1:
161
- sr_file = librosa.core.get_samplerate(wav_fn)
162
- output_fn_for_align = None
163
- ext = os.path.splitext(wav_fn)[1]
164
- input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
165
- link_file(wav_fn, input_fn)
166
- for p in processors:
167
- outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
168
- if len(outputs) == 3:
169
- input_fn, sr, output_fn_for_align = outputs
170
- else:
171
- input_fn, sr = outputs
172
- if output_fn_for_align is None:
173
- return input_fn, input_fn
174
- else:
175
- return input_fn, output_fn_for_align
176
- else:
177
- return wav_fn, wav_fn
178
-
179
- def _phone_encoder(self, ph_set):
180
- ph_set_fn = f"{self.processed_dir}/phone_set.json"
181
- if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
182
- ph_set = sorted(set(ph_set))
183
- json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
184
- print("| Build phone set: ", ph_set)
185
- else:
186
- ph_set = json.load(open(ph_set_fn, 'r'))
187
- print("| Load phone set: ", ph_set)
188
- return build_token_encoder(ph_set_fn)
189
-
190
- def _word_encoder(self, word_set):
191
- word_set_fn = f"{self.processed_dir}/word_set.json"
192
- if self.preprocess_args['reset_word_dict']:
193
- word_set = Counter(word_set)
194
- total_words = sum(word_set.values())
195
- word_set = word_set.most_common(hparams['word_dict_size'])
196
- num_unk_words = total_words - sum([x[1] for x in word_set])
197
- word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
198
- word_set = sorted(set(word_set))
199
- json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
200
- print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
201
- f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
202
- else:
203
- word_set = json.load(open(word_set_fn, 'r'))
204
- print("| Load word set. Size: ", len(word_set), word_set[:10])
205
- return build_token_encoder(word_set_fn)
206
-
207
- @classmethod
208
- def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
209
- word_token = word_encoder.encode(word)
210
- ph_token = ph_encoder.encode(ph)
211
- spk_id = spk_map[spk_name]
212
- return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
213
-
214
- def build_spk_map(self, spk_names):
215
- spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
216
- assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
217
- print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
218
- json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
219
- return spk_map
220
-
221
- @classmethod
222
- def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
223
- item_name = item['item_name']
224
- wav_align_fn = item['wav_align_fn']
225
- ph_gb_word = item['ph_gb_word']
226
- ext = os.path.splitext(wav_align_fn)[1]
227
- mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
228
- os.makedirs(mfa_input_group_dir, exist_ok=True)
229
- new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
230
- move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
231
- move_link_func(wav_align_fn, new_wav_align_fn)
232
- ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
233
- for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
234
- with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
235
- f_txt.write(ph_gb_word_nosil)
236
- return ph_gb_word_nosil, new_wav_align_fn
237
-
238
- def load_spk_map(self, base_dir):
239
- spk_map_fn = f"{base_dir}/spk_map.json"
240
- spk_map = json.load(open(spk_map_fn, 'r'))
241
- return spk_map
242
-
243
- def load_dict(self, base_dir):
244
- ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
245
- word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
246
- return ph_encoder, word_encoder
247
-
248
- @property
249
- def meta_csv_filename(self):
250
- return 'metadata'
251
-
252
- @property
253
- def wav_processed_dirname(self):
254
- return 'wav_processed'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/binarizer_zh.py DELETED
@@ -1,59 +0,0 @@
1
- import os
2
-
3
- os.environ["OMP_NUM_THREADS"] = "1"
4
-
5
- from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
6
- from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
7
- from data_gen.tts.data_gen_utils import get_mel2ph
8
- from utils.hparams import set_hparams, hparams
9
- import numpy as np
10
-
11
-
12
- class ZhBinarizer(BaseBinarizer):
13
- @staticmethod
14
- def get_align(tg_fn, ph, mel, phone_encoded, res):
15
- if tg_fn is not None and os.path.exists(tg_fn):
16
- _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
17
- else:
18
- raise BinarizationError(f"Align not found")
19
- ph_list = ph.split(" ")
20
- assert len(dur) == len(ph_list)
21
- mel2ph = []
22
- # 分隔符的时长分配给韵母
23
- dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
24
- for i in range(len(dur)):
25
- p = ph_list[i]
26
- if p[0] != '<' and not p[0].isalpha():
27
- uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
28
- j = 0
29
- while j < len(uv_) and not uv_[j]:
30
- j += 1
31
- dur[i - 1] += j
32
- dur[i] -= j
33
- if dur[i] < 100:
34
- dur[i - 1] += dur[i]
35
- dur[i] = 0
36
- # 声母和韵母等长
37
- for i in range(len(dur)):
38
- p = ph_list[i]
39
- if p in ALL_SHENMU:
40
- p_next = ph_list[i + 1]
41
- if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
42
- print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
43
- f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
44
- continue
45
- total = dur[i + 1] + dur[i]
46
- dur[i] = total // 2
47
- dur[i + 1] = total - dur[i]
48
- for i in range(len(dur)):
49
- mel2ph += [i + 1] * dur[i]
50
- mel2ph = np.array(mel2ph)
51
- if mel2ph.max() - 1 >= len(phone_encoded):
52
- raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
53
- res['mel2ph'] = mel2ph
54
- res['dur'] = dur
55
-
56
-
57
- if __name__ == "__main__":
58
- set_hparams()
59
- ZhBinarizer().process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/data_gen_utils.py DELETED
@@ -1,357 +0,0 @@
1
- import warnings
2
-
3
- warnings.filterwarnings("ignore")
4
-
5
- import parselmouth
6
- import os
7
- import torch
8
- from skimage.transform import resize
9
- from utils.text_encoder import TokenTextEncoder
10
- from utils.pitch_utils import f0_to_coarse
11
- import struct
12
- import webrtcvad
13
- from scipy.ndimage.morphology import binary_dilation
14
- import librosa
15
- import numpy as np
16
- from utils import audio
17
- import pyloudnorm as pyln
18
- import re
19
- import json
20
- from collections import OrderedDict
21
-
22
- PUNCS = '!,.?;:'
23
-
24
- int16_max = (2 ** 15) - 1
25
-
26
-
27
- def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
28
- """
29
- Ensures that segments without voice in the waveform remain no longer than a
30
- threshold determined by the VAD parameters in params.py.
31
- :param wav: the raw waveform as a numpy array of floats
32
- :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
33
- :return: the same waveform with silences trimmed away (length <= original wav length)
34
- """
35
-
36
- ## Voice Activation Detection
37
- # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
38
- # This sets the granularity of the VAD. Should not need to be changed.
39
- sampling_rate = 16000
40
- wav_raw, sr = librosa.core.load(path, sr=sr)
41
-
42
- if norm:
43
- meter = pyln.Meter(sr) # create BS.1770 meter
44
- loudness = meter.integrated_loudness(wav_raw)
45
- wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
46
- if np.abs(wav_raw).max() > 1.0:
47
- wav_raw = wav_raw / np.abs(wav_raw).max()
48
-
49
- wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
50
-
51
- vad_window_length = 30 # In milliseconds
52
- # Number of frames to average together when performing the moving average smoothing.
53
- # The larger this value, the larger the VAD variations must be to not get smoothed out.
54
- vad_moving_average_width = 8
55
-
56
- # Compute the voice detection window size
57
- samples_per_window = (vad_window_length * sampling_rate) // 1000
58
-
59
- # Trim the end of the audio to have a multiple of the window size
60
- wav = wav[:len(wav) - (len(wav) % samples_per_window)]
61
-
62
- # Convert the float waveform to 16-bit mono PCM
63
- pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
64
-
65
- # Perform voice activation detection
66
- voice_flags = []
67
- vad = webrtcvad.Vad(mode=3)
68
- for window_start in range(0, len(wav), samples_per_window):
69
- window_end = window_start + samples_per_window
70
- voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
71
- sample_rate=sampling_rate))
72
- voice_flags = np.array(voice_flags)
73
-
74
- # Smooth the voice detection with a moving average
75
- def moving_average(array, width):
76
- array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
77
- ret = np.cumsum(array_padded, dtype=float)
78
- ret[width:] = ret[width:] - ret[:-width]
79
- return ret[width - 1:] / width
80
-
81
- audio_mask = moving_average(voice_flags, vad_moving_average_width)
82
- audio_mask = np.round(audio_mask).astype(np.bool)
83
-
84
- # Dilate the voiced regions
85
- audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
86
- audio_mask = np.repeat(audio_mask, samples_per_window)
87
- audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
88
- if return_raw_wav:
89
- return wav_raw, audio_mask, sr
90
- return wav_raw[audio_mask], audio_mask, sr
91
-
92
-
93
- def process_utterance(wav_path,
94
- fft_size=1024,
95
- hop_size=256,
96
- win_length=1024,
97
- window="hann",
98
- num_mels=80,
99
- fmin=80,
100
- fmax=7600,
101
- eps=1e-6,
102
- sample_rate=22050,
103
- loud_norm=False,
104
- min_level_db=-100,
105
- return_linear=False,
106
- trim_long_sil=False, vocoder='pwg'):
107
- if isinstance(wav_path, str):
108
- if trim_long_sil:
109
- wav, _, _ = trim_long_silences(wav_path, sample_rate)
110
- else:
111
- wav, _ = librosa.core.load(wav_path, sr=sample_rate)
112
- else:
113
- wav = wav_path
114
-
115
- if loud_norm:
116
- meter = pyln.Meter(sample_rate) # create BS.1770 meter
117
- loudness = meter.integrated_loudness(wav)
118
- wav = pyln.normalize.loudness(wav, loudness, -22.0)
119
- if np.abs(wav).max() > 1:
120
- wav = wav / np.abs(wav).max()
121
-
122
- # get amplitude spectrogram
123
- x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
124
- win_length=win_length, window=window, pad_mode="constant")
125
- spc = np.abs(x_stft) # (n_bins, T)
126
-
127
- # get mel basis
128
- fmin = 0 if fmin == -1 else fmin
129
- fmax = sample_rate / 2 if fmax == -1 else fmax
130
- mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
131
- mel = mel_basis @ spc
132
-
133
- if vocoder == 'pwg':
134
- mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
135
- else:
136
- assert False, f'"{vocoder}" is not in ["pwg"].'
137
-
138
- l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
139
- wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
140
- wav = wav[:mel.shape[1] * hop_size]
141
-
142
- if not return_linear:
143
- return wav, mel
144
- else:
145
- spc = audio.amp_to_db(spc)
146
- spc = audio.normalize(spc, {'min_level_db': min_level_db})
147
- return wav, mel, spc
148
-
149
-
150
- def get_pitch(wav_data, mel, hparams):
151
- """
152
-
153
- :param wav_data: [T]
154
- :param mel: [T, 80]
155
- :param hparams:
156
- :return:
157
- """
158
- time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
159
- f0_min = 80
160
- f0_max = 750
161
-
162
- if hparams['hop_size'] == 128:
163
- pad_size = 4
164
- elif hparams['hop_size'] == 256:
165
- pad_size = 2
166
- else:
167
- assert False
168
-
169
- f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
170
- time_step=time_step / 1000, voicing_threshold=0.6,
171
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
172
- lpad = pad_size * 2
173
- rpad = len(mel) - len(f0) - lpad
174
- f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
175
- # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
176
- # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
177
- # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
178
- delta_l = len(mel) - len(f0)
179
- assert np.abs(delta_l) <= 8
180
- if delta_l > 0:
181
- f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
182
- f0 = f0[:len(mel)]
183
- pitch_coarse = f0_to_coarse(f0)
184
- return f0, pitch_coarse
185
-
186
-
187
- def remove_empty_lines(text):
188
- """remove empty lines"""
189
- assert (len(text) > 0)
190
- assert (isinstance(text, list))
191
- text = [t.strip() for t in text]
192
- if "" in text:
193
- text.remove("")
194
- return text
195
-
196
-
197
- class TextGrid(object):
198
- def __init__(self, text):
199
- text = remove_empty_lines(text)
200
- self.text = text
201
- self.line_count = 0
202
- self._get_type()
203
- self._get_time_intval()
204
- self._get_size()
205
- self.tier_list = []
206
- self._get_item_list()
207
-
208
- def _extract_pattern(self, pattern, inc):
209
- """
210
- Parameters
211
- ----------
212
- pattern : regex to extract pattern
213
- inc : increment of line count after extraction
214
- Returns
215
- -------
216
- group : extracted info
217
- """
218
- try:
219
- group = re.match(pattern, self.text[self.line_count]).group(1)
220
- self.line_count += inc
221
- except AttributeError:
222
- raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
223
- return group
224
-
225
- def _get_type(self):
226
- self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
227
-
228
- def _get_time_intval(self):
229
- self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
230
- self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
231
-
232
- def _get_size(self):
233
- self.size = int(self._extract_pattern(r"size = (.*)", 2))
234
-
235
- def _get_item_list(self):
236
- """Only supports IntervalTier currently"""
237
- for itemIdx in range(1, self.size + 1):
238
- tier = OrderedDict()
239
- item_list = []
240
- tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
241
- tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
242
- if tier_class != "IntervalTier":
243
- raise NotImplementedError("Only IntervalTier class is supported currently")
244
- tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
245
- tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
246
- tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
247
- tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
248
- for i in range(int(tier_size)):
249
- item = OrderedDict()
250
- item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
251
- item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
252
- item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
253
- item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
254
- item_list.append(item)
255
- tier["idx"] = tier_idx
256
- tier["class"] = tier_class
257
- tier["name"] = tier_name
258
- tier["xmin"] = tier_xmin
259
- tier["xmax"] = tier_xmax
260
- tier["size"] = tier_size
261
- tier["items"] = item_list
262
- self.tier_list.append(tier)
263
-
264
- def toJson(self):
265
- _json = OrderedDict()
266
- _json["file_type"] = self.file_type
267
- _json["xmin"] = self.xmin
268
- _json["xmax"] = self.xmax
269
- _json["size"] = self.size
270
- _json["tiers"] = self.tier_list
271
- return json.dumps(_json, ensure_ascii=False, indent=2)
272
-
273
-
274
- def get_mel2ph(tg_fn, ph, mel, hparams):
275
- ph_list = ph.split(" ")
276
- with open(tg_fn, "r") as f:
277
- tg = f.readlines()
278
- tg = remove_empty_lines(tg)
279
- tg = TextGrid(tg)
280
- tg = json.loads(tg.toJson())
281
- split = np.ones(len(ph_list) + 1, np.float) * -1
282
- tg_idx = 0
283
- ph_idx = 0
284
- tg_align = [x for x in tg['tiers'][-1]['items']]
285
- tg_align_ = []
286
- for x in tg_align:
287
- x['xmin'] = float(x['xmin'])
288
- x['xmax'] = float(x['xmax'])
289
- if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
290
- x['text'] = ''
291
- if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
292
- tg_align_[-1]['xmax'] = x['xmax']
293
- continue
294
- tg_align_.append(x)
295
- tg_align = tg_align_
296
- tg_len = len([x for x in tg_align if x['text'] != ''])
297
- ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
298
- assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
299
- while tg_idx < len(tg_align) or ph_idx < len(ph_list):
300
- if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
301
- split[ph_idx] = 1e8
302
- ph_idx += 1
303
- continue
304
- x = tg_align[tg_idx]
305
- if x['text'] == '' and ph_idx == len(ph_list):
306
- tg_idx += 1
307
- continue
308
- assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
309
- ph = ph_list[ph_idx]
310
- if x['text'] == '' and not is_sil_phoneme(ph):
311
- assert False, (ph_list, tg_align)
312
- if x['text'] != '' and is_sil_phoneme(ph):
313
- ph_idx += 1
314
- else:
315
- assert (x['text'] == '' and is_sil_phoneme(ph)) \
316
- or x['text'].lower() == ph.lower() \
317
- or x['text'].lower() == 'sil', (x['text'], ph)
318
- split[ph_idx] = x['xmin']
319
- if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
320
- split[ph_idx - 1] = split[ph_idx]
321
- ph_idx += 1
322
- tg_idx += 1
323
- assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
324
- assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
325
- mel2ph = np.zeros([mel.shape[0]], np.int)
326
- split[0] = 0
327
- split[-1] = 1e8
328
- for i in range(len(split) - 1):
329
- assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
330
- split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
331
- for ph_idx in range(len(ph_list)):
332
- mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
333
- mel2ph_torch = torch.from_numpy(mel2ph)
334
- T_t = len(ph_list)
335
- dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
336
- dur = dur[1:].numpy()
337
- return mel2ph, dur
338
-
339
-
340
- def build_phone_encoder(data_dir):
341
- phone_list_file = os.path.join(data_dir, 'phone_set.json')
342
- phone_list = json.load(open(phone_list_file))
343
- return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
344
-
345
-
346
- def build_word_encoder(data_dir):
347
- word_list_file = os.path.join(data_dir, 'word_set.json')
348
- word_list = json.load(open(word_list_file))
349
- return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
350
-
351
- def is_sil_phoneme(p):
352
- return not p[0].isalpha()
353
-
354
-
355
- def build_token_encoder(token_list_file):
356
- token_list = json.load(open(token_list_file))
357
- return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc DELETED
Binary file (3.8 kB)
 
NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc DELETED
Binary file (7.28 kB)
 
NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc DELETED
Binary file (2.53 kB)
 
NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc DELETED
Binary file (491 Bytes)
 
NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc DELETED
Binary file (371 Bytes)
 
NeuralSeq/data_gen/tts/emotion/audio.py DELETED
@@ -1,107 +0,0 @@
1
- from scipy.ndimage.morphology import binary_dilation
2
- from data_gen.tts.emotion.params_data import *
3
- from pathlib import Path
4
- from typing import Optional, Union
5
- import numpy as np
6
- import webrtcvad
7
- import librosa
8
- import struct
9
-
10
- int16_max = (2 ** 15) - 1
11
-
12
-
13
- def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
14
- source_sr: Optional[int] = None):
15
- """
16
- Applies the preprocessing operations used in training the Speaker Encoder to a waveform
17
- either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
18
-
19
- :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
20
- just .wav), either the waveform as a numpy array of floats.
21
- :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
22
- preprocessing. After preprocessing, the waveform's sampling rate will match the data
23
- hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
24
- this argument will be ignored.
25
- """
26
- # Load the wav from disk if needed
27
- if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
28
- wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
29
- else:
30
- wav = fpath_or_wav
31
-
32
- # Resample the wav if needed
33
- if source_sr is not None and source_sr != sampling_rate:
34
- wav = librosa.resample(wav, source_sr, sampling_rate)
35
-
36
- # Apply the preprocessing: normalize volume and shorten long silences
37
- wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
38
- wav = trim_long_silences(wav)
39
-
40
- return wav
41
-
42
-
43
- def wav_to_mel_spectrogram(wav):
44
- """
45
- Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
46
- Note: this not a log-mel spectrogram.
47
- """
48
- frames = librosa.feature.melspectrogram(
49
- wav,
50
- sampling_rate,
51
- n_fft=int(sampling_rate * mel_window_length / 1000),
52
- hop_length=int(sampling_rate * mel_window_step / 1000),
53
- n_mels=mel_n_channels
54
- )
55
- return frames.astype(np.float32).T
56
-
57
-
58
- def trim_long_silences(wav):
59
- """
60
- Ensures that segments without voice in the waveform remain no longer than a
61
- threshold determined by the VAD parameters in params.py.
62
-
63
- :param wav: the raw waveform as a numpy array of floats
64
- :return: the same waveform with silences trimmed away (length <= original wav length)
65
- """
66
- # Compute the voice detection window size
67
- samples_per_window = (vad_window_length * sampling_rate) // 1000
68
-
69
- # Trim the end of the audio to have a multiple of the window size
70
- wav = wav[:len(wav) - (len(wav) % samples_per_window)]
71
-
72
- # Convert the float waveform to 16-bit mono PCM
73
- pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
74
-
75
- # Perform voice activation detection
76
- voice_flags = []
77
- vad = webrtcvad.Vad(mode=3)
78
- for window_start in range(0, len(wav), samples_per_window):
79
- window_end = window_start + samples_per_window
80
- voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
81
- sample_rate=sampling_rate))
82
- voice_flags = np.array(voice_flags)
83
-
84
- # Smooth the voice detection with a moving average
85
- def moving_average(array, width):
86
- array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
87
- ret = np.cumsum(array_padded, dtype=float)
88
- ret[width:] = ret[width:] - ret[:-width]
89
- return ret[width - 1:] / width
90
-
91
- audio_mask = moving_average(voice_flags, vad_moving_average_width)
92
- audio_mask = np.round(audio_mask).astype(np.bool)
93
-
94
- # Dilate the voiced regions
95
- audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
96
- audio_mask = np.repeat(audio_mask, samples_per_window)
97
-
98
- return wav[audio_mask == True]
99
-
100
-
101
- def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
102
- if increase_only and decrease_only:
103
- raise ValueError("Both increase only and decrease only are set")
104
- dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
105
- if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
106
- return wav
107
- return wav * (10 ** (dBFS_change / 20))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/inference.py DELETED
@@ -1,177 +0,0 @@
1
- from data_gen.tts.emotion.params_data import *
2
- from data_gen.tts.emotion.model import EmotionEncoder
3
- from data_gen.tts.emotion.audio import preprocess_wav # We want to expose this function from here
4
- from matplotlib import cm
5
- from data_gen.tts.emotion import audio
6
- from pathlib import Path
7
- import matplotlib.pyplot as plt
8
- import numpy as np
9
- import torch
10
-
11
- _model = None # type: EmotionEncoder
12
- _device = None # type: torch.device
13
-
14
-
15
- def load_model(weights_fpath: Path, device=None):
16
- """
17
- Loads the model in memory. If this function is not explicitely called, it will be run on the
18
- first call to embed_frames() with the default weights file.
19
-
20
- :param weights_fpath: the path to saved model weights.
21
- :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
22
- model will be loaded and will run on this device. Outputs will however always be on the cpu.
23
- If None, will default to your GPU if it"s available, otherwise your CPU.
24
- """
25
- # TODO: I think the slow loading of the encoder might have something to do with the device it
26
- # was saved on. Worth investigating.
27
- global _model, _device
28
- if device is None:
29
- _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
- elif isinstance(device, str):
31
- _device = torch.device(device)
32
- _model = EmotionEncoder(_device, torch.device("cpu"))
33
- checkpoint = torch.load(weights_fpath)
34
- _model.load_state_dict(checkpoint["model_state"])
35
- _model.eval()
36
- print("Loaded encoder trained to step %d" % (checkpoint["step"]))
37
-
38
-
39
- def is_loaded():
40
- return _model is not None
41
-
42
-
43
- def embed_frames_batch(frames_batch):
44
- """
45
- Computes embeddings for a batch of mel spectrogram.
46
-
47
- :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
48
- (batch_size, n_frames, n_channels)
49
- :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
50
- """
51
- if _model is None:
52
- raise Exception("Model was not loaded. Call load_model() before inference.")
53
-
54
- frames = torch.from_numpy(frames_batch).to(_device)
55
- embed = _model.inference(frames).detach().cpu().numpy()
56
- return embed
57
-
58
-
59
- def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
60
- min_pad_coverage=0.75, overlap=0.5):
61
- """
62
- Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
63
- partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
64
- spectrogram slices are returned, so as to make each partial utterance waveform correspond to
65
- its spectrogram. This function assumes that the mel spectrogram parameters used are those
66
- defined in params_data.py.
67
-
68
- The returned ranges may be indexing further than the length of the waveform. It is
69
- recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
70
-
71
- :param n_samples: the number of samples in the waveform
72
- :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
73
- utterance
74
- :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
75
- enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
76
- then the last partial utterance will be considered, as if we padded the audio. Otherwise,
77
- it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
78
- utterance, this parameter is ignored so that the function always returns at least 1 slice.
79
- :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
80
- utterances are entirely disjoint.
81
- :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
82
- respectively the waveform and the mel spectrogram with these slices to obtain the partial
83
- utterances.
84
- """
85
- assert 0 <= overlap < 1
86
- assert 0 < min_pad_coverage <= 1
87
-
88
- samples_per_frame = int((sampling_rate * mel_window_step / 1000))
89
- n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
90
- frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
91
-
92
- # Compute the slices
93
- wav_slices, mel_slices = [], []
94
- steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
95
- for i in range(0, steps, frame_step):
96
- mel_range = np.array([i, i + partial_utterance_n_frames])
97
- wav_range = mel_range * samples_per_frame
98
- mel_slices.append(slice(*mel_range))
99
- wav_slices.append(slice(*wav_range))
100
-
101
- # Evaluate whether extra padding is warranted or not
102
- last_wav_range = wav_slices[-1]
103
- coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
104
- if coverage < min_pad_coverage and len(mel_slices) > 1:
105
- mel_slices = mel_slices[:-1]
106
- wav_slices = wav_slices[:-1]
107
-
108
- return wav_slices, mel_slices
109
-
110
-
111
- def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
112
- """
113
- Computes an embedding for a single utterance.
114
-
115
- # TODO: handle multiple wavs to benefit from batching on GPU
116
- :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
117
- :param using_partials: if True, then the utterance is split in partial utterances of
118
- <partial_utterance_n_frames> frames and the utterance embedding is computed from their
119
- normalized average. If False, the utterance is instead computed from feeding the entire
120
- spectogram to the network.
121
- :param return_partials: if True, the partial embeddings will also be returned along with the
122
- wav slices that correspond to the partial embeddings.
123
- :param kwargs: additional arguments to compute_partial_splits()
124
- :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
125
- <return_partials> is True, the partial utterances as a numpy array of float32 of shape
126
- (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
127
- returned. If <using_partials> is simultaneously set to False, both these values will be None
128
- instead.
129
- """
130
- # Process the entire utterance if not using partials
131
- if not using_partials:
132
- frames = audio.wav_to_mel_spectrogram(wav)
133
- embed = embed_frames_batch(frames[None, ...])[0]
134
- if return_partials:
135
- return embed, None, None
136
- return embed
137
-
138
- # Compute where to split the utterance into partials and pad if necessary
139
- wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
140
- max_wave_length = wave_slices[-1].stop
141
- if max_wave_length >= len(wav):
142
- wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
143
-
144
- # Split the utterance into partials
145
- frames = audio.wav_to_mel_spectrogram(wav)
146
- frames_batch = np.array([frames[s] for s in mel_slices])
147
- partial_embeds = embed_frames_batch(frames_batch)
148
-
149
- # Compute the utterance embedding from the partial embeddings
150
- raw_embed = np.mean(partial_embeds, axis=0)
151
- embed = raw_embed / np.linalg.norm(raw_embed, 2)
152
-
153
- if return_partials:
154
- return embed, partial_embeds, wave_slices
155
- return embed
156
-
157
-
158
- def embed_speaker(wavs, **kwargs):
159
- raise NotImplemented()
160
-
161
-
162
- def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
163
- if ax is None:
164
- ax = plt.gca()
165
-
166
- if shape is None:
167
- height = int(np.sqrt(len(embed)))
168
- shape = (height, -1)
169
- embed = embed.reshape(shape)
170
-
171
- cmap = cm.get_cmap()
172
- mappable = ax.imshow(embed, cmap=cmap)
173
- cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174
- cbar.set_clim(*color_range)
175
-
176
- ax.set_xticks([]), ax.set_yticks([])
177
- ax.set_title(title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/model.py DELETED
@@ -1,78 +0,0 @@
1
-
2
- from data_gen.tts.emotion.params_model import *
3
- from data_gen.tts.emotion.params_data import *
4
- from torch.nn.utils import clip_grad_norm_
5
- from scipy.optimize import brentq
6
- from torch import nn
7
- import numpy as np
8
- import torch
9
-
10
-
11
- class EmotionEncoder(nn.Module):
12
- def __init__(self, device, loss_device):
13
- super().__init__()
14
- self.loss_device = loss_device
15
-
16
- # Network defition
17
- self.lstm = nn.LSTM(input_size=mel_n_channels,
18
- hidden_size=model_hidden_size,
19
- num_layers=model_num_layers,
20
- batch_first=True).to(device)
21
- self.linear = nn.Linear(in_features=model_hidden_size,
22
- out_features=model_embedding_size).to(device)
23
- self.relu = torch.nn.ReLU().to(device)
24
-
25
-
26
- # Cosine similarity scaling (with fixed initial parameter values)
27
- self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
28
- self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
29
-
30
- # Loss
31
- self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
32
-
33
- def do_gradient_ops(self):
34
- # Gradient scale
35
- self.similarity_weight.grad *= 0.01
36
- self.similarity_bias.grad *= 0.01
37
-
38
- # Gradient clipping
39
- clip_grad_norm_(self.parameters(), 3, norm_type=2)
40
-
41
- def forward(self, utterances, hidden_init=None):
42
- """
43
- Computes the embeddings of a batch of utterance spectrograms.
44
-
45
- :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
46
- (batch_size, n_frames, n_channels)
47
- :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
48
- batch_size, hidden_size). Will default to a tensor of zeros if None.
49
- :return: the embeddings as a tensor of shape (batch_size, embedding_size)
50
- """
51
- # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
52
- # and the final cell state.
53
- out, (hidden, cell) = self.lstm(utterances, hidden_init)
54
-
55
- # We take only the hidden state of the last layer
56
- embeds_raw = self.relu(self.linear(hidden[-1]))
57
-
58
- # L2-normalize it
59
- embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
60
-
61
- return embeds
62
-
63
- def inference(self, utterances, hidden_init=None):
64
- """
65
- Computes the embeddings of a batch of utterance spectrograms.
66
-
67
- :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
68
- (batch_size, n_frames, n_channels)
69
- :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
70
- batch_size, hidden_size). Will default to a tensor of zeros if None.
71
- :return: the embeddings as a tensor of shape (batch_size, embedding_size)
72
- """
73
- # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
74
- # and the final cell state.
75
-
76
- out, (hidden, cell) = self.lstm(utterances, hidden_init)
77
-
78
- return hidden[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/params_data.py DELETED
@@ -1,29 +0,0 @@
1
-
2
- ## Mel-filterbank
3
- mel_window_length = 25 # In milliseconds
4
- mel_window_step = 10 # In milliseconds
5
- mel_n_channels = 40
6
-
7
-
8
- ## Audio
9
- sampling_rate = 16000
10
- # Number of spectrogram frames in a partial utterance
11
- partials_n_frames = 160 # 1600 ms
12
- # Number of spectrogram frames at inference
13
- inference_n_frames = 80 # 800 ms
14
-
15
-
16
- ## Voice Activation Detection
17
- # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18
- # This sets the granularity of the VAD. Should not need to be changed.
19
- vad_window_length = 30 # In milliseconds
20
- # Number of frames to average together when performing the moving average smoothing.
21
- # The larger this value, the larger the VAD variations must be to not get smoothed out.
22
- vad_moving_average_width = 8
23
- # Maximum number of consecutive silent frames a segment can have.
24
- vad_max_silence_length = 6
25
-
26
-
27
- ## Audio volume normalization
28
- audio_norm_target_dBFS = -30
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/params_model.py DELETED
@@ -1,11 +0,0 @@
1
-
2
- ## Model parameters
3
- model_hidden_size = 256
4
- model_embedding_size = 256
5
- model_num_layers = 3
6
-
7
-
8
- ## Training parameters
9
- learning_rate_init = 1e-4
10
- speakers_per_batch = 6
11
- utterances_per_speaker = 20
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/emotion/test_emotion.py DELETED
@@ -1,184 +0,0 @@
1
- #!/usr/bin/env python3 -u
2
- # Copyright (c) Facebook, Inc. and its affiliates.
3
- #
4
- # This source code is licensed under the MIT license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- Run inference for pre-processed data with a trained model.
9
- """
10
-
11
- import logging
12
- import math
13
- import numpy, math, pdb, sys, random
14
- import time, os, itertools, shutil, importlib
15
- import argparse
16
- import os
17
- import sys
18
- import glob
19
- from sklearn import metrics
20
- import soundfile as sf
21
- #import sentencepiece as spm
22
- import torch
23
- import inference as encoder
24
- import torch.nn as nn
25
- import torch.nn.functional as F
26
- from pathlib import Path
27
- logger = logging.getLogger(__name__)
28
- logger.setLevel(logging.INFO)
29
- from resemblyzer import VoiceEncoder, preprocess_wav
30
-
31
-
32
- def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
33
- fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
34
- fnr = 1 - tpr
35
-
36
- fnr = fnr * 100
37
- fpr = fpr * 100
38
-
39
- tunedThreshold = [];
40
- if target_fr:
41
- for tfr in target_fr:
42
- idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
43
- tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
44
-
45
- for tfa in target_fa:
46
- idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
47
- tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
48
-
49
- idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
50
- eer = max(fpr[idxE], fnr[idxE])
51
-
52
- return (tunedThreshold, eer, fpr, fnr);
53
-
54
-
55
- def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
56
- # Maximum audio length
57
- max_audio = max_frames * 160 + 240
58
-
59
- # Read wav file and convert to torch tensor
60
- audio,sample_rate = sf.read(filename)
61
-
62
- feats_v0 = torch.from_numpy(audio).float()
63
- audiosize = audio.shape[0]
64
-
65
- if audiosize <= max_audio:
66
- shortage = math.floor((max_audio - audiosize + 1) / 2)
67
- audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
68
- audiosize = audio.shape[0]
69
-
70
- if evalmode:
71
- startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
72
- else:
73
- startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
74
- feats = []
75
- if evalmode and max_frames == 0:
76
- feats.append(audio)
77
- else:
78
- for asf in startframe:
79
- feats.append(audio[int(asf):int(asf) + max_audio])
80
- feat = numpy.stack(feats, axis=0)
81
- feat = torch.FloatTensor(feat)
82
- return feat;
83
-
84
- def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
85
-
86
- lines = []
87
- files = []
88
- feats = {}
89
- tstart = time.time()
90
-
91
- ## Read all lines
92
- with open(listfilename) as listfile:
93
- while True:
94
- line = listfile.readline();
95
- if (not line):
96
- break;
97
-
98
- data = line.split();
99
-
100
- ## Append random label if missing
101
- if len(data) == 2: data = [random.randint(0,1)] + data
102
-
103
- files.append(data[1])
104
- files.append(data[2])
105
- lines.append(line)
106
-
107
- setfiles = list(set(files))
108
- setfiles.sort()
109
- ## Save all features to file
110
- for idx, file in enumerate(setfiles):
111
- # preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
112
- # embed = encoder.embed_utterance(preprocessed_wav)
113
- processed_wav = preprocess_wav(os.path.join(test_path,file))
114
- embed = voice_encoder.embed_utterance(processed_wav)
115
-
116
- torch.cuda.empty_cache()
117
- ref_feat = torch.from_numpy(embed).unsqueeze(0)
118
-
119
- feats[file] = ref_feat
120
-
121
- telapsed = time.time() - tstart
122
-
123
- if idx % print_interval == 0:
124
- sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
125
-
126
- print('')
127
- all_scores = [];
128
- all_labels = [];
129
- all_trials = [];
130
- tstart = time.time()
131
-
132
- ## Read files and compute all scores
133
- for idx, line in enumerate(lines):
134
-
135
- data = line.split();
136
- ## Append random label if missing
137
- if len(data) == 2: data = [random.randint(0,1)] + data
138
-
139
- ref_feat = feats[data[1]]
140
- com_feat = feats[data[2]]
141
- ref_feat = ref_feat.cuda()
142
- com_feat = com_feat.cuda()
143
- # normalize feats
144
- ref_feat = F.normalize(ref_feat, p=2, dim=1)
145
- com_feat = F.normalize(com_feat, p=2, dim=1)
146
-
147
- dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
148
-
149
- score = -1 * numpy.mean(dist);
150
-
151
- all_scores.append(score);
152
- all_labels.append(int(data[0]));
153
- all_trials.append(data[1]+" "+data[2])
154
-
155
- if idx % print_interval == 0:
156
- telapsed = time.time() - tstart
157
- sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
158
- sys.stdout.flush();
159
-
160
- print('\n')
161
-
162
- return (all_scores, all_labels, all_trials);
163
-
164
-
165
-
166
- if __name__ == '__main__':
167
-
168
- parser = argparse.ArgumentParser("baseline")
169
- parser.add_argument("--data_root", type=str, help="", required=True)
170
- parser.add_argument("--list", type=str, help="", required=True)
171
- parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
172
-
173
- args = parser.parse_args()
174
-
175
-
176
- # Load the models one by one.
177
- print("Preparing the encoder...")
178
- # encoder.load_model(Path(args.model_dir))
179
- print("Insert the wav file name...")
180
- voice_encoder = VoiceEncoder().cuda()
181
-
182
- sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
183
- result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
184
- print('EER %2.4f'%result[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/txt_processors/__init__.py DELETED
@@ -1 +0,0 @@
1
- from . import en
 
 
NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (218 Bytes)
 
NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc DELETED
Binary file (1.9 kB)
 
NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc DELETED
Binary file (2.87 kB)
 
NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py DELETED
@@ -1,47 +0,0 @@
1
- from data_gen.tts.data_gen_utils import is_sil_phoneme
2
-
3
- REGISTERED_TEXT_PROCESSORS = {}
4
-
5
- def register_txt_processors(name):
6
- def _f(cls):
7
- REGISTERED_TEXT_PROCESSORS[name] = cls
8
- return cls
9
-
10
- return _f
11
-
12
-
13
- def get_txt_processor_cls(name):
14
- return REGISTERED_TEXT_PROCESSORS.get(name, None)
15
-
16
-
17
- class BaseTxtProcessor:
18
- @staticmethod
19
- def sp_phonemes():
20
- return ['|']
21
-
22
- @classmethod
23
- def process(cls, txt, preprocess_args):
24
- raise NotImplementedError
25
-
26
- @classmethod
27
- def postprocess(cls, txt_struct, preprocess_args):
28
- # remove sil phoneme in head and tail
29
- while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
30
- txt_struct = txt_struct[1:]
31
- while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
32
- txt_struct = txt_struct[:-1]
33
- if preprocess_args['with_phsep']:
34
- txt_struct = cls.add_bdr(txt_struct)
35
- if preprocess_args['add_eos_bos']:
36
- txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
37
- return txt_struct
38
-
39
- @classmethod
40
- def add_bdr(cls, txt_struct):
41
- txt_struct_ = []
42
- for i, ts in enumerate(txt_struct):
43
- txt_struct_.append(ts)
44
- if i != len(txt_struct) - 1 and \
45
- not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
46
- txt_struct_.append(['|', ['|']])
47
- return txt_struct_
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/txt_processors/en.py DELETED
@@ -1,77 +0,0 @@
1
- import re
2
- import unicodedata
3
-
4
- from g2p_en import G2p
5
- from g2p_en.expand import normalize_numbers
6
- from nltk import pos_tag
7
- from nltk.tokenize import TweetTokenizer
8
-
9
- from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
10
- from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
11
-
12
- class EnG2p(G2p):
13
- word_tokenize = TweetTokenizer().tokenize
14
-
15
- def __call__(self, text):
16
- # preprocessing
17
- words = EnG2p.word_tokenize(text)
18
- tokens = pos_tag(words) # tuples of (word, tag)
19
-
20
- # steps
21
- prons = []
22
- for word, pos in tokens:
23
- if re.search("[a-z]", word) is None:
24
- pron = [word]
25
-
26
- elif word in self.homograph2features: # Check homograph
27
- pron1, pron2, pos1 = self.homograph2features[word]
28
- if pos.startswith(pos1):
29
- pron = pron1
30
- else:
31
- pron = pron2
32
- elif word in self.cmu: # lookup CMU dict
33
- pron = self.cmu[word][0]
34
- else: # predict for oov
35
- pron = self.predict(word)
36
-
37
- prons.extend(pron)
38
- prons.extend([" "])
39
-
40
- return prons[:-1]
41
-
42
-
43
- @register_txt_processors('en')
44
- class TxtProcessor(BaseTxtProcessor):
45
- g2p = EnG2p()
46
-
47
- @staticmethod
48
- def preprocess_text(text):
49
- text = normalize_numbers(text)
50
- text = ''.join(char for char in unicodedata.normalize('NFD', text)
51
- if unicodedata.category(char) != 'Mn') # Strip accents
52
- text = text.lower()
53
- text = re.sub("[\'\"()]+", "", text)
54
- text = re.sub("[-]+", " ", text)
55
- text = re.sub(f"[^ a-z{PUNCS}]", "", text)
56
- text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
57
- text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
58
- text = text.replace("i.e.", "that is")
59
- text = text.replace("i.e.", "that is")
60
- text = text.replace("etc.", "etc")
61
- text = re.sub(f"([{PUNCS}])", r" \1 ", text)
62
- text = re.sub(rf"\s+", r" ", text)
63
- return text
64
-
65
- @classmethod
66
- def process(cls, txt, preprocess_args):
67
- txt = cls.preprocess_text(txt).strip()
68
- phs = cls.g2p(txt)
69
- txt_struct = [[w, []] for w in txt.split(" ")]
70
- i_word = 0
71
- for p in phs:
72
- if p == ' ':
73
- i_word += 1
74
- else:
75
- txt_struct[i_word][1].append(p)
76
- txt_struct = cls.postprocess(txt_struct, preprocess_args)
77
- return txt_struct, txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
NeuralSeq/data_gen/tts/txt_processors/zh.py DELETED
@@ -1,43 +0,0 @@
1
- import re
2
- import jieba
3
- from pypinyin import pinyin, Style
4
- from data_gen.tts.data_gen_utils import PUNCS
5
- from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
6
- from utils.text_norm import NSWNormalizer
7
-
8
-
9
- class TxtProcessor(BaseTxtProcessor):
10
- table = {ord(f): ord(t) for f, t in zip(
11
- u':,。!?【】()%#@&1234567890',
12
- u':,.!?[]()%#@&1234567890')}
13
-
14
- @staticmethod
15
- def preprocess_text(text):
16
- text = text.translate(TxtProcessor.table)
17
- text = NSWNormalizer(text).normalize(remove_punc=False)
18
- text = re.sub("[\'\"()]+", "", text)
19
- text = re.sub("[-]+", " ", text)
20
- text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
21
- text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
22
- text = re.sub(f"([{PUNCS}])", r" \1 ", text)
23
- text = re.sub(rf"\s+", r"", text)
24
- text = re.sub(rf"[A-Za-z]+", r"$", text)
25
- return text
26
-
27
- @classmethod
28
- def process(cls, txt, pre_align_args):
29
- txt = cls.preprocess_text(txt)
30
- shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403
31
- yunmu_finals = pinyin(txt, style=Style.FINALS)
32
- yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
33
- yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
34
- if pre_align_args['use_tone'] else yunmu_finals
35
-
36
- assert len(shengmu) == len(yunmu)
37
- phs = ["|"]
38
- for a, b, c in zip(shengmu, yunmu, yunmu_finals):
39
- if a[0] == c[0]:
40
- phs += [a[0], "|"]
41
- else:
42
- phs += [a[0], b[0], "|"]
43
- return phs, txt