ONNX
Splend1dchan commited on
Commit
94dc36a
·
1 Parent(s): 33bdd47

upload models

Browse files
campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-to-speech"}
cosyvoice.yaml ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 16
30
+ linear_units: 4096
31
+ num_blocks: 6
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 16
48
+ linear_units: 4096
49
+ num_blocks: 14
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+
58
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
59
+ input_size: 512
60
+ output_size: 80
61
+ spk_embed_dim: !ref <spk_embed_dim>
62
+ output_type: 'mel'
63
+ vocab_size: 4096
64
+ input_frame_rate: 50
65
+ only_mask_loss: True
66
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
67
+ output_size: 512
68
+ attention_heads: 8
69
+ linear_units: 2048
70
+ num_blocks: 6
71
+ dropout_rate: 0.1
72
+ positional_dropout_rate: 0.1
73
+ attention_dropout_rate: 0.1
74
+ normalize_before: True
75
+ input_layer: 'linear'
76
+ pos_enc_layer_type: 'rel_pos_espnet'
77
+ selfattention_layer_type: 'rel_selfattn'
78
+ input_size: 512
79
+ use_cnn_module: False
80
+ macaron_style: False
81
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
82
+ channels: 80
83
+ sampling_ratios: [1, 1, 1, 1]
84
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
85
+ in_channels: 240
86
+ n_spks: 1
87
+ spk_emb_dim: 80
88
+ cfm_params: !new:omegaconf.DictConfig
89
+ content:
90
+ sigma_min: 1e-06
91
+ solver: 'euler'
92
+ t_scheduler: 'cosine'
93
+ training_cfg_rate: 0.2
94
+ inference_cfg_rate: 0.7
95
+ reg_loss_type: 'l1'
96
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
97
+ in_channels: 320
98
+ out_channels: 80
99
+ channels: [256, 256]
100
+ dropout: 0
101
+ attention_head_dim: 64
102
+ n_blocks: 4
103
+ num_mid_blocks: 12
104
+ num_heads: 8
105
+ act_fn: 'gelu'
106
+
107
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
108
+ in_channels: 80
109
+ base_channels: 512
110
+ nb_harmonics: 8
111
+ sampling_rate: !ref <sample_rate>
112
+ nsf_alpha: 0.1
113
+ nsf_sigma: 0.003
114
+ nsf_voiced_threshold: 10
115
+ upsample_rates: [8, 8]
116
+ upsample_kernel_sizes: [16, 16]
117
+ istft_params:
118
+ n_fft: 16
119
+ hop_len: 4
120
+ resblock_kernel_sizes: [3, 7, 11]
121
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
122
+ source_resblock_kernel_sizes: [7, 11]
123
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
124
+ lrelu_slope: 0.1
125
+ audio_limit: 0.99
126
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
127
+ num_class: 1
128
+ in_channels: 80
129
+ cond_channels: 512
130
+
131
+ # processor functions
132
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
133
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer
134
+ multilingual: True
135
+ num_languages: 100
136
+ language: 'en'
137
+ task: 'transcribe'
138
+ allowed_special: 'all'
139
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
140
+ get_tokenizer: !ref <get_tokenizer>
141
+ allowed_special: !ref <allowed_special>
142
+ filter: !name:cosyvoice.dataset.processor.filter
143
+ max_length: 40960
144
+ min_length: 0
145
+ token_max_length: 200
146
+ token_min_length: 1
147
+ resample: !name:cosyvoice.dataset.processor.resample
148
+ resample_rate: !ref <sample_rate>
149
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
150
+ n_fft: 1024
151
+ num_mels: 80
152
+ sampling_rate: !ref <sample_rate>
153
+ hop_size: 256
154
+ win_size: 1024
155
+ fmin: 0
156
+ fmax: 8000
157
+ center: False
158
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
159
+ feat_extractor: !ref <feat_extractor>
160
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
161
+ normalize: True
162
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
163
+ shuffle_size: 1000
164
+ sort: !name:cosyvoice.dataset.processor.sort
165
+ sort_size: 500 # sort_size should be less than shuffle_size
166
+ batch: !name:cosyvoice.dataset.processor.batch
167
+ batch_type: 'dynamic'
168
+ max_frames_in_batch: 2000
169
+ padding: !name:cosyvoice.dataset.processor.padding
170
+ use_spk_embedding: False # change to True during sft
171
+
172
+ # dataset processor pipeline
173
+ data_pipeline: [
174
+ !ref <parquet_opener>,
175
+ !ref <tokenize>,
176
+ !ref <filter>,
177
+ !ref <resample>,
178
+ !ref <compute_fbank>,
179
+ !ref <parse_embedding>,
180
+ !ref <shuffle>,
181
+ !ref <sort>,
182
+ !ref <batch>,
183
+ !ref <padding>,
184
+ ]
185
+
186
+ # train conf
187
+ train_conf:
188
+ optim: adam
189
+ optim_conf:
190
+ lr: 0.001 # change to 1e-5 during sft
191
+ scheduler: warmuplr # change to constantlr during sft
192
+ scheduler_conf:
193
+ warmup_steps: 2500
194
+ max_epoch: 200
195
+ grad_clip: 5
196
+ accum_grad: 2
197
+ log_interval: 100
198
+ save_per_step: -1
flow.decoder.estimator.fp32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482e27304e8242dc3d7bc9989bad84ec7835394ddf9e78826337d9484a4ee3ee
3
+ size 328627300
flow.encoder.fp32.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:637f9ef66ba7ecd677b4e1d3d0b1af1e0c6d744485782553ca5bc1ecfa4cf0f7
3
+ size 103558803
flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd80b089444a95e52956c57cdf177d7f6017a5af13b8a697717628a1d2be6b55
3
+ size 419900943
hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
3
+ size 81896716
llm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310c74da4315729c6069c0d0efdfb5bc9b433dc75c890f6ebaf0d18373bda52f
3
+ size 1243027089
speech_tokenizer_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e
3
+ size 522624269
spk2info.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:652d571b2efec1be6dc14345c2bae52eb41affe4b5d3fa4174548e059bd633b4
3
+ size 1317821