| flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec | |
| input_size: 512 | |
| output_size: 80 | |
| spk_embed_dim: 192 | |
| output_type: 'mel' | |
| vocab_size: 6561 | |
| encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2 | |
| input_size: 512 | |
| output_size: 512 | |
| input_layer: 'linear' | |
| pre_lookahead_len: 3 | |
| num_blocks: 6 | |
| num_up_blocks: 4 | |
| up_stride: 2 | |
| up_scale_factor: 2 | |
| attention_heads: 8 | |
| pos_enc_layer_type: 'rel_pos_espnet' | |
| selfattention_layer_type: 'rel_selfattn' | |
| key_bias: true | |
| linear_units: 2048 | |
| dropout_rate: 0.1 | |
| positional_dropout_rate: 0.1 | |
| attention_dropout_rate: 0.1 | |
| normalize_before: True | |
| decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM | |
| inference_cfg_rate: 0.7 | |
| estimator: !new:cosyvoice2.flow.decoder_dit.DiT | |
| in_channels: 320 | |
| out_channels: 80 | |
| mlp_ratio: 4.0 | |
| depth: 16 | |
| num_heads: 8 | |
| head_dim: 64 | |
| hidden_size: 512 | |