{ | |
"model_type": "TaDiCodec", | |
"preprocess": { | |
"hop_size": 480, | |
"sample_rate": 24000, | |
"n_fft": 1920, | |
"num_mels": 128, | |
"win_size": 1920, | |
"fmin": 0, | |
"fmax": 12000, | |
"mel_var": 8.14, | |
"mel_mean": -4.92 | |
}, | |
"model": { | |
"tadicodec": { | |
"mel_dim": 128, | |
"in_dim": 128, | |
"hidden_size": 1024, | |
"encoder_num_layers": 8, | |
"decoder_num_layers": 16, | |
"num_heads": 16, | |
"cond_drop_p": 0.2, | |
"context_drop_p": 0.2, | |
"down_sample_factor": 8, | |
"vq_emb_dim": 14, | |
"use_text_cond": true, | |
"text_vocab_size": 32100, | |
"cond_dim": 1024, | |
"cond_scale_factor": 1, | |
"sigma": 1e-5, | |
"time_scheduler": "linear", | |
"vq_type": "bsq" | |
}, | |
"vocos": { | |
"input_channels": 128, | |
"dim": 1024, | |
"intermediate_dim": 4096, | |
"num_layers": 30, | |
"n_fft": 1920, | |
"hop_size": 480, | |
"padding": "same" | |
} | |
} | |
} |