|
model_type: spark-tts |
|
architectures: |
|
- SparkTTSModel |
|
auto_map: |
|
AutoConfig: configuration_spark_tts.SparkTTSConfig |
|
AutoModel: modeling_spark_tts.SparkTTSModel |
|
AutoProcessor: processing_spark_tts.SparkTTSProcessor |
|
processor_class: processing_spark_tts.SparkTTSProcessor |
|
llm_model_name_or_path: ./LLM |
|
bicodec_model_name_or_path: ./BiCodec |
|
wav2vec2_model_name_or_path: ./wav2vec2-large-xlsr-53 |
|
sample_rate: 16000 |
|
highpass_cutoff_freq: 40 |
|
latent_hop_length: 320 |
|
ref_segment_duration: 6.0 |
|
volume_normalize: true |
|
torch_dtype: bfloat16 |
|
transformers_version: "4.50.3" |
|
_commit_hash: null |
|
bicodec_config: |
|
mel_params: |
|
sample_rate: 16000 |
|
n_fft: 1024 |
|
win_length: 640 |
|
hop_length: 320 |
|
mel_fmin: 10 |
|
mel_fmax: null |
|
num_mels: 128 |
|
encoder_config: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 12 |
|
out_channels: 1024 |
|
sample_ratios: [1, 1] |
|
decoder_config: |
|
input_channel: 1024 |
|
channels: 1536 |
|
rates: [8, 5, 4, 2] |
|
kernel_sizes: [16, 11, 8, 4] |
|
quantizer_config: |
|
input_dim: 1024 |
|
codebook_size: 8192 |
|
codebook_dim: 8 |
|
commitment: 0.25 |
|
codebook_loss_weight: 2.0 |
|
decay: 0.99 |
|
threshold_ema_dead_code: 0.2 |
|
speaker_encoder_config: |
|
input_dim: 128 |
|
out_dim: 1024 |
|
latent_dim: 128 |
|
token_num: 32 |
|
fsq_levels: [4, 4, 4, 4, 4, 4] |
|
fsq_num_quantizers: 1 |
|
prenet_config: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 12 |
|
out_channels: 1024 |
|
condition_dim: 1024 |
|
sample_ratios: [1, 1] |
|
use_tanh_at_final: false |
|
postnet_config: |
|
input_channels: 1024 |
|
vocos_dim: 384 |
|
vocos_intermediate_dim: 2048 |
|
vocos_num_layers: 6 |
|
out_channels: 1024 |
|
use_tanh_at_final: false |
|
|