Upload 7 files
Browse files- zipvoice/zipvoice_base.json +26 -0
- zipvoice_dialog/zipvoice_base.json +26 -0
- zipvoice_dialog_opendialog/zipvoice_base.json +26 -0
- zipvoice_dialog_stereo/zipvoice_base.json +26 -0
- zipvoice_distill/zipvoice_base.json +26 -0
- zipvoice_distill_libritts/zipvoice_base.json +26 -0
- zipvoice_libritts/zipvoice_base.json +26 -0
zipvoice/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_dialog/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_dialog_opendialog/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_dialog_stereo/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_distill/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_distill_libritts/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|
zipvoice_libritts/zipvoice_base.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model" : {
|
| 3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
| 4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
| 5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
| 6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
| 7 |
+
"fm_decoder_num_heads" : 4,
|
| 8 |
+
"fm_decoder_dim" : 512,
|
| 9 |
+
"text_encoder_num_layers" : 4,
|
| 10 |
+
"text_encoder_feedforward_dim" : 512,
|
| 11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
| 12 |
+
"text_encoder_num_heads" : 4,
|
| 13 |
+
"text_encoder_dim" : 192,
|
| 14 |
+
"query_head_dim" : 32,
|
| 15 |
+
"value_head_dim" : 12,
|
| 16 |
+
"pos_head_dim" : 4,
|
| 17 |
+
"pos_dim" : 48,
|
| 18 |
+
"time_embed_dim" : 192,
|
| 19 |
+
"text_embed_dim" : 192,
|
| 20 |
+
"feat_dim": 100
|
| 21 |
+
},
|
| 22 |
+
"feature" : {
|
| 23 |
+
"sampling_rate": 24000,
|
| 24 |
+
"type": "vocos"
|
| 25 |
+
}
|
| 26 |
+
}
|