k2-fsa
/

ZipVoice

@@ -1,13 +1,12 @@
 {
   "model" : {
-    "fm_decoder_downsampling_factor" : "1,2,4,2,1",
-    "fm_decoder_num_layers" : "2,2,4,4,4",
-    "fm_decoder_cnn_module_kernel" : "31,15,7,15,31",
     "fm_decoder_feedforward_dim" : 1536,
     "fm_decoder_num_heads" : 4,
     "fm_decoder_dim" : 512,
-    "text_encoder_downsampling_factor" : "1",
-    "text_encoder_num_layers" : "4",
     "text_encoder_feedforward_dim" : 512,
     "text_encoder_cnn_module_kernel" : 9,
     "text_encoder_num_heads" : 4,
@@ -17,12 +16,11 @@
     "pos_head_dim" : 4,
     "pos_dim" : 48,
     "time_embed_dim" : 192,
-    "text_embed_dim" : 192
   },
   "feature" : {
     "sampling_rate": 24000,
-    "feat_dim": 100,
-    "n_fft" : 1024,
-    "hop_length" : 256
   }
 }

 {
   "model" : {
+    "fm_decoder_downsampling_factor" : [1,2,4,2,1],
+    "fm_decoder_num_layers" : [2,2,4,4,4],
+    "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
     "fm_decoder_feedforward_dim" : 1536,
     "fm_decoder_num_heads" : 4,
     "fm_decoder_dim" : 512,
+    "text_encoder_num_layers" : 4,
     "text_encoder_feedforward_dim" : 512,
     "text_encoder_cnn_module_kernel" : 9,
     "text_encoder_num_heads" : 4,
     "pos_head_dim" : 4,
     "pos_dim" : 48,
     "time_embed_dim" : 192,
+    "text_embed_dim" : 192,
+    "feat_dim": 100
   },
   "feature" : {
     "sampling_rate": 24000,
+    "type": "vocos"
   }
 }