File size: 4,126 Bytes
f47fe9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# pytorch_lightning==1.8.6
seed_everything: 3407
trainer:
  logger:
    class_path: pytorch_lightning.loggers.TensorBoardLogger
    init_args:
      save_dir: ./result/
      name: lightning_logs
      version: null
      log_graph: false
      default_hp_metric: true
      prefix: ''
      sub_dir: null
      logdir: null
      comment: ''
      purge_step: null
      max_queue: 10
      flush_secs: 120
      filename_suffix: ''
      write_to_disk: true
      comet_config:
        disabled: true
  enable_checkpointing: true
  callbacks:
  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
    init_args:
      logging_interval: null
      log_momentum: false
  - class_path: pytorch_lightning.callbacks.ModelSummary
    init_args:
      max_depth: 2
  - class_path: pytorch_lightning.callbacks.ModelCheckpoint
    init_args:
      dirpath: null
      filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f}
      monitor: val_loss
      verbose: false
      save_last: true
      save_top_k: 10
      save_weights_only: false
      mode: min
      auto_insert_metric_name: true
      every_n_train_steps: 1000
      train_time_interval: null
      every_n_epochs: null
      save_on_train_epoch_end: null
  - class_path: inspiremusic.wavtokenizer.decoder.helpers.GradNormCallback
  default_root_dir: null
  gradient_clip_val: null
  gradient_clip_algorithm: null
  num_nodes: 1
  num_processes: null
  devices: -1
  gpus: null
  auto_select_gpus: false
  tpu_cores: null
  ipus: null
  enable_progress_bar: true
  overfit_batches: 0.0
  track_grad_norm: -1
  check_val_every_n_epoch: 1
  fast_dev_run: false
  accumulate_grad_batches: null
  max_epochs: null
  min_epochs: null
  max_steps: 20000000
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: 100
  limit_test_batches: null
  limit_predict_batches: null
  val_check_interval: null
  log_every_n_steps: 1000
  accelerator: gpu
  strategy: ddp
  sync_batchnorm: false
  precision: 32
  enable_model_summary: true
  num_sanity_val_steps: 2
  resume_from_checkpoint: null
  profiler: null
  benchmark: null
  deterministic: null
  reload_dataloaders_every_n_epochs: 0
  auto_lr_find: false
  replace_sampler_ddp: true
  detect_anomaly: false
  auto_scale_batch_size: false
  plugins: null
  amp_backend: native
  amp_level: null
  move_metrics_to_cpu: false
  multiple_trainloader_mode: max_size_cycle
  inference_mode: true
ckpt_path: null
data:
  class_path: inspiremusic.wavtokenizer.decoder.dataset.VocosDataModule
  init_args:
    train_params:
      filelist_path: train.scp
      sampling_rate: 24000
      num_samples: 72000
      batch_size: 38
      num_workers: 8
    val_params:
      filelist_path: test.scp
      sampling_rate: 24000
      num_samples: 72000
      batch_size: 10
      num_workers: 8
model:
  class_path: inspiremusic.wavtokenizer.decoder.experiment.WavTokenizer
  init_args:
    feature_extractor:
      class_path: inspiremusic.wavtokenizer.decoder.feature_extractors.EncodecFeatures
      init_args:
        encodec_model: encodec_24khz
        bandwidths:
        - 6.6
        - 6.6
        - 6.6
        - 6.6
        train_codebooks: true
        num_quantizers: 1
        dowmsamples:
        - 8
        - 5
        - 4
        - 2
        vq_bins: 4096
        vq_kmeans: 200
    backbone:
      class_path: inspiremusic.wavtokenizer.decoder.models.VocosBackbone
      init_args:
        input_channels: 512
        dim: 768
        intermediate_dim: 2304
        num_layers: 12
        layer_scale_init_value: null
        adanorm_num_embeddings: 4
    head:
      class_path: inspiremusic.wavtokenizer.decoder.heads.ISTFTHead
      init_args:
        dim: 768
        n_fft: 1280
        hop_length: 320
        padding: same
    resume_config: config.yaml
    resume_model: last.ckpt
    sample_rate: 24000
    initial_learning_rate: 0.0001
    num_warmup_steps: 0
    mel_loss_coeff: 45.0
    mrd_loss_coeff: 1.0
    pretrain_mel_steps: 0
    decay_mel_coeff: false
    evaluate_utmos: false
    evaluate_pesq: true
    evaluate_periodicty: true
    resume: true