Saripudin commited on
Commit
da52ea8
·
verified ·
1 Parent(s): 9ebf9a0

Upload indonesian_language_gpt_v1.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. indonesian_language_gpt_v1.yml +170 -0
indonesian_language_gpt_v1.yml ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: indonesian_language_gpt_v1
2
+ model: extensibletrainer
3
+ scale: 1
4
+ gpu_ids: [0] # <-- unless you have multiple gpus, use this
5
+ start_step: 0 # -1 causes 0.pth to be saved!
6
+ checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
7
+ fp16: false # TODO: why does enabling this with 8bit slow down perf??
8
+ use_8bit: true
9
+ wandb: true # <-- enable to log to wandb. tensorboard logging is always enabled.
10
+ wandb_project_name: tortoise
11
+ use_tb_logger: true
12
+
13
+ datasets:
14
+ train:
15
+ name: train_dataset
16
+ n_workers: 8 # idk what this does
17
+ batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
18
+ mode: paired_voice_audio
19
+ path: ../../dataset-v1/train.txt
20
+ fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
21
+ phase: train
22
+ max_wav_length: 255995
23
+ max_text_length: 200
24
+ sample_rate: 22050
25
+ load_conditioning: True
26
+ num_conditioning_candidates: 2
27
+ conditioning_length: 44000
28
+ use_bpe_tokenizer: True
29
+ load_aligned_codes: False
30
+ tokenizer_vocab: ../indonesia_tokenizer_v1.json
31
+ val:
32
+ name: val_dataset
33
+ n_workers: 8
34
+ batch_size: 128 # this could be higher probably
35
+ mode: paired_voice_audio
36
+ path: ../../dataset-v1/val.txt
37
+ fetcher_mode: ['lj']
38
+ phase: val # might be broken idk
39
+ max_wav_length: 255995
40
+ max_text_length: 200
41
+ sample_rate: 22050
42
+ load_conditioning: True
43
+ num_conditioning_candidates: 2
44
+ conditioning_length: 44000
45
+ use_bpe_tokenizer: True
46
+ load_aligned_codes: False
47
+ tokenizer_vocab: ../indonesia_tokenizer_v1.json
48
+
49
+ steps:
50
+ gpt_train:
51
+ training: gpt
52
+ loss_log_buffer: 500 # no idea what this does
53
+
54
+ # Generally follows the recipe from the DALLE paper.
55
+ optimizer: adamw # this should be adamw_zero if you're using distributed training
56
+ #optimizer: lion
57
+ optimizer_params:
58
+ lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
59
+ #lr: !!float 2e-6 # USE LOWER LR for LION
60
+ triton: false # ONLY RELEVANT FOR LION
61
+ weight_decay: !!float 1e-2
62
+ beta1: 0.9
63
+ beta2: 0.96
64
+ clip_grad_eps: 4
65
+
66
+ injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
67
+ paired_to_mel:
68
+ type: torch_mel_spectrogram
69
+ mel_norm_file: ../experiments/clips_mel_norms.pth
70
+ in: wav
71
+ out: paired_mel
72
+ paired_cond_to_mel:
73
+ type: for_each
74
+ subtype: torch_mel_spectrogram
75
+ mel_norm_file: ../experiments/clips_mel_norms.pth
76
+ in: conditioning
77
+ out: paired_conditioning_mel
78
+ to_codes:
79
+ type: discrete_token
80
+ in: paired_mel
81
+ out: paired_mel_codes
82
+ dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
83
+ paired_fwd_text:
84
+ type: generator
85
+ generator: gpt
86
+ in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
87
+ out: [loss_text_ce, loss_mel_ce, logits]
88
+ losses:
89
+ text_ce:
90
+ type: direct
91
+ weight: .01
92
+ key: loss_text_ce
93
+ mel_ce:
94
+ type: direct
95
+ weight: 1
96
+ key: loss_mel_ce
97
+
98
+ networks:
99
+ gpt:
100
+ type: generator
101
+ which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
102
+ kwargs:
103
+ layers: 30 # WAS 8
104
+ model_dim: 1024 # WAS 512
105
+ heads: 16 # WAS 8
106
+ max_text_tokens: 402 # WAS 120
107
+ max_mel_tokens: 604 # WAS 250
108
+ max_conditioning_inputs: 2 # WAS 1
109
+ mel_length_compression: 1024
110
+ number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
111
+ number_mel_codes: 8194
112
+ start_mel_token: 8192
113
+ stop_mel_token: 8193
114
+ start_text_token: 255
115
+ train_solo_embeddings: False # missing in uv3/4
116
+ use_mel_codes_as_input: True # ditto
117
+ checkpointing: True
118
+ tortoise_compat: True
119
+ #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
120
+ #only_alignment_head: False # uv3/4
121
+
122
+ path:
123
+ pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
124
+ strict_load: true
125
+ #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
126
+
127
+ # afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
128
+ train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
129
+ niter: 50000
130
+ warmup_iter: -1
131
+ mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
132
+ val_freq: 318 # TODO set this to epoch size * something
133
+
134
+ default_lr_scheme: MultiStepLR
135
+ gen_lr_steps: [5000, 10000, 15000, 20000] #[50000, 100000, 140000, 180000]
136
+ lr_gamma: 0.5
137
+ ema_enabled: false
138
+ #manual_seed: 1337 # add this if you want reproducibility
139
+
140
+ eval:
141
+ pure: true # see train.py
142
+
143
+ logger:
144
+ print_freq: 318 # TODO: set this to epoch size
145
+ save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
146
+ visuals: [gen, mel] #TODO: figure this out
147
+ visual_debug_rate: 500
148
+ is_mel_spectrogram: true
149
+ disable_state_saving: true # CHANGEME if you plan to halt training inbetween
150
+
151
+ upgrades:
152
+ # Variable: number_of_checkpoints_to_save
153
+ # Description: Define how many checkpoints should be saved on disk (1 checkpoint = pth+ =~ 6.8 GB)
154
+ # Type: integer
155
+ # Value: should be the same value as for number_of_states_to_save
156
+ # smaller than 1 - turn off this option; there is no max value. For Colab use 1 or 2.
157
+ # For Colab use 1 or 2 for gDrive and 5 for instance drive
158
+ # 1 == Leave last saved checkpoint + last saved state (about 6.8 GB).
159
+ # 2 == Leave last 2 saved checkpoints + last saved states (about 2 *~ 6.8 GB =~ 13.6 GB).
160
+ number_of_checkpoints_to_save: 1
161
+ # Variable: number_of_states_to_save
162
+ # Description: Define how many states should be saved on disk (1 state =~ 3.4 GB)
163
+ # if disable_state_saving is set as true this option will be inactive
164
+ # Type: integer
165
+ # Value: should be the same value as for number_of_checkpoints_to_save
166
+ # smaller than 1 - turn off this option; there is no max value.
167
+ # For Colab use 1 or 2 for gDrive and 5 for instance drive
168
+ # 1 == Leave last saved state (about 3.4 GB).
169
+ # 2 == Leave last 2 saved states (about 2 *~ 3.4 GB =~ 6.8 GB).
170
+ number_of_states_to_save: 1