Text2Text Generation
speechbrain
English
G2P
Grapheme-to-Phoneme
speechbrainteam commited on
Commit
45e4cc9
·
verified ·
1 Parent(s): 074c0fe

Delete hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +0 -533
hyperparams.yaml DELETED
@@ -1,533 +0,0 @@
1
- # Generated 2022-07-09 from:
2
- # /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
3
- # yamllint disable
4
- # ################################
5
- # Model: LSTM (encoder) + GRU (decoder) (tokenized)
6
- # Authors:
7
- # Loren Lugosch & Mirco Ravanelli 2020
8
- # Artem Ploujnikov 2021
9
- # ################################
10
-
11
- # Seed needs to be set at top of yaml, before objects with parameters are made
12
- seed: 1234
13
- __set_seed: !apply:torch.manual_seed [1234]
14
-
15
-
16
- # Tokenizers
17
- char_tokenize: false
18
- char_token_type: unigram # ["unigram", "bpe", "char"]
19
- char_token_output: 512
20
- char_token_wordwise: true
21
- phn_tokenize: false
22
- phn_token_type: unigram # ["unigram", "bpe", "char"]
23
- phn_token_output: 512 # index(blank/eos/bos/unk) = 0
24
- phn_token_wordwise: true
25
- character_coverage: 1.0
26
-
27
-
28
- phonemes_count: 43
29
- graphemes_count: 31
30
- phonemes_enable_space: true
31
-
32
- # Training Parameters
33
- lexicon_epochs: 50
34
- lexicon_ctc_epochs: 10
35
- lexicon_limit_to_stop: 50 # No stopping by default, can override
36
- lexicon_limit_warmup: 50 # No stopping by default, can override
37
- sentence_epochs: 13
38
- sentence_ctc_epochs: 10
39
- sentence_limit_to_stop: 3
40
- sentence_limit_warmup: 3
41
- homograph_epochs: 50
42
- homograph_ctc_epochs: 10
43
- homograph_limit_to_stop: 5
44
- homograph_limit_warmup: 10
45
- lexicon_batch_size: 1024
46
- sentence_batch_size: 32
47
- homograph_batch_size: 32
48
- ctc_weight: 0.5
49
- homograph_loss_weight: 2.0
50
- lr: 0.002
51
- save_for_pretrained: true
52
-
53
- # Model parameters
54
- output_neurons: &id004 !apply:speechbrain.utils.hparams.choice
55
-
56
- value: false
57
- choices:
58
- true: 513
59
- false: 43
60
-
61
- enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice
62
- value: false
63
- choices:
64
- true: 513
65
- false: 31
66
-
67
- enc_dropout: 0.5
68
- enc_neurons: 512
69
- enc_num_layers: 4
70
- dec_dropout: 0.5
71
- dec_neurons: 512
72
- dec_att_neurons: 256
73
- dec_num_layers: 4
74
- embedding_dim: 512
75
-
76
- # Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
77
- # Available modes:
78
- # raw: no BOS/EOS tokens are added
79
- # bos: a beginning-of-sequence token is added
80
- # eos: an end-of-sequence token is added
81
- grapheme_sequence_mode: bos
82
- phoneme_sequence_mode: bos
83
-
84
-
85
- # Special Token information
86
- bos_index: 0
87
- eos_index: 1
88
- blank_index: 2
89
- unk_index: 2
90
- token_space_index: 512
91
-
92
-
93
- # Language Model
94
- lm_emb_dim: 256 # dimension of the embeddings
95
- lm_rnn_size: 512 # dimension of hidden layers
96
- lm_layers: 2 # number of hidden layers
97
- lm_output_neurons: 43
98
-
99
- # Beam Searcher
100
- use_language_model: false
101
- beam_search_min_decode_ratio: 0
102
- beam_search_max_decode_ratio: 1.0
103
- beam_search_beam_size: 16
104
- beam_search_beam_size_valid: 16
105
- beam_search_eos_threshold: 10.0
106
- beam_search_using_max_attn_shift: false
107
- beam_search_max_attn_shift: 10
108
- beam_search_coverage_penalty: 5.0
109
- beam_search_lm_weight: 0.5
110
- beam_search_ctc_weight_decode: 0.4
111
- beam_search_temperature: 1.25
112
- beam_search_temperature_lm: 1.0
113
-
114
- # Word embeddings
115
- use_word_emb: true
116
- word_emb_model: bert-base-uncased
117
- word_emb_dim: 768
118
- word_emb_enc_dim: 256
119
- word_emb_norm_type: batch
120
-
121
- graphemes: &id028
122
- - A
123
- - B
124
- - C
125
- - D
126
- - E
127
- - F
128
- - G
129
- - H
130
- - I
131
- - J
132
- - K
133
- - L
134
- - M
135
- - N
136
- - O
137
- - P
138
- - Q
139
- - R
140
- - S
141
- - T
142
- - U
143
- - V
144
- - W
145
- - X
146
- - Y
147
- - Z
148
- - "'"
149
- - ' '
150
-
151
- phonemes: &id001
152
-
153
-
154
- - AA
155
- - AE
156
- - AH
157
- - AO
158
- - AW
159
- - AY
160
- - B
161
- - CH
162
- - D
163
- - DH
164
- - EH
165
- - ER
166
- - EY
167
- - F
168
- - G
169
- - HH
170
- - IH
171
- - IY
172
- - JH
173
- - K
174
- - L
175
- - M
176
- - N
177
- - NG
178
- - OW
179
- - OY
180
- - P
181
- - R
182
- - S
183
- - SH
184
- - T
185
- - TH
186
- - UH
187
- - UW
188
- - V
189
- - W
190
- - Y
191
- - Z
192
- - ZH
193
- - ' '
194
-
195
- enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim
196
- use_word_emb: true
197
- word_emb_enc_dim: 256
198
- embedding_dim: 512
199
-
200
-
201
- phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
202
-
203
-
204
- # Models
205
- tokens: *id001
206
- char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map
207
- map_dict: *id002
208
- enc: &id006 !new:speechbrain.nnet.RNN.LSTM
209
- input_shape: [null, null, *id003]
210
- bidirectional: true
211
- hidden_size: 512
212
- num_layers: 4
213
- dropout: 0.5
214
-
215
- lin: &id010 !new:speechbrain.nnet.linear.Linear
216
- input_size: 512
217
- n_neurons: *id004
218
- bias: false
219
-
220
- ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear
221
- input_size: 1024
222
- n_neurons: *id004
223
- encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
224
- num_embeddings: *id005
225
- embedding_dim: 512
226
-
227
- emb: &id008 !new:speechbrain.nnet.embedding.Embedding
228
- num_embeddings: *id004
229
- embedding_dim: 512
230
-
231
- dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
232
- enc_dim: 1024
233
- input_size: 512
234
- rnn_type: gru
235
- attn_type: content
236
- dropout: 0.5
237
- hidden_size: 512
238
- attn_dim: 256
239
- num_layers: 4
240
-
241
- word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
242
-
243
- word_emb_dim: 768
244
- word_emb_enc_dim: 256
245
- norm_type: batch
246
-
247
- word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
248
- init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
249
- model: bert-base-uncased
250
-
251
- log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax
252
- apply_log: true
253
-
254
- modules:
255
- model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
256
- enc: *id006
257
- encoder_emb: *id007
258
- emb: *id008
259
- dec: *id009
260
- lin: *id010
261
- out: *id011
262
- use_word_emb: true
263
- word_emb_enc: *id012
264
- enc: *id006
265
- encoder_emb: *id007
266
- emb: *id008
267
- dec: *id009
268
- lin: *id010
269
- ctc_lin: *id013
270
- out: *id011
271
- word_emb:
272
- word_emb_enc: *id012
273
- model: *id014
274
-
275
- lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM
276
- embedding_dim: 256
277
- rnn_layers: 2
278
- rnn_neurons: 512
279
- output_neurons: 43
280
- return_hidden: true
281
-
282
- opt_class: !name:torch.optim.Adam
283
- lr: 0.002
284
-
285
- # Scorer
286
- ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
287
- eos_index: 1
288
- blank_index: 2
289
- ctc_fc: *id013
290
-
291
- rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
292
- language_model: !ref <lm_model>
293
- temperature: 1.0
294
-
295
- coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
296
- vocab_size: !ref <output_neurons>
297
-
298
- scorer_train: !new:speechbrain.decoders.scorer.ScorerBuilder
299
- full_scorers: [!ref <coverage_scorer>,
300
- !ref <ctc_scorer>]
301
- weights:
302
- coverage: 5.0
303
- ctc: 0.4
304
-
305
- beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher
306
- embedding: *id008
307
- decoder: *id009
308
- linear: *id010
309
- bos_index: 0
310
- eos_index: 1
311
- min_decode_ratio: 0
312
- max_decode_ratio: 1.0
313
- beam_size: 16
314
- eos_threshold: 10.0
315
- using_max_attn_shift: false
316
- max_attn_shift: 10
317
- scorer: !ref <scorer_train>
318
-
319
- scorer_valid: !new:speechbrain.decoders.scorer.ScorerBuilder
320
- full_scorers: [!ref <coverage_scorer>,
321
- !ref <ctc_scorer>]
322
- weights:
323
- coverage: 5.0
324
- ctc: 0.
325
-
326
- beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
327
- embedding: *id008
328
- decoder: *id009
329
- linear: *id010
330
- bos_index: 0
331
- eos_index: 1
332
- min_decode_ratio: 0
333
- max_decode_ratio: 1.0
334
- beam_size: 16
335
- eos_threshold: 10.0
336
- using_max_attn_shift: false
337
- max_attn_shift: 10
338
- scorer: !ref <scorer_valid>
339
-
340
-
341
- scorer_test: !new:speechbrain.decoders.scorer.ScorerBuilder
342
- full_scorers: [!ref <coverage_scorer>,
343
- !ref <rnnlm_scorer>,
344
- !ref <ctc_scorer>]
345
- weights:
346
- coverage: 5.0
347
- rnnlm: 0.5
348
- ctc: 0.4
349
-
350
- beam_searcher_lm: !new:speechbrain.decoders.seq2seq.S2SRNNBeamSearcher
351
- embedding: *id008
352
- decoder: *id009
353
- linear: *id010
354
- bos_index: 0
355
- eos_index: 1
356
- min_decode_ratio: 0
357
- max_decode_ratio: 1.0
358
- beam_size: 16
359
- eos_threshold: 10.0
360
- using_max_attn_shift: false
361
- max_attn_shift: 10
362
- temperature: 1.25
363
- scorer: !ref <scorer_test>
364
-
365
-
366
- lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler
367
- initial_value: 0.002
368
- improvement_threshold: 0.0
369
- annealing_factor: 0.8
370
- patient: 0
371
-
372
- homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
373
-
374
- seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss
375
-
376
- label_smoothing: 0.1
377
-
378
- ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
379
- blank_index: 2
380
-
381
- seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss
382
-
383
- label_smoothing: 0.1
384
- reduction: batch
385
-
386
- homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss
387
- seq_cost: *id016
388
- seq_stats: !name:speechbrain.utils.metric_stats.MetricStats
389
- metric: *id017
390
- seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats
391
- metric: *id017
392
- classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats
393
-
394
- per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
395
- per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats
396
-
397
-
398
- model_output_keys:
399
- - p_seq
400
- - char_lens
401
- - encoder_out
402
-
403
- grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
404
- phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
405
-
406
-
407
- grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
408
- init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
409
- model_dir: grapheme_tokenizer
410
- bos_id: 0
411
- eos_id: 1
412
- unk_id: 2
413
- vocab_size: 512
414
- annotation_train: tokenizer_annotation_train.json
415
- annotation_read: char
416
- model_type: unigram # ["unigram", "bpe", "char"]
417
- character_coverage: 1.0
418
- annotation_format: json
419
- text_file: grapheme_annotations.txt
420
-
421
- phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
422
- init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
423
- model_dir: phoneme_tokenizer
424
- bos_id: 0
425
- eos_id: 1
426
- unk_id: 2
427
- vocab_size: 512
428
- annotation_train: tokenizer_annotation_train.json
429
- annotation_read: phn
430
- model_type: unigram # ["unigram", "bpe", "char"]
431
- character_coverage: 1.0
432
- annotation_list_to_check: [tokenizer_annotation_valid.json]
433
- annotation_format: json
434
- text_file: phoneme_annotations.txt
435
-
436
- out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
437
- tokenizer: *id022
438
- char_map: *id023
439
- token_space_index: 512
440
- wordwise: true
441
-
442
- out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode
443
-
444
- encoder: *id024
445
- out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
446
- value: false
447
- choices:
448
- true: *id025
449
- false: *id026
450
- encode_pipeline:
451
- batch: false
452
- use_padded_data: true
453
- output_keys:
454
- - grapheme_list
455
- - grapheme_encoded_list
456
- - grapheme_encoded
457
- - word_emb
458
- init:
459
- - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
460
- encoder: *id027
461
- tokens: *id028
462
- bos_index: 0
463
- eos_index: 1
464
- - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
465
- encoder: *id024
466
- tokens: *id001
467
- bos_index: 0
468
- eos_index: 1
469
- steps:
470
- - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
471
- graphemes: *id028
472
- takes: txt
473
- provides: txt_cleaned
474
- - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
475
- grapheme_encoder: *id027
476
- takes: txt_cleaned
477
- provides:
478
- - grapheme_list
479
- - grapheme_encoded_list
480
- - grapheme_encoded_raw
481
-
482
- - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
483
- encoder: *id027
484
- takes: grapheme_encoded_list
485
- provides:
486
- - grapheme_encoded
487
- - grapheme_len
488
- - grapheme_encoded_eos
489
- - grapheme_len_eos
490
- - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
491
- word_emb: !ref <word_emb>
492
- grapheme_encoder: !ref <grapheme_encoder>
493
- use_word_emb: !ref <use_word_emb>
494
- takes:
495
- - txt
496
- - grapheme_encoded
497
- - grapheme_len
498
- provides: word_emb
499
-
500
- decode_pipeline:
501
- batch: true
502
- output_keys:
503
- - phonemes
504
- steps:
505
- - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
506
- beam_searcher: *id029
507
- takes:
508
- - char_lens
509
- - encoder_out
510
- provides:
511
- - hyps
512
- - scores
513
- - func: !apply:speechbrain.utils.hparams.choice
514
- value: false
515
- choices:
516
- true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
517
- tokenizer: *id022
518
- char_map: *id023
519
- token_space_index: 512
520
- wordwise: true
521
- false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
522
- phoneme_encoder: *id024
523
- takes:
524
- - hyps
525
- provides:
526
- - phonemes
527
-
528
-
529
- pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
530
- loadables:
531
- model: *id014
532
- ctc_lin: *id013
533
-