rtakeda commited on
Commit
d6b2b3e
·
verified ·
1 Parent(s): 8cc0c23

Upload 7 files

Browse files
README.md CHANGED
@@ -1,3 +1,70 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ {}
3
+ ---
4
+ ## ESPnet2 Streaming Katakana ASR model
5
+
6
+ ### `ouktlab/espnet_streaming_csj_asr_train_asr_transformer_lm_rnn`
7
+
8
+ This model was trained using csj recipe in [espnet](https://github.com/espnet/espnet/).
9
+
10
+ ### Citing ESPnet
11
+
12
+ ```BibTex
13
+ @inproceedings{watanabe2018espnet,
14
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
15
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
16
+ year={2018},
17
+ booktitle={Proceedings of Interspeech},
18
+ pages={2207--2211},
19
+ doi={10.21437/Interspeech.2018-1456},
20
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
21
+ }
22
+
23
+ ```
24
+
25
+ or arXiv:
26
+
27
+ ```bibtex
28
+ @misc{watanabe2018espnet,
29
+ title={ESPnet: End-to-End Speech Processing Toolkit},
30
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
31
+ year={2018},
32
+ eprint={1804.00015},
33
+ archivePrefix={arXiv},
34
+ primaryClass={cs.CL}
35
+ }
36
+ ```
37
+
38
+ ### Training data recipe of this model (rev.+bgn.)
39
+
40
+ ```BibTex
41
+ @inproceedings{rtakeda2023:apsipa,
42
+ author={Ryu Takeda and Yui Sudo and Kazunori Komatani},
43
+ title={Flexible Evidence Model to Reduce Uncertainty Mismatch Between Speech Enhancement and ASR Based on Encoder-Decoder Architecture},
44
+ year={2023},
45
+ booktitle={Proceedings of Asia Pacific Signal and Information Processing Association (APSIPA)},
46
+ pages={1830-1837}
47
+ }
48
+
49
+ ### Katakana model
50
+ ```BibTex
51
+ @inproceedings{rtakeda2024:iwsds,
52
+ author={Ryu Takeda and Kazunori Komatani},
53
+ title={Toward OOV-word Acquisition during Spoken Dialogue using Syllable-based ASR and Word Segmentation},
54
+ year={2024},
55
+ booktitle={Proceedings of International Workshop on Spoken Dialogue Systems Technology (IWSDS)},
56
+ }
57
+
58
+ @inproceedings{oshio2023:apsipa,
59
+ author={Miki Oshio, Hokuto Munakata, Ryu Takeda and Kazunori Komatani},
60
+ title={Out-Of-Vocabulary Word Detection in Spoken Dialogues Based on Joint Decoding with User Response Patterns},
61
+ year={2023},
62
+ booktitle={Proceedings of Asia Pacific Signal and Information Processing Association (APSIPA)},
63
+ pages={1753-1759}
64
+ }
65
+
66
+ ```
67
+
68
+
69
+ license: cc-by-nc-4.0
70
+
exp/asr_stats_raw_jp_char_sp/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621b82abd629a39b062a42ff8bcd32be2286b8f70b7a992254dfcb129a3d9de8
3
+ size 1402
exp/asr_train_asr_streaming_transformer_ja_raw_jp_char_sp/config.yaml ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_asr_streaming_transformer_ja.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_train_asr_streaming_transformer_ja_raw_jp_char_sp
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 3
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 49623
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - acc
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 6
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 15000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/asr_stats_raw_jp_char_sp/train/speech_shape
72
+ - exp/asr_stats_raw_jp_char_sp/train/text_shape.char
73
+ valid_shape_file:
74
+ - exp/asr_stats_raw_jp_char_sp/valid/speech_shape
75
+ - exp/asr_stats_raw_jp_char_sp/valid/text_shape.char
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 80000
80
+ - 150
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train_nodup_sp/wav.scp
89
+ - speech
90
+ - sound
91
+ - - dump/raw/train_nodup_sp/text
92
+ - text
93
+ - text
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/train_dev/wav.scp
96
+ - speech
97
+ - sound
98
+ - - dump/raw/train_dev/text
99
+ - text
100
+ - text
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adam
106
+ optim_conf:
107
+ lr: 0.002
108
+ scheduler: warmuplr
109
+ scheduler_conf:
110
+ warmup_steps: 25000
111
+ token_list:
112
+ - <blank>
113
+ - <unk>
114
+ - ー
115
+ - ン
116
+ - イ
117
+ - ト
118
+ - ノ
119
+ - シ
120
+ - カ
121
+ - テ
122
+ - デ
123
+ - タ
124
+ - ス
125
+ - マ
126
+ - ッ
127
+ - ナ
128
+ - コ
129
+ - オ
130
+ - ニ
131
+ - エ
132
+ - ワ
133
+ - ク
134
+ - ア
135
+ - ガ
136
+ - モ
137
+ - キ
138
+ - ル
139
+ - ジ
140
+ - レ
141
+ - リ
142
+ - ョ
143
+ - ソ
144
+ - ラ
145
+ - ツ
146
+ - チ
147
+ - ケ
148
+ - ユ
149
+ - ド
150
+ - サ
151
+ - セ
152
+ - ヨ
153
+ - ダ
154
+ - ュ
155
+ - ヒ
156
+ - ウ
157
+ - ネ
158
+ - ホ
159
+ - ハ
160
+ - ミ
161
+ - ゴ
162
+ - ロ
163
+ - ブ
164
+ - バ
165
+ - ヤ
166
+ - メ
167
+ - ャ
168
+ - フ
169
+ - ズ
170
+ - ゲ
171
+ - ム
172
+ - ギ
173
+ - グ
174
+ - パ
175
+ - ベ
176
+ - ゼ
177
+ - ビ
178
+ - ザ
179
+ - ヘ
180
+ - ボ
181
+ - ィ
182
+ - ゾ
183
+ - プ
184
+ - ピ
185
+ - ヌ
186
+ - ポ
187
+ - ペ
188
+ - ェ
189
+ - ァ
190
+ - ゥ
191
+ - ォ
192
+ - ヴ
193
+ - ヮ
194
+ - ヅ
195
+ - <sos/eos>
196
+ init: xavier_uniform
197
+ input_size: null
198
+ ctc_conf:
199
+ dropout_rate: 0.0
200
+ ctc_type: builtin
201
+ reduce: true
202
+ ignore_nan_grad: null
203
+ zero_infinity: true
204
+ joint_net_conf: null
205
+ use_preprocessor: true
206
+ token_type: char
207
+ bpemodel: null
208
+ non_linguistic_symbols: null
209
+ cleaner: null
210
+ g2p: null
211
+ speech_volume_normalize: null
212
+ rir_scp: null
213
+ rir_apply_prob: 1.0
214
+ noise_scp: null
215
+ noise_apply_prob: 1.0
216
+ noise_db_range: '13_15'
217
+ short_noise_thres: 0.5
218
+ frontend: default
219
+ frontend_conf:
220
+ fs: 16k
221
+ specaug: specaug
222
+ specaug_conf:
223
+ apply_time_warp: true
224
+ time_warp_window: 5
225
+ time_warp_mode: bicubic
226
+ apply_freq_mask: true
227
+ freq_mask_width_range:
228
+ - 0
229
+ - 30
230
+ num_freq_mask: 2
231
+ apply_time_mask: true
232
+ time_mask_width_range:
233
+ - 0
234
+ - 40
235
+ num_time_mask: 2
236
+ normalize: global_mvn
237
+ normalize_conf:
238
+ stats_file: exp/asr_stats_raw_jp_char_sp/train/feats_stats.npz
239
+ model: espnet
240
+ model_conf:
241
+ ctc_weight: 0.3
242
+ lsm_weight: 0.1
243
+ length_normalized_loss: false
244
+ preencoder: null
245
+ preencoder_conf: {}
246
+ encoder: contextual_block_transformer
247
+ encoder_conf:
248
+ output_size: 512
249
+ attention_heads: 8
250
+ linear_units: 2048
251
+ num_blocks: 18
252
+ dropout_rate: 0.1
253
+ positional_dropout_rate: 0.1
254
+ attention_dropout_rate: 0.1
255
+ input_layer: conv2d6
256
+ normalize_before: true
257
+ block_size: 40
258
+ hop_size: 16
259
+ look_ahead: 16
260
+ init_average: true
261
+ ctx_pos_enc: true
262
+ postencoder: null
263
+ postencoder_conf: {}
264
+ decoder: transformer
265
+ decoder_conf:
266
+ attention_heads: 8
267
+ linear_units: 2048
268
+ num_blocks: 6
269
+ dropout_rate: 0.1
270
+ positional_dropout_rate: 0.1
271
+ self_attention_dropout_rate: 0.1
272
+ src_attention_dropout_rate: 0.1
273
+ preprocessor: default
274
+ preprocessor_conf: {}
275
+ required:
276
+ - output_dir
277
+ - token_list
278
+ version: '202211'
279
+ distributed: true
exp/asr_train_asr_streaming_transformer_ja_raw_jp_char_sp/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ffa823a00bc1b5e8b1161e327f3306ff8a68ba2c5a18a389d6d028eab1fc89a
3
+ size 367467429
exp/lm_train_lm_ja_char/config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_lm.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/lm_train_lm_jp_char
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 3
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 54257
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 40
28
+ patience: 3
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 256
67
+ valid_batch_size: null
68
+ batch_bins: 1000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/lm_stats_jp_char/train/text_shape.char
72
+ valid_shape_file:
73
+ - exp/lm_stats_jp_char/valid/text_shape.char
74
+ batch_type: folded
75
+ valid_batch_type: null
76
+ fold_length:
77
+ - 150
78
+ sort_in_batch: descending
79
+ sort_batch: descending
80
+ multiple_iterator: false
81
+ chunk_length: 500
82
+ chunk_shift_ratio: 0.5
83
+ num_cache_chunks: 1024
84
+ train_data_path_and_name_and_type:
85
+ - - dump/raw/lm_train.txt
86
+ - text
87
+ - text
88
+ valid_data_path_and_name_and_type:
89
+ - - dump/raw/train_dev/text
90
+ - text
91
+ - text
92
+ allow_variable_data_keys: false
93
+ max_cache_size: 0.0
94
+ max_cache_fd: 32
95
+ valid_max_cache_size: null
96
+ optim: sgd
97
+ optim_conf: {}
98
+ scheduler: null
99
+ scheduler_conf: {}
100
+ token_list:
101
+ - <blank>
102
+ - <unk>
103
+ - ー
104
+ - ン
105
+ - イ
106
+ - ト
107
+ - ノ
108
+ - シ
109
+ - カ
110
+ - テ
111
+ - デ
112
+ - タ
113
+ - ス
114
+ - マ
115
+ - ッ
116
+ - ナ
117
+ - コ
118
+ - オ
119
+ - ニ
120
+ - エ
121
+ - ワ
122
+ - ク
123
+ - ア
124
+ - ガ
125
+ - モ
126
+ - キ
127
+ - ル
128
+ - ジ
129
+ - レ
130
+ - リ
131
+ - ョ
132
+ - ソ
133
+ - ラ
134
+ - ツ
135
+ - チ
136
+ - ケ
137
+ - ユ
138
+ - ド
139
+ - サ
140
+ - セ
141
+ - ヨ
142
+ - ダ
143
+ - ュ
144
+ - ヒ
145
+ - ウ
146
+ - ネ
147
+ - ホ
148
+ - ハ
149
+ - ミ
150
+ - ゴ
151
+ - ロ
152
+ - ブ
153
+ - バ
154
+ - ヤ
155
+ - メ
156
+ - ャ
157
+ - フ
158
+ - ズ
159
+ - ゲ
160
+ - ム
161
+ - ギ
162
+ - グ
163
+ - パ
164
+ - ベ
165
+ - ゼ
166
+ - ビ
167
+ - ザ
168
+ - ヘ
169
+ - ボ
170
+ - ィ
171
+ - ゾ
172
+ - プ
173
+ - ピ
174
+ - ヌ
175
+ - ポ
176
+ - ペ
177
+ - ェ
178
+ - ァ
179
+ - ゥ
180
+ - ォ
181
+ - ヴ
182
+ - ヮ
183
+ - ヅ
184
+ - <sos/eos>
185
+ init: null
186
+ model_conf:
187
+ ignore_id: 0
188
+ use_preprocessor: true
189
+ token_type: char
190
+ bpemodel: null
191
+ non_linguistic_symbols: null
192
+ cleaner: null
193
+ g2p: null
194
+ lm: seq_rnn
195
+ lm_conf:
196
+ rnn_type: lstm
197
+ nlayers: 2
198
+ unit: 650
199
+ required:
200
+ - output_dir
201
+ - token_list
202
+ version: '202211'
203
+ distributed: true
exp/lm_train_lm_ja_char/valid.loss.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ded470555649ab4dec4c8403de325e81743c443006a0dadf18399836145df3
3
+ size 27520991
meta.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ files:
2
+ asr_model_file: exp/asr_train_asr_streaming_transformer_ja_raw_jp_char_sp/valid.acc.ave_10best.pth
3
+ lm_file: exp/lm_train_lm_ja_char/valid.loss.ave.pth
4
+ yaml_files:
5
+ asr_train_config: exp/asr_train_asr_streaming_transformer_ja_raw_jp_char_sp/config.yaml
6
+ lm_train_config: exp/lm_train_lm_ja_char/config.yaml