Update README.md
Browse files
README.md
CHANGED
@@ -2,14 +2,13 @@
|
|
2 |
tags:
|
3 |
- espnet
|
4 |
- audio
|
5 |
-
- automatic-speech-recognition
|
6 |
language: en
|
7 |
datasets:
|
8 |
- swbd
|
9 |
license: cc-by-4.0
|
10 |
---
|
11 |
|
12 |
-
## ESPnet2
|
13 |
|
14 |
### `espnet/Turn_taking_prediction_SWBD`
|
15 |
|
@@ -28,6 +27,17 @@ cd egs2/swbd/asr1
|
|
28 |
./run.sh --skip_data_prep false --skip_train true --download_model espnet/Turn_taking_prediction_SWBD
|
29 |
```
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# RESULTS
|
32 |
|
33 |
## asr_train_asr_whisper_turn_taking_target_raw_en_word
|
@@ -259,6 +269,16 @@ distributed: true
|
|
259 |
### Citing ESPnet
|
260 |
|
261 |
```BibTex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
@inproceedings{watanabe2018espnet,
|
263 |
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
264 |
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
@@ -269,22 +289,4 @@ distributed: true
|
|
269 |
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
270 |
}
|
271 |
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
```
|
278 |
-
|
279 |
-
or arXiv:
|
280 |
-
|
281 |
-
```bibtex
|
282 |
-
@misc{watanabe2018espnet,
|
283 |
-
title={ESPnet: End-to-End Speech Processing Toolkit},
|
284 |
-
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
285 |
-
year={2018},
|
286 |
-
eprint={1804.00015},
|
287 |
-
archivePrefix={arXiv},
|
288 |
-
primaryClass={cs.CL}
|
289 |
-
}
|
290 |
```
|
|
|
2 |
tags:
|
3 |
- espnet
|
4 |
- audio
|
|
|
5 |
language: en
|
6 |
datasets:
|
7 |
- swbd
|
8 |
license: cc-by-4.0
|
9 |
---
|
10 |
|
11 |
+
## ESPnet2 Turn taking model
|
12 |
|
13 |
### `espnet/Turn_taking_prediction_SWBD`
|
14 |
|
|
|
27 |
./run.sh --skip_data_prep false --skip_train true --download_model espnet/Turn_taking_prediction_SWBD
|
28 |
```
|
29 |
|
30 |
+
Use the following Python code to run inference and obtain the probability of a turn-taking event every 40 milliseconds.
|
31 |
+
```python
|
32 |
+
import soundfile
|
33 |
+
import os
|
34 |
+
import sys
|
35 |
+
from espnet2.bin.asr_inference import Speech2Text
|
36 |
+
speech2text = Speech2Text("exp/asr_train_asr_whisper_turn_taking_raw_en_word/config.yaml", "exp/asr_train_asr_whisper_turn_taking_raw_en_word/valid.loss.ave.pth",device="cuda", run_chunk=True)
|
37 |
+
audio, rate = soundfile.read(key)
|
38 |
+
print(speech2text(audio)[0][0])
|
39 |
+
```
|
40 |
+
|
41 |
# RESULTS
|
42 |
|
43 |
## asr_train_asr_whisper_turn_taking_target_raw_en_word
|
|
|
269 |
### Citing ESPnet
|
270 |
|
271 |
```BibTex
|
272 |
+
|
273 |
+
@inproceedings{
|
274 |
+
arora2025talking,
|
275 |
+
title={Talking Turns: Benchmarking Audio Foundation Models on Turn-Taking Dynamics},
|
276 |
+
author={Siddhant Arora and Zhiyun Lu and Chung-Cheng Chiu and Ruoming Pang and Shinji Watanabe},
|
277 |
+
booktitle={The Thirteenth International Conference on Learning Representations},
|
278 |
+
year={2025},
|
279 |
+
url={https://openreview.net/forum?id=2e4ECh0ikn}
|
280 |
+
}
|
281 |
+
|
282 |
@inproceedings{watanabe2018espnet,
|
283 |
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
284 |
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
|
|
289 |
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
290 |
}
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
```
|