|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
language: en |
|
datasets: |
|
- swbd |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 Turn taking model |
|
|
|
### `espnet/Turn_taking_prediction_SWBD` |
|
|
|
This model was trained by “siddhu001” using swbd recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
git checkout cea64abdeea5fa4f3da1a898be396e8c95c6e3ae |
|
pip install -e . |
|
cd egs2/swbd/asr1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model espnet/Turn_taking_prediction_SWBD |
|
``` |
|
|
|
Use the following Python code to run inference and obtain the probability of a turn-taking event every 40 milliseconds. |
|
```python |
|
import soundfile |
|
import os |
|
import sys |
|
from espnet2.bin.asr_inference import Speech2Text |
|
speech2text = Speech2Text("exp/asr_train_asr_whisper_turn_taking_raw_en_word/config.yaml", "exp/asr_train_asr_whisper_turn_taking_raw_en_word/valid.loss.ave.pth",device="cuda", run_chunk=True) |
|
audio, rate = soundfile.read(key) |
|
print(speech2text(audio)[0][0]) |
|
``` |
|
|
|
# RESULTS |
|
|
|
## asr_train_asr_whisper_turn_taking_target_raw_en_word |
|
### ROC_AUC |
|
|
|
|dataset|Continuation|Backchannel|Turn change|Interruption|Silence|Overall| |
|
|---|---|---|---|---|---|---| |
|
|decode_asr_chunk_asr_model_valid.loss.ave/test|93.3|89.4|90.8|91.3|95.1|92.0| |
|
|
|
## ASR config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf/train_asr_whisper_3_uselast.yaml |
|
print_config: false |
|
log_level: INFO |
|
drop_last_iter: false |
|
dry_run: false |
|
iterator_type: sequence |
|
valid_iterator_type: null |
|
output_dir: exp/asr_train_asr_whisper_3_uselast_raw_en_word |
|
ngpu: 1 |
|
seed: 0 |
|
num_workers: 1 |
|
num_att_plot: 3 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: 8 |
|
dist_rank: 0 |
|
local_rank: 0 |
|
dist_master_addr: localhost |
|
dist_master_port: 33429 |
|
dist_launcher: null |
|
multiprocessing_distributed: true |
|
unused_parameters: true |
|
sharded_ddp: false |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: true |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 32 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - valid |
|
- loss |
|
- min |
|
keep_nbest_models: 10 |
|
nbest_averaging_interval: 0 |
|
grad_clip: 5.0 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 1 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: false |
|
log_interval: null |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
use_adapter: false |
|
adapter: lora |
|
save_strategy: all |
|
adapter_conf: {} |
|
pretrain_path: null |
|
init_param: [] |
|
ignore_init_mismatch: false |
|
freeze_param: |
|
- encoder |
|
num_iters_per_epoch: 750 |
|
batch_size: 4000 |
|
valid_batch_size: null |
|
batch_bins: 1000000 |
|
valid_batch_bins: null |
|
train_shape_file: |
|
- exp/asr_stats_raw_en_word/train/speech_shape |
|
- exp/asr_stats_raw_en_word/train/text_shape.word |
|
valid_shape_file: |
|
- exp/asr_stats_raw_en_word/valid/speech_shape |
|
- exp/asr_stats_raw_en_word/valid/text_shape.word |
|
batch_type: folded |
|
valid_batch_type: null |
|
fold_length: |
|
- 80000 |
|
- 150 |
|
sort_in_batch: descending |
|
shuffle_within_batch: false |
|
sort_batch: descending |
|
multiple_iterator: false |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
chunk_excluded_key_prefixes: [] |
|
chunk_default_fs: null |
|
train_data_path_and_name_and_type: |
|
- - dump/raw/train/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/train/text |
|
- text |
|
- text |
|
valid_data_path_and_name_and_type: |
|
- - dump/raw/valid/wav.scp |
|
- speech |
|
- kaldi_ark |
|
- - dump/raw/valid/text |
|
- text |
|
- text |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
allow_multi_rates: false |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adam |
|
optim_conf: |
|
lr: 0.0005 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 500 |
|
token_list: |
|
- <blank> |
|
- <unk> |
|
- C |
|
- NA |
|
- I |
|
- BC |
|
- T |
|
- <sos/eos> |
|
init: null |
|
input_size: 1 |
|
ctc_conf: |
|
dropout_rate: 0.0 |
|
ctc_type: builtin |
|
reduce: true |
|
ignore_nan_grad: null |
|
zero_infinity: true |
|
brctc_risk_strategy: exp |
|
brctc_group_strategy: end |
|
brctc_risk_factor: 0.0 |
|
joint_net_conf: null |
|
use_preprocessor: true |
|
use_lang_prompt: false |
|
use_nlp_prompt: false |
|
token_type: word |
|
bpemodel: null |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: null |
|
speech_volume_normalize: null |
|
rir_scp: null |
|
rir_apply_prob: 1.0 |
|
noise_scp: null |
|
noise_apply_prob: 1.0 |
|
noise_db_range: '13_15' |
|
short_noise_thres: 0.5 |
|
aux_ctc_tasks: [] |
|
frontend: null |
|
frontend_conf: {} |
|
specaug: null |
|
specaug_conf: {} |
|
normalize: null |
|
normalize_conf: {} |
|
model: espnet |
|
model_conf: |
|
ctc_weight: 0.0 |
|
lsm_weight: 0.1 |
|
length_normalized_loss: false |
|
superb_setup: true |
|
num_class: 5 |
|
ssl_input_size: 1024 |
|
extract_feats_in_collect_stats: false |
|
use_only_last_correct: true |
|
preencoder: null |
|
preencoder_conf: {} |
|
encoder: whisper |
|
encoder_conf: |
|
whisper_model: medium |
|
dropout_rate: 0.0 |
|
use_specaug: false |
|
specaug_conf: |
|
apply_time_warp: true |
|
time_warp_window: 5 |
|
time_warp_mode: bicubic |
|
apply_freq_mask: true |
|
freq_mask_width_range: |
|
- 0 |
|
- 40 |
|
num_freq_mask: 2 |
|
apply_time_mask: true |
|
time_mask_width_ratio_range: |
|
- 0.0 |
|
- 0.12 |
|
num_time_mask: 5 |
|
postencoder: null |
|
postencoder_conf: {} |
|
decoder: null |
|
decoder_conf: {} |
|
preprocessor: default |
|
preprocessor_conf: {} |
|
required: |
|
- output_dir |
|
- token_list |
|
version: '202402' |
|
distributed: true |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
|
|
@inproceedings{ |
|
arora2025talking, |
|
title={Talking Turns: Benchmarking Audio Foundation Models on Turn-Taking Dynamics}, |
|
author={Siddhant Arora and Zhiyun Lu and Chung-Cheng Chiu and Ruoming Pang and Shinji Watanabe}, |
|
booktitle={The Thirteenth International Conference on Learning Representations}, |
|
year={2025}, |
|
url={https://openreview.net/forum?id=2e4ECh0ikn} |
|
} |
|
|
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
``` |
|
|