Update README.md
Browse files
README.md
CHANGED
@@ -27,62 +27,54 @@ Specifically, it supports the following speech-to-text tasks:
|
|
27 |
- Language identification
|
28 |
|
29 |
|
30 |
-
|
|
|
|
|
31 |
|
32 |
```BibTex
|
33 |
-
@
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
40 |
}
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
}
|
48 |
-
@inproceedings{
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
}
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
61 |
year={2023},
|
62 |
-
|
63 |
-
|
64 |
-
@InProceedings{pmlr-v162-peng22a,
|
65 |
-
title = {Branchformer: Parallel {MLP}-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding},
|
66 |
-
author = {Peng, Yifan and Dalmia, Siddharth and Lane, Ian and Watanabe, Shinji},
|
67 |
-
booktitle = {Proceedings of the 39th International Conference on Machine Learning},
|
68 |
-
pages = {17627--17643},
|
69 |
-
year = {2022},
|
70 |
-
editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
|
71 |
-
volume = {162},
|
72 |
-
series = {Proceedings of Machine Learning Research},
|
73 |
-
month = {17--23 Jul},
|
74 |
-
publisher = {PMLR},
|
75 |
-
pdf = {https://proceedings.mlr.press/v162/peng22a/peng22a.pdf},
|
76 |
-
url = {https://proceedings.mlr.press/v162/peng22a.html},
|
77 |
-
abstract = {Conformer has proven to be effective in many speech processing tasks. It combines the benefits of extracting local dependencies using convolutions and global dependencies using self-attention. Inspired by this, we propose a more flexible, interpretable and customizable encoder alternative, Branchformer, with parallel branches for modeling various ranged dependencies in end-to-end speech processing. In each encoder layer, one branch employs self-attention or its variant to capture long-range dependencies, while the other branch utilizes an MLP module with convolutional gating (cgMLP) to extract local relationships. We conduct experiments on several speech recognition and spoken language understanding benchmarks. Results show that our model outperforms both Transformer and cgMLP. It also matches with or outperforms state-of-the-art results achieved by Conformer. Furthermore, we show various strategies to reduce computation thanks to the two-branch architecture, including the ability to have variable inference complexity in a single trained model. The weights learned for merging branches indicate how local and global dependencies are utilized in different layers, which benefits model designing.}
|
78 |
-
}
|
79 |
-
@inproceedings{watanabe2018espnet,
|
80 |
-
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
81 |
-
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
82 |
-
year={2018},
|
83 |
-
booktitle={Proceedings of Interspeech},
|
84 |
-
pages={2207--2211},
|
85 |
-
doi={10.21437/Interspeech.2018-1456},
|
86 |
-
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
87 |
}
|
88 |
```
|
|
|
27 |
- Language identification
|
28 |
|
29 |
|
30 |
+
## Citations
|
31 |
+
|
32 |
+
#### OWSM-CTC
|
33 |
|
34 |
```BibTex
|
35 |
+
@inproceedings{owsm-ctc,
|
36 |
+
title = "{OWSM}-{CTC}: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification",
|
37 |
+
author = "Peng, Yifan and
|
38 |
+
Sudo, Yui and
|
39 |
+
Shakeel, Muhammad and
|
40 |
+
Watanabe, Shinji",
|
41 |
+
booktitle = "Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)",
|
42 |
+
year = "2024",
|
43 |
+
month= {8},
|
44 |
+
url = "https://aclanthology.org/2024.acl-long.549",
|
45 |
}
|
46 |
+
```
|
47 |
+
|
48 |
+
#### OWSM v3.1 and v3.2
|
49 |
+
|
50 |
+
```BibTex
|
51 |
+
@inproceedings{owsm-v32,
|
52 |
+
title={On the Effects of Heterogeneous Data Sources on Speech-to-Text Foundation Models},
|
53 |
+
author={Jinchuan Tian and Yifan Peng and William Chen and Kwanghee Choi and Karen Livescu and Shinji Watanabe},
|
54 |
+
booktitle={Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)},
|
55 |
+
year={2024},
|
56 |
+
month={9},
|
57 |
+
pdf="https://arxiv.org/pdf/2406.09282"
|
58 |
}
|
59 |
+
@inproceedings{owsm-v31,
|
60 |
+
title={{OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer}},
|
61 |
+
author={Yifan Peng and Jinchuan Tian and William Chen and Siddhant Arora and Brian Yan and Yui Sudo and Muhammad Shakeel and Kwanghee Choi and Jiatong Shi and Xuankai Chang and Jee-weon Jung and Shinji Watanabe},
|
62 |
+
booktitle={Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)},
|
63 |
+
year={2024},
|
64 |
+
month={9},
|
65 |
+
pdf="https://arxiv.org/pdf/2401.16658",
|
66 |
}
|
67 |
+
```
|
68 |
+
|
69 |
+
#### Initial OWSM (v1, v2, v3)
|
70 |
+
|
71 |
+
```BibTex
|
72 |
+
@inproceedings{owsm,
|
73 |
+
title={Reproducing Whisper-Style Training Using An Open-Source Toolkit And Publicly Available Data},
|
74 |
+
author={Yifan Peng and Jinchuan Tian and Brian Yan and Dan Berrebbi and Xuankai Chang and Xinjian Li and Jiatong Shi and Siddhant Arora and William Chen and Roshan Sharma and Wangyou Zhang and Yui Sudo and Muhammad Shakeel and Jee-weon Jung and Soumi Maiti and Shinji Watanabe},
|
75 |
+
booktitle={Proceedings of the IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
|
76 |
year={2023},
|
77 |
+
month={12},
|
78 |
+
pdf="https://arxiv.org/pdf/2309.13876",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
}
|
80 |
```
|