Spaces:
Runtime error
Runtime error
File size: 5,645 Bytes
0558aa4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
@inproceedings{shen2018natural,
title={Natural tts synthesis by conditioning wavenet on mel spectrogram predictions},
author={Shen, Jonathan and Pang, Ruoming and Weiss, Ron J and Schuster, Mike and Jaitly, Navdeep and Yang, Zongheng and Chen, Zhifeng and Zhang, Yu and Wang, Yuxuan and Skerrv-Ryan, Rj and others},
booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
pages={4779--4783},
year={2018},
organization={IEEE}
}
@inproceedings{lancucki2021fastpitch,
title={Fastpitch: Parallel text-to-speech with pitch prediction},
author={{\L}a{\'n}cucki, Adrian},
booktitle={ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={6588--6592},
year={2021},
organization={IEEE}
}
@inproceedings{tatanov2022mixer,
title={{Mixer-TTS}: non-autoregressive, fast and compact text-to-speech model conditioned on language model embeddings},
author={Tatanov, Oktai and Beliaev, Stanislav and Ginsburg, Boris},
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={7482--7486},
year={2022},
organization={IEEE}
}
@inproceedings{shih2021rad,
title={{RAD-TTS}: Parallel flow-based {TTS} with robust alignment learning and diverse synthesis},
author={Shih, Kevin J and Valle, Rafael and Badlani, Rohan and Lancucki, Adrian and Ping, Wei and Catanzaro, Bryan},
booktitle={ICML Workshop on Invertible Neural Networks, Normalizing Flows, and Explicit Likelihood Models},
year={2021}
}
@article{kong2020hifi,
title={{HiFi-GAN}: Generative adversarial networks for efficient and high fidelity speech synthesis},
author={Kong, Jungil and Kim, Jaehyeon and Bae, Jaekyoung},
journal={Advances in Neural Information Processing Systems},
volume={33},
pages={17022--17033},
year={2020}
}
@inproceedings{prenger2019waveglow,
title={Waveglow: A flow-based generative network for speech synthesis},
author={Prenger, Ryan and Valle, Rafael and Catanzaro, Bryan},
booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={3617--3621},
year={2019},
organization={IEEE}
}
@inproceedings{jang21_interspeech,
author={Won Jang and Dan Lim and Jaesam Yoon and Bongwan Kim and Juntae Kim},
title={{UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation}},
year=2021,
booktitle={Proc. Interspeech 2021},
pages={2207--2211},
doi={10.21437/Interspeech.2021-1016}
}
@inproceedings{badlani2022one,
title={One {TTS} alignment to rule them all},
author={Badlani, Rohan and {\L}a{\'n}cucki, Adrian and Shih, Kevin J and Valle, Rafael and Ping, Wei and Catanzaro, Bryan},
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={6092--6096},
year={2022},
organization={IEEE}
}
@article{xue2021byt5,
title={ByT5: Towards a token-free future with pre-trained byte-to-byte models 2021},
author={Xue, Linting and Barua, Aditya and Constant, Noah and Al-Rfou, Rami and Narang, Sharan and Kale, Mihir and Roberts, Adam and Raffel, Colin},
journal={arXiv preprint arXiv:2105.13626},
year={2021}
}
@article{vrezavckova2021t5g2p,
title={T5g2p: Using text-to-text transfer transformer for grapheme-to-phoneme conversion},
author={{\v{R}}ez{\'a}{\v{c}}kov{\'a}, Mark{\'e}ta and {\v{S}}vec, Jan and Tihelka, Daniel},
year={2021},
journal={International Speech Communication Association}
}
@article{zhu2022byt5,
title={ByT5 model for massively multilingual grapheme-to-phoneme conversion},
author={Zhu, Jian and Zhang, Cong and Jurgens, David},
journal={arXiv preprint arXiv:2204.03067},
year={2022}
}
@article{ggulati2020conformer,
title={Conformer: Convolution-augmented transformer for speech recognition},
author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and others},
journal={arXiv preprint arXiv:2005.08100},
year={2020}
}
@inproceedings{gorman2018improving,
title={Improving homograph disambiguation with supervised machine learning},
author={Gorman, Kyle and Mazovetskiy, Gleb and Nikolaev, Vitaly},
booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
@inproceedings{kim2021conditional,
title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech},
author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee},
booktitle={International Conference on Machine Learning},
pages={5530--5540},
year={2021},
organization={PMLR}
}
@article{zeghidour2022soundstream,
author={Zeghidour, Neil and Luebs, Alejandro and Omran, Ahmed and Skoglund, Jan and Tagliasacchi, Marco},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
title={{SoundStream}: An End-to-End Neural Audio Codec},
year={2022},
volume={30},
pages={495-507},
doi={10.1109/TASLP.2021.3129994}
}
@article{defossez2022encodec,
title={High fidelity neural audio compression},
author={D{\'e}fossez, Alexandre and Copet, Jade and Synnaeve, Gabriel and Adi, Yossi},
journal={arXiv preprint arXiv:2210.13438},
year={2022}
}
@article{mentzer2023finite,
title={Finite scalar quantization: {VQ-VAE} made simple},
author={Mentzer, Fabian and Minnen, David and Agustsson, Eirikur and Tschannen, Michael},
journal={arXiv preprint arXiv:2309.15505},
year={2023}
} |