github-actions[bot] commited on
Commit
09eaf7c
·
0 Parent(s):

Deploy snapshot for HF Space (LFS pointers, heavy tests removed)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +11 -0
  3. .gitignore +167 -0
  4. .gradio/certificate.pem +31 -0
  5. .vscode/settings.json +5 -0
  6. LICENSE +201 -0
  7. README.md +383 -0
  8. README_zh.md +371 -0
  9. apt.txt +3 -0
  10. colab_webui.ipynb +0 -0
  11. font/SimHei.ttf +3 -0
  12. gui.py +95 -0
  13. packages.txt +3 -0
  14. requirements.txt +136 -0
  15. requirements_module.txt +4 -0
  16. runtime.txt +1 -0
  17. scripts/download_models.sh +6 -0
  18. scripts/huggingface_download.py +21 -0
  19. scripts/modelscope_download.py +21 -0
  20. submodules/TTS/CITATION.cff +20 -0
  21. submodules/TTS/CODE_OF_CONDUCT.md +133 -0
  22. submodules/TTS/CODE_OWNERS.rst +75 -0
  23. submodules/TTS/CONTRIBUTING.md +162 -0
  24. submodules/TTS/Dockerfile +19 -0
  25. submodules/TTS/LICENSE.txt +373 -0
  26. submodules/TTS/MANIFEST.in +15 -0
  27. submodules/TTS/Makefile +78 -0
  28. submodules/TTS/README.md +407 -0
  29. submodules/TTS/TTS/.models.json +938 -0
  30. submodules/TTS/TTS/VERSION +1 -0
  31. submodules/TTS/TTS/__init__.py +6 -0
  32. submodules/TTS/TTS/api.py +458 -0
  33. submodules/TTS/TTS/bin/__init__.py +0 -0
  34. submodules/TTS/TTS/bin/collect_env_info.py +48 -0
  35. submodules/TTS/TTS/bin/compute_attention_masks.py +165 -0
  36. submodules/TTS/TTS/bin/compute_embeddings.py +197 -0
  37. submodules/TTS/TTS/bin/compute_statistics.py +96 -0
  38. submodules/TTS/TTS/bin/eval_encoder.py +88 -0
  39. submodules/TTS/TTS/bin/extract_tts_spectrograms.py +287 -0
  40. submodules/TTS/TTS/bin/find_unique_chars.py +45 -0
  41. submodules/TTS/TTS/bin/find_unique_phonemes.py +74 -0
  42. submodules/TTS/TTS/bin/remove_silence_using_vad.py +124 -0
  43. submodules/TTS/TTS/bin/resample.py +90 -0
  44. submodules/TTS/TTS/bin/synthesize.py +494 -0
  45. submodules/TTS/TTS/bin/train_encoder.py +332 -0
  46. submodules/TTS/TTS/bin/train_tts.py +71 -0
  47. submodules/TTS/TTS/bin/train_vocoder.py +77 -0
  48. submodules/TTS/TTS/bin/tune_wavegrad.py +103 -0
  49. submodules/TTS/TTS/config/__init__.py +135 -0
  50. submodules/TTS/TTS/config/shared_configs.py +268 -0
.DS_Store ADDED
Binary file (10.2 kB). View file
 
.gitattributes ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
4
+ *.gif filter=lfs diff=lfs merge=lfs -text
5
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
6
+ *.wav filter=lfs diff=lfs merge=lfs -text
7
+ *.flac filter=lfs diff=lfs merge=lfs -text
8
+ *.npy filter=lfs diff=lfs merge=lfs -text
9
+ *.pt filter=lfs diff=lfs merge=lfs -text
10
+ *.pth filter=lfs diff=lfs merge=lfs -text
11
+ *.ttf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ .idea/
163
+ models
164
+ videos
165
+ temp
166
+ tmp
167
+ .env
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.vscode/settings.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
3
+ "python-envs.defaultPackageManager": "ms-python.python:conda",
4
+ "python-envs.pythonProjects": []
5
+ }
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: BosonAI Hackathon
3
+ emoji: 🏃
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: webui.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: BosonAI_Hackathon
12
+ ---
13
+
14
+ # Intelligent Multi-language AI Dubbing/Translation Tool - Linly-Dubbing — "AI Empowerment, Language Without Borders"
15
+
16
+ <div align="center">
17
+ <h1>Linly-Dubbing WebUI</h1>
18
+
19
+ [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/Kedreamix/Linly-Dubbing)
20
+ <img src="docs/linly_logo.png" /><br>
21
+
22
+ [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb)
23
+ [![Licence](https://img.shields.io/badge/LICENSE-Apache-red.svg?style=for-the-badge)](https://github.com/Kedreamix/Linly-Talker/blob/main/LICENSE)
24
+
25
+ [**English**](./README.md) | [**中文简体**](./README_zh.md)
26
+
27
+ </div>
28
+
29
+ ---
30
+
31
+ <details open>
32
+ <summary>Table of Contents</summary>
33
+
34
+ <!-- TOC -->
35
+ - [Intelligent Multi-language AI Dubbing/Translation Tool - Linly-Dubbing — "AI Empowerment, Language Without Borders"](#intelligent-multi-language-ai-dubbingtranslation-tool---linly-dubbing--ai-empowerment-language-without-borders)
36
+ - [Introduction](#introduction)
37
+ - [TO DO LIST](#to-do-list)
38
+ - [Examples](#examples)
39
+ - [Installation and Usage Guide](#installation-and-usage-guide)
40
+ - [Test Environment](#test-environment)
41
+ - [1. Clone the Repository](#1-clone-the-repository)
42
+ - [2. Install Dependencies](#2-install-dependencies)
43
+ - [3. Configure Environment Variables](#3-configure-environment-variables)
44
+ - [4. Run the Application](#4-run-the-application)
45
+ - [Detailed Features and Technical Details](#detailed-features-and-technical-details)
46
+ - [Automatic Video Download](#automatic-video-download)
47
+ - [Vocal Separation](#vocal-separation)
48
+ - [Demucs](#demucs)
49
+ - [UVR5](#uvr5)
50
+ - [AI Speech Recognition](#ai-speech-recognition)
51
+ - [WhisperX](#whisperx)
52
+ - [FunASR](#funasr)
53
+ - [Large Language Model Translation](#large-language-model-translation)
54
+ - [OpenAI API](#openai-api)
55
+ - [Qwen](#qwen)
56
+ - [Google Translate](#google-translate)
57
+ - [AI-Powered Speech Synthesis](#ai-powered-speech-synthesis)
58
+ - [Edge TTS](#edge-tts)
59
+ - [XTTS](#xtts)
60
+ - [CosyVoice](#cosyvoice)
61
+ - [GPT-SoVITS](#gpt-sovits)
62
+ - [Video Processing](#video-processing)
63
+ - [Digital Human Lip-Sync Technology](#digital-human-lip-sync-technology)
64
+ - [License](#license)
65
+ - [References](#references)
66
+ - [Star History](#star-history)
67
+
68
+ <!-- /TOC -->
69
+ </details>
70
+
71
+ ## Introduction
72
+
73
+ `Linly-Dubbing` is an intelligent multi-language AI dubbing and translation tool inspired by [`YouDub-webui`](https://github.com/liuzhao1225/YouDub-webui) and further extended and optimized. We aim to offer diverse and high-quality dubbing options by integrating [`Linly-Talker`](https://github.com/Kedreamix/Linly-Talker)’s digital human lip-sync technology, creating a more natural multi-language video experience.
74
+
75
+ Leveraging cutting-edge AI technologies, `Linly-Dubbing` sets new standards in naturalness and accuracy for multi-language dubbing, making it ideal for international education, global content localization, and more. It helps teams extend their reach and share high-quality content worldwide.
76
+
77
+ Key features include:
78
+
79
+ - **Multi-language Support**: Offers dubbing and subtitle translation in Chinese and many other languages to meet global needs.
80
+ - **AI Speech Recognition**: Employs advanced AI for precise speech-to-text conversion and speaker recognition.
81
+ - **Large Language Model Translation**: Uses leading language models like GPT for fast and accurate translations, ensuring professional quality.
82
+ - **AI Voice Cloning**: Utilizes cutting-edge voice cloning to generate speech closely matching the original video's tone and emotion.
83
+ - **Digital Human Lip-Sync Technology**: Synchronizes dubbing with video visuals, enhancing realism and interactivity.
84
+ - **Flexible Upload and Translation**: Users can upload videos, choose translation languages, and standards, ensuring personalization and flexibility.
85
+ - **Regular Updates**: Continuously introduces the latest models to stay at the forefront of dubbing and translation technology.
86
+
87
+ Our mission is to provide seamless, high-quality multi-language dubbing and translation services, empowering content creators and businesses to thrive in global markets.
88
+
89
+ ---
90
+
91
+ ## TO DO LIST
92
+
93
+ - [x] Implement basic AI dubbing and smart translation features.
94
+ - [x] Integrate CosyVoice’s AI voice cloning for high-quality audio translation.
95
+ - [x] Add FunASR AI speech recognition algorithm with optimized Chinese support.
96
+ - [x] Utilize Qwen large language model for multi-language translation.
97
+ - [x] Develop Linly-Dubbing WebUI for easy one-click video generation with customizable parameters.
98
+ - [ ] Integrate UVR5 for voice/accompaniment separation and reverb removal, referencing GPTSoVITS.
99
+ - [ ] Improve voice cloning naturalness using GPTSoVITS for fine-tuning.
100
+ - [ ] Implement and optimize digital human lip-sync technology for better dubbing and visual coherence.
101
+
102
+ ---
103
+
104
+ ## Examples
105
+
106
+ | Original Video | Linly-Dubbing |
107
+ | ------------------------------------------------------------ | ------------------------------------------------------------ |
108
+ | <video src="https://github.com/user-attachments/assets/87ac52c1-0d67-4145-810a-d74147051026"> | <video src="https://github.com/user-attachments/assets/3d5c8346-3363-43f6-b8a4-80dc08f3eca4"> |
109
+
110
+ ---
111
+
112
+ ## Installation and Usage Guide
113
+
114
+ ### Test Environment
115
+
116
+ This guide applies to the following test environments:
117
+
118
+ - Python 3.10, PyTorch 2.3.1, CUDA 12.1
119
+ - Python 3.10, PyTorch 2.3.1, CUDA 11.8
120
+
121
+ Follow the steps below to install and configure `Linly-Dubbing`.
122
+
123
+ > [!NOTE]
124
+ >
125
+ > A Colab script is also available for an online experience: [Linly-Dubbing Colab](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb).
126
+
127
+ ### 1. Clone the Repository
128
+
129
+ First, clone the `Linly-Dubbing` repository to your local machine and initialize submodules.
130
+
131
+ ```bash
132
+ # Clone the project to your local machine
133
+ git clone https://github.com/Kedreamix/Linly-Dubbing.git --depth 1
134
+
135
+ # Navigate to the project directory
136
+ cd Linly-Dubbing
137
+
138
+ # Initialize and update submodules like CosyVoice
139
+ git submodule update --init --recursive
140
+ ```
141
+
142
+ ### 2. Install Dependencies
143
+
144
+ Before proceeding, please create a new Python environment and install the required dependencies.
145
+
146
+ ```bash
147
+ # Create a conda environment named 'linly_dubbing' and specify Python version 3.10
148
+ conda create -n linly_dubbing python=3.10 -y
149
+
150
+ # Activate the newly created environment
151
+ conda activate linly_dubbing
152
+
153
+ # Navigate to the project directory
154
+ cd Linly-Dubbing/
155
+
156
+ # Install the ffmpeg tool
157
+ # Install ffmpeg using conda
158
+ conda install ffmpeg==7.0.2 -c conda-forge
159
+ # Install ffmpeg using a domestic mirror
160
+ conda install ffmpeg==7.0.2 -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
161
+
162
+ # Upgrade pip to the latest version
163
+ python -m pip install --upgrade pip
164
+
165
+ # Change the PyPI source to speed up package downloads
166
+ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
167
+ ```
168
+
169
+ Depending on your CUDA version, install PyTorch and related libraries using the following commands:
170
+
171
+ ```bash
172
+ # For CUDA 11.8
173
+ pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
174
+
175
+ # For CUDA 12.1
176
+ pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
177
+ ```
178
+
179
+ If you prefer to install PyTorch via conda, you can use the following commands:
180
+
181
+ ```bash
182
+ # For CUDA 11.8
183
+ conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
184
+
185
+ # For CUDA 12.1
186
+ conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia
187
+ ```
188
+
189
+ > [!NOTE]
190
+ >
191
+ > The installation process is very slow.
192
+
193
+ Next, install the remaining project dependencies:
194
+
195
+ ```bash
196
+ # Install the required Python packages for the project
197
+ # pynini is required by WeTextProcessing, so use conda to install it as it works across all platforms.
198
+ conda install -y pynini==2.1.5 -c conda-forge
199
+ # -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
200
+
201
+ pip install -r requirements.txt
202
+ # Install dependencies for submodules
203
+ pip install -r requirements_module.txt
204
+ ```
205
+
206
+ > [!TIP]
207
+ >
208
+ > If you encounter an error during installation that says "Could not load library libcudnn_ops_infer.so.8," follow these steps to fix it:
209
+ >
210
+ > ```bash
211
+ > # Set LD_LIBRARY_PATH to include the correct cuDNN library path
212
+ > export LD_LIBRARY_PATH=`python3 -c 'import os; import torch; print(os.path.dirname(os.path.dirname(torch.__file__)) +"/nvidia/cudnn/lib")'`:$LD_LIBRARY_PATH
213
+ > ```
214
+ >
215
+
216
+ ### 3. Configure Environment Variables
217
+
218
+ Before running the program, you need to configure the necessary environment variables. In the root directory of the project, create a `.env` file by renaming `env.example` and filling in the following variables:
219
+
220
+ - `OPENAI_API_KEY`: Your OpenAI API key, usually formatted as `sk-xxx`.
221
+ - `MODEL_NAME`: The name of the model you are using, such as `gpt-4` or `gpt-3.5-turbo`.
222
+ - `OPENAI_API_BASE`: If you are using a self-hosted OpenAI model, provide the corresponding API base URL here.
223
+ - `HF_TOKEN`: Your Hugging Face API token, used to access and download models.
224
+ - `HF_ENDPOINT`: A custom Hugging Face endpoint, which can be specified if you encounter issues with model downloading.
225
+ - `APPID` and `ACCESS_TOKEN`: Credentials for using the Bytedance TTS engine.
226
+ - `BAIDU_API_KEY` and `BAIDU_SECRET_KEY`: Used for Baidu's Ernie Bot API.
227
+
228
+ > [!NOTE]
229
+ >
230
+ > In most cases, you only need to configure `MODEL_NAME` and `HF_TOKEN`.
231
+ >
232
+ > By default, `MODEL_NAME` is set to `Qwen/Qwen1.5-4B-Chat`, so you do not need to configure the `OPENAI_API_KEY`.
233
+
234
+ > [!TIP]
235
+ >
236
+ > Since the performance of large models can be limited under normal circumstances, it is recommended to use larger models or better APIs. I personally recommend choosing OpenAI's API. If cost is a concern, you can try Baidu's Ernie Bot API, which offers free API access. Simply apply for the API and add it to your environment variables: https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1.
237
+ >
238
+ > You can obtain your `HF_TOKEN` from [Hugging Face](https://huggingface.co/settings/tokens). If you wish to use the **speaker separation feature**, make sure to request access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1). Otherwise, you can opt not to enable this feature.
239
+
240
+ ### 4. Run the Application
241
+
242
+ Before launching the application, run the following commands to automatically download the required models (including Qwen, XTTSv2, and faster-whisper-large-v3):
243
+
244
+ ```bash
245
+ # For Linux
246
+ bash scripts/download_models.sh
247
+
248
+ # For Windows
249
+ python scripts/modelscope_download.py
250
+ # Download the wav2vec2_fairseq_base_ls960_asr_ls960.pth file and place it in the models/ASR/whisper folder
251
+ wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
252
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
253
+ ```
254
+
255
+ ![Download models](docs/download.png)
256
+
257
+ Once the download is complete, launch the WebUI interface using the following command:
258
+
259
+ ```bash
260
+ python webui.py
261
+ ```
262
+
263
+ After starting, you will see an interface like the one below. You can open [http://127.0.0.1:6006](http://127.0.0.1:6006) to explore the application:
264
+
265
+ ![Linly-Dubbing](docs/webui.png)
266
+
267
+
268
+
269
+ ## Detailed Features and Technical Details
270
+
271
+ ### Automatic Video Download
272
+
273
+ **yt-dlp** is a powerful open-source command-line tool designed for downloading video and audio from YouTube and other websites. This tool offers a wide range of parameter options, allowing users to customize download behavior to their needs. Whether choosing specific formats, resolutions, or extracting audio, yt-dlp provides flexible solutions. It also supports extensive post-processing features, such as automatically adding metadata and renaming files. For more details on parameters and usage, refer to the [yt-dlp official repository](https://github.com/yt-dlp/yt-dlp).
274
+
275
+ ### Vocal Separation
276
+
277
+ #### Demucs
278
+
279
+ **Demucs** is an advanced sound separation model developed by the Facebook research team, designed to separate different sound sources from mixed audio. Although its architecture is simple, Demucs is powerful enough to isolate instruments, voices, and background noise, making it easier for users to perform post-processing and editing. Its user-friendly design has made it a preferred tool for many audio processing applications, including music production and post-production in films. More information can be found on the [Demucs project page](https://github.com/facebookresearch/demucs).
280
+
281
+ #### UVR5
282
+
283
+ **UVR5 (Ultimate Vocal Remover)** is one of the best tools for vocal and accompaniment separation. It excels in generating high-quality accompaniments and vocal extractions, outperforming tools like RX9, RipX, and SpectraLayers 9. The extracted accompaniments are nearly indistinguishable from the original stereo tracks, and UVR5 is both open-source and free. Find the source code at: [https://github.com/Anjok07/ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui).
284
+
285
+ WebUI reference: [https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5](https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5)
286
+
287
+ Model weights reference: [https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights)
288
+
289
+ ### AI Speech Recognition
290
+
291
+ #### WhisperX
292
+
293
+ **WhisperX** is an extension of OpenAI's Whisper speech recognition system, specifically designed for generating and aligning subtitles for videos. Unlike traditional speech recognition systems, WhisperX not only accurately transcribes spoken content into text but also aligns it with video frames to generate timestamped subtitle files. This precise alignment makes video editing and subtitle generation more efficient and intuitive. WhisperX also supports multi-speaker recognition, providing detailed speaker information for richer, easier-to-understand subtitles.
294
+
295
+ #### FunASR
296
+
297
+ **FunASR** is a comprehensive speech recognition toolkit offering a wide range of speech processing features, including Automatic Speech Recognition (ASR), Voice Activity Detection (VAD), punctuation restoration, language modeling, speaker verification, speaker separation, and multi-speaker dialogue recognition. FunASR is particularly optimized for Chinese speech and offers pre-trained models with easy fine-tuning interfaces. It’s a significant tool in the field of speech recognition, widely used in voice assistants, automatic subtitle generation, and more. For more information, visit the [FunASR project](https://github.com/alibaba-damo-academy/FunASR).
298
+
299
+ ### Large Language Model Translation
300
+
301
+ #### OpenAI API
302
+
303
+ `Linly-Dubbing` uses OpenAI's large language models, such as GPT-4 and GPT-3.5-turbo, to perform high-quality translations via API. OpenAI's models are renowned for their natural language understanding and high-precision text generation capabilities, commonly used in tasks like dialogue generation and text analysis. You can find more details about the models and usage in the [OpenAI official documentation](https://platform.openai.com/docs/models).
304
+
305
+ #### Qwen
306
+
307
+ **Qwen** is a localized large language model that supports multi-language translation. Although its performance may not match OpenAI's top models, its open-source nature and local execution make it a cost-effective option. Qwen is capable of handling text translations across various languages and serves as a powerful open-source alternative. More details can be found on the [Qwen project page](https://github.com/QwenLM/Qwen).
308
+
309
+ #### Google Translate
310
+
311
+ As a supplement to the translation features, `Linly-Dubbing` also integrates [Google Translate](https://py-googletrans.readthedocs.io/en/latest/). Google Translate offers broad language support and good translation quality, making it suitable for quickly obtaining approximate translations.
312
+
313
+ ### AI-Powered Speech Synthesis
314
+
315
+ #### Edge TTS
316
+
317
+ **Edge TTS** is a high-quality text-to-speech conversion service provided by Microsoft. It supports multiple languages and voice styles, capable of generating natural and fluent voice output. With Edge TTS, `Linly-Dubbing` can generate high-quality speech from text, making content more lively and understandable. For more information, refer to the [Edge TTS official documentation](https://github.com/rany2/edge-tts).
318
+
319
+ #### XTTS
320
+
321
+ **Coqui XTTS** is an advanced deep learning text-to-speech toolkit focused on voice cloning and multi-language speech synthesis. XTTS can achieve voice cloning using brief audio snippets and generate realistic speech output. It offers a variety of pre-trained models and development tools, supporting training and fine-tuning of new models. Users can explore XTTS's capabilities online at [Hugging Face](https://huggingface.co/spaces/coqui/xtts) or visit the [official GitHub repository](https://github.com/coqui-ai/TTS) for more technical details.
322
+
323
+ - Try XTTS online: [Hugging Face](https://huggingface.co/spaces/coqui/xtts)
324
+ - Official GitHub repository: [Coqui TTS](https://github.com/coqui-ai/TTS)
325
+
326
+ #### CosyVoice
327
+
328
+ **CosyVoice** is a multi-language speech understanding and synthesis model developed by Alibaba's Tongyi Lab, supporting Chinese, English, Japanese, Cantonese, Korean, and more. CosyVoice is trained on over 150,000 hours of voice data and enables high-quality speech synthesis and cross-lingual voice cloning. It excels at generating natural and coherent speech across languages, with support for one-shot voice cloning, needing only 3 to 10 seconds of original audio to generate a similar voice. For more information and model details, visit the [CosyVoice project](https://github.com/FunAudioLLM/CosyVoice).
329
+
330
+ Main features and characteristics:
331
+
332
+ 1. **Multi-language support**: Handles speech synthesis tasks in various languages.
333
+ 2. **Multi-style speech synthesis**: Controls the emotion and tone of speech through commands.
334
+ 3. **Streaming inference support**: Future plans include real-time streaming inference support.
335
+
336
+ #### GPT-SoVITS
337
+
338
+ Thanks to the contributions of the open-source community, AI speech synthesis also benefits from the open-source voice cloning model `GPT-SoVITS`. **GPT** is a transformer-based natural language processing model with strong text generation capabilities, while **SoVITS** is a deep learning-based voice conversion technology capable of converting one person's voice into another’s. By combining these two technologies, **GPT-SoVITS** can generate highly realistic speech that matches the given text content.
339
+
340
+ The project can be found at [https://github.com/RVC-Boss/GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS). Key features include:
341
+
342
+ 1. **Zero-shot Text-to-Speech (TTS):** Input a 5-second voice sample to experience instant text-to-speech conversion.
343
+ 2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data to improve voice similarity and realism.
344
+ 3. **Cross-lingual support:** Inference across languages different from the training dataset, currently supporting English, Japanese, and Chinese.
345
+ 4. **WebUI tools:** Integrated tools include voice accompaniment separation, automatic dataset splitting, Chinese automatic speech recognition (ASR), and text annotation to help beginners create training datasets and GPT/SoVITS models.
346
+
347
+ ### Video Processing
348
+
349
+ In terms of video processing, `Linly-Dubbing` provides robust functionality support. Users can easily add subtitles, insert background music, adjust background music volume, and modify overall playback speed. With these features, users can customize video content to make it more engaging and personalized.
350
+
351
+ ### Digital Human Lip-Sync Technology
352
+
353
+ Inspired by `Linly-Talker`, this project focuses on digital human lip-sync technology. By combining advanced computer vision and speech recognition technologies, `Linly-Talker` allows digital characters' lip movements to match voiceovers precisely, achieving highly natural synchronization. This technology is not only applicable to animated characters but can also be used in scenarios such as virtual presenters or educators in instructional videos. `Linly-Talker` enhances digital character performance with accurate lip-sync and vivid facial expressions, providing a more immersive experience for the audience. This advanced digital human lip-sync technology significantly improves the professionalism and viewing experience of video content. For more information, refer to [https://github.com/Kedreamix/Linly-Talker](https://github.com/Kedreamix/Linly-Talker).
354
+
355
+ ---
356
+
357
+ ## License
358
+
359
+ > [!Caution]
360
+ >
361
+ > When using this tool, please comply with relevant laws, including copyright, data protection, and privacy laws. Do not use this tool without permission from the original author and/or rights holder.
362
+
363
+ `Linly-Dubbing` follows the Apache License 2.0. When using this tool, please comply with relevant laws, including copyright, data protection, and privacy laws. Do not use this tool without permission from the original author and/or rights holder.
364
+
365
+ ---
366
+
367
+ ## References
368
+
369
+ In developing this project, I referenced and drew inspiration from several outstanding open-source projects and related resources. Special thanks to the developers and contributors of these projects and the open-source community. Below are the main projects we referenced:
370
+
371
+ - [YouDub-webui](https://github.com/liuzhao1225/): Provides a feature-rich web interface for downloading and processing YouTube videos, from which we drew much inspiration and technical implementation details.
372
+
373
+ - [Coqui TTS](https://github.com/coqui-ai/TTS)
374
+ - [Qwen](https://github.com/QwenLM/Qwen)
375
+ - [FunASR](https://github.com/alibaba-damo-academy/FunASR)
376
+ - [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
377
+ - [Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
378
+
379
+ ---
380
+
381
+ ## Star History
382
+
383
+ <img src="https://api.star-history.com/svg?repos=Kedreamix/Linly-Dubbing&type=Date" alt="Star History Chart" style="zoom:200%;" />
README_zh.md ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 智能视频多语言AI配音/翻译工具 - Linly-Dubbing — “AI赋能,语言无界”
2
+
3
+ <div align="center">
4
+ <h1>Linly-Dubbing WebUI</h1>
5
+
6
+ [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/Kedreamix/Linly-Dubbing)
7
+ <img src="docs/linly_logo.png" /><br>
8
+
9
+ [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb)
10
+ [![Licence](https://img.shields.io/badge/LICENSE-Apache-red.svg?style=for-the-badge)](https://github.com/Kedreamix/Linly-Dubbing/blob/main/LICENSE)
11
+
12
+ [**English**](./README.md) | [**中文简体**](./README_zh.md)
13
+
14
+ </div>
15
+
16
+ ---
17
+
18
+ <details open>
19
+ <summary>目录</summary>
20
+ <!-- TOC -->
21
+
22
+ - [智能视频多语言AI配音/翻译工具 - Linly-Dubbing — “AI赋能,语言无界”](#智能视频多语言ai配音翻译工具---linly-dubbing--ai赋能语言无界)
23
+ - [介绍](#介绍)
24
+ - [TO DO LIST](#to-do-list)
25
+ - [示例](#示例)
26
+ - [安装与使用指南](#安装与使用指南)
27
+ - [测试环境](#测试环境)
28
+ - [1. 克隆代码仓库](#1-克隆代码仓库)
29
+ - [2. 安装依赖环境](#2-安装依赖环境)
30
+ - [3. 配置环境变量](#3-配置环境变量)
31
+ - [4. 运行程序](#4-运行程序)
32
+ - [详细功能和技术细节](#详细功能和技术细节)
33
+ - [自动下载视频](#自动下载视频)
34
+ - [人声分离](#人声分离)
35
+ - [Demucs](#demucs)
36
+ - [UVR5](#uvr5)
37
+ - [AI 智能语音识别](#ai-智能语音识别)
38
+ - [WhisperX](#whisperx)
39
+ - [FunASR](#funasr)
40
+ - [大型语言模型字幕翻译](#大型语言模型字幕翻译)
41
+ - [OpenAI API](#openai-api)
42
+ - [Qwen](#qwen)
43
+ - [Google Translate](#google-translate)
44
+ - [AI 语音合成](#ai-语音合成)
45
+ - [Edge TTS](#edge-tts)
46
+ - [XTTS](#xtts)
47
+ - [CosyVoice](#cosyvoice)
48
+ - [GPT-SoVITS](#gpt-sovits)
49
+ - [视频处理](#视频处理)
50
+ - [数字人对口型技术](#数字人对口型技术)
51
+ - [许可协议](#许可协议)
52
+ - [参考](#参考)
53
+ - [Star History](#star-history)
54
+
55
+ <!-- /TOC -->
56
+ </details>
57
+
58
+ ## 介绍
59
+
60
+ `Linly-Dubbing` 是一个智能视频多语言AI配音和翻译工具,它融合了[`YouDub-webui`](https://github.com/liuzhao1225/YouDub-webui)的灵感,并在此基础上进行了拓展和优化。我们致力于提供更加多样化和高质量的配音选择,通过集成[`Linly-Talker`](https://github.com/Kedreamix/Linly-Talker)的数字人对口型技术,为用户带来更加自然的多语言视频体验。
61
+
62
+ 通过整合最新的AI技术,`Linly-Dubbing` 在多语言配音的自然性和准确性方面达到了新的高度,适用于国际教育、全球娱乐内容本地化等多种场景,帮助团队将优质内容传播到全球各地。
63
+
64
+ 主要特点包括:
65
+
66
+ - **多语言支持**: 支持中文及多种其他语言的配音和字幕翻译,满足国际化需求。
67
+ - **AI 智能语音识别**: 使用先进的AI技术进行语音识别,提供精确的语音到文本转换和说话者识别。
68
+ - **大型语言模型翻译**: 结合领先的本地化大型语言模型(如GPT),快速且准确地进行翻译,确保专业性和自然性。
69
+ - **AI 声音克隆**: 利用尖端的声音克隆技术,生成与原视频配音高度相似的语音,保持情感和语调的连贯性。
70
+ - **数字人对口型技术**: 通过对口型技术,使配音与视频画面高度契合,提升真实性和互动性。
71
+ - **灵活上传与翻译**: 用户可以上传视频,自主选择翻译语言和标准,确保个性化和灵活性。
72
+ - **定期更新**: 持续引入最新模型,保持配音和翻译的领先地位。
73
+
74
+ 我们旨在为用户提供无缝、高质量的多语言视频配音和翻译服务,为内容创作者和企业在全球市场中提供有力支持。
75
+
76
+ ---
77
+
78
+ ## TO DO LIST
79
+
80
+ - [x] 完成AI配音和智能翻译功能的基础实现
81
+ - [x] 集成CosyVoice的AI声音克隆算法,实现高质量音频翻译
82
+ - [x] 增加FunASR的AI语音识别算法,特别优化对中文的支持
83
+ - [x] 利用Qwen大语言模型实现多语言翻译
84
+ - [x] 开发Linly-Dubbing WebUI,提供一键生成最终视频的便捷功能,并支持多种参数配置
85
+ - [ ] 加入UVR5进行人声/伴奏分离和混响移除,参考GPTSoVits
86
+ - [ ] 提升声音克隆的自然度,考虑使用GPTSoVits进行微调,加入GPTSoVits
87
+ - [ ] 实现并优化数字人对口型技术,提升配音与画面的契合度
88
+
89
+ ---
90
+
91
+ ## 示例
92
+
93
+ | 原视频 | Linly-Dubbing |
94
+ | ------------------------------------------------------------ | ------------------------------------------------------------ |
95
+ | <video src="https://github.com/user-attachments/assets/87ac52c1-0d67-4145-810a-d74147051026"> | <video src="https://github.com/user-attachments/assets/3d5c8346-3363-43f6-b8a4-80dc08f3eca4"> |
96
+
97
+ ---
98
+
99
+ ## 安装与使用指南
100
+
101
+ ### 测试环境
102
+
103
+ 本指南适用于以下测试环境:
104
+
105
+ - Python 3.10, PyTorch 2.3.1, CUDA 12.1
106
+ - Python 3.10, PyTorch 2.3.1, CUDA 11.8
107
+
108
+ 请按照以下步骤进行`Linly-Dubbing`的安装与配置。
109
+
110
+ > [!NOTE]
111
+ >
112
+ > 此外,我还提供了一个Colab脚本,您可以点击 [Linly-Dubbing Colab](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb) 进行在线体验。
113
+
114
+ ### 1. 克隆代码仓库
115
+
116
+ 首先,您需要将`Linly-Dubbing`项目的代码克隆到本地,并初始化子模块。以下是具体操作步骤:
117
+
118
+ ```bash
119
+ # 克隆项目代码到本地
120
+ git clone https://github.com/Kedreamix/Linly-Dubbing.git --depth 1
121
+
122
+ # 进入项目目录
123
+ cd Linly-Dubbing
124
+
125
+ # 初始化并更新子模块,如CosyVoice等
126
+ git submodule update --init --recursive
127
+ ```
128
+
129
+ ### 2. 安装依赖环境
130
+
131
+ 在继续之前,请创建一个新的Python环境,并安装所需的依赖项。
132
+
133
+ ```bash
134
+ # 创建名为 'linly_dubbing' 的conda环境,并指定Python版本为3.10
135
+ conda create -n linly_dubbing python=3.10 -y
136
+
137
+ # 激活新创建的环境
138
+ conda activate linly_dubbing
139
+
140
+ # 进入项目目录
141
+ cd Linly-Dubbing/
142
+
143
+ # 安装ffmpeg工具
144
+ # 使用conda安装ffmpeg
145
+ conda install ffmpeg==7.0.2 -c conda-forge
146
+ # 使用国内镜像安装ffmpeg
147
+ conda install ffmpeg==7.0.2 -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
148
+
149
+ # 升级pip到最新版本
150
+ python -m pip install --upgrade pip
151
+
152
+ # 更改PyPI源地址以加快包的下载速度
153
+ pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
154
+ ```
155
+
156
+ 根据您的CUDA版本,使用以下命令安装PyTorch及相关库:
157
+
158
+ ```bash
159
+ # 对于CUDA 11.8
160
+ pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
161
+
162
+ # 对于CUDA 12.1
163
+ pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
164
+ ```
165
+
166
+ 如果您倾向于通过conda安装PyTorch,可以选择以下命令:
167
+
168
+ ```bash
169
+ # 对于CUDA 11.8
170
+ conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
171
+
172
+ # 对于CUDA 12.1
173
+ conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia
174
+ ```
175
+
176
+ > [!NOTE]
177
+ >
178
+ > 安装过程可能耗时很长。
179
+
180
+ 然后,安装项目的其他依赖项:
181
+
182
+ ```bash
183
+ # 安装项目所需的Python包
184
+ # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
185
+ conda install -y pynini==2.1.5 -c conda-forge
186
+ # -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
187
+
188
+ pip install -r requirements.txt
189
+ # 安装submodules 下的依赖
190
+ pip install -r requirements_module.txt
191
+ ```
192
+
193
+ > [!TIP]
194
+ >
195
+ > 如在安装过程中遇到错误提示“Could not load library libcudnn_ops_infer.so.8”,请按以下步骤修复:
196
+ >
197
+ > ```bash
198
+ > # 设置LD_LIBRARY_PATH以包含正确的cuDNN库路径
199
+ > export LD_LIBRARY_PATH=`python3 -c 'import os; import torch; print(os.path.dirname(os.path.dirname(torch.__file__)) +"/nvidia/cudnn/lib")'`:$LD_LIBRARY_PATH
200
+ > ```
201
+
202
+ ### 3. 配置环境变量
203
+
204
+ 在运行程序前,您需要配置必要的环境变量。请在项目根目录下的 `.env` 文件中添加以下内容,首先将 `env.example`填入以下环境变量并 改名为 `.env` :
205
+
206
+ - `OPENAI_API_KEY`: 您的OpenAI API密钥,格式通常为 `sk-xxx`。
207
+ - `MODEL_NAME`: 使用的模型名称,如 `gpt-4` 或 `gpt-3.5-turbo`。
208
+ - `OPENAI_API_BASE`: 如使用自部署的OpenAI模型,请填写对应的API基础URL。
209
+ - `HF_TOKEN`: Hugging Face的API Token,用于访问和下载模型。
210
+ - `HF_ENDPOINT`: 当遇到模型下载问题时,可指定自定义的Hugging Face端点。
211
+ - `APPID` 和 `ACCESS_TOKEN`: 用于火山引擎TTS的凭据。
212
+ - `BAIDU_API_KEY`和`BAIDU_SECRET_KEY`: 用于百度文心一言的API
213
+
214
+ > [!NOTE]
215
+ >
216
+ > 通常,您只需配置 `MODEL_NAME` 和 `HF_TOKEN` 即可。
217
+ >
218
+ > 默认情况下,`MODEL_NAME` 设为 `Qwen/Qwen1.5-4B-Chat`,因此无需额外配置 `OPENAI_API_KEY`。
219
+
220
+ > ![TIP]
221
+ >
222
+ > 由于正常情况下大模型效果有限,所以建议可以使用规模较大的模型或者说使用较好的API,个人推荐可以选择OpenAI的api,如果考虑到收费问题,可以尝试百度的文心一言的API,免费申请API,填入到环境变量即可,[https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1](https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1)
223
+ >
224
+ > 可以在 [Hugging Face](https://huggingface.co/settings/tokens) 获取 `HF_TOKEN`。若需使用**说话人分离功能**,务必在[pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)申请访问权限。否则,可以选择不启用该功能。
225
+
226
+ ### 4. 运行程序
227
+
228
+ 在启动程序前,先通过以下命令自动下载所需的模型(包括Qwen,XTTSv2,和faster-whisper-large-v3模型):
229
+
230
+ ```bash
231
+ # Linux 终端运行
232
+ bash scripts/download_models.sh
233
+
234
+ # Windows
235
+ python scripts/modelscope_download.py
236
+ # 下载wav2vec2_fairseq_base_ls960_asr_ls960.pth文件放在models/ASR/whisper文件夹下
237
+ wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
238
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
239
+ ```
240
+
241
+ ![下载模型](docs/download.png)
242
+
243
+ 下载完成后,使用以下命令启动WebUI用户界面:
244
+
245
+ ```bash
246
+ python webui.py
247
+ ```
248
+
249
+ 启动后,您将看到如下图所示的界面,可以打开 [http://127.0.0.1:6006](http://127.0.0.1:6006) 进行体验:
250
+
251
+ ![Linly-Dubbing](docs/webui.png)
252
+
253
+ ---
254
+
255
+ ## 详细功能和技术细节
256
+
257
+ ### 自动下载视频
258
+
259
+ **yt-dlp** 是一款强大的开源命令行工具,专为从 YouTube 和其他网站下载视频和音频而设计。该工具具有广泛的参数选项,允许用户根据需求精细地定制下载行为。无论是选择特定的格式、分辨率,还是提取音频,yt-dlp 都能提供灵活的解决方案。此外,yt-dlp 支持丰富的后处理功能,如自动添加元数据、自动重命名文件等。有关详细的参数和使用方法,请参考 [yt-dlp 的官方仓库](https://github.com/yt-dlp/yt-dlp)。
260
+
261
+ ### 人声分离
262
+
263
+ #### Demucs
264
+
265
+ **Demucs** 是由 Facebook 研究团队开发的一个先进的声音分离模型,旨在从混合音频中分离出不同的声音源。Demucs 的架构简单,但功能强大,它能够将乐器、声音和背景音分离开来,使用户能够更方便地进行后期处理和编辑。其简单易用的设计使得它成为许多声音处理应用的首选工具,广泛用于音乐制作、影视后期等领域。更多信息可以参见 [Demucs 的项目页面](https://github.com/facebookresearch/demucs)。
266
+
267
+ #### UVR5
268
+
269
+ UVR5 (Ultimate Vocal Remover)是目前最优秀的人声伴奏分离工具之一,是一款功能强大的伴奏制作/人声提取工具,其表现不仅优于RX9、RipX和SpectraLayers 9等同类工具,而且它提取出来的伴奏已经无限接近原版立体声,而且开源免费,开源地址:[https://github.com/Anjok07/ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)。
270
+
271
+ WebUI参考:[https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5](https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5)
272
+
273
+ 权重文件参考:[https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights)
274
+
275
+ ### AI 智能语音识别
276
+
277
+ #### WhisperX
278
+
279
+ **WhisperX** 是 OpenAI 开发的 Whisper 语音识别系统的扩展版本,专注于生成和对齐视频字幕。与传统语音识别系统不同,WhisperX 不仅能够将语音内容精确地转录为文字,还能与视频帧进行精确对齐,生成带有时间戳的字幕文件。这种精准的对齐功能使视频编辑和字幕生成变得更加高效和直观。WhisperX 还支持多说话者识别,提供详尽的说话者信息,使得字幕内容更加丰富和易于理解。
280
+
281
+ #### FunASR
282
+
283
+ **FunASR** 是一个综合性的语音识别工具包,提供广泛的语音处理功能,包括语音识别(ASR)、语音活动检测(VAD)、标点符号恢复、语言模型、说话人验证、说话人分离以及多说话者对话识别等。FunASR 尤其针对中文语音进行了优化,提供了预训练模型及其微调的便捷接口。它是语音识别领域中的重要工具,广泛应用于语音助手、自动字幕生成等场景。详细信息可参考 [FunASR 项目](https://github.com/alibaba-damo-academy/FunASR)。
284
+
285
+ ### 大型语言模型字幕翻译
286
+
287
+ #### OpenAI API
288
+
289
+ `Linly-Dubbing` 采用 OpenAI 提供的多种大型语言模型,如 GPT-4 和 GPT-3.5-turbo,通过 API 接口进行高质量的翻译。OpenAI 的这些模型以其自然语言理解能力和高精度的生成文本能力著称,广泛用于对话生成、文本分析等任务。用户可以访问 [OpenAI 官方文档](https://platform.openai.com/docs/models) 了解更多模型信息和使用细节。
290
+
291
+ #### Qwen
292
+
293
+ **Qwen** 是一个本地化的大型语言模型,支持多语言翻译。虽然其性能可能不如 OpenAI 的顶级模型,但其开放源码和本地运行的特性使得它成为一个经济高效的选择。Qwen 能够处理多种语言的文本翻译,是一个强大的开源替代方案。详情请参见 [Qwen 项目](https://github.com/QwenLM/Qwen)。
294
+
295
+ #### Google Translate
296
+
297
+ 作为翻译功能的补充,`Linly-Dubbing` 还集成了 [Google Translate](https://py-googletrans.readthedocs.io/en/latest/) 的翻译服务。Google Translate 提供广泛的语言支持和良好的翻译质量,特别适合快速获取大致翻译内容。
298
+
299
+ ### AI 语音合成
300
+
301
+ #### Edge TTS
302
+
303
+ **Edge TTS** 是微软提供的高质量文本到语音转换服务。它支持多种语言和声音样式,能够生成自然流畅的语音输出。通过 Edge TTS,`Linly-Dubbing` 可以实现从文本生成高质量的语音,使内容更加生动和易于理解。更多信息和使用方法请参见 [Edge TTS 官方文档](https://github.com/rany2/edge-tts)。
304
+
305
+ #### XTTS
306
+
307
+ **Coqui XTTS** 是一个先进的深度学习文本到语音工具包,专注于声音克隆和多语言语音合成。XTTS 能够通过短时间的音频片段实现声音克隆,并生成逼真的语音输出。它提供了丰富的预训练模型和开发工具,支持新模型的训练和微调。用户可以通过 [Hugging Face](https://huggingface.co/spaces/coqui/xtts) 在线体验和测试 XTTS 的功能,或者访问 [官方 GitHub 库](https://github.com/coqui-ai/TTS) 了解更多技术细节。
308
+
309
+ - 在线体验 XTTS: [Hugging Face](https://huggingface.co/spaces/coqui/xtts)
310
+ - 官方 GitHub 库: [Coqui TTS](https://github.com/coqui-ai/TTS)
311
+
312
+ #### CosyVoice
313
+
314
+ **CosyVoice** 是阿里通义实验室开发的多语言语音理解和合成模型,支持中文、英语、日语、粤语、韩语等多种语言。CosyVoice 经过超过 15 万小时的语音数据训练,能够实现高质量的语音合成和跨语言音色克隆。它特别擅长在不同语言之间生成自然、连贯的语音,支持 one-shot 音色克隆,仅需 3 至 10 秒的原始音频即可生成模拟音色。更多信息和模型详情请访问 [CosyVoice 项目](https://github.com/FunAudioLLM/CosyVoice)。
315
+
316
+ 主要功能和特性:
317
+ 1. **多语言支持**:处理多种语言的语音合成任务。
318
+ 2. **多风格语音合成**:通过指令控制语音的情感和语气。
319
+ 3. **流式推理支持**:计划未来支持实时流式推理。
320
+
321
+ #### GPT-SoVITS
322
+
323
+ 感谢大家的开源贡献,AI语音合成还借鉴了当前开源的语音克隆模型 `GPT-SoVITS`,**GPT**是一种基于Transformer的自然语言处理模型,具有很强的文本生成能力。 **SoVITS**则是一种基于深度学习的语音转换技术,可以将一个人的语音转换成另一个人的语音。 通过将这两种技术结合起来,**GPT**-**SoVITS**可以生成高度逼真的语音,且语音内容与给定的文本内容一致。
324
+
325
+ 我认为效果是相当不错的,项目地址可参考[https://github.com/RVC-Boss/GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS),主要功能如下:
326
+
327
+ 1. **零样本文本到语音(TTS):** 输入 5 秒的声音样本,即刻体验文本到语音转换。
328
+ 2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型,提升声音相似度和真实感。
329
+ 3. **跨语言支持:** 支持与训练数据集不同语言的推理,目前支持英语、日语和中文。
330
+ 4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注,协助初学者创建训练数据集和 GPT/SoVITS 模型。
331
+
332
+ ### 视频处理
333
+
334
+ 在视频处理方面,`Linly-Dubbing` 提供了强大的功能支持。用户可以轻松添加字幕、插入背景音乐,并调整背景音乐的音量和整体播放速度等。通过这些功能,用户能够自定义视频内容,使之更具吸引力和个性化。
335
+
336
+ ### 数字人对口型技术
337
+
338
+ 借鉴于`Linly-Talker`,专注于实现数字人的对口型技术。通过结合先进的计算机视觉和语音识别技术,`Linly-Talker` 能够使数字人角色的口型与配音精确匹配,从而实现高度自然的同步效果。这项技术不仅适用于动画角色,还可以应用于虚拟主播、教育视频中的讲解员等多种场景。`Linly-Talker` 通过精确的口型匹配和生动的面部表情,使得虚拟人物的表现更加生动逼真,为观众提供更加沉浸的体验。这种先进的数字人对口型技术大大提升了视频内容的专业性和观赏价值。可参考[https://github.com/Kedreamix/Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
339
+
340
+ ---
341
+
342
+ ## 许可协议
343
+
344
+ > [!Caution]
345
+ >
346
+ > 在使用本工具时,请遵守相关法律,包括版权法、数据保护法和隐私法。未经原作者和/或版权所有者许可,请勿使用本工具。
347
+
348
+ `Linly-Dubbing` 遵循 Apache License 2.0。在使用本工具时,请遵守相关法律,包括版权法、数据保护法和隐私法。未经原作者和/或版权所有者许可,请勿使用本工具。
349
+
350
+ ---
351
+
352
+ ## 参考
353
+
354
+ 在开发过程中,我参考并借鉴了多个优秀的开源项目及相关资源。特别感谢这些项目的开发者和开源社区的贡献,以下是我们参考的主要项目:
355
+
356
+ - [YouDub-webui](https://github.com/liuzhao1225/YouDub-webui):提供了一个功能丰富的 Web 用户界面,用于 YouTube 视频的下载和处理,我们从中汲取了不少灵感和技术实现细节。
357
+ - [Coqui TTS](https://github.com/coqui-ai/TTS)
358
+
359
+ - [Qwen](https://github.com/QwenLM/Qwen)
360
+ - [FunASR](https://github.com/alibaba-damo-academy/FunASR)
361
+ - [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
362
+ - [Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
363
+
364
+ ---
365
+
366
+ ## Star History
367
+
368
+ [![Star History Chart](https://api.star-history.com/svg?repos=Kedreamix/Linly-Dubbing&type=Date)](https://star-history.com/#Kedreamix/Linly-Dubbing&Date)
369
+
370
+ ---
371
+
apt.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ libsndfile1
3
+ libgl1
colab_webui.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
font/SimHei.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:336a838f4a78e150826be608dae69de59d50948c3d2b71760e096ae764154bdc
3
+ size 9751960
gui.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from PySide6.QtWidgets import QApplication, QMainWindow, QTabWidget
3
+ from PySide6.QtCore import Qt
4
+
5
+ # Ensure required modules are importable
6
+ try:
7
+ # UI components (side-effects may register widgets/styles)
8
+ from ui_components import (
9
+ CustomSlider, # noqa: F401
10
+ FloatSlider, # noqa: F401
11
+ RadioButtonGroup, # noqa: F401
12
+ AudioSelector, # noqa: F401
13
+ VideoPlayer, # noqa: F401
14
+ )
15
+
16
+ # Feature tabs
17
+ from tabs.full_auto_tab import FullAutoTab
18
+ from tabs.settings_tab import SettingsTab
19
+ from tabs.download_tab import DownloadTab
20
+ from tabs.demucs_tab import DemucsTab
21
+ from tabs.asr_tab import ASRTab
22
+ from tabs.translation_tab import TranslationTab
23
+ from tabs.tts_tab import TTSTab
24
+ from tabs.video_tab import SynthesizeVideoTab
25
+ from tabs.linly_talker_tab import LinlyTalkerTab
26
+
27
+ # Optional heavy tools (app still runs without them)
28
+ try:
29
+ from tools.step000_video_downloader import download_from_url # noqa: F401
30
+ from tools.step010_demucs_vr import separate_all_audio_under_folder # noqa: F401
31
+ from tools.step020_asr import transcribe_all_audio_under_folder # noqa: F401
32
+ from tools.step030_translation import translate_all_transcript_under_folder # noqa: F401
33
+ from tools.step040_tts import generate_all_wavs_under_folder # noqa: F401
34
+ from tools.step050_synthesize_video import synthesize_all_video_under_folder # noqa: F401
35
+ from tools.do_everything import do_everything # noqa: F401
36
+ from tools.utils import SUPPORT_VOICE # noqa: F401
37
+ except ImportError as e:
38
+ print(f"Warning: some tool modules could not be imported: {e}")
39
+ SUPPORT_VOICE = [
40
+ "zh-CN-XiaoxiaoNeural",
41
+ "zh-CN-YunxiNeural",
42
+ "en-US-JennyNeural",
43
+ "ja-JP-NanamiNeural",
44
+ ]
45
+
46
+ except ImportError as e:
47
+ print(f"Error: failed to initialize application: {e}")
48
+ sys.exit(1)
49
+
50
+
51
+ class MainWindow(QMainWindow):
52
+ def __init__(self):
53
+ super().__init__()
54
+ self.setWindowTitle("Linly-Dubbing — Smart Multilingual Video Dubbing/Translation")
55
+ self.resize(1024, 768)
56
+
57
+ tabs = QTabWidget()
58
+
59
+ # Create tabs
60
+ self.full_auto_tab = FullAutoTab()
61
+ self.settings_tab = SettingsTab()
62
+
63
+ # Propagate settings changes to the One-Click tab
64
+ self.settings_tab.config_changed.connect(self.full_auto_tab.update_config)
65
+
66
+ # English-only tab labels
67
+ tabs.addTab(self.full_auto_tab, "One-Click")
68
+ tabs.addTab(self.settings_tab, "Settings")
69
+ tabs.addTab(DownloadTab(), "Auto Download")
70
+ tabs.addTab(DemucsTab(), "Vocal Separation")
71
+ tabs.addTab(ASRTab(), "ASR Speech Recognition")
72
+ tabs.addTab(TranslationTab(), "Subtitle Translation")
73
+ tabs.addTab(TTSTab(), "TTS Synthesis")
74
+ tabs.addTab(SynthesizeVideoTab(), "Video Composition")
75
+ tabs.addTab(LinlyTalkerTab(), "Linly-Talker Lip-Sync (WIP)")
76
+
77
+ self.setCentralWidget(tabs)
78
+
79
+
80
+ def main():
81
+ # High-DPI: enable crisp UI on modern displays
82
+ QApplication.setAttribute(Qt.AA_EnableHighDpiScaling, True)
83
+ QApplication.setAttribute(Qt.AA_UseHighDpiPixmaps, True)
84
+
85
+ app = QApplication(sys.argv)
86
+ app.setStyle("Fusion") # consistent cross-platform look
87
+
88
+ window = MainWindow()
89
+ window.show()
90
+
91
+ sys.exit(app.exec())
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ libsndfile1
3
+ espeak-ng
requirements.txt ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Core runtime / serving ---
2
+ gradio>=4.0
3
+ fastapi>=0.115
4
+ uvicorn[standard]>=0.30
5
+ huggingface_hub>=0.24
6
+ python-dotenv>=1.0
7
+
8
+ # --- PyTorch trio (CPU-friendly) ---
9
+ torch==2.3.1
10
+ torchaudio==2.3.1
11
+ torchvision==0.18.1
12
+
13
+ # --- Scientific / ML backbone ---
14
+ numpy>=1.26,<3
15
+ scipy>=1.11
16
+ scikit-learn>=1.3
17
+ pandas>=2.2
18
+ matplotlib>=3.8
19
+ numba>=0.59
20
+ llvmlite>=0.43
21
+ tqdm>=4.66
22
+ einops>=0.7
23
+ protobuf>=4.24
24
+ safetensors>=0.4.3
25
+
26
+ # --- NLP / Transformers / Whisper ---
27
+ transformers==4.55.4
28
+ tokenizers>=0.15
29
+ sentencepiece>=0.1.99
30
+ #faster-whisper==1.2.0
31
+ #whisperx==3.7.4
32
+ #whisper==1.1.10
33
+ nltk>=3.9
34
+ regex>=2024.9.11
35
+
36
+ # --- Audio / TTS / ASR toolchain ---
37
+ soundfile>=0.12
38
+ audioread>=3.0.1
39
+ soxr>=0.3.7
40
+ ffmpeg-python>=0.2.0
41
+ pydub>=0.25.1
42
+ audiostretchy==1.3.5
43
+ demucs==4.0.1
44
+ openunmix==1.3.0
45
+ asteroid-filterbanks==0.4.0
46
+ pytorch-wpe==0.0.1
47
+ speechbrain==1.0.3
48
+ coqui-tts==0.27.2
49
+ coqpit-config==0.2.1
50
+ ttsfrd==0.1.0
51
+ edge-tts==7.2.3
52
+ librosa==0.11.0
53
+ soxr>=0.3.7
54
+
55
+ # --- Diffusion / audio codecs (if you actually use them) ---
56
+ diffusers==0.27.2
57
+ encodec==0.1.1
58
+
59
+ # --- Model management / orchestration ---
60
+ accelerate==1.11.0
61
+ hydra-core==1.3.2
62
+ omegaconf==2.3.0
63
+ lightning==2.5.5
64
+ pytorch-lightning==2.5.5
65
+ torchmetrics==1.3.2
66
+
67
+ # --- PyAnnote (diarization) ---
68
+ pyannote.audio==3.4.0
69
+ pyannote.core==5.0.0
70
+ pyannote.database==5.1.3
71
+ pyannote.metrics==3.2.1
72
+ pyannote.pipeline==3.0.1
73
+
74
+ # --- OpenAI / HTTP clients ---
75
+ openai==1.55.3
76
+ httpx>=0.27
77
+ requests>=2.31
78
+ urllib3>=2.2
79
+
80
+ # --- Data / storage / utils ---
81
+ orjson>=3.10
82
+ ujson>=5.9
83
+ pyarrow>=16.0
84
+ zstandard>=0.22
85
+ cloudpickle>=3.0
86
+ joblib>=1.3
87
+ filelock>=3.12
88
+ rich>=13.7
89
+ tabulate>=0.9
90
+ Unidecode>=1.3
91
+
92
+ # --- Text processing / multi-language ---
93
+ pypinyin>=0.49
94
+ jieba>=0.42.1
95
+ jaconv>=0.4
96
+ SudachiPy>=0.6.8
97
+ SudachiDict-core>=20240109
98
+ WeTextProcessing>=1.0.3
99
+ pysbd>=0.3.4
100
+ bangla==0.0.5
101
+ bnunicodenormalizer==0.1.7
102
+ bnnumerizer==0.0.2
103
+
104
+ # --- Misc ML / optimization ---
105
+ optuna>=3.6
106
+ pytorch-metric-learning>=2.3
107
+ umap-learn>=0.5.5
108
+ pynndescent>=0.5.10
109
+
110
+ # --- Media / video ---
111
+ moviepy>=1.0.3
112
+ imageio>=2.34
113
+ imageio-ffmpeg>=0.4.9
114
+ av>=10.0
115
+
116
+ # --- Small helpers (stable) ---
117
+ loguru>=0.7
118
+ fire>=0.6
119
+ packaging>=23.2
120
+ typing_extensions>=4.9
121
+ python-dateutil>=2.8.2
122
+ pytz>=2024.1
123
+ tzlocal>=5.2
124
+ PyYAML>=6.0.1
125
+ regex>=2024.9.11
126
+
127
+ # --- Optional (comment out if not needed) ---
128
+ onnxruntime==1.23.1
129
+ translators==6.0.1
130
+ gdown==5.1.0
131
+ yt-dlp>=2024.5.27
132
+ openunmix==1.3.0
133
+
134
+ demucs>=4.0.0
135
+
136
+ TTS
requirements_module.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ submodules/demucs
2
+ submodules/whisper
3
+ submodules/whisperX
4
+ submodules/TTS
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10
scripts/download_models.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # 下载 wav2vec2 模型并保存到指定路径,如果文件已经存在,则跳过下载
2
+ mkdir -p models/ASR/whisper & wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
3
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
4
+
5
+ # 执行下载脚本
6
+ python scripts/modelscope_download.py
scripts/huggingface_download.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install huggingface_hub
2
+ from huggingface_hub import snapshot_download
3
+
4
+ # https://huggingface.co/coqui/XTTS-v2
5
+ snapshot_download('coqui/XTTS-v2', local_dir='models/TTS/XTTS-v2', resume_download=True, local_dir_use_symlinks=False)
6
+
7
+ # https://huggingface.co/FunAudioLLM/CosyVoice-300M
8
+ # snapshot_download('FunAudioLLM/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M', resume_download=True, local_dir_use_symlinks=False)
9
+
10
+ # https://huggingface.co/Qwen/Qwen1.5-4B-Chat
11
+ snapshot_download('Qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat', resume_download=True, local_dir_use_symlinks=False)
12
+
13
+ # https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat
14
+ snapshot_download('Qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat', resume_download=True, local_dir_use_symlinks=False)
15
+
16
+ # https://huggingface.co/Systran/faster-whisper-large-v3
17
+ snapshot_download('Systran/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3', resume_download=True, local_dir_use_symlinks=False)
18
+
19
+ # 需要申请自动下载
20
+ # https://huggingface.co/pyannote/speaker-diarization-3.1
21
+ # snapshot_download('pyannote/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1', resume_download=True, local_dir_use_symlinks=False)
scripts/modelscope_download.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install modelscope
2
+ from modelscope import snapshot_download
3
+
4
+ # https://modelscope.cn/models/AI-ModelScope/XTTS-v2
5
+ snapshot_download('AI-ModelScope/XTTS-v2', local_dir='models/TTS/XTTS-v2')
6
+
7
+ # https://modelscope.cn/models/iic/CosyVoice-300M
8
+ # snapshot_download('iic/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M')
9
+
10
+ # https://modelscope.cn/models/qwen/qwen1.5-4b-chat
11
+ snapshot_download('qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat')
12
+
13
+ # https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat
14
+ # snapshot_download('qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat')
15
+
16
+ # https://modelscope.cn/models/keepitsimple/faster-whisper-large-v3
17
+ snapshot_download('keepitsimple/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3')
18
+
19
+ # 需要申请自动下载
20
+ # https://modelscope.cn/models/mirror013/speaker-diarization-3.1
21
+ # snapshot_download('mirror013/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1')
submodules/TTS/CITATION.cff ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
3
+ title: "Coqui TTS"
4
+ abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
5
+ date-released: 2021-01-01
6
+ authors:
7
+ - family-names: "Eren"
8
+ given-names: "Gölge"
9
+ - name: "The Coqui TTS Team"
10
+ version: 1.4
11
+ doi: 10.5281/zenodo.6334862
12
+ license: "MPL-2.0"
13
+ url: "https://www.coqui.ai"
14
+ repository-code: "https://github.com/coqui-ai/TTS"
15
+ keywords:
16
+ - machine learning
17
+ - deep learning
18
+ - artificial intelligence
19
+ - text to speech
20
+ - TTS
submodules/TTS/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual identity
11
+ and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people
22
+ * Being respectful of differing opinions, viewpoints, and experiences
23
+ * Giving and gracefully accepting constructive feedback
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience
26
+ * Focusing on what is best not just for us as individuals, but for the
27
+ overall community
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or
32
+ advances of any kind
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks
34
+ * Public or private harassment
35
+ * Publishing others' private information, such as a physical or email
36
+ address, without their explicit permission
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official e-mail address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement at
64
65
+ All complaints will be reviewed and investigated promptly and fairly.
66
+
67
+ All community leaders are obligated to respect the privacy and security of the
68
+ reporter of any incident.
69
+
70
+ ## Enforcement Guidelines
71
+
72
+ Community leaders will follow these Community Impact Guidelines in determining
73
+ the consequences for any action they deem in violation of this Code of Conduct:
74
+
75
+ ### 1. Correction
76
+
77
+ **Community Impact**: Use of inappropriate language or other behavior deemed
78
+ unprofessional or unwelcome in the community.
79
+
80
+ **Consequence**: A private, written warning from community leaders, providing
81
+ clarity around the nature of the violation and an explanation of why the
82
+ behavior was inappropriate. A public apology may be requested.
83
+
84
+ ### 2. Warning
85
+
86
+ **Community Impact**: A violation through a single incident or series
87
+ of actions.
88
+
89
+ **Consequence**: A warning with consequences for continued behavior. No
90
+ interaction with the people involved, including unsolicited interaction with
91
+ those enforcing the Code of Conduct, for a specified period of time. This
92
+ includes avoiding interactions in community spaces as well as external channels
93
+ like social media. Violating these terms may lead to a temporary or
94
+ permanent ban.
95
+
96
+ ### 3. Temporary Ban
97
+
98
+ **Community Impact**: A serious violation of community standards, including
99
+ sustained inappropriate behavior.
100
+
101
+ **Consequence**: A temporary ban from any sort of interaction or public
102
+ communication with the community for a specified period of time. No public or
103
+ private interaction with the people involved, including unsolicited interaction
104
+ with those enforcing the Code of Conduct, is allowed during this period.
105
+ Violating these terms may lead to a permanent ban.
106
+
107
+ ### 4. Permanent Ban
108
+
109
+ **Community Impact**: Demonstrating a pattern of violation of community
110
+ standards, including sustained inappropriate behavior, harassment of an
111
+ individual, or aggression toward or disparagement of classes of individuals.
112
+
113
+ **Consequence**: A permanent ban from any sort of public interaction within
114
+ the community.
115
+
116
+ ## Attribution
117
+
118
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119
+ version 2.0, available at
120
+ [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
121
+
122
+ Community Impact Guidelines were inspired by
123
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124
+
125
+ For answers to common questions about this code of conduct, see the FAQ at
126
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available
127
+ at [https://www.contributor-covenant.org/translations][translations].
128
+
129
+ [homepage]: https://www.contributor-covenant.org
130
+ [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
131
+ [Mozilla CoC]: https://github.com/mozilla/diversity
132
+ [FAQ]: https://www.contributor-covenant.org/faq
133
+ [translations]: https://www.contributor-covenant.org/translations
submodules/TTS/CODE_OWNERS.rst ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TTS code owners / governance system
2
+ ==========================================
3
+
4
+ TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
5
+
6
+ Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
7
+
8
+ Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
9
+
10
+ The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
11
+
12
+ This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
13
+
14
+ There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
15
+
16
+ Global owners
17
+ ----------------
18
+
19
+ These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
20
+
21
+ - Eren Gölge (@erogol)
22
+ - Reuben Morais (@reuben)
23
+
24
+ Training, feeding
25
+ -----------------
26
+
27
+ - Eren Gölge (@erogol)
28
+
29
+ Model exporting
30
+ ---------------
31
+
32
+ - Eren Gölge (@erogol)
33
+
34
+ Multi-Speaker TTS
35
+ -----------------
36
+
37
+ - Eren Gölge (@erogol)
38
+ - Edresson Casanova (@edresson)
39
+
40
+ TTS
41
+ ---
42
+
43
+ - Eren Gölge (@erogol)
44
+
45
+ Vocoders
46
+ --------
47
+
48
+ - Eren Gölge (@erogol)
49
+
50
+ Speaker Encoder
51
+ ---------------
52
+
53
+ - Eren Gölge (@erogol)
54
+
55
+ Testing & CI
56
+ ------------
57
+
58
+ - Eren Gölge (@erogol)
59
+ - Reuben Morais (@reuben)
60
+
61
+ Python bindings
62
+ ---------------
63
+
64
+ - Eren Gölge (@erogol)
65
+ - Reuben Morais (@reuben)
66
+
67
+ Documentation
68
+ -------------
69
+
70
+ - Eren Gölge (@erogol)
71
+
72
+ Third party bindings
73
+ --------------------
74
+
75
+ Owned by the author.
submodules/TTS/CONTRIBUTING.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contribution guidelines
2
+
3
+ Welcome to the 🐸TTS!
4
+
5
+ This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
6
+
7
+ ## Where to start.
8
+ We welcome everyone who likes to contribute to 🐸TTS.
9
+
10
+ You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
11
+
12
+ If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
13
+
14
+ - [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
15
+
16
+ You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
17
+
18
+ - [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
19
+
20
+ This is a place to find feature requests, bugs.
21
+
22
+ Issues with the ```good first issue``` tag are good place for beginners to take on.
23
+
24
+ - ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
25
+
26
+ We list all the target improvements for the next version. You can pick one of them and start contributing.
27
+
28
+ - Also feel free to suggest new features, ideas and models. We're always open for new things.
29
+
30
+ ## Call for sharing language models
31
+ If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
32
+
33
+ This model can be shared in two ways:
34
+ 1. Share the model files with us and we serve them with the next 🐸 TTS release.
35
+ 2. Upload your models on GDrive and share the link.
36
+
37
+ Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
38
+
39
+ Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
40
+
41
+ ## Sending a ✨**PR**✨
42
+
43
+ If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
44
+ Please use the following steps to send a ✨**PR**✨.
45
+ Let us know if you encounter a problem along the way.
46
+
47
+ The following steps are tested on an Ubuntu system.
48
+
49
+ 1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
50
+
51
+ 2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
52
+
53
+ ```bash
54
+ $ git clone [email protected]:<your Github name>/TTS.git
55
+ $ cd TTS
56
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
57
+ ```
58
+
59
+ 3. Install 🐸TTS for development.
60
+
61
+ ```bash
62
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
63
+ $ make install
64
+ ```
65
+
66
+ 4. Create a new branch with an informative name for your goal.
67
+
68
+ ```bash
69
+ $ git checkout -b an_informative_name_for_my_branch
70
+ ```
71
+
72
+ 5. Implement your changes on your new branch.
73
+
74
+ 6. Explain your code using [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings.
75
+
76
+ 7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
77
+
78
+ 8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction.
79
+
80
+ ```bash
81
+ $ make test # stop at the first error
82
+ $ make test_all # run all the tests, report all the errors
83
+ ```
84
+
85
+ 9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
86
+
87
+ ```bash
88
+ $ make style
89
+ ```
90
+
91
+ 10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
92
+
93
+ ```bash
94
+ $ make lint
95
+ ```
96
+
97
+ 11. When things are good, add new files and commit your changes.
98
+
99
+ ```bash
100
+ $ git add my_file1.py my_file2.py ...
101
+ $ git commit
102
+ ```
103
+
104
+ It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
105
+
106
+ ```bash
107
+ $ git fetch upstream
108
+ $ git rebase upstream/master
109
+ # or for the development version
110
+ $ git rebase upstream/dev
111
+ ```
112
+
113
+ 12. Send a PR to ```dev``` branch.
114
+
115
+ Push your branch to your fork.
116
+
117
+ ```bash
118
+ $ git push -u origin an_informative_name_for_my_branch
119
+ ```
120
+
121
+ Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
122
+
123
+ Please set ✨**PR**✨'s target branch to ```dev``` as we use ```dev``` to work on the next version.
124
+
125
+ 13. Let's discuss until it is perfect. 💪
126
+
127
+ We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
128
+
129
+ 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
130
+
131
+ ## Development in Docker container
132
+
133
+ If you prefer working within a Docker container as your development environment, you can do the following:
134
+
135
+ 1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
136
+
137
+ 2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
138
+
139
+ ```bash
140
+ $ git clone [email protected]:<your Github name>/TTS.git
141
+ $ cd TTS
142
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
143
+ ```
144
+
145
+ 3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
146
+
147
+ ```
148
+ docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
149
+ ```
150
+
151
+ 4. Run the container with GPU support:
152
+
153
+ ```
154
+ docker run -it --gpus all tts-dev:latest /bin/bash
155
+ ```
156
+
157
+ Feel free to ping us at any step you need help using our communication channels.
158
+
159
+ If you are new to Github or open-source contribution, These are good resources.
160
+
161
+ - [Github Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests)
162
+ - [First-Contribution](https://github.com/firstcontributions/first-contributions)
submodules/TTS/Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
2
+ FROM ${BASE}
3
+
4
+ RUN apt-get update && apt-get upgrade -y
5
+ RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
6
+ RUN pip3 install llvmlite --ignore-installed
7
+
8
+ # Install Dependencies:
9
+ RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
10
+ RUN rm -rf /root/.cache/pip
11
+
12
+ # Copy TTS repository contents:
13
+ WORKDIR /root
14
+ COPY . /root
15
+
16
+ RUN make install
17
+
18
+ ENTRYPOINT ["tts"]
19
+ CMD ["--help"]
submodules/TTS/LICENSE.txt ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Mozilla Public License Version 2.0
2
+ ==================================
3
+
4
+ 1. Definitions
5
+ --------------
6
+
7
+ 1.1. "Contributor"
8
+ means each individual or legal entity that creates, contributes to
9
+ the creation of, or owns Covered Software.
10
+
11
+ 1.2. "Contributor Version"
12
+ means the combination of the Contributions of others (if any) used
13
+ by a Contributor and that particular Contributor's Contribution.
14
+
15
+ 1.3. "Contribution"
16
+ means Covered Software of a particular Contributor.
17
+
18
+ 1.4. "Covered Software"
19
+ means Source Code Form to which the initial Contributor has attached
20
+ the notice in Exhibit A, the Executable Form of such Source Code
21
+ Form, and Modifications of such Source Code Form, in each case
22
+ including portions thereof.
23
+
24
+ 1.5. "Incompatible With Secondary Licenses"
25
+ means
26
+
27
+ (a) that the initial Contributor has attached the notice described
28
+ in Exhibit B to the Covered Software; or
29
+
30
+ (b) that the Covered Software was made available under the terms of
31
+ version 1.1 or earlier of the License, but not also under the
32
+ terms of a Secondary License.
33
+
34
+ 1.6. "Executable Form"
35
+ means any form of the work other than Source Code Form.
36
+
37
+ 1.7. "Larger Work"
38
+ means a work that combines Covered Software with other material, in
39
+ a separate file or files, that is not Covered Software.
40
+
41
+ 1.8. "License"
42
+ means this document.
43
+
44
+ 1.9. "Licensable"
45
+ means having the right to grant, to the maximum extent possible,
46
+ whether at the time of the initial grant or subsequently, any and
47
+ all of the rights conveyed by this License.
48
+
49
+ 1.10. "Modifications"
50
+ means any of the following:
51
+
52
+ (a) any file in Source Code Form that results from an addition to,
53
+ deletion from, or modification of the contents of Covered
54
+ Software; or
55
+
56
+ (b) any new file in Source Code Form that contains any Covered
57
+ Software.
58
+
59
+ 1.11. "Patent Claims" of a Contributor
60
+ means any patent claim(s), including without limitation, method,
61
+ process, and apparatus claims, in any patent Licensable by such
62
+ Contributor that would be infringed, but for the grant of the
63
+ License, by the making, using, selling, offering for sale, having
64
+ made, import, or transfer of either its Contributions or its
65
+ Contributor Version.
66
+
67
+ 1.12. "Secondary License"
68
+ means either the GNU General Public License, Version 2.0, the GNU
69
+ Lesser General Public License, Version 2.1, the GNU Affero General
70
+ Public License, Version 3.0, or any later versions of those
71
+ licenses.
72
+
73
+ 1.13. "Source Code Form"
74
+ means the form of the work preferred for making modifications.
75
+
76
+ 1.14. "You" (or "Your")
77
+ means an individual or a legal entity exercising rights under this
78
+ License. For legal entities, "You" includes any entity that
79
+ controls, is controlled by, or is under common control with You. For
80
+ purposes of this definition, "control" means (a) the power, direct
81
+ or indirect, to cause the direction or management of such entity,
82
+ whether by contract or otherwise, or (b) ownership of more than
83
+ fifty percent (50%) of the outstanding shares or beneficial
84
+ ownership of such entity.
85
+
86
+ 2. License Grants and Conditions
87
+ --------------------------------
88
+
89
+ 2.1. Grants
90
+
91
+ Each Contributor hereby grants You a world-wide, royalty-free,
92
+ non-exclusive license:
93
+
94
+ (a) under intellectual property rights (other than patent or trademark)
95
+ Licensable by such Contributor to use, reproduce, make available,
96
+ modify, display, perform, distribute, and otherwise exploit its
97
+ Contributions, either on an unmodified basis, with Modifications, or
98
+ as part of a Larger Work; and
99
+
100
+ (b) under Patent Claims of such Contributor to make, use, sell, offer
101
+ for sale, have made, import, and otherwise transfer either its
102
+ Contributions or its Contributor Version.
103
+
104
+ 2.2. Effective Date
105
+
106
+ The licenses granted in Section 2.1 with respect to any Contribution
107
+ become effective for each Contribution on the date the Contributor first
108
+ distributes such Contribution.
109
+
110
+ 2.3. Limitations on Grant Scope
111
+
112
+ The licenses granted in this Section 2 are the only rights granted under
113
+ this License. No additional rights or licenses will be implied from the
114
+ distribution or licensing of Covered Software under this License.
115
+ Notwithstanding Section 2.1(b) above, no patent license is granted by a
116
+ Contributor:
117
+
118
+ (a) for any code that a Contributor has removed from Covered Software;
119
+ or
120
+
121
+ (b) for infringements caused by: (i) Your and any other third party's
122
+ modifications of Covered Software, or (ii) the combination of its
123
+ Contributions with other software (except as part of its Contributor
124
+ Version); or
125
+
126
+ (c) under Patent Claims infringed by Covered Software in the absence of
127
+ its Contributions.
128
+
129
+ This License does not grant any rights in the trademarks, service marks,
130
+ or logos of any Contributor (except as may be necessary to comply with
131
+ the notice requirements in Section 3.4).
132
+
133
+ 2.4. Subsequent Licenses
134
+
135
+ No Contributor makes additional grants as a result of Your choice to
136
+ distribute the Covered Software under a subsequent version of this
137
+ License (see Section 10.2) or under the terms of a Secondary License (if
138
+ permitted under the terms of Section 3.3).
139
+
140
+ 2.5. Representation
141
+
142
+ Each Contributor represents that the Contributor believes its
143
+ Contributions are its original creation(s) or it has sufficient rights
144
+ to grant the rights to its Contributions conveyed by this License.
145
+
146
+ 2.6. Fair Use
147
+
148
+ This License is not intended to limit any rights You have under
149
+ applicable copyright doctrines of fair use, fair dealing, or other
150
+ equivalents.
151
+
152
+ 2.7. Conditions
153
+
154
+ Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155
+ in Section 2.1.
156
+
157
+ 3. Responsibilities
158
+ -------------------
159
+
160
+ 3.1. Distribution of Source Form
161
+
162
+ All distribution of Covered Software in Source Code Form, including any
163
+ Modifications that You create or to which You contribute, must be under
164
+ the terms of this License. You must inform recipients that the Source
165
+ Code Form of the Covered Software is governed by the terms of this
166
+ License, and how they can obtain a copy of this License. You may not
167
+ attempt to alter or restrict the recipients' rights in the Source Code
168
+ Form.
169
+
170
+ 3.2. Distribution of Executable Form
171
+
172
+ If You distribute Covered Software in Executable Form then:
173
+
174
+ (a) such Covered Software must also be made available in Source Code
175
+ Form, as described in Section 3.1, and You must inform recipients of
176
+ the Executable Form how they can obtain a copy of such Source Code
177
+ Form by reasonable means in a timely manner, at a charge no more
178
+ than the cost of distribution to the recipient; and
179
+
180
+ (b) You may distribute such Executable Form under the terms of this
181
+ License, or sublicense it under different terms, provided that the
182
+ license for the Executable Form does not attempt to limit or alter
183
+ the recipients' rights in the Source Code Form under this License.
184
+
185
+ 3.3. Distribution of a Larger Work
186
+
187
+ You may create and distribute a Larger Work under terms of Your choice,
188
+ provided that You also comply with the requirements of this License for
189
+ the Covered Software. If the Larger Work is a combination of Covered
190
+ Software with a work governed by one or more Secondary Licenses, and the
191
+ Covered Software is not Incompatible With Secondary Licenses, this
192
+ License permits You to additionally distribute such Covered Software
193
+ under the terms of such Secondary License(s), so that the recipient of
194
+ the Larger Work may, at their option, further distribute the Covered
195
+ Software under the terms of either this License or such Secondary
196
+ License(s).
197
+
198
+ 3.4. Notices
199
+
200
+ You may not remove or alter the substance of any license notices
201
+ (including copyright notices, patent notices, disclaimers of warranty,
202
+ or limitations of liability) contained within the Source Code Form of
203
+ the Covered Software, except that You may alter any license notices to
204
+ the extent required to remedy known factual inaccuracies.
205
+
206
+ 3.5. Application of Additional Terms
207
+
208
+ You may choose to offer, and to charge a fee for, warranty, support,
209
+ indemnity or liability obligations to one or more recipients of Covered
210
+ Software. However, You may do so only on Your own behalf, and not on
211
+ behalf of any Contributor. You must make it absolutely clear that any
212
+ such warranty, support, indemnity, or liability obligation is offered by
213
+ You alone, and You hereby agree to indemnify every Contributor for any
214
+ liability incurred by such Contributor as a result of warranty, support,
215
+ indemnity or liability terms You offer. You may include additional
216
+ disclaimers of warranty and limitations of liability specific to any
217
+ jurisdiction.
218
+
219
+ 4. Inability to Comply Due to Statute or Regulation
220
+ ---------------------------------------------------
221
+
222
+ If it is impossible for You to comply with any of the terms of this
223
+ License with respect to some or all of the Covered Software due to
224
+ statute, judicial order, or regulation then You must: (a) comply with
225
+ the terms of this License to the maximum extent possible; and (b)
226
+ describe the limitations and the code they affect. Such description must
227
+ be placed in a text file included with all distributions of the Covered
228
+ Software under this License. Except to the extent prohibited by statute
229
+ or regulation, such description must be sufficiently detailed for a
230
+ recipient of ordinary skill to be able to understand it.
231
+
232
+ 5. Termination
233
+ --------------
234
+
235
+ 5.1. The rights granted under this License will terminate automatically
236
+ if You fail to comply with any of its terms. However, if You become
237
+ compliant, then the rights granted under this License from a particular
238
+ Contributor are reinstated (a) provisionally, unless and until such
239
+ Contributor explicitly and finally terminates Your grants, and (b) on an
240
+ ongoing basis, if such Contributor fails to notify You of the
241
+ non-compliance by some reasonable means prior to 60 days after You have
242
+ come back into compliance. Moreover, Your grants from a particular
243
+ Contributor are reinstated on an ongoing basis if such Contributor
244
+ notifies You of the non-compliance by some reasonable means, this is the
245
+ first time You have received notice of non-compliance with this License
246
+ from such Contributor, and You become compliant prior to 30 days after
247
+ Your receipt of the notice.
248
+
249
+ 5.2. If You initiate litigation against any entity by asserting a patent
250
+ infringement claim (excluding declaratory judgment actions,
251
+ counter-claims, and cross-claims) alleging that a Contributor Version
252
+ directly or indirectly infringes any patent, then the rights granted to
253
+ You by any and all Contributors for the Covered Software under Section
254
+ 2.1 of this License shall terminate.
255
+
256
+ 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257
+ end user license agreements (excluding distributors and resellers) which
258
+ have been validly granted by You or Your distributors under this License
259
+ prior to termination shall survive termination.
260
+
261
+ ************************************************************************
262
+ * *
263
+ * 6. Disclaimer of Warranty *
264
+ * ------------------------- *
265
+ * *
266
+ * Covered Software is provided under this License on an "as is" *
267
+ * basis, without warranty of any kind, either expressed, implied, or *
268
+ * statutory, including, without limitation, warranties that the *
269
+ * Covered Software is free of defects, merchantable, fit for a *
270
+ * particular purpose or non-infringing. The entire risk as to the *
271
+ * quality and performance of the Covered Software is with You. *
272
+ * Should any Covered Software prove defective in any respect, You *
273
+ * (not any Contributor) assume the cost of any necessary servicing, *
274
+ * repair, or correction. This disclaimer of warranty constitutes an *
275
+ * essential part of this License. No use of any Covered Software is *
276
+ * authorized under this License except under this disclaimer. *
277
+ * *
278
+ ************************************************************************
279
+
280
+ ************************************************************************
281
+ * *
282
+ * 7. Limitation of Liability *
283
+ * -------------------------- *
284
+ * *
285
+ * Under no circumstances and under no legal theory, whether tort *
286
+ * (including negligence), contract, or otherwise, shall any *
287
+ * Contributor, or anyone who distributes Covered Software as *
288
+ * permitted above, be liable to You for any direct, indirect, *
289
+ * special, incidental, or consequential damages of any character *
290
+ * including, without limitation, damages for lost profits, loss of *
291
+ * goodwill, work stoppage, computer failure or malfunction, or any *
292
+ * and all other commercial damages or losses, even if such party *
293
+ * shall have been informed of the possibility of such damages. This *
294
+ * limitation of liability shall not apply to liability for death or *
295
+ * personal injury resulting from such party's negligence to the *
296
+ * extent applicable law prohibits such limitation. Some *
297
+ * jurisdictions do not allow the exclusion or limitation of *
298
+ * incidental or consequential damages, so this exclusion and *
299
+ * limitation may not apply to You. *
300
+ * *
301
+ ************************************************************************
302
+
303
+ 8. Litigation
304
+ -------------
305
+
306
+ Any litigation relating to this License may be brought only in the
307
+ courts of a jurisdiction where the defendant maintains its principal
308
+ place of business and such litigation shall be governed by laws of that
309
+ jurisdiction, without reference to its conflict-of-law provisions.
310
+ Nothing in this Section shall prevent a party's ability to bring
311
+ cross-claims or counter-claims.
312
+
313
+ 9. Miscellaneous
314
+ ----------------
315
+
316
+ This License represents the complete agreement concerning the subject
317
+ matter hereof. If any provision of this License is held to be
318
+ unenforceable, such provision shall be reformed only to the extent
319
+ necessary to make it enforceable. Any law or regulation which provides
320
+ that the language of a contract shall be construed against the drafter
321
+ shall not be used to construe this License against a Contributor.
322
+
323
+ 10. Versions of the License
324
+ ---------------------------
325
+
326
+ 10.1. New Versions
327
+
328
+ Mozilla Foundation is the license steward. Except as provided in Section
329
+ 10.3, no one other than the license steward has the right to modify or
330
+ publish new versions of this License. Each version will be given a
331
+ distinguishing version number.
332
+
333
+ 10.2. Effect of New Versions
334
+
335
+ You may distribute the Covered Software under the terms of the version
336
+ of the License under which You originally received the Covered Software,
337
+ or under the terms of any subsequent version published by the license
338
+ steward.
339
+
340
+ 10.3. Modified Versions
341
+
342
+ If you create software not governed by this License, and you want to
343
+ create a new license for such software, you may create and use a
344
+ modified version of this License if you rename the license and remove
345
+ any references to the name of the license steward (except to note that
346
+ such modified license differs from this License).
347
+
348
+ 10.4. Distributing Source Code Form that is Incompatible With Secondary
349
+ Licenses
350
+
351
+ If You choose to distribute Source Code Form that is Incompatible With
352
+ Secondary Licenses under the terms of this version of the License, the
353
+ notice described in Exhibit B of this License must be attached.
354
+
355
+ Exhibit A - Source Code Form License Notice
356
+ -------------------------------------------
357
+
358
+ This Source Code Form is subject to the terms of the Mozilla Public
359
+ License, v. 2.0. If a copy of the MPL was not distributed with this
360
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
361
+
362
+ If it is not possible or desirable to put the notice in a particular
363
+ file, then You may include the notice in a location (such as a LICENSE
364
+ file in a relevant directory) where a recipient would be likely to look
365
+ for such a notice.
366
+
367
+ You may add additional accurate notices of copyright ownership.
368
+
369
+ Exhibit B - "Incompatible With Secondary Licenses" Notice
370
+ ---------------------------------------------------------
371
+
372
+ This Source Code Form is "Incompatible With Secondary Licenses", as
373
+ defined by the Mozilla Public License, v. 2.0.
submodules/TTS/MANIFEST.in ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include README.md
2
+ include LICENSE.txt
3
+ include requirements.*.txt
4
+ include *.cff
5
+ include requirements.txt
6
+ include TTS/VERSION
7
+ recursive-include TTS *.json
8
+ recursive-include TTS *.html
9
+ recursive-include TTS *.png
10
+ recursive-include TTS *.md
11
+ recursive-include TTS *.py
12
+ recursive-include TTS *.pyx
13
+ recursive-include images *.png
14
+ recursive-exclude tests *
15
+ prune tests*
submodules/TTS/Makefile ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DEFAULT_GOAL := help
2
+ .PHONY: test system-deps dev-deps deps style lint install help docs
3
+
4
+ help:
5
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
6
+
7
+ target_dirs := tests TTS notebooks recipes
8
+
9
+ test_all: ## run tests and don't stop on an error.
10
+ nose2 --with-coverage --coverage TTS tests
11
+ ./run_bash_tests.sh
12
+
13
+ test: ## run tests.
14
+ nose2 -F -v -B --with-coverage --coverage TTS tests
15
+
16
+ test_vocoder: ## run vocoder tests.
17
+ nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
18
+
19
+ test_tts: ## run tts tests.
20
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
21
+
22
+ test_tts2: ## run tts tests.
23
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
24
+
25
+ test_xtts:
26
+ nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
27
+
28
+ test_aux: ## run aux tests.
29
+ nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
30
+ ./run_bash_tests.sh
31
+
32
+ test_zoo: ## run zoo tests.
33
+ nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
34
+
35
+ inference_tests: ## run inference tests.
36
+ nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
37
+
38
+ data_tests: ## run data tests.
39
+ nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
40
+
41
+ test_text: ## run text tests.
42
+ nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
43
+
44
+ test_failed: ## only run tests failed the last time.
45
+ nose2 -F -v -B --with-coverage --coverage TTS tests
46
+
47
+ style: ## update code style.
48
+ black ${target_dirs}
49
+ isort ${target_dirs}
50
+
51
+ lint: ## run pylint linter.
52
+ pylint ${target_dirs}
53
+ black ${target_dirs} --check
54
+ isort ${target_dirs} --check-only
55
+
56
+ system-deps: ## install linux system deps
57
+ sudo apt-get install -y libsndfile1-dev
58
+
59
+ dev-deps: ## install development deps
60
+ pip install -r requirements.dev.txt
61
+
62
+ doc-deps: ## install docs dependencies
63
+ pip install -r docs/requirements.txt
64
+
65
+ build-docs: ## build the docs
66
+ cd docs && make clean && make build
67
+
68
+ hub-deps: ## install deps for torch hub use
69
+ pip install -r requirements.hub.txt
70
+
71
+ deps: ## install 🐸 requirements.
72
+ pip install -r requirements.txt
73
+
74
+ install: ## install 🐸 TTS for development.
75
+ pip install -e .[all]
76
+
77
+ docs: ## build the docs
78
+ $(MAKE) -C docs clean && $(MAKE) -C docs html
submodules/TTS/README.md ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## 🐸Coqui.ai News
3
+ - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
4
+ - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
5
+ - 📣 ⓍTTS can now stream with <200ms latency.
6
+ - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
7
+ - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
8
+ - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
9
+ - 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
10
+
11
+ <div align="center">
12
+ <img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
13
+
14
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
15
+
16
+
17
+ **🐸TTS is a library for advanced Text-to-Speech generation.**
18
+
19
+ 🚀 Pretrained models in +1100 languages.
20
+
21
+ 🛠️ Tools for training new models and fine-tuning existing models in any language.
22
+
23
+ 📚 Utilities for dataset analysis and curation.
24
+ ______________________________________________________________________
25
+
26
+ [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
27
+ [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
28
+ [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
29
+ [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
30
+ [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
31
+ [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
32
+
33
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
34
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
35
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
36
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
37
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
38
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
39
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
40
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
41
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg)
42
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg)
43
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg)
44
+ [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
45
+
46
+ </div>
47
+
48
+ ______________________________________________________________________
49
+
50
+ ## 💬 Where to ask questions
51
+ Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
52
+
53
+ | Type | Platforms |
54
+ | ------------------------------- | --------------------------------------- |
55
+ | 🚨 **Bug Reports** | [GitHub Issue Tracker] |
56
+ | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
57
+ | 👩‍💻 **Usage Questions** | [GitHub Discussions] |
58
+ | 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
59
+
60
+ [github issue tracker]: https://github.com/coqui-ai/tts/issues
61
+ [github discussions]: https://github.com/coqui-ai/TTS/discussions
62
+ [discord]: https://discord.gg/5eXr5seRrv
63
+ [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
64
+
65
+
66
+ ## 🔗 Links and Resources
67
+ | Type | Links |
68
+ | ------------------------------- | --------------------------------------- |
69
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
70
+ | 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
71
+ | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
72
+ | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
73
+ | 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
74
+ | 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
75
+
76
+
77
+ ## 🥇 TTS Performance
78
+ <p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>
79
+
80
+ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.
81
+
82
+ ## Features
83
+ - High-performance Deep Learning models for Text2Speech tasks.
84
+ - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
85
+ - Speaker Encoder to compute speaker embeddings efficiently.
86
+ - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
87
+ - Fast and efficient model training.
88
+ - Detailed training logs on the terminal and Tensorboard.
89
+ - Support for Multi-speaker TTS.
90
+ - Efficient, flexible, lightweight but feature complete `Trainer API`.
91
+ - Released and ready-to-use models.
92
+ - Tools to curate Text2Speech datasets under```dataset_analysis```.
93
+ - Utilities to use and test your models.
94
+ - Modular (but not too much) code base enabling easy implementation of new ideas.
95
+
96
+ ## Model Implementations
97
+ ### Spectrogram models
98
+ - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
99
+ - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
100
+ - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
101
+ - Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
102
+ - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
103
+ - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
104
+ - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
105
+ - FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
106
+ - SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
107
+ - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
108
+ - OverFlow: [paper](https://arxiv.org/abs/2211.06892)
109
+ - Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
110
+ - Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
111
+
112
+ ### End-to-End Models
113
+ - ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
114
+ - VITS: [paper](https://arxiv.org/pdf/2106.06103)
115
+ - 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
116
+ - 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
117
+ - 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
118
+
119
+ ### Attention Methods
120
+ - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
121
+ - Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
122
+ - Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
123
+ - Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
124
+ - Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
125
+ - Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
126
+
127
+ ### Speaker Encoder
128
+ - GE2E: [paper](https://arxiv.org/abs/1710.10467)
129
+ - Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
130
+
131
+ ### Vocoders
132
+ - MelGAN: [paper](https://arxiv.org/abs/1910.06711)
133
+ - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
134
+ - ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
135
+ - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
136
+ - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
137
+ - WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
138
+ - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
139
+ - UnivNet: [paper](https://arxiv.org/abs/2106.07889)
140
+
141
+ ### Voice Conversion
142
+ - FreeVC: [paper](https://arxiv.org/abs/2210.15418)
143
+
144
+ You can also help us implement more models.
145
+
146
+ ## Installation
147
+ 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
148
+
149
+ If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
150
+
151
+ ```bash
152
+ pip install TTS
153
+ ```
154
+
155
+ If you plan to code or train models, clone 🐸TTS and install it locally.
156
+
157
+ ```bash
158
+ git clone https://github.com/coqui-ai/TTS
159
+ pip install -e .[all,dev,notebooks] # Select the relevant extras
160
+ ```
161
+
162
+ If you are on Ubuntu (Debian), you can also run following commands for installation.
163
+
164
+ ```bash
165
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
166
+ $ make install
167
+ ```
168
+
169
+ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
170
+
171
+
172
+ ## Docker Image
173
+ You can also try TTS without install with the docker image.
174
+ Simply run the following command and you will be able to run TTS without installing it.
175
+
176
+ ```bash
177
+ docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
178
+ python3 TTS/server/server.py --list_models #To get the list of available models
179
+ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
180
+ ```
181
+
182
+ You can then enjoy the TTS server [here](http://[::1]:5002/)
183
+ More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
184
+
185
+
186
+ ## Synthesizing speech by 🐸TTS
187
+
188
+ ### 🐍 Python API
189
+
190
+ #### Running a multi-speaker and multi-lingual model
191
+
192
+ ```python
193
+ import torch
194
+ from TTS.api import TTS
195
+
196
+ # Get device
197
+ device = "cuda" if torch.cuda.is_available() else "cpu"
198
+
199
+ # List available 🐸TTS models
200
+ print(TTS().list_models())
201
+
202
+ # Init TTS
203
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
204
+
205
+ # Run TTS
206
+ # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
207
+ # Text to speech list of amplitude values as output
208
+ wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
209
+ # Text to speech to a file
210
+ tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
211
+ ```
212
+
213
+ #### Running a single speaker model
214
+
215
+ ```python
216
+ # Init TTS with the target model name
217
+ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
218
+
219
+ # Run TTS
220
+ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
221
+
222
+ # Example voice cloning with YourTTS in English, French and Portuguese
223
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
224
+ tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
225
+ tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
226
+ tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
227
+ ```
228
+
229
+ #### Example voice conversion
230
+
231
+ Converting the voice in `source_wav` to the voice of `target_wav`
232
+
233
+ ```python
234
+ tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
235
+ tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
236
+ ```
237
+
238
+ #### Example voice cloning together with the voice conversion model.
239
+ This way, you can clone voices by using any model in 🐸TTS.
240
+
241
+ ```python
242
+
243
+ tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
244
+ tts.tts_with_vc_to_file(
245
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
246
+ speaker_wav="target/speaker.wav",
247
+ file_path="output.wav"
248
+ )
249
+ ```
250
+
251
+ #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
252
+ For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
253
+ You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
254
+ and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
255
+
256
+ ```python
257
+ # TTS with on the fly voice conversion
258
+ api = TTS("tts_models/deu/fairseq/vits")
259
+ api.tts_with_vc_to_file(
260
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
261
+ speaker_wav="target/speaker.wav",
262
+ file_path="output.wav"
263
+ )
264
+ ```
265
+
266
+ ### Command-line `tts`
267
+
268
+ <!-- begin-tts-readme -->
269
+
270
+ Synthesize speech on command line.
271
+
272
+ You can either use your trained model or choose a model from the provided list.
273
+
274
+ If you don't specify any models, then it uses LJSpeech based English model.
275
+
276
+ #### Single Speaker Models
277
+
278
+ - List provided models:
279
+
280
+ ```
281
+ $ tts --list_models
282
+ ```
283
+
284
+ - Get model info (for both tts_models and vocoder_models):
285
+
286
+ - Query by type/name:
287
+ The model_info_by_name uses the name as it from the --list_models.
288
+ ```
289
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
290
+ ```
291
+ For example:
292
+ ```
293
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
294
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
295
+ ```
296
+ - Query by type/idx:
297
+ The model_query_idx uses the corresponding idx from --list_models.
298
+
299
+ ```
300
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
301
+ ```
302
+
303
+ For example:
304
+
305
+ ```
306
+ $ tts --model_info_by_idx tts_models/3
307
+ ```
308
+
309
+ - Query info for model info by full name:
310
+ ```
311
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
312
+ ```
313
+
314
+ - Run TTS with default models:
315
+
316
+ ```
317
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
318
+ ```
319
+
320
+ - Run TTS and pipe out the generated TTS wav file data:
321
+
322
+ ```
323
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
324
+ ```
325
+
326
+ - Run a TTS model with its default vocoder model:
327
+
328
+ ```
329
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
330
+ ```
331
+
332
+ For example:
333
+
334
+ ```
335
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
336
+ ```
337
+
338
+ - Run with specific TTS and vocoder models from the list:
339
+
340
+ ```
341
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
342
+ ```
343
+
344
+ For example:
345
+
346
+ ```
347
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
348
+ ```
349
+
350
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
351
+
352
+ ```
353
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
354
+ ```
355
+
356
+ - Run your own TTS and Vocoder models:
357
+
358
+ ```
359
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
360
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
361
+ ```
362
+
363
+ #### Multi-speaker Models
364
+
365
+ - List the available speakers and choose a <speaker_id> among them:
366
+
367
+ ```
368
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
369
+ ```
370
+
371
+ - Run the multi-speaker TTS model with the target speaker ID:
372
+
373
+ ```
374
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
375
+ ```
376
+
377
+ - Run your own multi-speaker TTS model:
378
+
379
+ ```
380
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
381
+ ```
382
+
383
+ ### Voice Conversion Models
384
+
385
+ ```
386
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
387
+ ```
388
+
389
+ <!-- end-tts-readme -->
390
+
391
+ ## Directory Structure
392
+ ```
393
+ |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
394
+ |- utils/ (common utilities.)
395
+ |- TTS
396
+ |- bin/ (folder for all the executables.)
397
+ |- train*.py (train your target model.)
398
+ |- ...
399
+ |- tts/ (text to speech models)
400
+ |- layers/ (model layer definitions)
401
+ |- models/ (model definitions)
402
+ |- utils/ (model specific utilities.)
403
+ |- speaker_encoder/ (Speaker Encoder models.)
404
+ |- (same)
405
+ |- vocoder/ (Vocoder models.)
406
+ |- (same)
407
+ ```
submodules/TTS/TTS/.models.json ADDED
@@ -0,0 +1,938 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual": {
4
+ "multi-dataset": {
5
+ "xtts_v2": {
6
+ "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
7
+ "hf_url": [
8
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
9
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
10
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
11
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
12
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
13
+ ],
14
+ "model_hash": "10f92b55c512af7a8d39d650547a15a7",
15
+ "default_vocoder": null,
16
+ "commit": "480a6cdf7",
17
+ "license": "CPML",
18
+ "contact": "[email protected]",
19
+ "tos_required": true
20
+ },
21
+ "xtts_v1.1": {
22
+ "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
23
+ "hf_url": [
24
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
25
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
26
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
27
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
28
+ ],
29
+ "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
30
+ "default_vocoder": null,
31
+ "commit": "82910a63",
32
+ "license": "CPML",
33
+ "contact": "[email protected]",
34
+ "tos_required": true
35
+ },
36
+ "your_tts": {
37
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
38
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
39
+ "default_vocoder": null,
40
+ "commit": "e9a1953e",
41
+ "license": "CC BY-NC-ND 4.0",
42
+ "contact": "[email protected]"
43
+ },
44
+ "bark": {
45
+ "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
46
+ "hf_url": [
47
+ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
48
+ "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
49
+ "https://coqui.gateway.scarf.sh/hf/text_2.pt",
50
+ "https://coqui.gateway.scarf.sh/hf/bark/config.json",
51
+ "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
52
+ "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
53
+ ],
54
+ "default_vocoder": null,
55
+ "commit": "e9a1953e",
56
+ "license": "MIT",
57
+ "contact": "https://www.suno.ai/"
58
+ }
59
+ }
60
+ },
61
+ "bg": {
62
+ "cv": {
63
+ "vits": {
64
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
65
+ "default_vocoder": null,
66
+ "commit": null,
67
+ "author": "@NeonGeckoCom",
68
+ "license": "bsd-3-clause"
69
+ }
70
+ }
71
+ },
72
+ "cs": {
73
+ "cv": {
74
+ "vits": {
75
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
76
+ "default_vocoder": null,
77
+ "commit": null,
78
+ "author": "@NeonGeckoCom",
79
+ "license": "bsd-3-clause"
80
+ }
81
+ }
82
+ },
83
+ "da": {
84
+ "cv": {
85
+ "vits": {
86
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
87
+ "default_vocoder": null,
88
+ "commit": null,
89
+ "author": "@NeonGeckoCom",
90
+ "license": "bsd-3-clause"
91
+ }
92
+ }
93
+ },
94
+ "et": {
95
+ "cv": {
96
+ "vits": {
97
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
98
+ "default_vocoder": null,
99
+ "commit": null,
100
+ "author": "@NeonGeckoCom",
101
+ "license": "bsd-3-clause"
102
+ }
103
+ }
104
+ },
105
+ "ga": {
106
+ "cv": {
107
+ "vits": {
108
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
109
+ "default_vocoder": null,
110
+ "commit": null,
111
+ "author": "@NeonGeckoCom",
112
+ "license": "bsd-3-clause"
113
+ }
114
+ }
115
+ },
116
+ "en": {
117
+ "ek1": {
118
+ "tacotron2": {
119
+ "description": "EK1 en-rp tacotron2 by NMStoker",
120
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
121
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
122
+ "commit": "c802255",
123
+ "license": "apache 2.0"
124
+ }
125
+ },
126
+ "ljspeech": {
127
+ "tacotron2-DDC": {
128
+ "description": "Tacotron2 with Double Decoder Consistency.",
129
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
130
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
131
+ "commit": "bae2ad0f",
132
+ "author": "Eren Gölge @erogol",
133
+ "license": "apache 2.0",
134
+ "contact": "[email protected]"
135
+ },
136
+ "tacotron2-DDC_ph": {
137
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
138
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
139
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
140
+ "commit": "3900448",
141
+ "author": "Eren Gölge @erogol",
142
+ "license": "apache 2.0",
143
+ "contact": "[email protected]"
144
+ },
145
+ "glow-tts": {
146
+ "description": "",
147
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
148
+ "stats_file": null,
149
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
150
+ "commit": "",
151
+ "author": "Eren Gölge @erogol",
152
+ "license": "MPL",
153
+ "contact": "[email protected]"
154
+ },
155
+ "speedy-speech": {
156
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
157
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
158
+ "stats_file": null,
159
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
160
+ "commit": "4581e3d",
161
+ "author": "Eren Gölge @erogol",
162
+ "license": "apache 2.0",
163
+ "contact": "[email protected]"
164
+ },
165
+ "tacotron2-DCA": {
166
+ "description": "",
167
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
168
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
169
+ "commit": "",
170
+ "author": "Eren Gölge @erogol",
171
+ "license": "MPL",
172
+ "contact": "[email protected]"
173
+ },
174
+ "vits": {
175
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
176
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
177
+ "default_vocoder": null,
178
+ "commit": "3900448",
179
+ "author": "Eren Gölge @erogol",
180
+ "license": "apache 2.0",
181
+ "contact": "[email protected]"
182
+ },
183
+ "vits--neon": {
184
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
185
+ "default_vocoder": null,
186
+ "author": "@NeonGeckoCom",
187
+ "license": "bsd-3-clause",
188
+ "contact": null,
189
+ "commit": null
190
+ },
191
+ "fast_pitch": {
192
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
193
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
194
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
195
+ "commit": "b27b3ba",
196
+ "author": "Eren Gölge @erogol",
197
+ "license": "apache 2.0",
198
+ "contact": "[email protected]"
199
+ },
200
+ "overflow": {
201
+ "description": "Overflow model trained on LJSpeech",
202
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
203
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
204
+ "commit": "3b1a28f",
205
+ "author": "Eren Gölge @erogol",
206
+ "license": "apache 2.0",
207
+ "contact": "[email protected]"
208
+ },
209
+ "neural_hmm": {
210
+ "description": "Neural HMM model trained on LJSpeech",
211
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
212
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
213
+ "commit": "3b1a28f",
214
+ "author": "Shivam Metha @shivammehta25",
215
+ "license": "apache 2.0",
216
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
217
+ }
218
+ },
219
+ "vctk": {
220
+ "vits": {
221
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
222
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
223
+ "default_vocoder": null,
224
+ "commit": "3900448",
225
+ "author": "Eren @erogol",
226
+ "license": "apache 2.0",
227
+ "contact": "[email protected]"
228
+ },
229
+ "fast_pitch": {
230
+ "description": "FastPitch model trained on VCTK dataseset.",
231
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
232
+ "default_vocoder": null,
233
+ "commit": "bdab788d",
234
+ "author": "Eren @erogol",
235
+ "license": "CC BY-NC-ND 4.0",
236
+ "contact": "[email protected]"
237
+ }
238
+ },
239
+ "sam": {
240
+ "tacotron-DDC": {
241
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
242
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
243
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
244
+ "commit": "bae2ad0f",
245
+ "author": "Eren Gölge @erogol",
246
+ "license": "apache 2.0",
247
+ "contact": "[email protected]"
248
+ }
249
+ },
250
+ "blizzard2013": {
251
+ "capacitron-t2-c50": {
252
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
253
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
254
+ "commit": "d6284e7",
255
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
256
+ "author": "Adam Froghyar @a-froghyar",
257
+ "license": "apache 2.0",
258
+ "contact": "[email protected]"
259
+ },
260
+ "capacitron-t2-c150_v2": {
261
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
262
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
263
+ "commit": "a67039d",
264
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
265
+ "author": "Adam Froghyar @a-froghyar",
266
+ "license": "apache 2.0",
267
+ "contact": "[email protected]"
268
+ }
269
+ },
270
+ "multi-dataset": {
271
+ "tortoise-v2": {
272
+ "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
273
+ "github_rls_url": [
274
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
275
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
276
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
277
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
278
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
279
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
280
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
281
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
282
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
283
+ ],
284
+ "commit": "c1875f6",
285
+ "default_vocoder": null,
286
+ "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
287
+ "license": "apache 2.0"
288
+ }
289
+ },
290
+ "jenny": {
291
+ "jenny": {
292
+ "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
293
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
294
+ "default_vocoder": null,
295
+ "commit": "ba40a1c",
296
+ "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
297
+ "author": "@noml4u"
298
+ }
299
+ }
300
+ },
301
+ "es": {
302
+ "mai": {
303
+ "tacotron2-DDC": {
304
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
305
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
306
+ "commit": "",
307
+ "author": "Eren Gölge @erogol",
308
+ "license": "MPL",
309
+ "contact": "[email protected]"
310
+ }
311
+ },
312
+ "css10": {
313
+ "vits": {
314
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
315
+ "default_vocoder": null,
316
+ "commit": null,
317
+ "author": "@NeonGeckoCom",
318
+ "license": "bsd-3-clause"
319
+ }
320
+ }
321
+ },
322
+ "fr": {
323
+ "mai": {
324
+ "tacotron2-DDC": {
325
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
326
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
327
+ "commit": null,
328
+ "author": "Eren Gölge @erogol",
329
+ "license": "MPL",
330
+ "contact": "[email protected]"
331
+ }
332
+ },
333
+ "css10": {
334
+ "vits": {
335
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
336
+ "default_vocoder": null,
337
+ "commit": null,
338
+ "author": "@NeonGeckoCom",
339
+ "license": "bsd-3-clause"
340
+ }
341
+ }
342
+ },
343
+ "uk": {
344
+ "mai": {
345
+ "glow-tts": {
346
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
347
+ "author": "@robinhad",
348
+ "commit": "bdab788d",
349
+ "license": "MIT",
350
+ "contact": "",
351
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
352
+ },
353
+ "vits": {
354
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
355
+ "default_vocoder": null,
356
+ "commit": null,
357
+ "author": "@NeonGeckoCom",
358
+ "license": "bsd-3-clause"
359
+ }
360
+ }
361
+ },
362
+ "zh-CN": {
363
+ "baker": {
364
+ "tacotron2-DDC-GST": {
365
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
366
+ "commit": "unknown",
367
+ "author": "@kirianguiller",
368
+ "license": "apache 2.0",
369
+ "default_vocoder": null
370
+ }
371
+ }
372
+ },
373
+ "nl": {
374
+ "mai": {
375
+ "tacotron2-DDC": {
376
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
377
+ "author": "@r-dh",
378
+ "license": "apache 2.0",
379
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
380
+ "stats_file": null,
381
+ "commit": "540d811"
382
+ }
383
+ },
384
+ "css10": {
385
+ "vits": {
386
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
387
+ "default_vocoder": null,
388
+ "commit": null,
389
+ "author": "@NeonGeckoCom",
390
+ "license": "bsd-3-clause"
391
+ }
392
+ }
393
+ },
394
+ "de": {
395
+ "thorsten": {
396
+ "tacotron2-DCA": {
397
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
398
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
399
+ "author": "@thorstenMueller",
400
+ "license": "apache 2.0",
401
+ "commit": "unknown"
402
+ },
403
+ "vits": {
404
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
405
+ "default_vocoder": null,
406
+ "author": "@thorstenMueller",
407
+ "license": "apache 2.0",
408
+ "commit": "unknown"
409
+ },
410
+ "tacotron2-DDC": {
411
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
412
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
413
+ "description": "Thorsten-Dec2021-22k-DDC",
414
+ "author": "@thorstenMueller",
415
+ "license": "apache 2.0",
416
+ "commit": "unknown"
417
+ }
418
+ },
419
+ "css10": {
420
+ "vits-neon": {
421
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
422
+ "default_vocoder": null,
423
+ "author": "@NeonGeckoCom",
424
+ "license": "bsd-3-clause",
425
+ "commit": null
426
+ }
427
+ }
428
+ },
429
+ "ja": {
430
+ "kokoro": {
431
+ "tacotron2-DDC": {
432
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
433
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
434
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
435
+ "author": "@kaiidams",
436
+ "license": "apache 2.0",
437
+ "commit": "401fbd89"
438
+ }
439
+ }
440
+ },
441
+ "tr": {
442
+ "common-voice": {
443
+ "glow-tts": {
444
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
445
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
446
+ "license": "MIT",
447
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
448
+ "author": "Fatih Akademi",
449
+ "commit": null
450
+ }
451
+ }
452
+ },
453
+ "it": {
454
+ "mai_female": {
455
+ "glow-tts": {
456
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
457
+ "default_vocoder": null,
458
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
459
+ "author": "@nicolalandro",
460
+ "license": "apache 2.0",
461
+ "commit": null
462
+ },
463
+ "vits": {
464
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
465
+ "default_vocoder": null,
466
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
467
+ "author": "@nicolalandro",
468
+ "license": "apache 2.0",
469
+ "commit": null
470
+ }
471
+ },
472
+ "mai_male": {
473
+ "glow-tts": {
474
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
475
+ "default_vocoder": null,
476
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
477
+ "author": "@nicolalandro",
478
+ "license": "apache 2.0",
479
+ "commit": null
480
+ },
481
+ "vits": {
482
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
483
+ "default_vocoder": null,
484
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
485
+ "author": "@nicolalandro",
486
+ "license": "apache 2.0",
487
+ "commit": null
488
+ }
489
+ }
490
+ },
491
+ "ewe": {
492
+ "openbible": {
493
+ "vits": {
494
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
495
+ "default_vocoder": null,
496
+ "license": "CC-BY-SA 4.0",
497
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
498
+ "author": "@coqui_ai",
499
+ "commit": "1b22f03"
500
+ }
501
+ }
502
+ },
503
+ "hau": {
504
+ "openbible": {
505
+ "vits": {
506
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
507
+ "default_vocoder": null,
508
+ "license": "CC-BY-SA 4.0",
509
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
510
+ "author": "@coqui_ai",
511
+ "commit": "1b22f03"
512
+ }
513
+ }
514
+ },
515
+ "lin": {
516
+ "openbible": {
517
+ "vits": {
518
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
519
+ "default_vocoder": null,
520
+ "license": "CC-BY-SA 4.0",
521
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
522
+ "author": "@coqui_ai",
523
+ "commit": "1b22f03"
524
+ }
525
+ }
526
+ },
527
+ "tw_akuapem": {
528
+ "openbible": {
529
+ "vits": {
530
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
531
+ "default_vocoder": null,
532
+ "license": "CC-BY-SA 4.0",
533
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
534
+ "author": "@coqui_ai",
535
+ "commit": "1b22f03"
536
+ }
537
+ }
538
+ },
539
+ "tw_asante": {
540
+ "openbible": {
541
+ "vits": {
542
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
543
+ "default_vocoder": null,
544
+ "license": "CC-BY-SA 4.0",
545
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
546
+ "author": "@coqui_ai",
547
+ "commit": "1b22f03"
548
+ }
549
+ }
550
+ },
551
+ "yor": {
552
+ "openbible": {
553
+ "vits": {
554
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
555
+ "default_vocoder": null,
556
+ "license": "CC-BY-SA 4.0",
557
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
558
+ "author": "@coqui_ai",
559
+ "commit": "1b22f03"
560
+ }
561
+ }
562
+ },
563
+ "hu": {
564
+ "css10": {
565
+ "vits": {
566
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
567
+ "default_vocoder": null,
568
+ "commit": null,
569
+ "author": "@NeonGeckoCom",
570
+ "license": "bsd-3-clause"
571
+ }
572
+ }
573
+ },
574
+ "el": {
575
+ "cv": {
576
+ "vits": {
577
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
578
+ "default_vocoder": null,
579
+ "commit": null,
580
+ "author": "@NeonGeckoCom",
581
+ "license": "bsd-3-clause"
582
+ }
583
+ }
584
+ },
585
+ "fi": {
586
+ "css10": {
587
+ "vits": {
588
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
589
+ "default_vocoder": null,
590
+ "commit": null,
591
+ "author": "@NeonGeckoCom",
592
+ "license": "bsd-3-clause"
593
+ }
594
+ }
595
+ },
596
+ "hr": {
597
+ "cv": {
598
+ "vits": {
599
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
600
+ "default_vocoder": null,
601
+ "commit": null,
602
+ "author": "@NeonGeckoCom",
603
+ "license": "bsd-3-clause"
604
+ }
605
+ }
606
+ },
607
+ "lt": {
608
+ "cv": {
609
+ "vits": {
610
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
611
+ "default_vocoder": null,
612
+ "commit": null,
613
+ "author": "@NeonGeckoCom",
614
+ "license": "bsd-3-clause"
615
+ }
616
+ }
617
+ },
618
+ "lv": {
619
+ "cv": {
620
+ "vits": {
621
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
622
+ "default_vocoder": null,
623
+ "commit": null,
624
+ "author": "@NeonGeckoCom",
625
+ "license": "bsd-3-clause"
626
+ }
627
+ }
628
+ },
629
+ "mt": {
630
+ "cv": {
631
+ "vits": {
632
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
633
+ "default_vocoder": null,
634
+ "commit": null,
635
+ "author": "@NeonGeckoCom",
636
+ "license": "bsd-3-clause"
637
+ }
638
+ }
639
+ },
640
+ "pl": {
641
+ "mai_female": {
642
+ "vits": {
643
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
644
+ "default_vocoder": null,
645
+ "commit": null,
646
+ "author": "@NeonGeckoCom",
647
+ "license": "bsd-3-clause"
648
+ }
649
+ }
650
+ },
651
+ "pt": {
652
+ "cv": {
653
+ "vits": {
654
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
655
+ "default_vocoder": null,
656
+ "commit": null,
657
+ "author": "@NeonGeckoCom",
658
+ "license": "bsd-3-clause"
659
+ }
660
+ }
661
+ },
662
+ "ro": {
663
+ "cv": {
664
+ "vits": {
665
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
666
+ "default_vocoder": null,
667
+ "commit": null,
668
+ "author": "@NeonGeckoCom",
669
+ "license": "bsd-3-clause"
670
+ }
671
+ }
672
+ },
673
+ "sk": {
674
+ "cv": {
675
+ "vits": {
676
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
677
+ "default_vocoder": null,
678
+ "commit": null,
679
+ "author": "@NeonGeckoCom",
680
+ "license": "bsd-3-clause"
681
+ }
682
+ }
683
+ },
684
+ "sl": {
685
+ "cv": {
686
+ "vits": {
687
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
688
+ "default_vocoder": null,
689
+ "commit": null,
690
+ "author": "@NeonGeckoCom",
691
+ "license": "bsd-3-clause"
692
+ }
693
+ }
694
+ },
695
+ "sv": {
696
+ "cv": {
697
+ "vits": {
698
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
699
+ "default_vocoder": null,
700
+ "commit": null,
701
+ "author": "@NeonGeckoCom",
702
+ "license": "bsd-3-clause"
703
+ }
704
+ }
705
+ },
706
+ "ca": {
707
+ "custom": {
708
+ "vits": {
709
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
710
+ "default_vocoder": null,
711
+ "commit": null,
712
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
713
+ "author": "@gullabi",
714
+ "license": "CC-BY-4.0"
715
+ }
716
+ }
717
+ },
718
+ "fa": {
719
+ "custom": {
720
+ "glow-tts": {
721
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
722
+ "default_vocoder": null,
723
+ "commit": null,
724
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
725
+ "author": "@karim23657",
726
+ "license": "CC-BY-4.0"
727
+ }
728
+ }
729
+ },
730
+ "bn": {
731
+ "custom": {
732
+ "vits-male": {
733
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
734
+ "default_vocoder": null,
735
+ "commit": null,
736
+ "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
737
+ "author": "@mobassir94",
738
+ "license": "Apache 2.0"
739
+ },
740
+ "vits-female": {
741
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
742
+ "default_vocoder": null,
743
+ "commit": null,
744
+ "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
745
+ "author": "@mobassir94",
746
+ "license": "Apache 2.0"
747
+ }
748
+ }
749
+ },
750
+ "be": {
751
+ "common-voice": {
752
+ "glow-tts":{
753
+ "description": "Belarusian GlowTTS model created by @alex73 (Github).",
754
+ "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
755
+ "default_vocoder": "vocoder_models/be/common-voice/hifigan",
756
+ "commit": "c0aabb85",
757
+ "license": "CC-BY-SA 4.0",
758
+ "contact": "[email protected]"
759
+ }
760
+ }
761
+ }
762
+ },
763
+ "vocoder_models": {
764
+ "universal": {
765
+ "libri-tts": {
766
+ "wavegrad": {
767
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
768
+ "commit": "ea976b0",
769
+ "author": "Eren Gölge @erogol",
770
+ "license": "MPL",
771
+ "contact": "[email protected]"
772
+ },
773
+ "fullband-melgan": {
774
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
775
+ "commit": "4132240",
776
+ "author": "Eren Gölge @erogol",
777
+ "license": "MPL",
778
+ "contact": "[email protected]"
779
+ }
780
+ }
781
+ },
782
+ "en": {
783
+ "ek1": {
784
+ "wavegrad": {
785
+ "description": "EK1 en-rp wavegrad by NMStoker",
786
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
787
+ "commit": "c802255",
788
+ "license": "apache 2.0"
789
+ }
790
+ },
791
+ "ljspeech": {
792
+ "multiband-melgan": {
793
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
794
+ "commit": "ea976b0",
795
+ "author": "Eren Gölge @erogol",
796
+ "license": "MPL",
797
+ "contact": "[email protected]"
798
+ },
799
+ "hifigan_v2": {
800
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
801
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
802
+ "commit": "bae2ad0f",
803
+ "author": "@erogol",
804
+ "license": "apache 2.0",
805
+ "contact": "[email protected]"
806
+ },
807
+ "univnet": {
808
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
809
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
810
+ "commit": "4581e3d",
811
+ "author": "Eren @erogol",
812
+ "license": "apache 2.0",
813
+ "contact": "[email protected]"
814
+ }
815
+ },
816
+ "blizzard2013": {
817
+ "hifigan_v2": {
818
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
819
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
820
+ "commit": "d6284e7",
821
+ "author": "Adam Froghyar @a-froghyar",
822
+ "license": "apache 2.0",
823
+ "contact": "[email protected]"
824
+ }
825
+ },
826
+ "vctk": {
827
+ "hifigan_v2": {
828
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
829
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
830
+ "commit": "2f07160",
831
+ "author": "Edresson Casanova",
832
+ "license": "apache 2.0",
833
+ "contact": ""
834
+ }
835
+ },
836
+ "sam": {
837
+ "hifigan_v2": {
838
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
839
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
840
+ "commit": "2f07160",
841
+ "author": "Eren Gölge @erogol",
842
+ "license": "apache 2.0",
843
+ "contact": "[email protected]"
844
+ }
845
+ }
846
+ },
847
+ "nl": {
848
+ "mai": {
849
+ "parallel-wavegan": {
850
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
851
+ "author": "@r-dh",
852
+ "license": "apache 2.0",
853
+ "commit": "unknown"
854
+ }
855
+ }
856
+ },
857
+ "de": {
858
+ "thorsten": {
859
+ "wavegrad": {
860
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
861
+ "author": "@thorstenMueller",
862
+ "license": "apache 2.0",
863
+ "commit": "unknown"
864
+ },
865
+ "fullband-melgan": {
866
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
867
+ "author": "@thorstenMueller",
868
+ "license": "apache 2.0",
869
+ "commit": "unknown"
870
+ },
871
+ "hifigan_v1": {
872
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
873
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
874
+ "author": "@thorstenMueller",
875
+ "license": "apache 2.0",
876
+ "commit": "unknown"
877
+ }
878
+ }
879
+ },
880
+ "ja": {
881
+ "kokoro": {
882
+ "hifigan_v1": {
883
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
884
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
885
+ "author": "@kaiidams",
886
+ "license": "apache 2.0",
887
+ "commit": "3900448"
888
+ }
889
+ }
890
+ },
891
+ "uk": {
892
+ "mai": {
893
+ "multiband-melgan": {
894
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
895
+ "author": "@robinhad",
896
+ "commit": "bdab788d",
897
+ "license": "MIT",
898
+ "contact": ""
899
+ }
900
+ }
901
+ },
902
+ "tr": {
903
+ "common-voice": {
904
+ "hifigan": {
905
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
906
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
907
+ "author": "Fatih Akademi",
908
+ "license": "MIT",
909
+ "commit": null
910
+ }
911
+ }
912
+ },
913
+ "be": {
914
+ "common-voice": {
915
+ "hifigan": {
916
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
917
+ "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
918
+ "author": "@alex73",
919
+ "license": "CC-BY-SA 4.0",
920
+ "commit": "c0aabb85"
921
+ }
922
+ }
923
+ }
924
+ },
925
+ "voice_conversion_models": {
926
+ "multilingual": {
927
+ "vctk": {
928
+ "freevc24": {
929
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
930
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
931
+ "author": "Jing-Yi Li @OlaWod",
932
+ "license": "MIT",
933
+ "commit": null
934
+ }
935
+ }
936
+ }
937
+ }
938
+ }
submodules/TTS/TTS/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.22.0
submodules/TTS/TTS/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
submodules/TTS/TTS/api.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import warnings
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import numpy as np
7
+ from torch import nn
8
+
9
+ from TTS.utils.audio.numpy_transforms import save_wav
10
+ from TTS.utils.manage import ModelManager
11
+ from TTS.utils.synthesizer import Synthesizer
12
+ from TTS.config import load_config
13
+
14
+
15
+ class TTS(nn.Module):
16
+ """TODO: Add voice conversion and Capacitron support."""
17
+
18
+ def __init__(
19
+ self,
20
+ model_name: str = "",
21
+ model_path: str = None,
22
+ config_path: str = None,
23
+ vocoder_path: str = None,
24
+ vocoder_config_path: str = None,
25
+ progress_bar: bool = True,
26
+ gpu=False,
27
+ ):
28
+ """🐸TTS python interface that allows to load and use the released models.
29
+
30
+ Example with a multi-speaker model:
31
+ >>> from TTS.api import TTS
32
+ >>> tts = TTS(TTS.list_models()[0])
33
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
34
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
35
+
36
+ Example with a single-speaker model:
37
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
38
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
39
+
40
+ Example loading a model from a path:
41
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
42
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
43
+
44
+ Example voice cloning with YourTTS in English, French and Portuguese:
45
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
46
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
47
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
48
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
49
+
50
+ Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
51
+ >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
52
+ >>> tts.tts_to_file("This is a test.", file_path="output.wav")
53
+
54
+ Args:
55
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
56
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
57
+ config_path (str, optional): Path to the model config. Defaults to None.
58
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
59
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
60
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
61
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
62
+ """
63
+ super().__init__()
64
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
65
+ self.config = load_config(config_path) if config_path else None
66
+ self.synthesizer = None
67
+ self.voice_converter = None
68
+ self.model_name = ""
69
+ if gpu:
70
+ warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
71
+
72
+ if model_name is not None and len(model_name) > 0:
73
+ if "tts_models" in model_name:
74
+ self.load_tts_model_by_name(model_name, gpu)
75
+ elif "voice_conversion_models" in model_name:
76
+ self.load_vc_model_by_name(model_name, gpu)
77
+ else:
78
+ self.load_model_by_name(model_name, gpu)
79
+
80
+ if model_path:
81
+ self.load_tts_model_by_path(
82
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
83
+ )
84
+
85
+ @property
86
+ def models(self):
87
+ return self.manager.list_tts_models()
88
+
89
+ @property
90
+ def is_multi_speaker(self):
91
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
92
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
93
+ return False
94
+
95
+ @property
96
+ def is_multi_lingual(self):
97
+ # Not sure what sets this to None, but applied a fix to prevent crashing.
98
+ if (
99
+ isinstance(self.model_name, str)
100
+ and "xtts" in self.model_name
101
+ or self.config
102
+ and ("xtts" in self.config.model or len(self.config.languages) > 1)
103
+ ):
104
+ return True
105
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
106
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
107
+ return False
108
+
109
+ @property
110
+ def speakers(self):
111
+ if not self.is_multi_speaker:
112
+ return None
113
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
114
+
115
+ @property
116
+ def languages(self):
117
+ if not self.is_multi_lingual:
118
+ return None
119
+ return self.synthesizer.tts_model.language_manager.language_names
120
+
121
+ @staticmethod
122
+ def get_models_file_path():
123
+ return Path(__file__).parent / ".models.json"
124
+
125
+ def list_models(self):
126
+ return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
127
+
128
+ def download_model_by_name(self, model_name: str):
129
+ model_path, config_path, model_item = self.manager.download_model(model_name)
130
+ if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
131
+ # return model directory if there are multiple files
132
+ # we assume that the model knows how to load itself
133
+ return None, None, None, None, model_path
134
+ if model_item.get("default_vocoder") is None:
135
+ return model_path, config_path, None, None, None
136
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
137
+ return model_path, config_path, vocoder_path, vocoder_config_path, None
138
+
139
+ def load_model_by_name(self, model_name: str, gpu: bool = False):
140
+ """Load one of the 🐸TTS models by name.
141
+
142
+ Args:
143
+ model_name (str): Model name to load. You can list models by ```tts.models```.
144
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
145
+ """
146
+ self.load_tts_model_by_name(model_name, gpu)
147
+
148
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
149
+ """Load one of the voice conversion models by name.
150
+
151
+ Args:
152
+ model_name (str): Model name to load. You can list models by ```tts.models```.
153
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
154
+ """
155
+ self.model_name = model_name
156
+ model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
157
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
158
+
159
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
160
+ """Load one of 🐸TTS models by name.
161
+
162
+ Args:
163
+ model_name (str): Model name to load. You can list models by ```tts.models```.
164
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
165
+
166
+ TODO: Add tests
167
+ """
168
+ self.synthesizer = None
169
+ self.model_name = model_name
170
+
171
+ model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
172
+ model_name
173
+ )
174
+
175
+ # init synthesizer
176
+ # None values are fetch from the model
177
+ self.synthesizer = Synthesizer(
178
+ tts_checkpoint=model_path,
179
+ tts_config_path=config_path,
180
+ tts_speakers_file=None,
181
+ tts_languages_file=None,
182
+ vocoder_checkpoint=vocoder_path,
183
+ vocoder_config=vocoder_config_path,
184
+ encoder_checkpoint=None,
185
+ encoder_config=None,
186
+ model_dir=model_dir,
187
+ use_cuda=gpu,
188
+ )
189
+
190
+ def load_tts_model_by_path(
191
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
192
+ ):
193
+ """Load a model from a path.
194
+
195
+ Args:
196
+ model_path (str): Path to the model checkpoint.
197
+ config_path (str): Path to the model config.
198
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
199
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
200
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
201
+ """
202
+
203
+ self.synthesizer = Synthesizer(
204
+ tts_checkpoint=model_path,
205
+ tts_config_path=config_path,
206
+ tts_speakers_file=None,
207
+ tts_languages_file=None,
208
+ vocoder_checkpoint=vocoder_path,
209
+ vocoder_config=vocoder_config,
210
+ encoder_checkpoint=None,
211
+ encoder_config=None,
212
+ use_cuda=gpu,
213
+ )
214
+
215
+ def _check_arguments(
216
+ self,
217
+ speaker: str = None,
218
+ language: str = None,
219
+ speaker_wav: str = None,
220
+ emotion: str = None,
221
+ speed: float = None,
222
+ **kwargs,
223
+ ) -> None:
224
+ """Check if the arguments are valid for the model."""
225
+ # check for the coqui tts models
226
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
227
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
228
+ if self.is_multi_lingual and language is None:
229
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
230
+ if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
231
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
232
+ if not self.is_multi_lingual and language is not None:
233
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
234
+ if not emotion is None and not speed is None:
235
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
236
+
237
+ def tts(
238
+ self,
239
+ text: str,
240
+ speaker: str = None,
241
+ language: str = None,
242
+ speaker_wav: str = None,
243
+ emotion: str = None,
244
+ speed: float = None,
245
+ split_sentences: bool = True,
246
+ **kwargs,
247
+ ):
248
+ """Convert text to speech.
249
+
250
+ Args:
251
+ text (str):
252
+ Input text to synthesize.
253
+ speaker (str, optional):
254
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
255
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
256
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
257
+ supported by `XTTS` model.
258
+ speaker_wav (str, optional):
259
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
260
+ Defaults to None.
261
+ emotion (str, optional):
262
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
263
+ speed (float, optional):
264
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
265
+ Defaults to None.
266
+ split_sentences (bool, optional):
267
+ Split text into sentences, synthesize them separately and concatenate the file audio.
268
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
269
+ applicable to the 🐸TTS models. Defaults to True.
270
+ kwargs (dict, optional):
271
+ Additional arguments for the model.
272
+ """
273
+ self._check_arguments(
274
+ speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
275
+ )
276
+ wav = self.synthesizer.tts(
277
+ text=text,
278
+ speaker_name=speaker,
279
+ language_name=language,
280
+ speaker_wav=speaker_wav,
281
+ reference_wav=None,
282
+ style_wav=None,
283
+ style_text=None,
284
+ reference_speaker_name=None,
285
+ split_sentences=split_sentences,
286
+ **kwargs,
287
+ )
288
+ return wav
289
+
290
+ def tts_to_file(
291
+ self,
292
+ text: str,
293
+ speaker: str = None,
294
+ language: str = None,
295
+ speaker_wav: str = None,
296
+ emotion: str = None,
297
+ speed: float = 1.0,
298
+ pipe_out=None,
299
+ file_path: str = "output.wav",
300
+ split_sentences: bool = True,
301
+ **kwargs,
302
+ ):
303
+ """Convert text to speech.
304
+
305
+ Args:
306
+ text (str):
307
+ Input text to synthesize.
308
+ speaker (str, optional):
309
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
310
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
311
+ language (str, optional):
312
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
313
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
314
+ speaker_wav (str, optional):
315
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
316
+ Defaults to None.
317
+ emotion (str, optional):
318
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
319
+ speed (float, optional):
320
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
321
+ pipe_out (BytesIO, optional):
322
+ Flag to stdout the generated TTS wav file for shell pipe.
323
+ file_path (str, optional):
324
+ Output file path. Defaults to "output.wav".
325
+ split_sentences (bool, optional):
326
+ Split text into sentences, synthesize them separately and concatenate the file audio.
327
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
328
+ applicable to the 🐸TTS models. Defaults to True.
329
+ kwargs (dict, optional):
330
+ Additional arguments for the model.
331
+ """
332
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
333
+
334
+ wav = self.tts(
335
+ text=text,
336
+ speaker=speaker,
337
+ language=language,
338
+ speaker_wav=speaker_wav,
339
+ split_sentences=split_sentences,
340
+ **kwargs,
341
+ )
342
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
343
+ return file_path
344
+
345
+ def voice_conversion(
346
+ self,
347
+ source_wav: str,
348
+ target_wav: str,
349
+ ):
350
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
351
+
352
+ Args:``
353
+ source_wav (str):
354
+ Path to the source wav file.
355
+ target_wav (str):`
356
+ Path to the target wav file.
357
+ """
358
+ wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
359
+ return wav
360
+
361
+ def voice_conversion_to_file(
362
+ self,
363
+ source_wav: str,
364
+ target_wav: str,
365
+ file_path: str = "output.wav",
366
+ ):
367
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
368
+
369
+ Args:
370
+ source_wav (str):
371
+ Path to the source wav file.
372
+ target_wav (str):
373
+ Path to the target wav file.
374
+ file_path (str, optional):
375
+ Output file path. Defaults to "output.wav".
376
+ """
377
+ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
378
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
379
+ return file_path
380
+
381
+ def tts_with_vc(
382
+ self,
383
+ text: str,
384
+ language: str = None,
385
+ speaker_wav: str = None,
386
+ speaker: str = None,
387
+ split_sentences: bool = True,
388
+ ):
389
+ """Convert text to speech with voice conversion.
390
+
391
+ It combines tts with voice conversion to fake voice cloning.
392
+
393
+ - Convert text to speech with tts.
394
+ - Convert the output wav to target speaker with voice conversion.
395
+
396
+ Args:
397
+ text (str):
398
+ Input text to synthesize.
399
+ language (str, optional):
400
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
401
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
402
+ speaker_wav (str, optional):
403
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
404
+ Defaults to None.
405
+ speaker (str, optional):
406
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
407
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
408
+ split_sentences (bool, optional):
409
+ Split text into sentences, synthesize them separately and concatenate the file audio.
410
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
411
+ applicable to the 🐸TTS models. Defaults to True.
412
+ """
413
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
414
+ # Lazy code... save it to a temp file to resample it while reading it for VC
415
+ self.tts_to_file(
416
+ text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
417
+ )
418
+ if self.voice_converter is None:
419
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
420
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
421
+ return wav
422
+
423
+ def tts_with_vc_to_file(
424
+ self,
425
+ text: str,
426
+ language: str = None,
427
+ speaker_wav: str = None,
428
+ file_path: str = "output.wav",
429
+ speaker: str = None,
430
+ split_sentences: bool = True,
431
+ ):
432
+ """Convert text to speech with voice conversion and save to file.
433
+
434
+ Check `tts_with_vc` for more details.
435
+
436
+ Args:
437
+ text (str):
438
+ Input text to synthesize.
439
+ language (str, optional):
440
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
441
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
442
+ speaker_wav (str, optional):
443
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
444
+ Defaults to None.
445
+ file_path (str, optional):
446
+ Output file path. Defaults to "output.wav".
447
+ speaker (str, optional):
448
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
449
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
450
+ split_sentences (bool, optional):
451
+ Split text into sentences, synthesize them separately and concatenate the file audio.
452
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
453
+ applicable to the 🐸TTS models. Defaults to True.
454
+ """
455
+ wav = self.tts_with_vc(
456
+ text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
457
+ )
458
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
submodules/TTS/TTS/bin/__init__.py ADDED
File without changes
submodules/TTS/TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
submodules/TTS/TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
submodules/TTS/TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.config.shared_configs import BaseDatasetConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.managers import save_file
12
+ from TTS.tts.utils.speakers import SpeakerManager
13
+
14
+
15
+ def compute_embeddings(
16
+ model_path,
17
+ config_path,
18
+ output_path,
19
+ old_speakers_file=None,
20
+ old_append=False,
21
+ config_dataset_path=None,
22
+ formatter_name=None,
23
+ dataset_name=None,
24
+ dataset_path=None,
25
+ meta_file_train=None,
26
+ meta_file_val=None,
27
+ disable_cuda=False,
28
+ no_eval=False,
29
+ ):
30
+ use_cuda = torch.cuda.is_available() and not disable_cuda
31
+
32
+ if config_dataset_path is not None:
33
+ c_dataset = load_config(config_dataset_path)
34
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
35
+ else:
36
+ c_dataset = BaseDatasetConfig()
37
+ c_dataset.formatter = formatter_name
38
+ c_dataset.dataset_name = dataset_name
39
+ c_dataset.path = dataset_path
40
+ if meta_file_train is not None:
41
+ c_dataset.meta_file_train = meta_file_train
42
+ if meta_file_val is not None:
43
+ c_dataset.meta_file_val = meta_file_val
44
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
45
+
46
+ if meta_data_eval is None:
47
+ samples = meta_data_train
48
+ else:
49
+ samples = meta_data_train + meta_data_eval
50
+
51
+ encoder_manager = SpeakerManager(
52
+ encoder_model_path=model_path,
53
+ encoder_config_path=config_path,
54
+ d_vectors_file_path=old_speakers_file,
55
+ use_cuda=use_cuda,
56
+ )
57
+
58
+ class_name_key = encoder_manager.encoder_config.class_name_key
59
+
60
+ # compute speaker embeddings
61
+ if old_speakers_file is not None and old_append:
62
+ speaker_mapping = encoder_manager.embeddings
63
+ else:
64
+ speaker_mapping = {}
65
+
66
+ for fields in tqdm(samples):
67
+ class_name = fields[class_name_key]
68
+ audio_file = fields["audio_file"]
69
+ embedding_key = fields["audio_unique_name"]
70
+
71
+ # Only update the speaker name when the embedding is already in the old file.
72
+ if embedding_key in speaker_mapping:
73
+ speaker_mapping[embedding_key]["name"] = class_name
74
+ continue
75
+
76
+ if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
77
+ # get the embedding from the old file
78
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
79
+ else:
80
+ # extract the embedding
81
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
82
+
83
+ # create speaker_mapping if target dataset is defined
84
+ speaker_mapping[embedding_key] = {}
85
+ speaker_mapping[embedding_key]["name"] = class_name
86
+ speaker_mapping[embedding_key]["embedding"] = embedd
87
+
88
+ if speaker_mapping:
89
+ # save speaker_mapping if target dataset is defined
90
+ if os.path.isdir(output_path):
91
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
92
+ else:
93
+ mapping_file_path = output_path
94
+
95
+ if os.path.dirname(mapping_file_path) != "":
96
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
97
+
98
+ save_file(speaker_mapping, mapping_file_path)
99
+ print("Speaker embeddings saved at:", mapping_file_path)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser(
104
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
105
+ """
106
+ Example runs:
107
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
108
+
109
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
110
+ """,
111
+ formatter_class=RawTextHelpFormatter,
112
+ )
113
+ parser.add_argument(
114
+ "--model_path",
115
+ type=str,
116
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
117
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
118
+ )
119
+ parser.add_argument(
120
+ "--config_path",
121
+ type=str,
122
+ help="Path to model config file. It defaults to the released speaker encoder config.",
123
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
124
+ )
125
+ parser.add_argument(
126
+ "--config_dataset_path",
127
+ type=str,
128
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
129
+ default=None,
130
+ )
131
+ parser.add_argument(
132
+ "--output_path",
133
+ type=str,
134
+ help="Path for output `pth` or `json` file.",
135
+ default="speakers.pth",
136
+ )
137
+ parser.add_argument(
138
+ "--old_file",
139
+ type=str,
140
+ help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
141
+ default=None,
142
+ )
143
+ parser.add_argument(
144
+ "--old_append",
145
+ help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
146
+ default=False,
147
+ action="store_true",
148
+ )
149
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
150
+ parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
151
+ parser.add_argument(
152
+ "--formatter_name",
153
+ type=str,
154
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
155
+ default=None,
156
+ )
157
+ parser.add_argument(
158
+ "--dataset_name",
159
+ type=str,
160
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
161
+ default=None,
162
+ )
163
+ parser.add_argument(
164
+ "--dataset_path",
165
+ type=str,
166
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
167
+ default=None,
168
+ )
169
+ parser.add_argument(
170
+ "--meta_file_train",
171
+ type=str,
172
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
173
+ default=None,
174
+ )
175
+ parser.add_argument(
176
+ "--meta_file_val",
177
+ type=str,
178
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
179
+ default=None,
180
+ )
181
+ args = parser.parse_args()
182
+
183
+ compute_embeddings(
184
+ args.model_path,
185
+ args.config_path,
186
+ args.output_path,
187
+ old_speakers_file=args.old_file,
188
+ old_append=args.old_append,
189
+ config_dataset_path=args.config_dataset_path,
190
+ formatter_name=args.formatter_name,
191
+ dataset_name=args.dataset_name,
192
+ dataset_path=args.dataset_path,
193
+ meta_file_train=args.meta_file_train,
194
+ meta_file_val=args.meta_file_val,
195
+ disable_cuda=args.disable_cuda,
196
+ no_eval=args.no_eval,
197
+ )
submodules/TTS/TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # from TTS.utils.io import load_config
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.utils.audio import AudioProcessor
15
+
16
+
17
+ def main():
18
+ """Run preprocessing process."""
19
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
+ parser.add_argument(
23
+ "--data_path",
24
+ type=str,
25
+ required=False,
26
+ help="folder including the target set of wavs overriding dataset config.",
27
+ )
28
+ args, overrides = parser.parse_known_args()
29
+
30
+ CONFIG = load_config(args.config_path)
31
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
+
33
+ # load config
34
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
+ CONFIG.audio.stats_path = None # discard pre-defined stats
36
+
37
+ # load audio processor
38
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
39
+
40
+ # load the meta data of target dataset
41
+ if args.data_path:
42
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
+ else:
44
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
+ print(f" > There are {len(dataset_items)} files.")
46
+
47
+ mel_sum = 0
48
+ mel_square_sum = 0
49
+ linear_sum = 0
50
+ linear_square_sum = 0
51
+ N = 0
52
+ for item in tqdm(dataset_items):
53
+ # compute features
54
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
+ linear = ap.spectrogram(wav)
56
+ mel = ap.melspectrogram(wav)
57
+
58
+ # compute stats
59
+ N += mel.shape[1]
60
+ mel_sum += mel.sum(1)
61
+ linear_sum += linear.sum(1)
62
+ mel_square_sum += (mel**2).sum(axis=1)
63
+ linear_square_sum += (linear**2).sum(axis=1)
64
+
65
+ mel_mean = mel_sum / N
66
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
+ linear_mean = linear_sum / N
68
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
+
70
+ output_file_path = args.out_path
71
+ stats = {}
72
+ stats["mel_mean"] = mel_mean
73
+ stats["mel_std"] = mel_scale
74
+ stats["linear_mean"] = linear_mean
75
+ stats["linear_std"] = linear_scale
76
+
77
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
+
82
+ # set default config values for mean-var scaling
83
+ CONFIG.audio.stats_path = output_file_path
84
+ CONFIG.audio.signal_norm = True
85
+ # remove redundant values
86
+ del CONFIG.audio.max_norm
87
+ del CONFIG.audio.min_level_db
88
+ del CONFIG.audio.symmetric_norm
89
+ del CONFIG.audio.clip_norm
90
+ stats["audio_config"] = CONFIG.audio.to_dict()
91
+ np.save(output_file_path, stats, allow_pickle=True)
92
+ print(f" > stats saved to {output_file_path}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
submodules/TTS/TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+ class_name_key = encoder_manager.encoder_config.class_name_key
14
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15
+
16
+ class_acc_dict = {}
17
+
18
+ # compute embeddings for all wav_files
19
+ for item in tqdm(dataset_items):
20
+ class_name = item[class_name_key]
21
+ wav_file = item["audio_file"]
22
+
23
+ # extract the embedding
24
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
27
+ if encoder_manager.use_cuda:
28
+ embedding = embedding.cuda()
29
+
30
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31
+ predicted_label = map_classid_to_classname[str(class_id)]
32
+ else:
33
+ predicted_label = None
34
+
35
+ if class_name is not None and predicted_label is not None:
36
+ is_equal = int(class_name == predicted_label)
37
+ if class_name not in class_acc_dict:
38
+ class_acc_dict[class_name] = [is_equal]
39
+ else:
40
+ class_acc_dict[class_name].append(is_equal)
41
+ else:
42
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
43
+
44
+ acc_avg = 0
45
+ for key, values in class_acc_dict.items():
46
+ acc = sum(values) / len(values)
47
+ print("Class", key, "Accuracy:", acc)
48
+ acc_avg += acc
49
+
50
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
51
+
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser(
55
+ description="""Compute the accuracy of the encoder.\n\n"""
56
+ """
57
+ Example runs:
58
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59
+ """,
60
+ formatter_class=RawTextHelpFormatter,
61
+ )
62
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63
+ parser.add_argument(
64
+ "config_path",
65
+ type=str,
66
+ help="Path to model config file.",
67
+ )
68
+
69
+ parser.add_argument(
70
+ "config_dataset_path",
71
+ type=str,
72
+ help="Path to dataset config file.",
73
+ )
74
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76
+
77
+ args = parser.parse_args()
78
+
79
+ c_dataset = load_config(args.config_dataset_path)
80
+
81
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82
+ items = meta_data_train + meta_data_eval
83
+
84
+ enc_manager = SpeakerManager(
85
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86
+ )
87
+
88
+ compute_encoder_accuracy(items, enc_manager)
submodules/TTS/TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.audio.numpy_transforms import quantize
19
+ from TTS.utils.generic_utils import count_parameters
20
+
21
+ use_cuda = torch.cuda.is_available()
22
+
23
+
24
+ def setup_loader(ap, r, verbose=False):
25
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
26
+ dataset = TTSDataset(
27
+ outputs_per_step=r,
28
+ compute_linear_spec=False,
29
+ samples=meta_data,
30
+ tokenizer=tokenizer,
31
+ ap=ap,
32
+ batch_group_size=0,
33
+ min_text_len=c.min_text_len,
34
+ max_text_len=c.max_text_len,
35
+ min_audio_len=c.min_audio_len,
36
+ max_audio_len=c.max_audio_len,
37
+ phoneme_cache_path=c.phoneme_cache_path,
38
+ precompute_num_workers=0,
39
+ use_noise_augment=False,
40
+ verbose=verbose,
41
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
42
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
43
+ )
44
+
45
+ if c.use_phonemes and c.compute_input_seq_cache:
46
+ # precompute phonemes to have a better estimate of sequence lengths.
47
+ dataset.compute_input_seq(c.num_loader_workers)
48
+ dataset.preprocess_samples()
49
+
50
+ loader = DataLoader(
51
+ dataset,
52
+ batch_size=c.batch_size,
53
+ shuffle=False,
54
+ collate_fn=dataset.collate_fn,
55
+ drop_last=False,
56
+ sampler=None,
57
+ num_workers=c.num_loader_workers,
58
+ pin_memory=False,
59
+ )
60
+ return loader
61
+
62
+
63
+ def set_filename(wav_path, out_path):
64
+ wav_file = os.path.basename(wav_path)
65
+ file_name = wav_file.split(".")[0]
66
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
69
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
70
+ wavq_path = os.path.join(out_path, "quant", file_name)
71
+ mel_path = os.path.join(out_path, "mel", file_name)
72
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
73
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
74
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
75
+
76
+
77
+ def format_data(data):
78
+ # setup input data
79
+ text_input = data["token_id"]
80
+ text_lengths = data["token_id_lengths"]
81
+ mel_input = data["mel"]
82
+ mel_lengths = data["mel_lengths"]
83
+ item_idx = data["item_idxs"]
84
+ d_vectors = data["d_vectors"]
85
+ speaker_ids = data["speaker_ids"]
86
+ attn_mask = data["attns"]
87
+ avg_text_length = torch.mean(text_lengths.float())
88
+ avg_spec_length = torch.mean(mel_lengths.float())
89
+
90
+ # dispatch data to GPU
91
+ if use_cuda:
92
+ text_input = text_input.cuda(non_blocking=True)
93
+ text_lengths = text_lengths.cuda(non_blocking=True)
94
+ mel_input = mel_input.cuda(non_blocking=True)
95
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
96
+ if speaker_ids is not None:
97
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
98
+ if d_vectors is not None:
99
+ d_vectors = d_vectors.cuda(non_blocking=True)
100
+ if attn_mask is not None:
101
+ attn_mask = attn_mask.cuda(non_blocking=True)
102
+ return (
103
+ text_input,
104
+ text_lengths,
105
+ mel_input,
106
+ mel_lengths,
107
+ speaker_ids,
108
+ d_vectors,
109
+ avg_text_length,
110
+ avg_spec_length,
111
+ attn_mask,
112
+ item_idx,
113
+ )
114
+
115
+
116
+ @torch.no_grad()
117
+ def inference(
118
+ model_name,
119
+ model,
120
+ ap,
121
+ text_input,
122
+ text_lengths,
123
+ mel_input,
124
+ mel_lengths,
125
+ speaker_ids=None,
126
+ d_vectors=None,
127
+ ):
128
+ if model_name == "glow_tts":
129
+ speaker_c = None
130
+ if speaker_ids is not None:
131
+ speaker_c = speaker_ids
132
+ elif d_vectors is not None:
133
+ speaker_c = d_vectors
134
+ outputs = model.inference_with_MAS(
135
+ text_input,
136
+ text_lengths,
137
+ mel_input,
138
+ mel_lengths,
139
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
140
+ )
141
+ model_output = outputs["model_outputs"]
142
+ model_output = model_output.detach().cpu().numpy()
143
+
144
+ elif "tacotron" in model_name:
145
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
146
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
147
+ postnet_outputs = outputs["model_outputs"]
148
+ # normalize tacotron output
149
+ if model_name == "tacotron":
150
+ mel_specs = []
151
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
152
+ for b in range(postnet_outputs.shape[0]):
153
+ postnet_output = postnet_outputs[b]
154
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
155
+ model_output = torch.stack(mel_specs).cpu().numpy()
156
+
157
+ elif model_name == "tacotron2":
158
+ model_output = postnet_outputs.detach().cpu().numpy()
159
+ return model_output
160
+
161
+
162
+ def extract_spectrograms(
163
+ data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
164
+ ):
165
+ model.eval()
166
+ export_metadata = []
167
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantize_bits > 0:
201
+ wavq = quantize(wav, quantize_bits)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantize_bits=args.quantize_bits,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
submodules/TTS/TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
submodules/TTS/TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers import Gruut
11
+
12
+
13
+ def compute_phonemes(item):
14
+ text = item["text"]
15
+ ph = phonemizer.phonemize(text).replace("|", "")
16
+ return set(list(ph))
17
+
18
+
19
+ def main():
20
+ # pylint: disable=W0601
21
+ global c, phonemizer
22
+ # pylint: disable=bad-option-value
23
+ parser = argparse.ArgumentParser(
24
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
+ """
26
+ Example runs:
27
+
28
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
29
+ """,
30
+ formatter_class=RawTextHelpFormatter,
31
+ )
32
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
+ args = parser.parse_args()
34
+
35
+ c = load_config(args.config_path)
36
+
37
+ # load all datasets
38
+ train_items, eval_items = load_tts_samples(
39
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
+ )
41
+ items = train_items + eval_items
42
+ print("Num items:", len(items))
43
+
44
+ language_list = [item["language"] for item in items]
45
+ is_lang_def = all(language_list)
46
+
47
+ if not c.phoneme_language or not is_lang_def:
48
+ raise ValueError("Phoneme language must be defined in config.")
49
+
50
+ if not language_list.count(language_list[0]) == len(language_list):
51
+ raise ValueError(
52
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
+ )
54
+
55
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
+
57
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
+ phones = []
59
+ for ph in phonemes:
60
+ phones.extend(ph)
61
+
62
+ phones = set(phones)
63
+ lower_phones = filter(lambda c: c.islower(), phones)
64
+ phones_force_lower = [c.lower() for c in phones]
65
+ phones_force_lower = set(phones_force_lower)
66
+
67
+ print(f" > Number of unique phonemes: {len(phones)}")
68
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
submodules/TTS/TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import multiprocessing
4
+ import os
5
+ import pathlib
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
11
+
12
+ torch.set_num_threads(1)
13
+
14
+
15
+ def adjust_path_and_remove_silence(audio_path):
16
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
17
+ # ignore if the file exists
18
+ if os.path.exists(output_path) and not args.force:
19
+ return output_path, False
20
+
21
+ # create all directory structure
22
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
23
+ # remove the silence and save the audio
24
+ output_path, is_speech = remove_silence(
25
+ model_and_utils,
26
+ audio_path,
27
+ output_path,
28
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
29
+ use_cuda=args.use_cuda,
30
+ )
31
+ return output_path, is_speech
32
+
33
+
34
+ def preprocess_audios():
35
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
36
+ print("> Number of files: ", len(files))
37
+ if not args.force:
38
+ print("> Ignoring files that already exist in the output idrectory.")
39
+
40
+ if args.trim_just_beginning_and_end:
41
+ print("> Trimming just the beginning and the end with nonspeech parts.")
42
+ else:
43
+ print("> Trimming all nonspeech parts.")
44
+
45
+ filtered_files = []
46
+ if files:
47
+ # create threads
48
+ # num_threads = multiprocessing.cpu_count()
49
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
50
+
51
+ if args.num_processes > 1:
52
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
53
+ results = list(
54
+ tqdm(
55
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
56
+ total=len(files),
57
+ desc="Processing audio files",
58
+ )
59
+ )
60
+ for output_path, is_speech in results:
61
+ if not is_speech:
62
+ filtered_files.append(output_path)
63
+ else:
64
+ for f in tqdm(files):
65
+ output_path, is_speech = adjust_path_and_remove_silence(f)
66
+ if not is_speech:
67
+ filtered_files.append(output_path)
68
+
69
+ # write files that do not have speech
70
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
71
+ for file in filtered_files:
72
+ f.write(str(file) + "\n")
73
+ else:
74
+ print("> No files Found !")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ parser = argparse.ArgumentParser(
79
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
80
+ )
81
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
82
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
83
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
84
+ parser.add_argument(
85
+ "-g",
86
+ "--glob",
87
+ type=str,
88
+ default="**/*.wav",
89
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
90
+ )
91
+ parser.add_argument(
92
+ "-t",
93
+ "--trim_just_beginning_and_end",
94
+ type=bool,
95
+ default=True,
96
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
97
+ )
98
+ parser.add_argument(
99
+ "-c",
100
+ "--use_cuda",
101
+ type=bool,
102
+ default=False,
103
+ help="If True use cuda",
104
+ )
105
+ parser.add_argument(
106
+ "--use_onnx",
107
+ type=bool,
108
+ default=False,
109
+ help="If True use onnx",
110
+ )
111
+ parser.add_argument(
112
+ "--num_processes",
113
+ type=int,
114
+ default=1,
115
+ help="Number of processes to use",
116
+ )
117
+ args = parser.parse_args()
118
+
119
+ if args.output_dir == "":
120
+ args.output_dir = args.input_dir
121
+
122
+ # load the model and utils
123
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
124
+ preprocess_audios()
submodules/TTS/TTS/bin/resample.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
submodules/TTS/TTS/bin/synthesize.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import contextlib
6
+ import sys
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ # pylint: disable=redefined-outer-name, unused-argument
10
+ from pathlib import Path
11
+
12
+ description = """
13
+ Synthesize speech on command line.
14
+
15
+ You can either use your trained model or choose a model from the provided list.
16
+
17
+ If you don't specify any models, then it uses LJSpeech based English model.
18
+
19
+ #### Single Speaker Models
20
+
21
+ - List provided models:
22
+
23
+ ```
24
+ $ tts --list_models
25
+ ```
26
+
27
+ - Get model info (for both tts_models and vocoder_models):
28
+
29
+ - Query by type/name:
30
+ The model_info_by_name uses the name as it from the --list_models.
31
+ ```
32
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
33
+ ```
34
+ For example:
35
+ ```
36
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
37
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
38
+ ```
39
+ - Query by type/idx:
40
+ The model_query_idx uses the corresponding idx from --list_models.
41
+
42
+ ```
43
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
44
+ ```
45
+
46
+ For example:
47
+
48
+ ```
49
+ $ tts --model_info_by_idx tts_models/3
50
+ ```
51
+
52
+ - Query info for model info by full name:
53
+ ```
54
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
55
+ ```
56
+
57
+ - Run TTS with default models:
58
+
59
+ ```
60
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
61
+ ```
62
+
63
+ - Run TTS and pipe out the generated TTS wav file data:
64
+
65
+ ```
66
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
67
+ ```
68
+
69
+ - Run a TTS model with its default vocoder model:
70
+
71
+ ```
72
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
73
+ ```
74
+
75
+ For example:
76
+
77
+ ```
78
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
79
+ ```
80
+
81
+ - Run with specific TTS and vocoder models from the list:
82
+
83
+ ```
84
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
85
+ ```
86
+
87
+ For example:
88
+
89
+ ```
90
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
91
+ ```
92
+
93
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
94
+
95
+ ```
96
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
97
+ ```
98
+
99
+ - Run your own TTS and Vocoder models:
100
+
101
+ ```
102
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
103
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
104
+ ```
105
+
106
+ #### Multi-speaker Models
107
+
108
+ - List the available speakers and choose a <speaker_id> among them:
109
+
110
+ ```
111
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
112
+ ```
113
+
114
+ - Run the multi-speaker TTS model with the target speaker ID:
115
+
116
+ ```
117
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
118
+ ```
119
+
120
+ - Run your own multi-speaker TTS model:
121
+
122
+ ```
123
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
124
+ ```
125
+
126
+ ### Voice Conversion Models
127
+
128
+ ```
129
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
130
+ ```
131
+ """
132
+
133
+
134
+ def str2bool(v):
135
+ if isinstance(v, bool):
136
+ return v
137
+ if v.lower() in ("yes", "true", "t", "y", "1"):
138
+ return True
139
+ if v.lower() in ("no", "false", "f", "n", "0"):
140
+ return False
141
+ raise argparse.ArgumentTypeError("Boolean value expected.")
142
+
143
+
144
+ def main():
145
+ parser = argparse.ArgumentParser(
146
+ description=description.replace(" ```\n", ""),
147
+ formatter_class=RawTextHelpFormatter,
148
+ )
149
+
150
+ parser.add_argument(
151
+ "--list_models",
152
+ type=str2bool,
153
+ nargs="?",
154
+ const=True,
155
+ default=False,
156
+ help="list available pre-trained TTS and vocoder models.",
157
+ )
158
+
159
+ parser.add_argument(
160
+ "--model_info_by_idx",
161
+ type=str,
162
+ default=None,
163
+ help="model info using query format: <model_type>/<model_query_idx>",
164
+ )
165
+
166
+ parser.add_argument(
167
+ "--model_info_by_name",
168
+ type=str,
169
+ default=None,
170
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
171
+ )
172
+
173
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
174
+
175
+ # Args for running pre-trained TTS models.
176
+ parser.add_argument(
177
+ "--model_name",
178
+ type=str,
179
+ default="tts_models/en/ljspeech/tacotron2-DDC",
180
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
181
+ )
182
+ parser.add_argument(
183
+ "--vocoder_name",
184
+ type=str,
185
+ default=None,
186
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
187
+ )
188
+
189
+ # Args for running custom models
190
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
191
+ parser.add_argument(
192
+ "--model_path",
193
+ type=str,
194
+ default=None,
195
+ help="Path to model file.",
196
+ )
197
+ parser.add_argument(
198
+ "--out_path",
199
+ type=str,
200
+ default="tts_output.wav",
201
+ help="Output wav file path.",
202
+ )
203
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
204
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
205
+ parser.add_argument(
206
+ "--vocoder_path",
207
+ type=str,
208
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
209
+ default=None,
210
+ )
211
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
212
+ parser.add_argument(
213
+ "--encoder_path",
214
+ type=str,
215
+ help="Path to speaker encoder model file.",
216
+ default=None,
217
+ )
218
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
219
+ parser.add_argument(
220
+ "--pipe_out",
221
+ help="stdout the generated TTS wav file for shell pipe.",
222
+ type=str2bool,
223
+ nargs="?",
224
+ const=True,
225
+ default=False,
226
+ )
227
+
228
+ # args for multi-speaker synthesis
229
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
230
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
231
+ parser.add_argument(
232
+ "--speaker_idx",
233
+ type=str,
234
+ help="Target speaker ID for a multi-speaker TTS model.",
235
+ default=None,
236
+ )
237
+ parser.add_argument(
238
+ "--language_idx",
239
+ type=str,
240
+ help="Target language ID for a multi-lingual TTS model.",
241
+ default=None,
242
+ )
243
+ parser.add_argument(
244
+ "--speaker_wav",
245
+ nargs="+",
246
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
247
+ default=None,
248
+ )
249
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
250
+ parser.add_argument(
251
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
252
+ )
253
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
254
+ parser.add_argument(
255
+ "--list_speaker_idxs",
256
+ help="List available speaker ids for the defined multi-speaker model.",
257
+ type=str2bool,
258
+ nargs="?",
259
+ const=True,
260
+ default=False,
261
+ )
262
+ parser.add_argument(
263
+ "--list_language_idxs",
264
+ help="List available language ids for the defined multi-lingual model.",
265
+ type=str2bool,
266
+ nargs="?",
267
+ const=True,
268
+ default=False,
269
+ )
270
+ # aux args
271
+ parser.add_argument(
272
+ "--save_spectogram",
273
+ type=bool,
274
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
275
+ default=False,
276
+ )
277
+ parser.add_argument(
278
+ "--reference_wav",
279
+ type=str,
280
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
281
+ default=None,
282
+ )
283
+ parser.add_argument(
284
+ "--reference_speaker_idx",
285
+ type=str,
286
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
287
+ default=None,
288
+ )
289
+ parser.add_argument(
290
+ "--progress_bar",
291
+ type=str2bool,
292
+ help="If true shows a progress bar for the model download. Defaults to True",
293
+ default=True,
294
+ )
295
+
296
+ # voice conversion args
297
+ parser.add_argument(
298
+ "--source_wav",
299
+ type=str,
300
+ default=None,
301
+ help="Original audio file to convert in the voice of the target_wav",
302
+ )
303
+ parser.add_argument(
304
+ "--target_wav",
305
+ type=str,
306
+ default=None,
307
+ help="Target audio file to convert in the voice of the source_wav",
308
+ )
309
+
310
+ parser.add_argument(
311
+ "--voice_dir",
312
+ type=str,
313
+ default=None,
314
+ help="Voice dir for tortoise model",
315
+ )
316
+
317
+ args = parser.parse_args()
318
+
319
+ # print the description if either text or list_models is not set
320
+ check_args = [
321
+ args.text,
322
+ args.list_models,
323
+ args.list_speaker_idxs,
324
+ args.list_language_idxs,
325
+ args.reference_wav,
326
+ args.model_info_by_idx,
327
+ args.model_info_by_name,
328
+ args.source_wav,
329
+ args.target_wav,
330
+ ]
331
+ if not any(check_args):
332
+ parser.parse_args(["-h"])
333
+
334
+ pipe_out = sys.stdout if args.pipe_out else None
335
+
336
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
337
+ # Late-import to make things load faster
338
+ from TTS.api import TTS
339
+ from TTS.utils.manage import ModelManager
340
+ from TTS.utils.synthesizer import Synthesizer
341
+
342
+ # load model manager
343
+ path = Path(__file__).parent / "../.models.json"
344
+ manager = ModelManager(path, progress_bar=args.progress_bar)
345
+ api = TTS()
346
+
347
+ tts_path = None
348
+ tts_config_path = None
349
+ speakers_file_path = None
350
+ language_ids_file_path = None
351
+ vocoder_path = None
352
+ vocoder_config_path = None
353
+ encoder_path = None
354
+ encoder_config_path = None
355
+ vc_path = None
356
+ vc_config_path = None
357
+ model_dir = None
358
+
359
+ # CASE1 #list : list pre-trained TTS models
360
+ if args.list_models:
361
+ manager.list_models()
362
+ sys.exit()
363
+
364
+ # CASE2 #info : model info for pre-trained TTS models
365
+ if args.model_info_by_idx:
366
+ model_query = args.model_info_by_idx
367
+ manager.model_info_by_idx(model_query)
368
+ sys.exit()
369
+
370
+ if args.model_info_by_name:
371
+ model_query_full_name = args.model_info_by_name
372
+ manager.model_info_by_full_name(model_query_full_name)
373
+ sys.exit()
374
+
375
+ # CASE3: load pre-trained model paths
376
+ if args.model_name is not None and not args.model_path:
377
+ model_path, config_path, model_item = manager.download_model(args.model_name)
378
+ # tts model
379
+ if model_item["model_type"] == "tts_models":
380
+ tts_path = model_path
381
+ tts_config_path = config_path
382
+ if "default_vocoder" in model_item:
383
+ args.vocoder_name = (
384
+ model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
385
+ )
386
+
387
+ # voice conversion model
388
+ if model_item["model_type"] == "voice_conversion_models":
389
+ vc_path = model_path
390
+ vc_config_path = config_path
391
+
392
+ # tts model with multiple files to be loaded from the directory path
393
+ if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
394
+ model_dir = model_path
395
+ tts_path = None
396
+ tts_config_path = None
397
+ args.vocoder_name = None
398
+
399
+ # load vocoder
400
+ if args.vocoder_name is not None and not args.vocoder_path:
401
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
402
+
403
+ # CASE4: set custom model paths
404
+ if args.model_path is not None:
405
+ tts_path = args.model_path
406
+ tts_config_path = args.config_path
407
+ speakers_file_path = args.speakers_file_path
408
+ language_ids_file_path = args.language_ids_file_path
409
+
410
+ if args.vocoder_path is not None:
411
+ vocoder_path = args.vocoder_path
412
+ vocoder_config_path = args.vocoder_config_path
413
+
414
+ if args.encoder_path is not None:
415
+ encoder_path = args.encoder_path
416
+ encoder_config_path = args.encoder_config_path
417
+
418
+ device = args.device
419
+ if args.use_cuda:
420
+ device = "cuda"
421
+
422
+ # load models
423
+ synthesizer = Synthesizer(
424
+ tts_path,
425
+ tts_config_path,
426
+ speakers_file_path,
427
+ language_ids_file_path,
428
+ vocoder_path,
429
+ vocoder_config_path,
430
+ encoder_path,
431
+ encoder_config_path,
432
+ vc_path,
433
+ vc_config_path,
434
+ model_dir,
435
+ args.voice_dir,
436
+ ).to(device)
437
+
438
+ # query speaker ids of a multi-speaker model.
439
+ if args.list_speaker_idxs:
440
+ print(
441
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
442
+ )
443
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
444
+ return
445
+
446
+ # query langauge ids of a multi-lingual model.
447
+ if args.list_language_idxs:
448
+ print(
449
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
450
+ )
451
+ print(synthesizer.tts_model.language_manager.name_to_id)
452
+ return
453
+
454
+ # check the arguments against a multi-speaker model.
455
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
456
+ print(
457
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
458
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
459
+ )
460
+ return
461
+
462
+ # RUN THE SYNTHESIS
463
+ if args.text:
464
+ print(" > Text: {}".format(args.text))
465
+
466
+ # kick it
467
+ if tts_path is not None:
468
+ wav = synthesizer.tts(
469
+ args.text,
470
+ speaker_name=args.speaker_idx,
471
+ language_name=args.language_idx,
472
+ speaker_wav=args.speaker_wav,
473
+ reference_wav=args.reference_wav,
474
+ style_wav=args.capacitron_style_wav,
475
+ style_text=args.capacitron_style_text,
476
+ reference_speaker_name=args.reference_speaker_idx,
477
+ )
478
+ elif vc_path is not None:
479
+ wav = synthesizer.voice_conversion(
480
+ source_wav=args.source_wav,
481
+ target_wav=args.target_wav,
482
+ )
483
+ elif model_dir is not None:
484
+ wav = synthesizer.tts(
485
+ args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
486
+ )
487
+
488
+ # save the results
489
+ print(" > Saving output to {}".format(args.out_path))
490
+ synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
491
+
492
+
493
+ if __name__ == "__main__":
494
+ main()
submodules/TTS/TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.io import copy_model_files, save_best_model, save_checkpoint
12
+ from trainer.torch import NoamLR
13
+ from trainer.trainer_utils import get_optimizer
14
+
15
+ from TTS.encoder.dataset import EncoderDataset
16
+ from TTS.encoder.utils.generic_utils import setup_encoder_model
17
+ from TTS.encoder.utils.training import init_training
18
+ from TTS.encoder.utils.visual import plot_embeddings
19
+ from TTS.tts.datasets import load_tts_samples
20
+ from TTS.utils.audio import AudioProcessor
21
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = {"train_loss": None, "eval_loss": float("inf")}
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(
226
+ c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
227
+ )
228
+
229
+ end_time = time.time()
230
+
231
+ print("")
232
+ print(
233
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
234
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
235
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
236
+ ),
237
+ flush=True,
238
+ )
239
+ # evaluation
240
+ if c.run_eval:
241
+ model.eval()
242
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
243
+ print("\n\n")
244
+ print("--> EVAL PERFORMANCE")
245
+ print(
246
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
247
+ flush=True,
248
+ )
249
+ # save the best checkpoint
250
+ best_loss = save_best_model(
251
+ {"train_loss": None, "eval_loss": eval_loss},
252
+ best_loss,
253
+ c,
254
+ model,
255
+ optimizer,
256
+ None,
257
+ global_step,
258
+ epoch,
259
+ OUT_PATH,
260
+ criterion=criterion.state_dict(),
261
+ )
262
+ model.train()
263
+
264
+ return best_loss, global_step
265
+
266
+
267
+ def main(args): # pylint: disable=redefined-outer-name
268
+ # pylint: disable=global-variable-undefined
269
+ global meta_data_train
270
+ global meta_data_eval
271
+ global train_classes
272
+
273
+ ap = AudioProcessor(**c.audio)
274
+ model = setup_encoder_model(c)
275
+
276
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
277
+
278
+ # pylint: disable=redefined-outer-name
279
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
280
+
281
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
282
+ if c.run_eval:
283
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
284
+ else:
285
+ eval_data_loader = None
286
+
287
+ num_classes = len(train_classes)
288
+ criterion = model.get_criterion(c, num_classes)
289
+
290
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
291
+ c.map_classid_to_classname = map_classid_to_classname
292
+ copy_model_files(c, OUT_PATH, new_fields={})
293
+
294
+ if args.restore_path:
295
+ criterion, args.restore_step = model.load_checkpoint(
296
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
297
+ )
298
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
299
+ else:
300
+ args.restore_step = 0
301
+
302
+ if c.lr_decay:
303
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
304
+ else:
305
+ scheduler = None
306
+
307
+ num_params = count_parameters(model)
308
+ print("\n > Model has {} parameters".format(num_params), flush=True)
309
+
310
+ if use_cuda:
311
+ model = model.cuda()
312
+ criterion.cuda()
313
+
314
+ global_step = args.restore_step
315
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
316
+
317
+
318
+ if __name__ == "__main__":
319
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
320
+
321
+ try:
322
+ main(args)
323
+ except KeyboardInterrupt:
324
+ remove_experiment_folder(OUT_PATH)
325
+ try:
326
+ sys.exit(0)
327
+ except SystemExit:
328
+ os._exit(0) # pylint: disable=protected-access
329
+ except Exception: # pylint: disable=broad-except
330
+ remove_experiment_folder(OUT_PATH)
331
+ traceback.print_exc()
332
+ sys.exit(1)
submodules/TTS/TTS/bin/train_tts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
submodules/TTS/TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.utils.audio import AudioProcessor
8
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
+ from TTS.vocoder.models import setup_model
10
+
11
+
12
+ @dataclass
13
+ class TrainVocoderArgs(TrainerArgs):
14
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
+
16
+
17
+ def main():
18
+ """Run `tts` model training directly by a `config.json` file."""
19
+ # init trainer args
20
+ train_args = TrainVocoderArgs()
21
+ parser = train_args.init_argparse(arg_prefix="")
22
+
23
+ # override trainer args from comman-line args
24
+ args, config_overrides = parser.parse_known_args()
25
+ train_args.parse_args(args)
26
+
27
+ # load config.json and register
28
+ if args.config_path or args.continue_path:
29
+ if args.config_path:
30
+ # init from a file
31
+ config = load_config(args.config_path)
32
+ if len(config_overrides) > 0:
33
+ config.parse_known_args(config_overrides, relaxed_parser=True)
34
+ elif args.continue_path:
35
+ # continue from a prev experiment
36
+ config = load_config(os.path.join(args.continue_path, "config.json"))
37
+ if len(config_overrides) > 0:
38
+ config.parse_known_args(config_overrides, relaxed_parser=True)
39
+ else:
40
+ # init from console args
41
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
+
43
+ config_base = BaseTrainingConfig()
44
+ config_base.parse_known_args(config_overrides)
45
+ config = register_config(config_base.model)()
46
+
47
+ # load training samples
48
+ if "feature_path" in config and config.feature_path:
49
+ # load pre-computed features
50
+ print(f" > Loading features from: {config.feature_path}")
51
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
+ else:
53
+ # load data raw wav files
54
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
+
56
+ # setup audio processor
57
+ ap = AudioProcessor(**config.audio)
58
+
59
+ # init the model from config
60
+ model = setup_model(config)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ training_assets={"audio_processor": ap},
71
+ parse_command_line_args=False,
72
+ )
73
+ trainer.fit()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
submodules/TTS/TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.utils.audio import AudioProcessor
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.models import setup_model
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
+ parser.add_argument(
23
+ "--num_iter",
24
+ type=int,
25
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
26
+ )
27
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
+ parser.add_argument(
30
+ "--search_depth",
31
+ type=int,
32
+ default=3,
33
+ help="Search granularity. Increasing this increases the run-time exponentially.",
34
+ )
35
+
36
+ # load config
37
+ args = parser.parse_args()
38
+ config = load_config(args.config_path)
39
+
40
+ # setup audio processor
41
+ ap = AudioProcessor(**config.audio)
42
+
43
+ # load dataset
44
+ _, train_data = load_wav_data(args.data_path, 0)
45
+ train_data = train_data[: args.num_samples]
46
+ dataset = WaveGradDataset(
47
+ ap=ap,
48
+ items=train_data,
49
+ seq_len=-1,
50
+ hop_len=ap.hop_length,
51
+ pad_short=config.pad_short,
52
+ conv_pad=config.conv_pad,
53
+ is_training=True,
54
+ return_segments=False,
55
+ use_noise_augment=False,
56
+ use_cache=False,
57
+ verbose=True,
58
+ )
59
+ loader = DataLoader(
60
+ dataset,
61
+ batch_size=1,
62
+ shuffle=False,
63
+ collate_fn=dataset.collate_full_clips,
64
+ drop_last=False,
65
+ num_workers=config.num_loader_workers,
66
+ pin_memory=False,
67
+ )
68
+
69
+ # setup the model
70
+ model = setup_model(config)
71
+ if args.use_cuda:
72
+ model.cuda()
73
+
74
+ # setup optimization parameters
75
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
+ print(f" > base values: {base_values}")
77
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
+ best_error = float("inf")
79
+ best_schedule = None # pylint: disable=C0103
80
+ total_search_iter = len(base_values) ** args.num_iter
81
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
+ beta = exponents * base
83
+ model.compute_noise_level(beta)
84
+ for data in loader:
85
+ mel, audio = data
86
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
+
88
+ if args.use_cuda:
89
+ y_hat = y_hat.cpu()
90
+ y_hat = y_hat.numpy()
91
+
92
+ mel_hat = []
93
+ for i in range(y_hat.shape[0]):
94
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
+ mel_hat.append(torch.from_numpy(m))
96
+
97
+ mel_hat = torch.stack(mel_hat)
98
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
99
+ if mse.item() < best_error:
100
+ best_error = mse.item()
101
+ best_schedule = {"beta": beta}
102
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
103
+ np.save(args.output_path, best_schedule)
submodules/TTS/TTS/config/__init__.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments but not urls with //
20
+ input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
21
+ return json.loads(input_str)
22
+
23
+ def register_config(model_name: str) -> Coqpit:
24
+ """Find the right config for the given model name.
25
+
26
+ Args:
27
+ model_name (str): Model name.
28
+
29
+ Raises:
30
+ ModuleNotFoundError: No matching config for the model name.
31
+
32
+ Returns:
33
+ Coqpit: config class.
34
+ """
35
+ config_class = None
36
+ config_name = model_name + "_config"
37
+
38
+ # TODO: fix this
39
+ if model_name == "xtts":
40
+ from TTS.tts.configs.xtts_config import XttsConfig
41
+
42
+ config_class = XttsConfig
43
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
44
+ for path in paths:
45
+ try:
46
+ config_class = find_module(path, config_name)
47
+ except ModuleNotFoundError:
48
+ pass
49
+ if config_class is None:
50
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
51
+ return config_class
52
+
53
+
54
+ def _process_model_name(config_dict: Dict) -> str:
55
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
56
+
57
+ Args:
58
+ config_dict (Dict): A dictionary including the config fields.
59
+
60
+ Returns:
61
+ str: Formatted modelname.
62
+ """
63
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
64
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
65
+ return model_name
66
+
67
+
68
+ def load_config(config_path: str) -> Coqpit:
69
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
70
+ to find the corresponding Config class. Then initialize the Config.
71
+
72
+ Args:
73
+ config_path (str): path to the config file.
74
+
75
+ Raises:
76
+ TypeError: given config file has an unknown type.
77
+
78
+ Returns:
79
+ Coqpit: TTS config object.
80
+ """
81
+ config_dict = {}
82
+ ext = os.path.splitext(config_path)[1]
83
+ if ext in (".yml", ".yaml"):
84
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
85
+ data = yaml.safe_load(f)
86
+ elif ext == ".json":
87
+ try:
88
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
89
+ data = json.load(f)
90
+ except json.decoder.JSONDecodeError:
91
+ # backwards compat.
92
+ data = read_json_with_comments(config_path)
93
+ else:
94
+ raise TypeError(f" [!] Unknown config file type {ext}")
95
+ config_dict.update(data)
96
+ model_name = _process_model_name(config_dict)
97
+ config_class = register_config(model_name.lower())
98
+ config = config_class()
99
+ config.from_dict(config_dict)
100
+ return config
101
+
102
+
103
+ def check_config_and_model_args(config, arg_name, value):
104
+ """Check the give argument in `config.model_args` if exist or in `config` for
105
+ the given value.
106
+
107
+ Return False if the argument does not exist in `config.model_args` or `config`.
108
+ This is to patch up the compatibility between models with and without `model_args`.
109
+
110
+ TODO: Remove this in the future with a unified approach.
111
+ """
112
+ if hasattr(config, "model_args"):
113
+ if arg_name in config.model_args:
114
+ return config.model_args[arg_name] == value
115
+ if hasattr(config, arg_name):
116
+ return config[arg_name] == value
117
+ return False
118
+
119
+
120
+ def get_from_config_or_model_args(config, arg_name):
121
+ """Get the given argument from `config.model_args` if exist or in `config`."""
122
+ if hasattr(config, "model_args"):
123
+ if arg_name in config.model_args:
124
+ return config.model_args[arg_name]
125
+ return config[arg_name]
126
+
127
+
128
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
129
+ """Get the given argument from `config.model_args` if exist or in `config`."""
130
+ if hasattr(config, "model_args"):
131
+ if arg_name in config.model_args:
132
+ return config.model_args[arg_name]
133
+ if hasattr(config, arg_name):
134
+ return config[arg_name]
135
+ return def_val
submodules/TTS/TTS/config/shared_configs.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ phonemizer (str):
216
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
+
218
+ meta_file_val (str):
219
+ Name of the dataset meta file that defines the instances used at validation.
220
+
221
+ meta_file_attn_mask (str):
222
+ Path to the file that lists the attention mask files used with models that require attention masks to
223
+ train the duration predictor.
224
+ """
225
+
226
+ formatter: str = ""
227
+ dataset_name: str = ""
228
+ path: str = ""
229
+ meta_file_train: str = ""
230
+ ignored_speakers: List[str] = None
231
+ language: str = ""
232
+ phonemizer: str = ""
233
+ meta_file_val: str = ""
234
+ meta_file_attn_mask: str = ""
235
+
236
+ def check_values(
237
+ self,
238
+ ):
239
+ """Check config fields"""
240
+ c = asdict(self)
241
+ check_argument("formatter", c, restricted=True)
242
+ check_argument("path", c, restricted=True)
243
+ check_argument("meta_file_train", c, restricted=True)
244
+ check_argument("meta_file_val", c, restricted=False)
245
+ check_argument("meta_file_attn_mask", c, restricted=False)
246
+
247
+
248
+ @dataclass
249
+ class BaseTrainingConfig(TrainerConfig):
250
+ """Base config to define the basic 🐸TTS training parameters that are shared
251
+ among all the models. It is based on ```Trainer.TrainingConfig```.
252
+
253
+ Args:
254
+ model (str):
255
+ Name of the model that is used in the training.
256
+
257
+ num_loader_workers (int):
258
+ Number of workers for training time dataloader.
259
+
260
+ num_eval_loader_workers (int):
261
+ Number of workers for evaluation time dataloader.
262
+ """
263
+
264
+ model: str = None
265
+ # dataloading
266
+ num_loader_workers: int = 0
267
+ num_eval_loader_workers: int = 0
268
+ use_noise_augment: bool = False