diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..443dcc13ea428986afbdddf5dc5ebba461950ff7
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..c67b46086c3677b541ea329d2cfaa853e61954f4
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,11 @@
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a035a3557968108caf446148e5b6b684f5f9d622
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,167 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+models
+videos
+temp
+tmp
+.env
diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8c200329368ddfdbe180fdbc2deda24ed7a9ce4
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+ "python-envs.defaultEnvManager": "ms-python.python:conda",
+ "python-envs.defaultPackageManager": "ms-python.python:conda",
+ "python-envs.pythonProjects": []
+}
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..31755ff88b91debf5a477a814a25f36e190a97f7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,383 @@
+---
+title: BosonAI Hackathon
+emoji: 🏃
+colorFrom: pink
+colorTo: indigo
+sdk: gradio
+sdk_version: 5.49.1
+app_file: webui.py
+pinned: false
+license: apache-2.0
+short_description: BosonAI_Hackathon
+---
+
+# Intelligent Multi-language AI Dubbing/Translation Tool - Linly-Dubbing — "AI Empowerment, Language Without Borders"
+
+
+
Linly-Dubbing WebUI
+
+[](https://github.com/Kedreamix/Linly-Dubbing)
+
+
+[](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb)
+[](https://github.com/Kedreamix/Linly-Talker/blob/main/LICENSE)
+
+[**English**](./README.md) | [**中文简体**](./README_zh.md)
+
+
+
+---
+
+
+Table of Contents
+
+
+- [Intelligent Multi-language AI Dubbing/Translation Tool - Linly-Dubbing — "AI Empowerment, Language Without Borders"](#intelligent-multi-language-ai-dubbingtranslation-tool---linly-dubbing--ai-empowerment-language-without-borders)
+ - [Introduction](#introduction)
+ - [TO DO LIST](#to-do-list)
+ - [Examples](#examples)
+ - [Installation and Usage Guide](#installation-and-usage-guide)
+ - [Test Environment](#test-environment)
+ - [1. Clone the Repository](#1-clone-the-repository)
+ - [2. Install Dependencies](#2-install-dependencies)
+ - [3. Configure Environment Variables](#3-configure-environment-variables)
+ - [4. Run the Application](#4-run-the-application)
+ - [Detailed Features and Technical Details](#detailed-features-and-technical-details)
+ - [Automatic Video Download](#automatic-video-download)
+ - [Vocal Separation](#vocal-separation)
+ - [Demucs](#demucs)
+ - [UVR5](#uvr5)
+ - [AI Speech Recognition](#ai-speech-recognition)
+ - [WhisperX](#whisperx)
+ - [FunASR](#funasr)
+ - [Large Language Model Translation](#large-language-model-translation)
+ - [OpenAI API](#openai-api)
+ - [Qwen](#qwen)
+ - [Google Translate](#google-translate)
+ - [AI-Powered Speech Synthesis](#ai-powered-speech-synthesis)
+ - [Edge TTS](#edge-tts)
+ - [XTTS](#xtts)
+ - [CosyVoice](#cosyvoice)
+ - [GPT-SoVITS](#gpt-sovits)
+ - [Video Processing](#video-processing)
+ - [Digital Human Lip-Sync Technology](#digital-human-lip-sync-technology)
+ - [License](#license)
+ - [References](#references)
+ - [Star History](#star-history)
+
+
+
+
+## Introduction
+
+`Linly-Dubbing` is an intelligent multi-language AI dubbing and translation tool inspired by [`YouDub-webui`](https://github.com/liuzhao1225/YouDub-webui) and further extended and optimized. We aim to offer diverse and high-quality dubbing options by integrating [`Linly-Talker`](https://github.com/Kedreamix/Linly-Talker)’s digital human lip-sync technology, creating a more natural multi-language video experience.
+
+Leveraging cutting-edge AI technologies, `Linly-Dubbing` sets new standards in naturalness and accuracy for multi-language dubbing, making it ideal for international education, global content localization, and more. It helps teams extend their reach and share high-quality content worldwide.
+
+Key features include:
+
+- **Multi-language Support**: Offers dubbing and subtitle translation in Chinese and many other languages to meet global needs.
+- **AI Speech Recognition**: Employs advanced AI for precise speech-to-text conversion and speaker recognition.
+- **Large Language Model Translation**: Uses leading language models like GPT for fast and accurate translations, ensuring professional quality.
+- **AI Voice Cloning**: Utilizes cutting-edge voice cloning to generate speech closely matching the original video's tone and emotion.
+- **Digital Human Lip-Sync Technology**: Synchronizes dubbing with video visuals, enhancing realism and interactivity.
+- **Flexible Upload and Translation**: Users can upload videos, choose translation languages, and standards, ensuring personalization and flexibility.
+- **Regular Updates**: Continuously introduces the latest models to stay at the forefront of dubbing and translation technology.
+
+Our mission is to provide seamless, high-quality multi-language dubbing and translation services, empowering content creators and businesses to thrive in global markets.
+
+---
+
+## TO DO LIST
+
+- [x] Implement basic AI dubbing and smart translation features.
+- [x] Integrate CosyVoice’s AI voice cloning for high-quality audio translation.
+- [x] Add FunASR AI speech recognition algorithm with optimized Chinese support.
+- [x] Utilize Qwen large language model for multi-language translation.
+- [x] Develop Linly-Dubbing WebUI for easy one-click video generation with customizable parameters.
+- [ ] Integrate UVR5 for voice/accompaniment separation and reverb removal, referencing GPTSoVITS.
+- [ ] Improve voice cloning naturalness using GPTSoVITS for fine-tuning.
+- [ ] Implement and optimize digital human lip-sync technology for better dubbing and visual coherence.
+
+---
+
+## Examples
+
+| Original Video | Linly-Dubbing |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| | |
+
+---
+
+## Installation and Usage Guide
+
+### Test Environment
+
+This guide applies to the following test environments:
+
+- Python 3.10, PyTorch 2.3.1, CUDA 12.1
+- Python 3.10, PyTorch 2.3.1, CUDA 11.8
+
+Follow the steps below to install and configure `Linly-Dubbing`.
+
+> [!NOTE]
+>
+> A Colab script is also available for an online experience: [Linly-Dubbing Colab](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb).
+
+### 1. Clone the Repository
+
+First, clone the `Linly-Dubbing` repository to your local machine and initialize submodules.
+
+```bash
+# Clone the project to your local machine
+git clone https://github.com/Kedreamix/Linly-Dubbing.git --depth 1
+
+# Navigate to the project directory
+cd Linly-Dubbing
+
+# Initialize and update submodules like CosyVoice
+git submodule update --init --recursive
+```
+
+### 2. Install Dependencies
+
+Before proceeding, please create a new Python environment and install the required dependencies.
+
+```bash
+# Create a conda environment named 'linly_dubbing' and specify Python version 3.10
+conda create -n linly_dubbing python=3.10 -y
+
+# Activate the newly created environment
+conda activate linly_dubbing
+
+# Navigate to the project directory
+cd Linly-Dubbing/
+
+# Install the ffmpeg tool
+# Install ffmpeg using conda
+conda install ffmpeg==7.0.2 -c conda-forge
+# Install ffmpeg using a domestic mirror
+conda install ffmpeg==7.0.2 -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+
+# Upgrade pip to the latest version
+python -m pip install --upgrade pip
+
+# Change the PyPI source to speed up package downloads
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+Depending on your CUDA version, install PyTorch and related libraries using the following commands:
+
+```bash
+# For CUDA 11.8
+pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+
+# For CUDA 12.1
+pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+```
+
+If you prefer to install PyTorch via conda, you can use the following commands:
+
+```bash
+# For CUDA 11.8
+conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
+
+# For CUDA 12.1
+conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia
+```
+
+> [!NOTE]
+>
+> The installation process is very slow.
+
+Next, install the remaining project dependencies:
+
+```bash
+# Install the required Python packages for the project
+# pynini is required by WeTextProcessing, so use conda to install it as it works across all platforms.
+conda install -y pynini==2.1.5 -c conda-forge
+# -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+
+pip install -r requirements.txt
+# Install dependencies for submodules
+pip install -r requirements_module.txt
+```
+
+> [!TIP]
+>
+> If you encounter an error during installation that says "Could not load library libcudnn_ops_infer.so.8," follow these steps to fix it:
+>
+> ```bash
+> # Set LD_LIBRARY_PATH to include the correct cuDNN library path
+> export LD_LIBRARY_PATH=`python3 -c 'import os; import torch; print(os.path.dirname(os.path.dirname(torch.__file__)) +"/nvidia/cudnn/lib")'`:$LD_LIBRARY_PATH
+> ```
+>
+
+### 3. Configure Environment Variables
+
+Before running the program, you need to configure the necessary environment variables. In the root directory of the project, create a `.env` file by renaming `env.example` and filling in the following variables:
+
+- `OPENAI_API_KEY`: Your OpenAI API key, usually formatted as `sk-xxx`.
+- `MODEL_NAME`: The name of the model you are using, such as `gpt-4` or `gpt-3.5-turbo`.
+- `OPENAI_API_BASE`: If you are using a self-hosted OpenAI model, provide the corresponding API base URL here.
+- `HF_TOKEN`: Your Hugging Face API token, used to access and download models.
+- `HF_ENDPOINT`: A custom Hugging Face endpoint, which can be specified if you encounter issues with model downloading.
+- `APPID` and `ACCESS_TOKEN`: Credentials for using the Bytedance TTS engine.
+- `BAIDU_API_KEY` and `BAIDU_SECRET_KEY`: Used for Baidu's Ernie Bot API.
+
+> [!NOTE]
+>
+> In most cases, you only need to configure `MODEL_NAME` and `HF_TOKEN`.
+>
+> By default, `MODEL_NAME` is set to `Qwen/Qwen1.5-4B-Chat`, so you do not need to configure the `OPENAI_API_KEY`.
+
+> [!TIP]
+>
+> Since the performance of large models can be limited under normal circumstances, it is recommended to use larger models or better APIs. I personally recommend choosing OpenAI's API. If cost is a concern, you can try Baidu's Ernie Bot API, which offers free API access. Simply apply for the API and add it to your environment variables: https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1.
+>
+> You can obtain your `HF_TOKEN` from [Hugging Face](https://huggingface.co/settings/tokens). If you wish to use the **speaker separation feature**, make sure to request access to [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1). Otherwise, you can opt not to enable this feature.
+
+### 4. Run the Application
+
+Before launching the application, run the following commands to automatically download the required models (including Qwen, XTTSv2, and faster-whisper-large-v3):
+
+```bash
+# For Linux
+bash scripts/download_models.sh
+
+# For Windows
+python scripts/modelscope_download.py
+# Download the wav2vec2_fairseq_base_ls960_asr_ls960.pth file and place it in the models/ASR/whisper folder
+wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
+```
+
+
+
+Once the download is complete, launch the WebUI interface using the following command:
+
+```bash
+python webui.py
+```
+
+After starting, you will see an interface like the one below. You can open [http://127.0.0.1:6006](http://127.0.0.1:6006) to explore the application:
+
+
+
+
+
+## Detailed Features and Technical Details
+
+### Automatic Video Download
+
+**yt-dlp** is a powerful open-source command-line tool designed for downloading video and audio from YouTube and other websites. This tool offers a wide range of parameter options, allowing users to customize download behavior to their needs. Whether choosing specific formats, resolutions, or extracting audio, yt-dlp provides flexible solutions. It also supports extensive post-processing features, such as automatically adding metadata and renaming files. For more details on parameters and usage, refer to the [yt-dlp official repository](https://github.com/yt-dlp/yt-dlp).
+
+### Vocal Separation
+
+#### Demucs
+
+**Demucs** is an advanced sound separation model developed by the Facebook research team, designed to separate different sound sources from mixed audio. Although its architecture is simple, Demucs is powerful enough to isolate instruments, voices, and background noise, making it easier for users to perform post-processing and editing. Its user-friendly design has made it a preferred tool for many audio processing applications, including music production and post-production in films. More information can be found on the [Demucs project page](https://github.com/facebookresearch/demucs).
+
+#### UVR5
+
+**UVR5 (Ultimate Vocal Remover)** is one of the best tools for vocal and accompaniment separation. It excels in generating high-quality accompaniments and vocal extractions, outperforming tools like RX9, RipX, and SpectraLayers 9. The extracted accompaniments are nearly indistinguishable from the original stereo tracks, and UVR5 is both open-source and free. Find the source code at: [https://github.com/Anjok07/ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui).
+
+WebUI reference: [https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5](https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5)
+
+Model weights reference: [https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights)
+
+### AI Speech Recognition
+
+#### WhisperX
+
+**WhisperX** is an extension of OpenAI's Whisper speech recognition system, specifically designed for generating and aligning subtitles for videos. Unlike traditional speech recognition systems, WhisperX not only accurately transcribes spoken content into text but also aligns it with video frames to generate timestamped subtitle files. This precise alignment makes video editing and subtitle generation more efficient and intuitive. WhisperX also supports multi-speaker recognition, providing detailed speaker information for richer, easier-to-understand subtitles.
+
+#### FunASR
+
+**FunASR** is a comprehensive speech recognition toolkit offering a wide range of speech processing features, including Automatic Speech Recognition (ASR), Voice Activity Detection (VAD), punctuation restoration, language modeling, speaker verification, speaker separation, and multi-speaker dialogue recognition. FunASR is particularly optimized for Chinese speech and offers pre-trained models with easy fine-tuning interfaces. It’s a significant tool in the field of speech recognition, widely used in voice assistants, automatic subtitle generation, and more. For more information, visit the [FunASR project](https://github.com/alibaba-damo-academy/FunASR).
+
+### Large Language Model Translation
+
+#### OpenAI API
+
+`Linly-Dubbing` uses OpenAI's large language models, such as GPT-4 and GPT-3.5-turbo, to perform high-quality translations via API. OpenAI's models are renowned for their natural language understanding and high-precision text generation capabilities, commonly used in tasks like dialogue generation and text analysis. You can find more details about the models and usage in the [OpenAI official documentation](https://platform.openai.com/docs/models).
+
+#### Qwen
+
+**Qwen** is a localized large language model that supports multi-language translation. Although its performance may not match OpenAI's top models, its open-source nature and local execution make it a cost-effective option. Qwen is capable of handling text translations across various languages and serves as a powerful open-source alternative. More details can be found on the [Qwen project page](https://github.com/QwenLM/Qwen).
+
+#### Google Translate
+
+As a supplement to the translation features, `Linly-Dubbing` also integrates [Google Translate](https://py-googletrans.readthedocs.io/en/latest/). Google Translate offers broad language support and good translation quality, making it suitable for quickly obtaining approximate translations.
+
+### AI-Powered Speech Synthesis
+
+#### Edge TTS
+
+**Edge TTS** is a high-quality text-to-speech conversion service provided by Microsoft. It supports multiple languages and voice styles, capable of generating natural and fluent voice output. With Edge TTS, `Linly-Dubbing` can generate high-quality speech from text, making content more lively and understandable. For more information, refer to the [Edge TTS official documentation](https://github.com/rany2/edge-tts).
+
+#### XTTS
+
+**Coqui XTTS** is an advanced deep learning text-to-speech toolkit focused on voice cloning and multi-language speech synthesis. XTTS can achieve voice cloning using brief audio snippets and generate realistic speech output. It offers a variety of pre-trained models and development tools, supporting training and fine-tuning of new models. Users can explore XTTS's capabilities online at [Hugging Face](https://huggingface.co/spaces/coqui/xtts) or visit the [official GitHub repository](https://github.com/coqui-ai/TTS) for more technical details.
+
+- Try XTTS online: [Hugging Face](https://huggingface.co/spaces/coqui/xtts)
+- Official GitHub repository: [Coqui TTS](https://github.com/coqui-ai/TTS)
+
+#### CosyVoice
+
+**CosyVoice** is a multi-language speech understanding and synthesis model developed by Alibaba's Tongyi Lab, supporting Chinese, English, Japanese, Cantonese, Korean, and more. CosyVoice is trained on over 150,000 hours of voice data and enables high-quality speech synthesis and cross-lingual voice cloning. It excels at generating natural and coherent speech across languages, with support for one-shot voice cloning, needing only 3 to 10 seconds of original audio to generate a similar voice. For more information and model details, visit the [CosyVoice project](https://github.com/FunAudioLLM/CosyVoice).
+
+Main features and characteristics:
+
+1. **Multi-language support**: Handles speech synthesis tasks in various languages.
+2. **Multi-style speech synthesis**: Controls the emotion and tone of speech through commands.
+3. **Streaming inference support**: Future plans include real-time streaming inference support.
+
+#### GPT-SoVITS
+
+Thanks to the contributions of the open-source community, AI speech synthesis also benefits from the open-source voice cloning model `GPT-SoVITS`. **GPT** is a transformer-based natural language processing model with strong text generation capabilities, while **SoVITS** is a deep learning-based voice conversion technology capable of converting one person's voice into another’s. By combining these two technologies, **GPT-SoVITS** can generate highly realistic speech that matches the given text content.
+
+The project can be found at [https://github.com/RVC-Boss/GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS). Key features include:
+
+1. **Zero-shot Text-to-Speech (TTS):** Input a 5-second voice sample to experience instant text-to-speech conversion.
+2. **Few-shot TTS:** Fine-tune the model with just 1 minute of training data to improve voice similarity and realism.
+3. **Cross-lingual support:** Inference across languages different from the training dataset, currently supporting English, Japanese, and Chinese.
+4. **WebUI tools:** Integrated tools include voice accompaniment separation, automatic dataset splitting, Chinese automatic speech recognition (ASR), and text annotation to help beginners create training datasets and GPT/SoVITS models.
+
+### Video Processing
+
+In terms of video processing, `Linly-Dubbing` provides robust functionality support. Users can easily add subtitles, insert background music, adjust background music volume, and modify overall playback speed. With these features, users can customize video content to make it more engaging and personalized.
+
+### Digital Human Lip-Sync Technology
+
+Inspired by `Linly-Talker`, this project focuses on digital human lip-sync technology. By combining advanced computer vision and speech recognition technologies, `Linly-Talker` allows digital characters' lip movements to match voiceovers precisely, achieving highly natural synchronization. This technology is not only applicable to animated characters but can also be used in scenarios such as virtual presenters or educators in instructional videos. `Linly-Talker` enhances digital character performance with accurate lip-sync and vivid facial expressions, providing a more immersive experience for the audience. This advanced digital human lip-sync technology significantly improves the professionalism and viewing experience of video content. For more information, refer to [https://github.com/Kedreamix/Linly-Talker](https://github.com/Kedreamix/Linly-Talker).
+
+---
+
+## License
+
+> [!Caution]
+>
+> When using this tool, please comply with relevant laws, including copyright, data protection, and privacy laws. Do not use this tool without permission from the original author and/or rights holder.
+
+`Linly-Dubbing` follows the Apache License 2.0. When using this tool, please comply with relevant laws, including copyright, data protection, and privacy laws. Do not use this tool without permission from the original author and/or rights holder.
+
+---
+
+## References
+
+In developing this project, I referenced and drew inspiration from several outstanding open-source projects and related resources. Special thanks to the developers and contributors of these projects and the open-source community. Below are the main projects we referenced:
+
+- [YouDub-webui](https://github.com/liuzhao1225/): Provides a feature-rich web interface for downloading and processing YouTube videos, from which we drew much inspiration and technical implementation details.
+
+- [Coqui TTS](https://github.com/coqui-ai/TTS)
+- [Qwen](https://github.com/QwenLM/Qwen)
+- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
+- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+- [Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
+
+---
+
+## Star History
+
+
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..7aa769149f28a90047310016537f172a2e64f6fc
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,371 @@
+# 智能视频多语言AI配音/翻译工具 - Linly-Dubbing — “AI赋能,语言无界”
+
+
+
Linly-Dubbing WebUI
+
+[](https://github.com/Kedreamix/Linly-Dubbing)
+
+
+[](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb)
+[](https://github.com/Kedreamix/Linly-Dubbing/blob/main/LICENSE)
+
+[**English**](./README.md) | [**中文简体**](./README_zh.md)
+
+
+
+---
+
+
+目录
+
+
+- [智能视频多语言AI配音/翻译工具 - Linly-Dubbing — “AI赋能,语言无界”](#智能视频多语言ai配音翻译工具---linly-dubbing--ai赋能语言无界)
+ - [介绍](#介绍)
+ - [TO DO LIST](#to-do-list)
+ - [示例](#示例)
+ - [安装与使用指南](#安装与使用指南)
+ - [测试环境](#测试环境)
+ - [1. 克隆代码仓库](#1-克隆代码仓库)
+ - [2. 安装依赖环境](#2-安装依赖环境)
+ - [3. 配置环境变量](#3-配置环境变量)
+ - [4. 运行程序](#4-运行程序)
+ - [详细功能和技术细节](#详细功能和技术细节)
+ - [自动下载视频](#自动下载视频)
+ - [人声分离](#人声分离)
+ - [Demucs](#demucs)
+ - [UVR5](#uvr5)
+ - [AI 智能语音识别](#ai-智能语音识别)
+ - [WhisperX](#whisperx)
+ - [FunASR](#funasr)
+ - [大型语言模型字幕翻译](#大型语言模型字幕翻译)
+ - [OpenAI API](#openai-api)
+ - [Qwen](#qwen)
+ - [Google Translate](#google-translate)
+ - [AI 语音合成](#ai-语音合成)
+ - [Edge TTS](#edge-tts)
+ - [XTTS](#xtts)
+ - [CosyVoice](#cosyvoice)
+ - [GPT-SoVITS](#gpt-sovits)
+ - [视频处理](#视频处理)
+ - [数字人对口型技术](#数字人对口型技术)
+ - [许可协议](#许可协议)
+ - [参考](#参考)
+ - [Star History](#star-history)
+
+
+
+
+## 介绍
+
+`Linly-Dubbing` 是一个智能视频多语言AI配音和翻译工具,它融合了[`YouDub-webui`](https://github.com/liuzhao1225/YouDub-webui)的灵感,并在此基础上进行了拓展和优化。我们致力于提供更加多样化和高质量的配音选择,通过集成[`Linly-Talker`](https://github.com/Kedreamix/Linly-Talker)的数字人对口型技术,为用户带来更加自然的多语言视频体验。
+
+通过整合最新的AI技术,`Linly-Dubbing` 在多语言配音的自然性和准确性方面达到了新的高度,适用于国际教育、全球娱乐内容本地化等多种场景,帮助团队将优质内容传播到全球各地。
+
+主要特点包括:
+
+- **多语言支持**: 支持中文及多种其他语言的配音和字幕翻译,满足国际化需求。
+- **AI 智能语音识别**: 使用先进的AI技术进行语音识别,提供精确的语音到文本转换和说话者识别。
+- **大型语言模型翻译**: 结合领先的本地化大型语言模型(如GPT),快速且准确地进行翻译,确保专业性和自然性。
+- **AI 声音克隆**: 利用尖端的声音克隆技术,生成与原视频配音高度相似的语音,保持情感和语调的连贯性。
+- **数字人对口型技术**: 通过对口型技术,使配音与视频画面高度契合,提升真实性和互动性。
+- **灵活上传与翻译**: 用户可以上传视频,自主选择翻译语言和标准,确保个性化和灵活性。
+- **定期更新**: 持续引入最新模型,保持配音和翻译的领先地位。
+
+我们旨在为用户提供无缝、高质量的多语言视频配音和翻译服务,为内容创作者和企业在全球市场中提供有力支持。
+
+---
+
+## TO DO LIST
+
+- [x] 完成AI配音和智能翻译功能的基础实现
+- [x] 集成CosyVoice的AI声音克隆算法,实现高质量音频翻译
+- [x] 增加FunASR的AI语音识别算法,特别优化对中文的支持
+- [x] 利用Qwen大语言模型实现多语言翻译
+- [x] 开发Linly-Dubbing WebUI,提供一键生成最终视频的便捷功能,并支持多种参数配置
+- [ ] 加入UVR5进行人声/伴奏分离和混响移除,参考GPTSoVits
+- [ ] 提升声音克隆的自然度,考虑使用GPTSoVits进行微调,加入GPTSoVits
+- [ ] 实现并优化数字人对口型技术,提升配音与画面的契合度
+
+---
+
+## 示例
+
+| 原视频 | Linly-Dubbing |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| | |
+
+---
+
+## 安装与使用指南
+
+### 测试环境
+
+本指南适用于以下测试环境:
+
+- Python 3.10, PyTorch 2.3.1, CUDA 12.1
+- Python 3.10, PyTorch 2.3.1, CUDA 11.8
+
+请按照以下步骤进行`Linly-Dubbing`的安装与配置。
+
+> [!NOTE]
+>
+> 此外,我还提供了一个Colab脚本,您可以点击 [Linly-Dubbing Colab](https://colab.research.google.com/github/Kedreamix/Linly-Dubbing/blob/main/colab_webui.ipynb) 进行在线体验。
+
+### 1. 克隆代码仓库
+
+首先,您需要将`Linly-Dubbing`项目的代码克隆到本地,并初始化子模块。以下是具体操作步骤:
+
+```bash
+# 克隆项目代码到本地
+git clone https://github.com/Kedreamix/Linly-Dubbing.git --depth 1
+
+# 进入项目目录
+cd Linly-Dubbing
+
+# 初始化并更新子模块,如CosyVoice等
+git submodule update --init --recursive
+```
+
+### 2. 安装依赖环境
+
+在继续之前,请创建一个新的Python环境,并安装所需的依赖项。
+
+```bash
+# 创建名为 'linly_dubbing' 的conda环境,并指定Python版本为3.10
+conda create -n linly_dubbing python=3.10 -y
+
+# 激活新创建的环境
+conda activate linly_dubbing
+
+# 进入项目目录
+cd Linly-Dubbing/
+
+# 安装ffmpeg工具
+# 使用conda安装ffmpeg
+conda install ffmpeg==7.0.2 -c conda-forge
+# 使用国内镜像安装ffmpeg
+conda install ffmpeg==7.0.2 -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+
+# 升级pip到最新版本
+python -m pip install --upgrade pip
+
+# 更改PyPI源地址以加快包的下载速度
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+根据您的CUDA版本,使用以下命令安装PyTorch及相关库:
+
+```bash
+# 对于CUDA 11.8
+pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+
+# 对于CUDA 12.1
+pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+```
+
+如果您倾向于通过conda安装PyTorch,可以选择以下命令:
+
+```bash
+# 对于CUDA 11.8
+conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
+
+# 对于CUDA 12.1
+conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia
+```
+
+> [!NOTE]
+>
+> 安装过程可能耗时很长。
+
+然后,安装项目的其他依赖项:
+
+```bash
+# 安装项目所需的Python包
+# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
+conda install -y pynini==2.1.5 -c conda-forge
+# -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
+
+pip install -r requirements.txt
+# 安装submodules 下的依赖
+pip install -r requirements_module.txt
+```
+
+> [!TIP]
+>
+> 如在安装过程中遇到错误提示“Could not load library libcudnn_ops_infer.so.8”,请按以下步骤修复:
+>
+> ```bash
+> # 设置LD_LIBRARY_PATH以包含正确的cuDNN库路径
+> export LD_LIBRARY_PATH=`python3 -c 'import os; import torch; print(os.path.dirname(os.path.dirname(torch.__file__)) +"/nvidia/cudnn/lib")'`:$LD_LIBRARY_PATH
+> ```
+
+### 3. 配置环境变量
+
+在运行程序前,您需要配置必要的环境变量。请在项目根目录下的 `.env` 文件中添加以下内容,首先将 `env.example`填入以下环境变量并 改名为 `.env` :
+
+- `OPENAI_API_KEY`: 您的OpenAI API密钥,格式通常为 `sk-xxx`。
+- `MODEL_NAME`: 使用的模型名称,如 `gpt-4` 或 `gpt-3.5-turbo`。
+- `OPENAI_API_BASE`: 如使用自部署的OpenAI模型,请填写对应的API基础URL。
+- `HF_TOKEN`: Hugging Face的API Token,用于访问和下载模型。
+- `HF_ENDPOINT`: 当遇到模型下载问题时,可指定自定义的Hugging Face端点。
+- `APPID` 和 `ACCESS_TOKEN`: 用于火山引擎TTS的凭据。
+- `BAIDU_API_KEY`和`BAIDU_SECRET_KEY`: 用于百度文心一言的API
+
+> [!NOTE]
+>
+> 通常,您只需配置 `MODEL_NAME` 和 `HF_TOKEN` 即可。
+>
+> 默认情况下,`MODEL_NAME` 设为 `Qwen/Qwen1.5-4B-Chat`,因此无需额外配置 `OPENAI_API_KEY`。
+
+> ![TIP]
+>
+> 由于正常情况下大模型效果有限,所以建议可以使用规模较大的模型或者说使用较好的API,个人推荐可以选择OpenAI的api,如果考虑到收费问题,可以尝试百度的文心一言的API,免费申请API,填入到环境变量即可,[https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1](https://console.bce.baidu.com/qianfan/ais/console/applicationConsole/application/v1)
+>
+> 可以在 [Hugging Face](https://huggingface.co/settings/tokens) 获取 `HF_TOKEN`。若需使用**说话人分离功能**,务必在[pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)申请访问权限。否则,可以选择不启用该功能。
+
+### 4. 运行程序
+
+在启动程序前,先通过以下命令自动下载所需的模型(包括Qwen,XTTSv2,和faster-whisper-large-v3模型):
+
+```bash
+# Linux 终端运行
+bash scripts/download_models.sh
+
+# Windows
+python scripts/modelscope_download.py
+# 下载wav2vec2_fairseq_base_ls960_asr_ls960.pth文件放在models/ASR/whisper文件夹下
+wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
+```
+
+
+
+下载完成后,使用以下命令启动WebUI用户界面:
+
+```bash
+python webui.py
+```
+
+启动后,您将看到如下图所示的界面,可以打开 [http://127.0.0.1:6006](http://127.0.0.1:6006) 进行体验:
+
+
+
+---
+
+## 详细功能和技术细节
+
+### 自动下载视频
+
+**yt-dlp** 是一款强大的开源命令行工具,专为从 YouTube 和其他网站下载视频和音频而设计。该工具具有广泛的参数选项,允许用户根据需求精细地定制下载行为。无论是选择特定的格式、分辨率,还是提取音频,yt-dlp 都能提供灵活的解决方案。此外,yt-dlp 支持丰富的后处理功能,如自动添加元数据、自动重命名文件等。有关详细的参数和使用方法,请参考 [yt-dlp 的官方仓库](https://github.com/yt-dlp/yt-dlp)。
+
+### 人声分离
+
+#### Demucs
+
+**Demucs** 是由 Facebook 研究团队开发的一个先进的声音分离模型,旨在从混合音频中分离出不同的声音源。Demucs 的架构简单,但功能强大,它能够将乐器、声音和背景音分离开来,使用户能够更方便地进行后期处理和编辑。其简单易用的设计使得它成为许多声音处理应用的首选工具,广泛用于音乐制作、影视后期等领域。更多信息可以参见 [Demucs 的项目页面](https://github.com/facebookresearch/demucs)。
+
+#### UVR5
+
+UVR5 (Ultimate Vocal Remover)是目前最优秀的人声伴奏分离工具之一,是一款功能强大的伴奏制作/人声提取工具,其表现不仅优于RX9、RipX和SpectraLayers 9等同类工具,而且它提取出来的伴奏已经无限接近原版立体声,而且开源免费,开源地址:[https://github.com/Anjok07/ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)。
+
+WebUI参考:[https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5](https://github.com/RVC-Boss/GPT-SoVITS/tree/main/tools/uvr5)
+
+权重文件参考:[https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights)
+
+### AI 智能语音识别
+
+#### WhisperX
+
+**WhisperX** 是 OpenAI 开发的 Whisper 语音识别系统的扩展版本,专注于生成和对齐视频字幕。与传统语音识别系统不同,WhisperX 不仅能够将语音内容精确地转录为文字,还能与视频帧进行精确对齐,生成带有时间戳的字幕文件。这种精准的对齐功能使视频编辑和字幕生成变得更加高效和直观。WhisperX 还支持多说话者识别,提供详尽的说话者信息,使得字幕内容更加丰富和易于理解。
+
+#### FunASR
+
+**FunASR** 是一个综合性的语音识别工具包,提供广泛的语音处理功能,包括语音识别(ASR)、语音活动检测(VAD)、标点符号恢复、语言模型、说话人验证、说话人分离以及多说话者对话识别等。FunASR 尤其针对中文语音进行了优化,提供了预训练模型及其微调的便捷接口。它是语音识别领域中的重要工具,广泛应用于语音助手、自动字幕生成等场景。详细信息可参考 [FunASR 项目](https://github.com/alibaba-damo-academy/FunASR)。
+
+### 大型语言模型字幕翻译
+
+#### OpenAI API
+
+`Linly-Dubbing` 采用 OpenAI 提供的多种大型语言模型,如 GPT-4 和 GPT-3.5-turbo,通过 API 接口进行高质量的翻译。OpenAI 的这些模型以其自然语言理解能力和高精度的生成文本能力著称,广泛用于对话生成、文本分析等任务。用户可以访问 [OpenAI 官方文档](https://platform.openai.com/docs/models) 了解更多模型信息和使用细节。
+
+#### Qwen
+
+**Qwen** 是一个本地化的大型语言模型,支持多语言翻译。虽然其性能可能不如 OpenAI 的顶级模型,但其开放源码和本地运行的特性使得它成为一个经济高效的选择。Qwen 能够处理多种语言的文本翻译,是一个强大的开源替代方案。详情请参见 [Qwen 项目](https://github.com/QwenLM/Qwen)。
+
+#### Google Translate
+
+作为翻译功能的补充,`Linly-Dubbing` 还集成了 [Google Translate](https://py-googletrans.readthedocs.io/en/latest/) 的翻译服务。Google Translate 提供广泛的语言支持和良好的翻译质量,特别适合快速获取大致翻译内容。
+
+### AI 语音合成
+
+#### Edge TTS
+
+**Edge TTS** 是微软提供的高质量文本到语音转换服务。它支持多种语言和声音样式,能够生成自然流畅的语音输出。通过 Edge TTS,`Linly-Dubbing` 可以实现从文本生成高质量的语音,使内容更加生动和易于理解。更多信息和使用方法请参见 [Edge TTS 官方文档](https://github.com/rany2/edge-tts)。
+
+#### XTTS
+
+**Coqui XTTS** 是一个先进的深度学习文本到语音工具包,专注于声音克隆和多语言语音合成。XTTS 能够通过短时间的音频片段实现声音克隆,并生成逼真的语音输出。它提供了丰富的预训练模型和开发工具,支持新模型的训练和微调。用户可以通过 [Hugging Face](https://huggingface.co/spaces/coqui/xtts) 在线体验和测试 XTTS 的功能,或者访问 [官方 GitHub 库](https://github.com/coqui-ai/TTS) 了解更多技术细节。
+
+- 在线体验 XTTS: [Hugging Face](https://huggingface.co/spaces/coqui/xtts)
+- 官方 GitHub 库: [Coqui TTS](https://github.com/coqui-ai/TTS)
+
+#### CosyVoice
+
+**CosyVoice** 是阿里通义实验室开发的多语言语音理解和合成模型,支持中文、英语、日语、粤语、韩语等多种语言。CosyVoice 经过超过 15 万小时的语音数据训练,能够实现高质量的语音合成和跨语言音色克隆。它特别擅长在不同语言之间生成自然、连贯的语音,支持 one-shot 音色克隆,仅需 3 至 10 秒的原始音频即可生成模拟音色。更多信息和模型详情请访问 [CosyVoice 项目](https://github.com/FunAudioLLM/CosyVoice)。
+
+主要功能和特性:
+1. **多语言支持**:处理多种语言的语音合成任务。
+2. **多风格语音合成**:通过指令控制语音的情感和语气。
+3. **流式推理支持**:计划未来支持实时流式推理。
+
+#### GPT-SoVITS
+
+感谢大家的开源贡献,AI语音合成还借鉴了当前开源的语音克隆模型 `GPT-SoVITS`,**GPT**是一种基于Transformer的自然语言处理模型,具有很强的文本生成能力。 **SoVITS**则是一种基于深度学习的语音转换技术,可以将一个人的语音转换成另一个人的语音。 通过将这两种技术结合起来,**GPT**-**SoVITS**可以生成高度逼真的语音,且语音内容与给定的文本内容一致。
+
+我认为效果是相当不错的,项目地址可参考[https://github.com/RVC-Boss/GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS),主要功能如下:
+
+1. **零样本文本到语音(TTS):** 输入 5 秒的声音样本,即刻体验文本到语音转换。
+2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型,提升声音相似度和真实感。
+3. **跨语言支持:** 支持与训练数据集不同语言的推理,目前支持英语、日语和中文。
+4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注,协助初学者创建训练数据集和 GPT/SoVITS 模型。
+
+### 视频处理
+
+在视频处理方面,`Linly-Dubbing` 提供了强大的功能支持。用户可以轻松添加字幕、插入背景音乐,并调整背景音乐的音量和整体播放速度等。通过这些功能,用户能够自定义视频内容,使之更具吸引力和个性化。
+
+### 数字人对口型技术
+
+借鉴于`Linly-Talker`,专注于实现数字人的对口型技术。通过结合先进的计算机视觉和语音识别技术,`Linly-Talker` 能够使数字人角色的口型与配音精确匹配,从而实现高度自然的同步效果。这项技术不仅适用于动画角色,还可以应用于虚拟主播、教育视频中的讲解员等多种场景。`Linly-Talker` 通过精确的口型匹配和生动的面部表情,使得虚拟人物的表现更加生动逼真,为观众提供更加沉浸的体验。这种先进的数字人对口型技术大大提升了视频内容的专业性和观赏价值。可参考[https://github.com/Kedreamix/Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
+
+---
+
+## 许可协议
+
+> [!Caution]
+>
+> 在使用本工具时,请遵守相关法律,包括版权法、数据保护法和隐私法。未经原作者和/或版权所有者许可,请勿使用本工具。
+
+`Linly-Dubbing` 遵循 Apache License 2.0。在使用本工具时,请遵守相关法律,包括版权法、数据保护法和隐私法。未经原作者和/或版权所有者许可,请勿使用本工具。
+
+---
+
+## 参考
+
+在开发过程中,我参考并借鉴了多个优秀的开源项目及相关资源。特别感谢这些项目的开发者和开源社区的贡献,以下是我们参考的主要项目:
+
+- [YouDub-webui](https://github.com/liuzhao1225/YouDub-webui):提供了一个功能丰富的 Web 用户界面,用于 YouTube 视频的下载和处理,我们从中汲取了不少灵感和技术实现细节。
+- [Coqui TTS](https://github.com/coqui-ai/TTS)
+
+- [Qwen](https://github.com/QwenLM/Qwen)
+- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
+- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+- [Linly-Talker](https://github.com/Kedreamix/Linly-Talker)
+
+---
+
+## Star History
+
+[](https://star-history.com/#Kedreamix/Linly-Dubbing&Date)
+
+---
+
diff --git a/apt.txt b/apt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..47fc5ada90cd0698316a6b577ad6e8cf2bfea982
--- /dev/null
+++ b/apt.txt
@@ -0,0 +1,3 @@
+ffmpeg
+libsndfile1
+libgl1
diff --git a/colab_webui.ipynb b/colab_webui.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..582dabd5e684673de7a15d9b3ede45b493faf12c
--- /dev/null
+++ b/colab_webui.ipynb
@@ -0,0 +1,3483 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "NPgmiqOJQMHQ"
+ },
+ "source": [
+ "# Build environment 环境配置"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "geSxiwseXpRf",
+ "outputId": "7d06a6d2-4e78-44a6-95fc-06b5f0b0230c"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mon Aug 19 18:58:47 2024 \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n",
+ "|-----------------------------------------+----------------------+----------------------+\n",
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|=========================================+======================+======================|\n",
+ "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
+ "| N/A 48C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ " \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=======================================================================================|\n",
+ "| No running processes found |\n",
+ "+---------------------------------------------------------------------------------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 查看当前使用GPU的CUDA版本\n",
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "A3ra_nCWQfXe"
+ },
+ "source": [
+ "# Git Clone 克隆代码"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1qucvAztXzom",
+ "outputId": "f43252c7-8cbc-43fb-ac3f-c059dd28c59f"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content\n",
+ "Cloning into 'Linly-Dubbing'...\n",
+ "remote: Enumerating objects: 1018, done.\u001b[K\n",
+ "remote: Counting objects: 100% (1018/1018), done.\u001b[K\n",
+ "remote: Compressing objects: 100% (846/846), done.\u001b[K\n",
+ "remote: Total 1018 (delta 117), reused 991 (delta 112), pack-reused 0 (from 0)\u001b[K\n",
+ "Receiving objects: 100% (1018/1018), 39.74 MiB | 15.66 MiB/s, done.\n",
+ "Resolving deltas: 100% (117/117), done.\n",
+ "/content/Linly-Dubbing\n",
+ "Submodule 'CosyVoice' (https://github.com/FunAudioLLM/CosyVoice.git) registered for path 'CosyVoice'\n",
+ "Cloning into '/content/Linly-Dubbing/CosyVoice'...\n",
+ "Submodule path 'CosyVoice': checked out '6be8d0fc367f48c46a671edc16a99b9728038cb7'\n",
+ "Submodule 'third_party/Matcha-TTS' (https://github.com/shivammehta25/Matcha-TTS.git) registered for path 'CosyVoice/third_party/Matcha-TTS'\n",
+ "Cloning into '/content/Linly-Dubbing/CosyVoice/third_party/Matcha-TTS'...\n",
+ "Submodule path 'CosyVoice/third_party/Matcha-TTS': checked out 'dd9105b34bf2be2230f4aa1e4769fb586a3c824e'\n"
+ ]
+ }
+ ],
+ "source": [
+ "%cd /content/\n",
+ "\n",
+ "# 克隆项目代码到本地\n",
+ "!git clone https://github.com/Kedreamix/Linly-Dubbing.git --depth 1\n",
+ "\n",
+ "# 进入项目目录\n",
+ "%cd Linly-Dubbing\n",
+ "\n",
+ "# 初始化并更新子模块,如CosyVoice等\n",
+ "!git submodule update --init --recursive"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "B-nNsyEIQa7I",
+ "outputId": "fbf63667-f63d-4edb-dc6a-115bcbf870f3"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Name: torch\n",
+ "Version: 2.3.1+cu121\n",
+ "Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration\n",
+ "Home-page: https://pytorch.org/\n",
+ "Author: PyTorch Team\n",
+ "Author-email: packages@pytorch.org\n",
+ "License: BSD-3\n",
+ "Location: /usr/local/lib/python3.10/dist-packages\n",
+ "Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions\n",
+ "Required-by: accelerate, fastai, torchaudio, torchtext, torchvision\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 查看torch版本\n",
+ "!pip show torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NViDFNOMX67d",
+ "outputId": "f545b99e-e710-4591-e083-112c93207697"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Reading package lists... Done\n",
+ "Building dependency tree... Done\n",
+ "Reading state information... Done\n",
+ "ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
+ "0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.\n",
+ "ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers\n",
+ "built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)\n",
+ "configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n",
+ "libavutil 56. 70.100 / 56. 70.100\n",
+ "libavcodec 58.134.100 / 58.134.100\n",
+ "libavformat 58. 76.100 / 58. 76.100\n",
+ "libavdevice 58. 13.100 / 58. 13.100\n",
+ "libavfilter 7.110.100 / 7.110.100\n",
+ "libswscale 5. 9.100 / 5. 9.100\n",
+ "libswresample 3. 9.100 / 3. 9.100\n",
+ "libpostproc 55. 9.100 / 55. 9.100\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 安装ffmpeg\n",
+ "!apt-get install ffmpeg\n",
+ "!ffmpeg -version"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "SmEIaKn1X1Hy",
+ "outputId": "e54ad3e7-0292-4a1d-ef50-eac827827c46"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting pynini==2.1.5\n",
+ " Downloading pynini-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)\n",
+ "Requirement already satisfied: Cython>=0.29 in /usr/local/lib/python3.10/dist-packages (from pynini==2.1.5) (3.0.11)\n",
+ "Downloading pynini-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161.3 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m161.3/161.3 MB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: pynini\n",
+ "Successfully installed pynini-2.1.5\n",
+ "Collecting numpy==1.26.3 (from -r requirements.txt (line 8))\n",
+ " Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.2/61.2 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting transformers==4.39.3 (from -r requirements.txt (line 9))\n",
+ " Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting translators (from -r requirements.txt (line 10))\n",
+ " Downloading translators-5.9.2-py3-none-any.whl.metadata (68 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.8/68.8 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting gradio (from -r requirements.txt (line 12))\n",
+ " Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)\n",
+ "Collecting loguru (from -r requirements.txt (line 13))\n",
+ " Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)\n",
+ "Collecting yt-dlp (from -r requirements.txt (line 14))\n",
+ " Downloading yt_dlp-2024.8.6-py3-none-any.whl.metadata (170 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m170.1/170.1 kB\u001b[0m \u001b[31m382.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 15)) (1.13.1)\n",
+ "Collecting python-dotenv (from -r requirements.txt (line 16))\n",
+ " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
+ "Collecting openai (from -r requirements.txt (line 17))\n",
+ " Downloading openai-1.41.0-py3-none-any.whl.metadata (22 kB)\n",
+ "Collecting audiostretchy (from -r requirements.txt (line 18))\n",
+ " Downloading audiostretchy-1.3.5-py3-none-any.whl.metadata (8.8 kB)\n",
+ "Collecting modelscope (from -r requirements.txt (line 19))\n",
+ " Downloading modelscope-1.17.1-py3-none-any.whl.metadata (40 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.6/40.6 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting funasr (from -r requirements.txt (line 24))\n",
+ " Downloading funasr-1.1.5-py3-none-any.whl.metadata (30 kB)\n",
+ "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 29)) (0.32.1)\n",
+ "Collecting HyperPyYAML==1.2.2 (from -r requirements.txt (line 32))\n",
+ " Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)\n",
+ "Collecting librosa==0.10.2 (from -r requirements.txt (line 33))\n",
+ " Downloading librosa-0.10.2-py3-none-any.whl.metadata (8.6 kB)\n",
+ "Collecting WeTextProcessing==1.0.3 (from -r requirements.txt (line 34))\n",
+ " Downloading WeTextProcessing-1.0.3-py3-none-any.whl.metadata (7.2 kB)\n",
+ "Collecting wget==3.2 (from -r requirements.txt (line 35))\n",
+ " Downloading wget-3.2.zip (10 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting diffusers==0.27.2 (from -r requirements.txt (line 38))\n",
+ " Downloading diffusers-0.27.2-py3-none-any.whl.metadata (18 kB)\n",
+ "Requirement already satisfied: gdown==5.1.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 39)) (5.1.0)\n",
+ "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 40)) (14.0.2)\n",
+ "Collecting conformer==0.3.2 (from -r requirements.txt (line 41))\n",
+ " Downloading conformer-0.3.2-py3-none-any.whl.metadata (631 bytes)\n",
+ "Collecting lightning==2.2.4 (from -r requirements.txt (line 42))\n",
+ " Downloading lightning-2.2.4-py3-none-any.whl.metadata (53 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.4/53.4 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (3.15.4)\n",
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (0.23.5)\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (6.0.2)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (2024.5.15)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (2.32.3)\n",
+ "Collecting tokenizers<0.19,>=0.14 (from transformers==4.39.3->-r requirements.txt (line 9))\n",
+ " Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (0.4.4)\n",
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.39.3->-r requirements.txt (line 9)) (4.66.5)\n",
+ "Collecting ruamel.yaml>=0.17.28 (from HyperPyYAML==1.2.2->-r requirements.txt (line 32))\n",
+ " Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)\n",
+ "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (3.0.1)\n",
+ "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (1.3.2)\n",
+ "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (1.4.2)\n",
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (4.4.2)\n",
+ "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (0.60.0)\n",
+ "Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (0.12.1)\n",
+ "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (1.8.2)\n",
+ "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (0.4.0)\n",
+ "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (4.12.2)\n",
+ "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (0.4)\n",
+ "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2->-r requirements.txt (line 33)) (1.0.8)\n",
+ "Requirement already satisfied: pynini==2.1.5 in /usr/local/lib/python3.10/dist-packages (from WeTextProcessing==1.0.3->-r requirements.txt (line 34)) (2.1.5)\n",
+ "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.10/dist-packages (from WeTextProcessing==1.0.3->-r requirements.txt (line 34)) (6.4.2)\n",
+ "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.10/dist-packages (from diffusers==0.27.2->-r requirements.txt (line 38)) (8.2.0)\n",
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from diffusers==0.27.2->-r requirements.txt (line 38)) (9.4.0)\n",
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown==5.1.0->-r requirements.txt (line 39)) (4.12.3)\n",
+ "Requirement already satisfied: einops>=0.6.1 in /usr/local/lib/python3.10/dist-packages (from conformer==0.3.2->-r requirements.txt (line 41)) (0.8.0)\n",
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from conformer==0.3.2->-r requirements.txt (line 41)) (2.3.1+cu121)\n",
+ "Requirement already satisfied: fsspec<2025.0,>=2022.5.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (2024.6.1)\n",
+ "Collecting lightning-utilities<2.0,>=0.8.0 (from lightning==2.2.4->-r requirements.txt (line 42))\n",
+ " Downloading lightning_utilities-0.11.6-py3-none-any.whl.metadata (5.2 kB)\n",
+ "Collecting torchmetrics<3.0,>=0.7.0 (from lightning==2.2.4->-r requirements.txt (line 42))\n",
+ " Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)\n",
+ "Collecting pytorch-lightning (from lightning==2.2.4->-r requirements.txt (line 42))\n",
+ " Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)\n",
+ "Requirement already satisfied: Cython>=0.29 in /usr/local/lib/python3.10/dist-packages (from pynini==2.1.5->WeTextProcessing==1.0.3->-r requirements.txt (line 34)) (3.0.11)\n",
+ "Collecting PyExecJS>=1.5.1 (from translators->-r requirements.txt (line 10))\n",
+ " Downloading PyExecJS-1.5.1.tar.gz (13 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: lxml>=4.9.1 in /usr/local/lib/python3.10/dist-packages (from translators->-r requirements.txt (line 10)) (4.9.4)\n",
+ "Collecting pathos>=0.2.9 (from translators->-r requirements.txt (line 10))\n",
+ " Downloading pathos-0.3.2-py3-none-any.whl.metadata (11 kB)\n",
+ "Requirement already satisfied: cryptography>=42.0.4 in /usr/local/lib/python3.10/dist-packages (from translators->-r requirements.txt (line 10)) (42.0.8)\n",
+ "Collecting aiofiles<24.0,>=22.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
+ "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (3.7.1)\n",
+ "Collecting fastapi (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading fastapi-0.112.1-py3-none-any.whl.metadata (27 kB)\n",
+ "Collecting ffmpy (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)\n",
+ "Collecting gradio-client==1.3.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)\n",
+ "Collecting httpx>=0.24.1 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)\n",
+ "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (3.1.4)\n",
+ "Requirement already satisfied: markupsafe~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (2.1.5)\n",
+ "Requirement already satisfied: matplotlib~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (3.7.1)\n",
+ "Collecting orjson~=3.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m193.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (2.1.4)\n",
+ "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (2.8.2)\n",
+ "Collecting pydub (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
+ "Collecting python-multipart>=0.0.9 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)\n",
+ "Collecting ruff>=0.2.2 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading ruff-0.6.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
+ "Collecting semantic-version~=2.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n",
+ "Collecting tomlkit==0.12.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)\n",
+ "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (0.12.3)\n",
+ "Requirement already satisfied: urllib3~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio->-r requirements.txt (line 12)) (2.0.7)\n",
+ "Collecting uvicorn>=0.14.0 (from gradio->-r requirements.txt (line 12))\n",
+ " Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)\n",
+ "Collecting websockets<13.0,>=10.0 (from gradio-client==1.3.0->gradio->-r requirements.txt (line 12))\n",
+ " Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+ "Collecting brotli (from yt-dlp->-r requirements.txt (line 14))\n",
+ " Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)\n",
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from yt-dlp->-r requirements.txt (line 14)) (2024.7.4)\n",
+ "Collecting mutagen (from yt-dlp->-r requirements.txt (line 14))\n",
+ " Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)\n",
+ "Collecting pycryptodomex (from yt-dlp->-r requirements.txt (line 14))\n",
+ " Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
+ "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai->-r requirements.txt (line 17)) (1.7.0)\n",
+ "Collecting jiter<1,>=0.4.0 (from openai->-r requirements.txt (line 17))\n",
+ " Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
+ "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai->-r requirements.txt (line 17)) (1.3.1)\n",
+ "Collecting fire>=0.5.0 (from audiostretchy->-r requirements.txt (line 18))\n",
+ " Downloading fire-0.6.0.tar.gz (88 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.4/88.4 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting jamo (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)\n",
+ "Collecting kaldiio>=2.17.0 (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading kaldiio-2.18.0-py3-none-any.whl.metadata (13 kB)\n",
+ "Collecting torch-complex (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading torch_complex-0.4.4-py3-none-any.whl.metadata (3.1 kB)\n",
+ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from funasr->-r requirements.txt (line 24)) (0.1.99)\n",
+ "Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from funasr->-r requirements.txt (line 24)) (0.42.1)\n",
+ "Collecting pytorch-wpe (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading pytorch_wpe-0.0.1-py3-none-any.whl.metadata (242 bytes)\n",
+ "Requirement already satisfied: editdistance>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from funasr->-r requirements.txt (line 24)) (0.8.1)\n",
+ "Collecting oss2 (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading oss2-2.18.6.tar.gz (283 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.8/283.8 kB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting umap-learn (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)\n",
+ "Collecting jaconv (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading jaconv-0.4.0.tar.gz (17 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting hydra-core>=1.3.2 (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)\n",
+ "Collecting tensorboardX (from funasr->-r requirements.txt (line 24))\n",
+ " Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate->-r requirements.txt (line 29)) (5.9.5)\n",
+ "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio->-r requirements.txt (line 12)) (3.7)\n",
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio->-r requirements.txt (line 12)) (1.2.2)\n",
+ "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=42.0.4->translators->-r requirements.txt (line 10)) (1.17.0)\n",
+ "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from fire>=0.5.0->audiostretchy->-r requirements.txt (line 18)) (1.16.0)\n",
+ "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from fire>=0.5.0->audiostretchy->-r requirements.txt (line 18)) (2.4.0)\n",
+ "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (3.10.3)\n",
+ "Collecting httpcore==1.* (from httpx>=0.24.1->gradio->-r requirements.txt (line 12))\n",
+ " Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)\n",
+ "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.24.1->gradio->-r requirements.txt (line 12))\n",
+ " Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
+ "Collecting omegaconf<2.4,>=2.2 (from hydra-core>=1.3.2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)\n",
+ "Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.3.2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from lightning-utilities<2.0,>=0.8.0->lightning==2.2.4->-r requirements.txt (line 42)) (71.0.4)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (1.2.1)\n",
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (4.53.1)\n",
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (1.4.5)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (3.1.2)\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio->-r requirements.txt (line 12)) (2.8.2)\n",
+ "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa==0.10.2->-r requirements.txt (line 33)) (0.43.0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio->-r requirements.txt (line 12)) (2024.1)\n",
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio->-r requirements.txt (line 12)) (2024.1)\n",
+ "Collecting ppft>=1.7.6.8 (from pathos>=0.2.9->translators->-r requirements.txt (line 10))\n",
+ " Downloading ppft-1.7.6.8-py3-none-any.whl.metadata (12 kB)\n",
+ "Collecting dill>=0.3.8 (from pathos>=0.2.9->translators->-r requirements.txt (line 10))\n",
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+ "Collecting pox>=0.3.4 (from pathos>=0.2.9->translators->-r requirements.txt (line 10))\n",
+ " Downloading pox-0.3.4-py3-none-any.whl.metadata (8.0 kB)\n",
+ "Collecting multiprocess>=0.70.16 (from pathos>=0.2.9->translators->-r requirements.txt (line 10))\n",
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+ "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa==0.10.2->-r requirements.txt (line 33)) (4.2.2)\n",
+ "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio->-r requirements.txt (line 12)) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio->-r requirements.txt (line 12)) (2.20.1)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.39.3->-r requirements.txt (line 9)) (3.3.2)\n",
+ "Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->HyperPyYAML==1.2.2->-r requirements.txt (line 32))\n",
+ " Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa==0.10.2->-r requirements.txt (line 33)) (3.5.0)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->conformer==0.3.2->-r requirements.txt (line 41)) (1.13.2)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->conformer==0.3.2->-r requirements.txt (line 41)) (3.3)\n",
+ "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
+ "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
+ "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
+ "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
+ "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
+ "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
+ "Collecting nvidia-curand-cu12==10.3.2.106 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
+ "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
+ "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
+ "Collecting nvidia-nccl-cu12==2.20.5 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
+ "Collecting nvidia-nvtx-cu12==12.1.105 (from torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n",
+ "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->conformer==0.3.2->-r requirements.txt (line 41)) (2.3.1)\n",
+ "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch->conformer==0.3.2->-r requirements.txt (line 41))\n",
+ " Using cached nvidia_nvjitlink_cu12-12.6.20-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+ "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (8.1.7)\n",
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (1.5.4)\n",
+ "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (13.7.1)\n",
+ "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown==5.1.0->-r requirements.txt (line 39)) (2.6)\n",
+ "Collecting starlette<0.39.0,>=0.37.2 (from fastapi->gradio->-r requirements.txt (line 12))\n",
+ " Downloading starlette-0.38.2-py3-none-any.whl.metadata (5.9 kB)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata->diffusers==0.27.2->-r requirements.txt (line 38)) (3.20.0)\n",
+ "Collecting crcmod>=1.7 (from oss2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading crcmod-1.7.tar.gz (89 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.7/89.7 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting pycryptodome>=3.4.7 (from oss2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
+ "Collecting aliyun-python-sdk-kms>=2.4.1 (from oss2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading aliyun_python_sdk_kms-2.16.4-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+ "Collecting aliyun-python-sdk-core>=2.13.12 (from oss2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading aliyun-python-sdk-core-2.15.1.tar.gz (443 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m443.1/443.1 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==5.1.0->-r requirements.txt (line 39)) (1.7.1)\n",
+ "Requirement already satisfied: protobuf>=3.20 in /usr/local/lib/python3.10/dist-packages (from tensorboardX->funasr->-r requirements.txt (line 24)) (3.20.3)\n",
+ "Collecting pynndescent>=0.5 (from umap-learn->funasr->-r requirements.txt (line 24))\n",
+ " Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (2.3.5)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (6.0.5)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (1.9.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2025.0,>=2022.5.0->lightning==2.2.4->-r requirements.txt (line 42)) (4.0.3)\n",
+ "Collecting jmespath<1.0.0,>=0.9.3 (from aliyun-python-sdk-core>=2.13.12->oss2->funasr->-r requirements.txt (line 24))\n",
+ " Downloading jmespath-0.10.0-py2.py3-none-any.whl.metadata (8.0 kB)\n",
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=42.0.4->translators->-r requirements.txt (line 10)) (2.22)\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (3.0.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (2.16.1)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->conformer==0.3.2->-r requirements.txt (line 41)) (1.3.0)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio->-r requirements.txt (line 12)) (0.1.2)\n",
+ "Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.2/18.2 MB\u001b[0m \u001b[31m78.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading transformers-4.39.3-py3-none-any.whl (8.8 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m73.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)\n",
+ "Downloading librosa-0.10.2-py3-none-any.whl (260 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m260.0/260.0 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading WeTextProcessing-1.0.3-py3-none-any.whl (2.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading diffusers-0.27.2-py3-none-any.whl (2.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m78.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading conformer-0.3.2-py3-none-any.whl (4.3 kB)\n",
+ "Downloading lightning-2.2.4-py3-none-any.whl (2.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading translators-5.9.2-py3-none-any.whl (56 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.4/56.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading gradio-4.41.0-py3-none-any.whl (12.6 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.6/12.6 MB\u001b[0m \u001b[31m91.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading gradio_client-1.3.0-py3-none-any.whl (318 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.7/318.7 kB\u001b[0m \u001b[31m27.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading tomlkit-0.12.0-py3-none-any.whl (37 kB)\n",
+ "Downloading loguru-0.7.2-py3-none-any.whl (62 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading yt_dlp-2024.8.6-py3-none-any.whl (3.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m82.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
+ "Downloading openai-1.41.0-py3-none-any.whl (362 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m362.4/362.4 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading audiostretchy-1.3.5-py3-none-any.whl (105 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.5/105.5 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading modelscope-1.17.1-py3-none-any.whl (5.7 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading funasr-1.1.5-py3-none-any.whl (649 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m649.0/649.0 kB\u001b[0m \u001b[31m43.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
+ "Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading hydra_core-1.3.2-py3-none-any.whl (154 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.5/154.5 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (318 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading kaldiio-2.18.0-py3-none-any.whl (28 kB)\n",
+ "Downloading lightning_utilities-0.11.6-py3-none-any.whl (26 kB)\n",
+ "Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.9/141.9 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pathos-0.3.2-py3-none-any.whl (82 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.1/82.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n",
+ "Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.8/117.8 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading ruff-0.6.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.2/10.2 MB\u001b[0m \u001b[31m96.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
+ "Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m89.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hUsing cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
+ "Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
+ "Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
+ "Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
+ "Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
+ "Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
+ "Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
+ "Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
+ "Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
+ "Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n",
+ "Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
+ "Downloading torchmetrics-1.4.1-py3-none-any.whl (866 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m866.2/866.2 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading uvicorn-0.30.6-py3-none-any.whl (62 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.8/62.8 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading fastapi-0.112.1-py3-none-any.whl (93 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading ffmpy-0.4.0-py3-none-any.whl (5.8 kB)\n",
+ "Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)\n",
+ "Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.4/194.4 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m43.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+ "Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m815.2/815.2 kB\u001b[0m \u001b[31m28.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pytorch_wpe-0.0.1-py3-none-any.whl (8.1 kB)\n",
+ "Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.7/101.7 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading torch_complex-0.4.4-py3-none-any.whl (9.1 kB)\n",
+ "Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.7/85.7 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading aliyun_python_sdk_kms-2.16.4-py2.py3-none-any.whl (98 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.4/98.4 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pox-0.3.4-py3-none-any.whl (29 kB)\n",
+ "Downloading ppft-1.7.6.8-py3-none-any.whl (56 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m50.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.9/56.9 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.7/526.7 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading starlette-0.38.2-py3-none-any.whl (72 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n",
+ "Using cached nvidia_nvjitlink_cu12-12.6.20-py3-none-manylinux2014_x86_64.whl (19.7 MB)\n",
+ "Building wheels for collected packages: wget, fire, antlr4-python3-runtime, PyExecJS, jaconv, oss2, aliyun-python-sdk-core, crcmod\n",
+ " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=5045e4fb8797a415e1a604b151af0684cbea317c19085ff0b5b25a446666c710\n",
+ " Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769\n",
+ " Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for fire: filename=fire-0.6.0-py2.py3-none-any.whl size=117030 sha256=4f94cad48021bb1236c0589dff5b0a93e77ebdc51c595f0dbc255db1409506d1\n",
+ " Stored in directory: /root/.cache/pip/wheels/d6/6d/5d/5b73fa0f46d01a793713f8859201361e9e581ced8c75e5c6a3\n",
+ " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=a0618844504294e40cd05366d1bbabe8eb9a670970c18f2afc361e4ea2317377\n",
+ " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n",
+ " Building wheel for PyExecJS (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for PyExecJS: filename=PyExecJS-1.5.1-py3-none-any.whl size=14576 sha256=d6357e26fc289436ca1e65d2e032a041e4699d5d86dab72150799d4e574eb9dc\n",
+ " Stored in directory: /root/.cache/pip/wheels/9d/91/30/28e6da53d4f44dc445349b2ffad581968447e4cbc9dd7991b8\n",
+ " Building wheel for jaconv (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for jaconv: filename=jaconv-0.4.0-py3-none-any.whl size=18228 sha256=4e7ef68322311dd5be52280657dade8620766ef47644800952edac4ff01351c2\n",
+ " Stored in directory: /root/.cache/pip/wheels/20/95/99/94e8d7545125181756857f6b1fc085ed4e0811ad9be7321af7\n",
+ " Building wheel for oss2 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for oss2: filename=oss2-2.18.6-py3-none-any.whl size=118355 sha256=08e18c1e51601bba1bd822253c0999f7988c4be253bb28305fad629ba41ca567\n",
+ " Stored in directory: /root/.cache/pip/wheels/e9/1c/df/6256a3d22097f6e1a30edd892de172054fd27875e0a349b4a4\n",
+ " Building wheel for aliyun-python-sdk-core (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for aliyun-python-sdk-core: filename=aliyun_python_sdk_core-2.15.1-py3-none-any.whl size=535322 sha256=ba8d4940debd82315d0272c51ad1b741872e0180e306cf8bb0dadcf5ec48414b\n",
+ " Stored in directory: /root/.cache/pip/wheels/69/4b/8e/0a28e00f4cf43b273c18cce083804738d41013e017da922ce4\n",
+ " Building wheel for crcmod (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for crcmod: filename=crcmod-1.7-cp310-cp310-linux_x86_64.whl size=31405 sha256=bbb62de93186408bb4bc948333361e34795606ab74e601b7ac460ee0c89d68e2\n",
+ " Stored in directory: /root/.cache/pip/wheels/85/4c/07/72215c529bd59d67e3dac29711d7aba1b692f543c808ba9e86\n",
+ "Successfully built wget fire antlr4-python3-runtime PyExecJS jaconv oss2 aliyun-python-sdk-core crcmod\n",
+ "Installing collected packages: wget, pydub, jamo, jaconv, crcmod, brotli, antlr4-python3-runtime, websockets, tomlkit, semantic-version, ruff, ruamel.yaml.clib, python-multipart, python-dotenv, PyExecJS, pycryptodomex, pycryptodome, ppft, pox, orjson, omegaconf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, mutagen, loguru, lightning-utilities, jmespath, jiter, h11, fire, ffmpy, dill, aiofiles, yt-dlp, WeTextProcessing, uvicorn, torch-complex, tensorboardX, starlette, ruamel.yaml, pytorch-wpe, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, modelscope, kaldiio, hydra-core, httpcore, audiostretchy, tokenizers, pathos, nvidia-cusolver-cu12, HyperPyYAML, httpx, fastapi, diffusers, aliyun-python-sdk-core, translators, transformers, pynndescent, openai, librosa, gradio-client, aliyun-python-sdk-kms, umap-learn, torchmetrics, oss2, gradio, conformer, pytorch-lightning, funasr, lightning\n",
+ " Attempting uninstall: tomlkit\n",
+ " Found existing installation: tomlkit 0.13.2\n",
+ " Uninstalling tomlkit-0.13.2:\n",
+ " Successfully uninstalled tomlkit-0.13.2\n",
+ " Attempting uninstall: numpy\n",
+ " Found existing installation: numpy 1.26.4\n",
+ " Uninstalling numpy-1.26.4:\n",
+ " Successfully uninstalled numpy-1.26.4\n",
+ " Attempting uninstall: tokenizers\n",
+ " Found existing installation: tokenizers 0.19.1\n",
+ " Uninstalling tokenizers-0.19.1:\n",
+ " Successfully uninstalled tokenizers-0.19.1\n",
+ " Attempting uninstall: transformers\n",
+ " Found existing installation: transformers 4.42.4\n",
+ " Uninstalling transformers-4.42.4:\n",
+ " Successfully uninstalled transformers-4.42.4\n",
+ " Attempting uninstall: librosa\n",
+ " Found existing installation: librosa 0.10.2.post1\n",
+ " Uninstalling librosa-0.10.2.post1:\n",
+ " Successfully uninstalled librosa-0.10.2.post1\n",
+ "Successfully installed HyperPyYAML-1.2.2 PyExecJS-1.5.1 WeTextProcessing-1.0.3 aiofiles-23.2.1 aliyun-python-sdk-core-2.15.1 aliyun-python-sdk-kms-2.16.4 antlr4-python3-runtime-4.9.3 audiostretchy-1.3.5 brotli-1.1.0 conformer-0.3.2 crcmod-1.7 diffusers-0.27.2 dill-0.3.8 fastapi-0.112.1 ffmpy-0.4.0 fire-0.6.0 funasr-1.1.5 gradio-4.41.0 gradio-client-1.3.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 hydra-core-1.3.2 jaconv-0.4.0 jamo-0.4.1 jiter-0.5.0 jmespath-0.10.0 kaldiio-2.18.0 librosa-0.10.2 lightning-2.2.4 lightning-utilities-0.11.6 loguru-0.7.2 modelscope-1.17.1 multiprocess-0.70.16 mutagen-1.47.0 numpy-1.26.3 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.6.20 nvidia-nvtx-cu12-12.1.105 omegaconf-2.3.0 openai-1.41.0 orjson-3.10.7 oss2-2.18.6 pathos-0.3.2 pox-0.3.4 ppft-1.7.6.8 pycryptodome-3.20.0 pycryptodomex-3.20.0 pydub-0.25.1 pynndescent-0.5.13 python-dotenv-1.0.1 python-multipart-0.0.9 pytorch-lightning-2.4.0 pytorch-wpe-0.0.1 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.8 ruff-0.6.1 semantic-version-2.10.0 starlette-0.38.2 tensorboardX-2.6.2.2 tokenizers-0.15.2 tomlkit-0.12.0 torch-complex-0.4.4 torchmetrics-1.4.1 transformers-4.39.3 translators-5.9.2 umap-learn-0.5.6 uvicorn-0.30.6 websockets-12.0 wget-3.2 yt-dlp-2024.8.6\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "application/vnd.colab-display-data+json": {
+ "pip_warning": {
+ "packages": [
+ "pydevd_plugins"
+ ]
+ },
+ "id": "fa4c4da3a6364a2cb6b2b480d11a2a32"
+ }
+ },
+ "metadata": {}
+ }
+ ],
+ "source": [
+ "# 安装项目所需的Python包\n",
+ "# pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.\n",
+ "# !conda install -y pynini==2.1.5 -c conda-forge\n",
+ "# -c https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/\n",
+ "\n",
+ "!pip install pynini==2.1.5\n",
+ "\n",
+ "!pip install -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vGgQQjJlRcK8",
+ "outputId": "b7f21d0d-6fc4-4542-d809-77cac030a0ae"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Processing ./submodules/demucs\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Processing ./submodules/whisper\n",
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ "Processing ./submodules/whisperX\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Processing ./submodules/TTS\n",
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting dora-search (from demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading dora_search-0.1.12.tar.gz (87 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.1/87.1 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from demucs==4.1.0a2->-r requirements_module.txt (line 1)) (0.8.0)\n",
+ "Collecting julius>=0.2.3 (from demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading julius-0.2.7.tar.gz (59 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.6/59.6 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting lameenc>=1.2 (from demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading lameenc-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (803 bytes)\n",
+ "Collecting openunmix (from demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading openunmix-1.3.0-py3-none-any.whl.metadata (17 kB)\n",
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from demucs==4.1.0a2->-r requirements_module.txt (line 1)) (6.0.2)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from demucs==4.1.0a2->-r requirements_module.txt (line 1)) (4.66.5)\n",
+ "Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20231117->-r requirements_module.txt (line 2)) (0.60.0)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20231117->-r requirements_module.txt (line 2)) (1.26.3)\n",
+ "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20231117->-r requirements_module.txt (line 2)) (10.3.0)\n",
+ "Collecting tiktoken (from openai-whisper==20231117->-r requirements_module.txt (line 2))\n",
+ " Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+ "Requirement already satisfied: triton<3,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20231117->-r requirements_module.txt (line 2)) (2.3.1)\n",
+ "Collecting faster-whisper==1.0.0 (from whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading faster_whisper-1.0.0-py3-none-any.whl.metadata (14 kB)\n",
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1->-r requirements_module.txt (line 3)) (4.39.3)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.1.4)\n",
+ "Requirement already satisfied: setuptools>=65 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1->-r requirements_module.txt (line 3)) (71.0.4)\n",
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1->-r requirements_module.txt (line 3)) (3.8.1)\n",
+ "Collecting pyannote.audio==3.1.1 (from whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)\n",
+ "Collecting av==11.* (from faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)\n",
+ "Collecting ctranslate2<5,>=4.0 (from faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n",
+ "Requirement already satisfied: huggingface-hub>=0.13 in /usr/local/lib/python3.10/dist-packages (from faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.23.5)\n",
+ "Requirement already satisfied: tokenizers<0.16,>=0.13 in /usr/local/lib/python3.10/dist-packages (from faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.15.2)\n",
+ "Collecting onnxruntime<2,>=1.14 (from faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading onnxruntime-1.19.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)\n",
+ "Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)\n",
+ "Requirement already satisfied: lightning>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.2.4)\n",
+ "Requirement already satisfied: omegaconf<3.0,>=2.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.3.0)\n",
+ "Collecting pyannote.core>=5.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)\n",
+ "Collecting pyannote.database>=5.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pyannote.database-5.1.0-py3-none-any.whl.metadata (1.2 kB)\n",
+ "Collecting pyannote.metrics>=3.2 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)\n",
+ "Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)\n",
+ "Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl.metadata (17 kB)\n",
+ "Requirement already satisfied: rich>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (13.7.1)\n",
+ "Collecting semver>=3.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading semver-3.0.2-py3-none-any.whl.metadata (5.0 kB)\n",
+ "Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.12.1)\n",
+ "Collecting speechbrain>=0.5.14 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading speechbrain-1.0.0-py3-none-any.whl.metadata (23 kB)\n",
+ "Requirement already satisfied: tensorboardX>=2.6 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.6.2.2)\n",
+ "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.3.1+cu121)\n",
+ "Collecting torch-audiomentations>=0.11.0 (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading torch_audiomentations-0.11.1-py3-none-any.whl.metadata (14 kB)\n",
+ "Requirement already satisfied: torchaudio>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.3.1+cu121)\n",
+ "Requirement already satisfied: torchmetrics>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (1.4.1)\n",
+ "Requirement already satisfied: cython>=0.29.30 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (3.0.11)\n",
+ "Requirement already satisfied: scipy>=1.11.2 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (1.13.1)\n",
+ "Requirement already satisfied: librosa>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (0.10.2)\n",
+ "Requirement already satisfied: scikit-learn>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (1.3.2)\n",
+ "Requirement already satisfied: inflect>=5.6.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (7.3.1)\n",
+ "Collecting anyascii>=0.3.0 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)\n",
+ "Requirement already satisfied: fsspec>=2023.6.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (2024.6.1)\n",
+ "Requirement already satisfied: aiohttp>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (3.10.3)\n",
+ "Requirement already satisfied: packaging>=23.1 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (24.1)\n",
+ "Requirement already satisfied: mutagen==1.47.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (1.47.0)\n",
+ "Requirement already satisfied: flask>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (2.2.5)\n",
+ "Collecting pysbd>=0.3.4 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)\n",
+ "Requirement already satisfied: umap-learn>=0.5.1 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (0.5.6)\n",
+ "Collecting pandas (from whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
+ "Requirement already satisfied: matplotlib>=3.7.0 in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (3.7.1)\n",
+ "Collecting trainer>=0.0.36 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)\n",
+ "Collecting coqpit>=0.0.16 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)\n",
+ "Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (0.42.1)\n",
+ "Collecting pypinyin (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading pypinyin-0.52.0-py2.py3-none-any.whl.metadata (12 kB)\n",
+ "Collecting hangul-romanize (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)\n",
+ "Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut-2.2.3.tar.gz (73 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: jamo in /usr/local/lib/python3.10/dist-packages (from TTS==0.22.0->-r requirements_module.txt (line 4)) (0.4.1)\n",
+ "Collecting g2pkk>=0.1.1 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading g2pkk-0.1.2-py3-none-any.whl.metadata (2.0 kB)\n",
+ "Collecting bangla (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading bangla-0.0.2-py2.py3-none-any.whl.metadata (4.5 kB)\n",
+ "Collecting bnnumerizer (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading bnnumerizer-0.0.2.tar.gz (4.7 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting bnunicodenormalizer (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading bnunicodenormalizer-0.1.7-py3-none-any.whl.metadata (22 kB)\n",
+ "Collecting encodec>=0.1.1 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading encodec-0.1.1.tar.gz (3.7 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting unidecode>=1.3.2 (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)\n",
+ "Collecting num2words (from TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)\n",
+ "Requirement already satisfied: spacy>=3 in /usr/local/lib/python3.10/dist-packages (from spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.7.5)\n",
+ "Requirement already satisfied: Babel<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.16.0)\n",
+ "Collecting dateparser~=1.1.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading dateparser-1.1.8-py2.py3-none-any.whl.metadata (27 kB)\n",
+ "Collecting gruut-ipa<1.0,>=0.12.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut-ipa-0.13.0.tar.gz (101 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.6/101.6 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting gruut_lang_en~=2.0.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut_lang_en-2.0.1.tar.gz (15.3 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.3/15.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting jsonlines~=1.2.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading jsonlines-1.2.0-py2.py3-none-any.whl.metadata (1.3 kB)\n",
+ "Collecting networkx<3.0.0,>=2.5.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)\n",
+ "Collecting python-crfsuite~=0.9.7 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)\n",
+ "Collecting gruut_lang_de~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut_lang_de-2.0.1.tar.gz (18.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.1/18.1 MB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting gruut_lang_fr~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut_lang_fr-2.0.2.tar.gz (10.9 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/10.9 MB\u001b[0m \u001b[31m111.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting gruut_lang_es~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading gruut_lang_es-2.0.1.tar.gz (31.4 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m31.4/31.4 MB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.3.5)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.4.1)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (6.0.5)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.9.4)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.0.3)\n",
+ "Requirement already satisfied: Werkzeug>=2.2.2 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.0.3)\n",
+ "Requirement already satisfied: Jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.1.4)\n",
+ "Requirement already satisfied: itsdangerous>=2.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.2.0)\n",
+ "Requirement already satisfied: click>=8.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (8.1.7)\n",
+ "Requirement already satisfied: typeguard>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from inflect>=5.6.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.3.0)\n",
+ "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.0.1)\n",
+ "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.4.2)\n",
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.4.2)\n",
+ "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.8.2)\n",
+ "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.4.0)\n",
+ "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.12.2)\n",
+ "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.4)\n",
+ "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.0.8)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.2.1)\n",
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.53.1)\n",
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.4.5)\n",
+ "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (9.4.0)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.1.2)\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.8.2)\n",
+ "Collecting docopt>=0.6.2 (from num2words->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading docopt-0.6.2.tar.gz (25 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->openai-whisper==20231117->-r requirements_module.txt (line 2)) (0.43.0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2024.1)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.3.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.5.0)\n",
+ "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (1.17.0)\n",
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.0.12)\n",
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.0.5)\n",
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.0.10)\n",
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.0.8)\n",
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.0.9)\n",
+ "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (8.2.5)\n",
+ "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.1.3)\n",
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.4.8)\n",
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.0.10)\n",
+ "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.4.1)\n",
+ "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.12.3)\n",
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.32.3)\n",
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.8.2)\n",
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.4.0)\n",
+ "Collecting sudachipy!=0.6.1,>=0.5.2 (from spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+ "Collecting sudachidict-core>=20211220 (from spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4))\n",
+ " Downloading SudachiDict_core-20240716-py3-none-any.whl.metadata (2.5 kB)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (5.9.5)\n",
+ "Requirement already satisfied: tensorboard in /usr/local/lib/python3.10/dist-packages (from trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.17.0)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1->-r requirements_module.txt (line 3)) (3.15.4)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2024.5.15)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.4.4)\n",
+ "Requirement already satisfied: pynndescent>=0.5 in /usr/local/lib/python3.10/dist-packages (from umap-learn>=0.5.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.5.13)\n",
+ "Collecting retrying (from dora-search->demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)\n",
+ "Collecting submitit (from dora-search->demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading submitit-1.5.1-py3-none-any.whl.metadata (8.0 kB)\n",
+ "Collecting treetable (from dora-search->demucs==4.1.0a2->-r requirements_module.txt (line 1))\n",
+ " Downloading treetable-0.2.5.tar.gz (10 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.22)\n",
+ "Requirement already satisfied: tzlocal in /usr/local/lib/python3.10/dist-packages (from dateparser~=1.1.0->gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4)) (5.2)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2>=3.0->flask>=2.0.1->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.1.5)\n",
+ "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from jsonlines~=1.2.0->gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.16.0)\n",
+ "Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.10/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.2.0)\n",
+ "Requirement already satisfied: lightning-utilities<2.0,>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.11.6)\n",
+ "Requirement already satisfied: pytorch-lightning in /usr/local/lib/python3.10/dist-packages (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.4.0)\n",
+ "Requirement already satisfied: antlr4-python3-runtime==4.9.* in /usr/local/lib/python3.10/dist-packages (from omegaconf<3.0,>=2.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (4.9.3)\n",
+ "Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n",
+ "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (24.3.25)\n",
+ "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (3.20.3)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (1.13.2)\n",
+ "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa>=0.10.0->TTS==0.22.0->-r requirements_module.txt (line 4)) (4.2.2)\n",
+ "Requirement already satisfied: sortedcontainers>=2.0.4 in /usr/local/lib/python3.10/dist-packages (from pyannote.core>=5.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.4.0)\n",
+ "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.9.0)\n",
+ "Collecting optuna>=3.1 (from pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)\n",
+ "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.20.1 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.20.1)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.7)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2.0.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (2024.7.4)\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (3.0.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.16.1)\n",
+ "Requirement already satisfied: hyperpyyaml in /usr/local/lib/python3.10/dist-packages (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (1.2.2)\n",
+ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.1.99)\n",
+ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.7.11)\n",
+ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.1.5)\n",
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.105)\n",
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.105)\n",
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.105)\n",
+ "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (8.9.2.26)\n",
+ "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.3.1)\n",
+ "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (11.0.2.54)\n",
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (10.3.2.106)\n",
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (11.4.5.107)\n",
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.0.106)\n",
+ "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.20.5)\n",
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.1.105)\n",
+ "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=2.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (12.6.20)\n",
+ "Collecting torch-pitch-shift>=1.2.2 (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading torch_pitch_shift-1.2.4-py3-none-any.whl.metadata (2.5 kB)\n",
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.5.4)\n",
+ "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.18.1)\n",
+ "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (7.0.4)\n",
+ "Requirement already satisfied: cloudpickle>=1.2.1 in /usr/local/lib/python3.10/dist-packages (from submitit->dora-search->demucs==4.1.0a2->-r requirements_module.txt (line 1)) (2.2.1)\n",
+ "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.4.0)\n",
+ "Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.64.1)\n",
+ "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (3.6)\n",
+ "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.36->TTS==0.22.0->-r requirements_module.txt (line 4)) (0.7.2)\n",
+ "Requirement already satisfied: marisa-trie>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.2.0)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.1.2)\n",
+ "Collecting alembic>=1.5.0 (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)\n",
+ "Collecting colorlog (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n",
+ "Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (2.0.32)\n",
+ "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS==0.22.0->-r requirements_module.txt (line 4)) (1.16.0)\n",
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3)) (1.3.0)\n",
+ "Collecting primePy>=1.3 (from torch-pitch-shift>=1.2.2->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading primePy-1.3-py3-none-any.whl.metadata (4.8 kB)\n",
+ "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n",
+ "Requirement already satisfied: ruamel.yaml>=0.17.28 in /usr/local/lib/python3.10/dist-packages (from hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.18.6)\n",
+ "Collecting Mako (from alembic>=1.5.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3))\n",
+ " Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)\n",
+ "Requirement already satisfied: ruamel.yaml.clib>=0.2.7 in /usr/local/lib/python3.10/dist-packages (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (0.2.8)\n",
+ "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1->-r requirements_module.txt (line 3)) (3.0.3)\n",
+ "Downloading faster_whisper-1.0.0-py3-none-any.whl (1.5 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m66.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pyannote.audio-3.1.1-py2.py3-none-any.whl (208 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m208.7/208.7 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m32.9/32.9 MB\u001b[0m \u001b[31m57.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading anyascii-0.3.2-py3-none-any.whl (289 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m289.9/289.9 kB\u001b[0m \u001b[31m22.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading coqpit-0.0.17-py3-none-any.whl (13 kB)\n",
+ "Downloading g2pkk-0.1.2-py3-none-any.whl (25 kB)\n",
+ "Downloading lameenc-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (239 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.8/239.8 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading num2words-0.5.13-py3-none-any.whl (143 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.3/143.3 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m111.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pysbd-0.3.4-py3-none-any.whl (71 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading trainer-0.0.36-py3-none-any.whl (51 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.2/51.2 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading Unidecode-1.3.8-py3-none-any.whl (235 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.5/235.5 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading bangla-0.0.2-py2.py3-none-any.whl (6.2 kB)\n",
+ "Downloading bnunicodenormalizer-0.1.7-py3-none-any.whl (23 kB)\n",
+ "Downloading hangul_romanize-0.1.0-py3-none-any.whl (4.6 kB)\n",
+ "Downloading openunmix-1.3.0-py3-none-any.whl (40 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.0/40.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pypinyin-0.52.0-py2.py3-none-any.whl (833 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m833.7/833.7 kB\u001b[0m \u001b[31m46.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m55.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)\n",
+ "Downloading ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (192.3 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m192.3/192.3 MB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading dateparser-1.1.8-py2.py3-none-any.whl (293 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m293.8/293.8 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading jsonlines-1.2.0-py2.py3-none-any.whl (7.6 kB)\n",
+ "Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m68.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading onnxruntime-1.19.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.2 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.2/13.2 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pyannote.core-5.0.0-py3-none-any.whl (58 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pyannote.database-5.1.0-py3-none-any.whl (48 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.1/48.1 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pyannote.metrics-3.2.1-py3-none-any.whl (51 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.4/51.4 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pyannote.pipeline-3.0.1-py3-none-any.whl (31 kB)\n",
+ "Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m56.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading pytorch_metric_learning-2.6.0-py3-none-any.whl (119 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.3/119.3 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading semver-3.0.2-py3-none-any.whl (17 kB)\n",
+ "Downloading speechbrain-1.0.0-py3-none-any.whl (760 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m760.1/760.1 kB\u001b[0m \u001b[31m44.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading SudachiDict_core-20240716-py3-none-any.whl (72.0 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 MB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m85.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading torch_audiomentations-0.11.1-py3-none-any.whl (50 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.1/50.1 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)\n",
+ "Downloading submitit-1.5.1-py3-none-any.whl (74 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.7/74.7 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.1/380.1 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading torch_pitch_shift-1.2.4-py3-none-any.whl (4.9 kB)\n",
+ "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.0/233.0 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading primePy-1.3-py3-none-any.whl (4.0 kB)\n",
+ "Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n",
+ "Downloading Mako-1.3.5-py3-none-any.whl (78 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hBuilding wheels for collected packages: demucs, openai-whisper, whisperx, TTS, gruut, encodec, julius, bnnumerizer, dora-search, docopt, gruut-ipa, gruut_lang_de, gruut_lang_en, gruut_lang_es, gruut_lang_fr, treetable\n",
+ " Building wheel for demucs (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for demucs: filename=demucs-4.1.0a2-py3-none-any.whl size=83512 sha256=d95118457ade550d112160a5d8cf08d771db44bb39a03c63e25abbd313cb34ca\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-e5wc77tl/wheels/f1/13/88/6aa247fe78860dad92037cbfda07111f894f08f5fcc4fdcb21\n",
+ " Building wheel for openai-whisper (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for openai-whisper: filename=openai_whisper-20231117-py3-none-any.whl size=802819 sha256=a3689dfccbb73153aafc23dd6926ee85355a9e9ad94674f3f981945249bfdf88\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-e5wc77tl/wheels/af/71/a6/c355411f740299490f85e086e415fe4d667c3422ee184df8da\n",
+ " Building wheel for whisperx (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for whisperx: filename=whisperx-3.1.1-py3-none-any.whl size=38421 sha256=7ac59bc72dfe30fef00fd19fc21467986563ffd6d45579e3b7bbf0dbb2d9f701\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-e5wc77tl/wheels/4a/46/90/0c78a1fb0c47d85b134cf681ac680aed5eea3c571cc74c02ee\n",
+ " Building wheel for TTS (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for TTS: filename=TTS-0.22.0-cp310-cp310-linux_x86_64.whl size=904107 sha256=725f1c985725809efc648f796ed828770c0219b869b3edf1ecf081a21900559f\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-e5wc77tl/wheels/3f/bb/f4/ef859b8202177e3c3ccc56febe1c61ff7c747d123c1f6c3fb9\n",
+ " Building wheel for gruut (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut: filename=gruut-2.2.3-py3-none-any.whl size=75787 sha256=44dcd47695e35e54fc7d4b4a0281c579fdb48ce7e349b09f68cc3e9745ff22db\n",
+ " Stored in directory: /root/.cache/pip/wheels/fc/57/a8/f9de532daf5214f53644f20f3a9e6f69269453c87df9c0a817\n",
+ " Building wheel for encodec (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45759 sha256=2d13dcbc234044eea0ab35bb01dc4d0e7c016ee7afc00b3774e413ec99874bcb\n",
+ " Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3\n",
+ " Building wheel for julius (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for julius: filename=julius-0.2.7-py3-none-any.whl size=21869 sha256=c5408f9b4a958c3103a905b64f7ea3bd7f115dcc0d83c5cb1d153c579a43ce66\n",
+ " Stored in directory: /root/.cache/pip/wheels/b9/b2/05/f883527ffcb7f2ead5438a2c23439aa0c881eaa9a4c80256f4\n",
+ " Building wheel for bnnumerizer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for bnnumerizer: filename=bnnumerizer-0.0.2-py3-none-any.whl size=5260 sha256=c69e65eee86426c1cc6789bd4f596564fd69176c417cbaec39b9215f6102e715\n",
+ " Stored in directory: /root/.cache/pip/wheels/59/6b/e8/223172e7d5c9f72df3ea1a0d9258f3a8ab5b28e827728edef5\n",
+ " Building wheel for dora-search (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for dora-search: filename=dora_search-0.1.12-py3-none-any.whl size=75092 sha256=b2cfabf51a9ca8b911f8bf4e570e109ae39c30f6e5e9eae8b8eabf77f95f436e\n",
+ " Stored in directory: /root/.cache/pip/wheels/b1/c2/c0/bea5cc405497284d584b958f293ef32c23bad42ae5e44d973c\n",
+ " Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=4729498729a7da14fb21ba32af9f40118cf83cb57f383d7f0a7506575448184d\n",
+ " Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac\n",
+ " Building wheel for gruut-ipa (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut-ipa: filename=gruut_ipa-0.13.0-py3-none-any.whl size=104872 sha256=2639e03bc36500b62c9ff719da0d9581722ccc32f61d13db1579f9762a6e3b28\n",
+ " Stored in directory: /root/.cache/pip/wheels/7b/18/49/e4f500ecdf0babe757953f844e4d7cd1ea81c5503c09bfe984\n",
+ " Building wheel for gruut_lang_de (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut_lang_de: filename=gruut_lang_de-2.0.1-py3-none-any.whl size=18498314 sha256=50160e490e56cb1bd108cda95494aa78845a3ace03b9bfa2c44c3d4e8f908f84\n",
+ " Stored in directory: /root/.cache/pip/wheels/83/80/5f/775b357ae61d7cb68793327c7470d848715cbc60bb373af8dd\n",
+ " Building wheel for gruut_lang_en (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut_lang_en: filename=gruut_lang_en-2.0.1-py3-none-any.whl size=15326858 sha256=7061be2143ae1e41176fb517a95ba47251a354dfa3dd3c08d0167a263467a3af\n",
+ " Stored in directory: /root/.cache/pip/wheels/64/8d/b7/d484d224facd899ed188e00374f25dd3f19d1a3f53da6517bd\n",
+ " Building wheel for gruut_lang_es (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut_lang_es: filename=gruut_lang_es-2.0.1-py3-none-any.whl size=32173928 sha256=02ab773c980e51b6803fbda70a03598216ad5f73d772cfa12a9f0d6a6b51d9ed\n",
+ " Stored in directory: /root/.cache/pip/wheels/ab/bd/96/5ddde14e8e6932a96f12c5ab5de62b619d39e2507d7daf5188\n",
+ " Building wheel for gruut_lang_fr (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for gruut_lang_fr: filename=gruut_lang_fr-2.0.2-py3-none-any.whl size=10968767 sha256=e1d7d3870f6c28e15abdd4bee0bbe559509d9a61dbcf7371bda0161bcbfcec79\n",
+ " Stored in directory: /root/.cache/pip/wheels/db/21/be/d0436e3f1cf9bf38b9bb9b4a476399c77a1ab19f7172b45e19\n",
+ " Building wheel for treetable (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for treetable: filename=treetable-0.2.5-py3-none-any.whl size=7332 sha256=7cee2e53cbdb0b733494e26e1f73ea1e939dcf4ae9122a2142e72b8476aa041b\n",
+ " Stored in directory: /root/.cache/pip/wheels/72/55/0e/91c3655bdb162446f8a7cd477579397544454a63ae7c599c0c\n",
+ "Successfully built demucs openai-whisper whisperx TTS gruut encodec julius bnnumerizer dora-search docopt gruut-ipa gruut_lang_de gruut_lang_en gruut_lang_es gruut_lang_fr treetable\n",
+ "Installing collected packages: sudachipy, python-crfsuite, primePy, lameenc, hangul-romanize, gruut_lang_fr, gruut_lang_es, gruut_lang_en, gruut_lang_de, docopt, bnunicodenormalizer, bnnumerizer, bangla, unidecode, treetable, sudachidict-core, submitit, semver, retrying, pysbd, pypinyin, num2words, networkx, Mako, jsonlines, humanfriendly, gruut-ipa, ctranslate2, coqpit, colorlog, av, anyascii, tiktoken, pyannote.core, pandas, g2pkk, dateparser, coloredlogs, alembic, optuna, openai-whisper, onnxruntime, gruut, trainer, pytorch-metric-learning, pyannote.database, julius, faster-whisper, dora-search, asteroid-filterbanks, torch-pitch-shift, speechbrain, pyannote.pipeline, pyannote.metrics, openunmix, encodec, torch-audiomentations, demucs, TTS, pyannote.audio, whisperx\n",
+ " Attempting uninstall: networkx\n",
+ " Found existing installation: networkx 3.3\n",
+ " Uninstalling networkx-3.3:\n",
+ " Successfully uninstalled networkx-3.3\n",
+ " Attempting uninstall: pandas\n",
+ " Found existing installation: pandas 2.1.4\n",
+ " Uninstalling pandas-2.1.4:\n",
+ " Successfully uninstalled pandas-2.1.4\n",
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "cudf-cu12 24.4.1 requires pandas<2.2.2dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.\n",
+ "google-colab 1.0.0 requires pandas==2.1.4, but you have pandas 1.5.3 which is incompatible.\n",
+ "xarray 2024.6.0 requires pandas>=2.0, but you have pandas 1.5.3 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[0mSuccessfully installed Mako-1.3.5 TTS-0.22.0 alembic-1.13.2 anyascii-0.3.2 asteroid-filterbanks-0.4.0 av-11.0.0 bangla-0.0.2 bnnumerizer-0.0.2 bnunicodenormalizer-0.1.7 coloredlogs-15.0.1 colorlog-6.8.2 coqpit-0.0.17 ctranslate2-4.3.1 dateparser-1.1.8 demucs-4.1.0a2 docopt-0.6.2 dora-search-0.1.12 encodec-0.1.1 faster-whisper-1.0.0 g2pkk-0.1.2 gruut-2.2.3 gruut-ipa-0.13.0 gruut_lang_de-2.0.1 gruut_lang_en-2.0.1 gruut_lang_es-2.0.1 gruut_lang_fr-2.0.2 hangul-romanize-0.1.0 humanfriendly-10.0 jsonlines-1.2.0 julius-0.2.7 lameenc-1.7.0 networkx-2.8.8 num2words-0.5.13 onnxruntime-1.19.0 openai-whisper-20231117 openunmix-1.3.0 optuna-3.6.1 pandas-1.5.3 primePy-1.3 pyannote.audio-3.1.1 pyannote.core-5.0.0 pyannote.database-5.1.0 pyannote.metrics-3.2.1 pyannote.pipeline-3.0.1 pypinyin-0.52.0 pysbd-0.3.4 python-crfsuite-0.9.10 pytorch-metric-learning-2.6.0 retrying-1.3.4 semver-3.0.2 speechbrain-1.0.0 submitit-1.5.1 sudachidict-core-20240716 sudachipy-0.6.8 tiktoken-0.7.0 torch-audiomentations-0.11.1 torch-pitch-shift-1.2.4 trainer-0.0.36 treetable-0.2.5 unidecode-1.3.8 whisperx-3.1.1\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 安装submodules 下的依赖\n",
+ "!pip install -r requirements_module.txt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "frCAHzRRQ3op"
+ },
+ "source": [
+ "# Download pretrained models 下载预训练模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Bu1gpyeAQ4zx",
+ "outputId": "4bce27e7-efe3-4e63-b875-367e02c58011"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "--2024-08-19 19:03:10-- https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth\n",
+ "Resolving download.pytorch.org (download.pytorch.org)... 18.172.170.8, 18.172.170.43, 18.172.170.53, ...\n",
+ "Connecting to download.pytorch.org (download.pytorch.org)|18.172.170.8|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 377664473 (360M) [application/x-www-form-urlencoded]\n",
+ "Saving to: ‘models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth’\n",
+ "\n",
+ "models/ASR/whisper/ 100%[===================>] 360.17M 184MB/s in 2.0s \n",
+ "\n",
+ "2024-08-19 19:03:12 (184 MB/s) - ‘models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth’ saved [377664473/377664473]\n",
+ "\n",
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+ " warnings.warn(\n",
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1194: UserWarning: `local_dir_use_symlinks` parameter is deprecated and will be ignored. The process to download files to a local folder has been updated and do not rely on symlinks anymore. You only need to pass a destination folder as`local_dir`.\n",
+ "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.\n",
+ " warnings.warn(\n",
+ "Fetching 18 files: 0% 0/18 [00:00, ?it/s]\n",
+ ".gitattributes: 100% 1.52k/1.52k [00:00<00:00, 10.7MB/s]\n",
+ "Fetching 18 files: 6% 1/18 [00:00<00:03, 5.03it/s]\n",
+ "LICENSE.txt: 100% 4.01k/4.01k [00:00<00:00, 32.0MB/s]\n",
+ "\n",
+ "mel_stats.pth: 100% 1.07k/1.07k [00:00<00:00, 7.80MB/s]\n",
+ "\n",
+ "README.md: 100% 4.26k/4.26k [00:00<00:00, 21.1MB/s]\n",
+ "\n",
+ "hash.md5: 100% 32.0/32.0 [00:00<00:00, 258kB/s]\n",
+ "\n",
+ "config.json: 100% 4.37k/4.37k [00:00<00:00, 23.4MB/s]\n",
+ "Fetching 18 files: 22% 4/18 [00:00<00:01, 12.83it/s]\n",
+ "dvae.pth: 0% 0.00/211M [00:00, ?B/s]\u001b[A\n",
+ "\n",
+ "model.pth: 0% 0.00/1.87G [00:00, ?B/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "samples/de_sample.wav: 0% 0.00/299k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/en_sample.wav: 0% 0.00/299k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/fr_sample.wav: 0% 0.00/432k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "dvae.pth: 5% 10.5M/211M [00:00<00:02, 71.0MB/s]\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/de_sample.wav: 100% 299k/299k [00:00<00:00, 3.28MB/s]\n",
+ "\n",
+ "\n",
+ "samples/es_sample.wav: 100% 330k/330k [00:00<00:00, 6.73MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/fr_sample.wav: 100% 432k/432k [00:00<00:00, 4.21MB/s]\n",
+ "\n",
+ "dvae.pth: 10% 21.0M/211M [00:00<00:02, 81.7MB/s]\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/ja-sample.wav: 0% 0.00/285k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 1% 21.0M/1.87G [00:00<00:24, 75.6MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 15% 31.5M/211M [00:00<00:02, 86.4MB/s]\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/en_sample.wav: 100% 299k/299k [00:00<00:00, 1.28MB/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "samples/pt_sample.wav: 100% 445k/445k [00:00<00:00, 3.44MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/en_sample.wav: 100% 299k/299k [00:00<00:00, 1.23MB/s]\n",
+ "samples/pt_sample.wav: 100% 445k/445k [00:00<00:00, 3.13MB/s]\n",
+ "samples/tr_sample.wav: 100% 299k/299k [00:00<00:00, 33.5MB/s]\n",
+ "\n",
+ "\n",
+ "model.pth: 2% 31.5M/1.87G [00:00<00:22, 82.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "speakers_xtts.pth: 0% 0.00/7.75M [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/zh-cn-sample.wav: 0% 0.00/383k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/ja-sample.wav: 100% 285k/285k [00:00<00:00, 1.48MB/s]\n",
+ "\n",
+ "dvae.pth: 20% 41.9M/211M [00:00<00:01, 86.3MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 2% 41.9M/1.87G [00:00<00:21, 86.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "vocab.json: 0% 0.00/361k [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "samples/zh-cn-sample.wav: 100% 383k/383k [00:00<00:00, 2.19MB/s]\n",
+ "vocab.json: 100% 361k/361k [00:00<00:00, 16.4MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "speakers_xtts.pth: 100% 7.75M/7.75M [00:00<00:00, 37.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "speakers_xtts.pth: 100% 7.75M/7.75M [00:00<00:00, 29.5MB/s]\n",
+ "\n",
+ "dvae.pth: 30% 62.9M/211M [00:00<00:01, 94.6MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 3% 62.9M/1.87G [00:00<00:22, 78.8MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 35% 73.4M/211M [00:00<00:01, 91.9MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 4% 73.4M/1.87G [00:00<00:21, 85.3MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 40% 83.9M/211M [00:00<00:01, 94.6MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 5% 94.4M/1.87G [00:01<00:18, 95.7MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 50% 105M/211M [00:01<00:01, 99.0MB/s] \u001b[A\n",
+ "\n",
+ "model.pth: 6% 115M/1.87G [00:01<00:18, 94.8MB/s] \u001b[A\u001b[A\n",
+ "dvae.pth: 60% 126M/211M [00:01<00:00, 95.3MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 7% 126M/1.87G [00:01<00:18, 95.5MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 65% 136M/211M [00:01<00:00, 95.6MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 7% 136M/1.87G [00:01<00:17, 97.6MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 8% 147M/1.87G [00:01<00:18, 94.1MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 75% 157M/211M [00:01<00:00, 94.3MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 9% 168M/1.87G [00:01<00:17, 95.3MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 85% 178M/211M [00:01<00:00, 99.4MB/s]\u001b[A\n",
+ "dvae.pth: 90% 189M/211M [00:01<00:00, 98.6MB/s]\u001b[A\n",
+ "\n",
+ "model.pth: 10% 178M/1.87G [00:01<00:17, 93.9MB/s]\u001b[A\u001b[A\n",
+ "dvae.pth: 100% 210M/211M [00:02<00:00, 94.7MB/s]\u001b[A\n",
+ "\n",
+ "dvae.pth: 100% 211M/211M [00:02<00:00, 91.9MB/s]\n",
+ "Fetching 18 files: 33% 6/18 [00:02<00:06, 1.84it/s]\n",
+ "\n",
+ "model.pth: 12% 220M/1.87G [00:02<00:15, 107MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 13% 241M/1.87G [00:02<00:13, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 14% 262M/1.87G [00:02<00:11, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 15% 283M/1.87G [00:02<00:10, 146MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 16% 304M/1.87G [00:02<00:12, 125MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 18% 336M/1.87G [00:03<00:10, 153MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 20% 367M/1.87G [00:03<00:08, 178MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 21% 398M/1.87G [00:03<00:07, 198MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 23% 430M/1.87G [00:03<00:06, 207MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 25% 461M/1.87G [00:03<00:07, 200MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 26% 482M/1.87G [00:03<00:06, 202MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 27% 503M/1.87G [00:03<00:06, 203MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 28% 524M/1.87G [00:03<00:06, 201MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 29% 545M/1.87G [00:04<00:06, 202MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 30% 566M/1.87G [00:04<00:06, 202MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 31% 587M/1.87G [00:04<00:06, 198MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 33% 608M/1.87G [00:04<00:06, 197MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 34% 629M/1.87G [00:04<00:06, 186MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 35% 650M/1.87G [00:04<00:07, 171MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 36% 671M/1.87G [00:04<00:06, 180MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 37% 692M/1.87G [00:04<00:06, 186MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 38% 713M/1.87G [00:04<00:06, 180MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 39% 734M/1.87G [00:05<00:06, 178MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 40% 755M/1.87G [00:05<00:06, 173MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 42% 776M/1.87G [00:05<00:06, 172MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 43% 797M/1.87G [00:05<00:06, 172MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 44% 818M/1.87G [00:05<00:06, 175MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 45% 839M/1.87G [00:05<00:05, 179MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 46% 860M/1.87G [00:05<00:06, 166MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 47% 881M/1.87G [00:08<00:41, 23.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 49% 912M/1.87G [00:08<00:25, 37.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 51% 944M/1.87G [00:08<00:17, 53.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 52% 975M/1.87G [00:08<00:12, 72.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 53% 996M/1.87G [00:08<00:10, 85.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 54% 1.02G/1.87G [00:09<00:08, 98.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 56% 1.05G/1.87G [00:09<00:06, 121MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 58% 1.08G/1.87G [00:09<00:05, 142MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 59% 1.10G/1.87G [00:09<00:04, 154MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 61% 1.13G/1.87G [00:09<00:04, 171MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 62% 1.15G/1.87G [00:09<00:04, 178MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 63% 1.18G/1.87G [00:09<00:03, 193MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 65% 1.22G/1.87G [00:10<00:03, 201MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 67% 1.25G/1.87G [00:10<00:04, 145MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 68% 1.28G/1.87G [00:10<00:03, 166MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 70% 1.31G/1.87G [00:10<00:03, 185MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 72% 1.34G/1.87G [00:10<00:02, 188MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 74% 1.37G/1.87G [00:10<00:02, 198MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 75% 1.41G/1.87G [00:11<00:02, 209MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 77% 1.44G/1.87G [00:11<00:01, 218MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 79% 1.47G/1.87G [00:11<00:01, 224MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 80% 1.50G/1.87G [00:11<00:01, 213MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 82% 1.53G/1.87G [00:11<00:01, 209MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 84% 1.56G/1.87G [00:11<00:01, 211MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 85% 1.59G/1.87G [00:14<00:08, 33.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 87% 1.63G/1.87G [00:14<00:05, 45.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 89% 1.66G/1.87G [00:14<00:03, 59.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 90% 1.69G/1.87G [00:14<00:02, 77.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 92% 1.72G/1.87G [00:15<00:01, 95.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 94% 1.75G/1.87G [00:15<00:01, 111MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 95% 1.77G/1.87G [00:15<00:00, 123MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 96% 1.79G/1.87G [00:15<00:00, 136MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 97% 1.81G/1.87G [00:15<00:00, 148MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 99% 1.85G/1.87G [00:15<00:00, 167MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model.pth: 100% 1.87G/1.87G [00:15<00:00, 118MB/s]\n",
+ "Fetching 18 files: 100% 18/18 [00:16<00:00, 1.11it/s]\n",
+ "Fetching 12 files: 0% 0/12 [00:00, ?it/s]\n",
+ "config.json: 100% 662/662 [00:00<00:00, 5.66MB/s]\n",
+ "\n",
+ "generation_config.json: 100% 206/206 [00:00<00:00, 1.88MB/s]\n",
+ "\n",
+ "LICENSE: 100% 7.28k/7.28k [00:00<00:00, 51.2MB/s]\n",
+ "\n",
+ "merges.txt: 0% 0.00/1.67M [00:00, ?B/s]\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 0% 0.00/3.91G [00:00, ?B/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ ".gitattributes: 100% 1.52k/1.52k [00:00<00:00, 11.8MB/s]\n",
+ "Fetching 12 files: 8% 1/12 [00:00<00:02, 4.39it/s]\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 0% 0.00/3.99G [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "model.safetensors.index.json: 100% 39.6k/39.6k [00:00<00:00, 88.3MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "tokenizer_config.json: 100% 1.29k/1.29k [00:00<00:00, 9.42MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "tokenizer.json: 0% 0.00/7.03M [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 0% 10.5M/3.99G [00:00<00:59, 66.6MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "vocab.json: 0% 0.00/2.78M [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "README.md: 100% 4.27k/4.27k [00:00<00:00, 15.4MB/s]\n",
+ "Fetching 12 files: 25% 3/12 [00:00<00:01, 6.15it/s]\n",
+ "merges.txt: 100% 1.67M/1.67M [00:00<00:00, 4.92MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 1% 31.5M/3.99G [00:00<00:38, 103MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 0% 10.5M/3.91G [00:00<02:14, 29.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 1% 52.4M/3.99G [00:00<00:30, 129MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "tokenizer.json: 100% 7.03M/7.03M [00:00<00:00, 21.0MB/s]\n",
+ "\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 1% 31.5M/3.91G [00:00<00:59, 65.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "vocab.json: 100% 2.78M/2.78M [00:00<00:00, 7.82MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 2% 73.4M/3.99G [00:00<00:29, 133MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 1% 52.4M/3.91G [00:00<00:46, 82.4MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 2% 62.9M/3.91G [00:00<00:45, 84.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 2% 94.4M/3.99G [00:00<00:34, 114MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 2% 73.4M/3.91G [00:00<00:43, 87.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 3% 115M/3.99G [00:01<00:34, 112MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 2% 94.4M/3.91G [00:01<00:38, 99.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 3% 136M/3.99G [00:01<00:34, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 3% 115M/3.91G [00:01<00:56, 67.5MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 3% 126M/3.91G [00:01<00:51, 72.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 4% 157M/3.99G [00:01<00:50, 75.3MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 4% 178M/3.99G [00:01<00:44, 86.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 4% 147M/3.91G [00:01<00:44, 84.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 5% 199M/3.99G [00:01<00:38, 99.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 4% 168M/3.91G [00:02<00:39, 94.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 5% 178M/3.91G [00:02<00:39, 94.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 6% 220M/3.99G [00:02<00:36, 104MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 5% 199M/3.91G [00:02<00:36, 101MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 6% 241M/3.99G [00:02<00:35, 106MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 6% 220M/3.91G [00:02<00:34, 107MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 7% 262M/3.99G [00:02<00:33, 111MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 6% 241M/3.91G [00:02<00:32, 111MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 7% 283M/3.99G [00:02<00:33, 111MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 7% 262M/3.91G [00:02<00:32, 114MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 8% 304M/3.99G [00:02<00:31, 118MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 8% 325M/3.99G [00:03<00:30, 120MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 7% 283M/3.91G [00:03<00:32, 113MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 8% 304M/3.91G [00:03<00:32, 112MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 9% 346M/3.99G [00:03<00:33, 110MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 8% 325M/3.91G [00:03<00:32, 111MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 9% 367M/3.99G [00:03<00:32, 113MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 10% 388M/3.99G [00:03<00:30, 119MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 9% 346M/3.91G [00:03<00:31, 114MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 10% 409M/3.99G [00:03<00:29, 121MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 9% 367M/3.91G [00:03<00:30, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 10% 388M/3.91G [00:03<00:27, 126MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 11% 430M/3.99G [00:03<00:28, 125MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 11% 451M/3.99G [00:04<00:26, 134MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 10% 409M/3.91G [00:04<00:28, 124MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 12% 472M/3.99G [00:04<00:25, 139MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 11% 430M/3.91G [00:04<00:26, 131MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 12% 493M/3.99G [00:04<00:24, 145MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 12% 451M/3.91G [00:04<00:27, 127MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 13% 514M/3.99G [00:04<00:22, 152MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 12% 472M/3.91G [00:04<00:26, 129MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 13% 535M/3.99G [00:04<00:22, 155MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 14% 556M/3.99G [00:04<00:21, 160MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 13% 493M/3.91G [00:04<00:26, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 14% 577M/3.99G [00:04<00:20, 163MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 13% 514M/3.91G [00:04<00:24, 137MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 15% 598M/3.99G [00:05<00:25, 134MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 14% 535M/3.91G [00:05<00:29, 115MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 16% 619M/3.99G [00:05<00:26, 127MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 14% 556M/3.91G [00:05<00:28, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 16% 640M/3.99G [00:05<00:24, 134MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 15% 577M/3.91G [00:05<00:27, 122MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 17% 661M/3.99G [00:05<00:22, 146MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 17% 682M/3.99G [00:05<00:23, 142MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 15% 598M/3.91G [00:05<00:29, 114MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 18% 703M/3.99G [00:05<00:21, 150MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 16% 619M/3.91G [00:05<00:27, 120MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 16% 640M/3.91G [00:05<00:26, 125MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 18% 724M/3.99G [00:05<00:24, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 17% 661M/3.91G [00:06<00:25, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 19% 744M/3.99G [00:06<00:24, 130MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 17% 682M/3.91G [00:06<00:25, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 19% 765M/3.99G [00:06<00:25, 124MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 18% 703M/3.91G [00:06<00:25, 127MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 20% 786M/3.99G [00:06<00:24, 130MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 18% 724M/3.91G [00:06<00:26, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 20% 807M/3.99G [00:06<00:27, 116MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 19% 744M/3.91G [00:06<00:25, 126MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 21% 828M/3.99G [00:06<00:24, 128MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 20% 765M/3.91G [00:06<00:24, 127MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 21% 849M/3.99G [00:06<00:24, 128MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 20% 786M/3.91G [00:07<00:25, 121MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 22% 870M/3.99G [00:07<00:26, 120MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 22% 891M/3.99G [00:07<00:22, 136MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 21% 807M/3.91G [00:07<00:25, 121MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 23% 912M/3.99G [00:07<00:21, 141MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 21% 828M/3.91G [00:07<00:26, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 23% 933M/3.99G [00:07<00:21, 140MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 24% 954M/3.99G [00:07<00:21, 144MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 22% 849M/3.91G [00:07<00:29, 105MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 24% 975M/3.99G [00:07<00:21, 143MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 22% 870M/3.91G [00:07<00:28, 107MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 25% 996M/3.99G [00:07<00:19, 152MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 23% 891M/3.91G [00:08<00:26, 112MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 25% 1.02G/3.99G [00:08<00:21, 141MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 23% 912M/3.91G [00:08<00:25, 118MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 26% 1.04G/3.99G [00:08<00:21, 138MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 24% 933M/3.91G [00:08<00:26, 113MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 27% 1.06G/3.99G [00:08<00:23, 126MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 24% 954M/3.91G [00:08<00:24, 123MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 27% 1.08G/3.99G [00:08<00:36, 79.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 27% 1.09G/3.99G [00:10<01:52, 25.7MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 25% 975M/3.91G [00:10<01:47, 27.4MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 25% 996M/3.91G [00:10<01:18, 36.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 28% 1.11G/3.99G [00:10<01:25, 33.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 26% 1.02G/3.91G [00:10<00:59, 48.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 28% 1.12G/3.99G [00:11<01:17, 37.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 27% 1.04G/3.91G [00:11<00:46, 61.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 28% 1.13G/3.99G [00:11<01:06, 42.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 27% 1.06G/3.91G [00:11<00:39, 72.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 29% 1.15G/3.99G [00:11<00:49, 57.7MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 28% 1.08G/3.91G [00:11<00:35, 79.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 29% 1.17G/3.99G [00:11<00:38, 74.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 28% 1.10G/3.91G [00:11<00:31, 88.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 30% 1.20G/3.99G [00:11<00:37, 74.6MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 29% 1.12G/3.91G [00:11<00:27, 102MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 29% 1.14G/3.91G [00:11<00:24, 113MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 30% 1.22G/3.99G [00:11<00:33, 81.7MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 30% 1.16G/3.91G [00:12<00:21, 125MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 31% 1.23G/3.99G [00:12<00:36, 75.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 30% 1.18G/3.91G [00:12<00:20, 131MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 31% 1.21G/3.91G [00:12<00:18, 146MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 31% 1.24G/3.99G [00:12<00:37, 73.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 32% 1.26G/3.99G [00:12<00:29, 93.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 31% 1.23G/3.91G [00:12<00:18, 147MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 32% 1.28G/3.99G [00:12<00:24, 109MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 32% 1.25G/3.91G [00:12<00:18, 145MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 33% 1.30G/3.99G [00:12<00:22, 122MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 32% 1.27G/3.91G [00:12<00:18, 145MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 33% 1.32G/3.99G [00:12<00:19, 137MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 33% 1.29G/3.91G [00:12<00:19, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 34% 1.34G/3.99G [00:12<00:19, 137MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 34% 1.31G/3.91G [00:13<00:19, 136MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 34% 1.36G/3.99G [00:13<00:17, 148MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 35% 1.38G/3.99G [00:13<00:16, 158MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 34% 1.33G/3.91G [00:13<00:18, 137MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 35% 1.41G/3.99G [00:13<00:16, 161MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 35% 1.35G/3.91G [00:13<00:18, 137MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 36% 1.43G/3.99G [00:13<00:17, 148MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 35% 1.37G/3.91G [00:13<00:19, 130MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 36% 1.45G/3.99G [00:13<00:16, 154MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 36% 1.39G/3.91G [00:13<00:19, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 37% 1.47G/3.99G [00:13<00:16, 151MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 37% 1.49G/3.99G [00:13<00:17, 145MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 36% 1.42G/3.91G [00:13<00:20, 120MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 37% 1.44G/3.91G [00:14<00:20, 123MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 38% 1.51G/3.99G [00:14<00:19, 129MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 37% 1.46G/3.91G [00:14<00:20, 118MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 38% 1.53G/3.99G [00:14<00:20, 120MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 39% 1.55G/3.99G [00:14<00:19, 123MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 38% 1.48G/3.91G [00:14<00:22, 110MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 39% 1.57G/3.99G [00:14<00:19, 122MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 38% 1.50G/3.91G [00:14<00:21, 113MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 40% 1.59G/3.99G [00:14<00:19, 122MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 39% 1.52G/3.91G [00:14<00:20, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 39% 1.54G/3.91G [00:15<00:19, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 40% 1.61G/3.99G [00:15<00:20, 118MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 40% 1.56G/3.91G [00:15<00:20, 114MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 41% 1.64G/3.99G [00:15<00:20, 115MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 42% 1.66G/3.99G [00:15<00:18, 124MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 40% 1.58G/3.91G [00:15<00:19, 120MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 42% 1.68G/3.99G [00:15<00:18, 123MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 41% 1.60G/3.91G [00:15<00:19, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 43% 1.70G/3.99G [00:15<00:19, 117MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 42% 1.63G/3.91G [00:15<00:20, 112MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 43% 1.72G/3.99G [00:15<00:19, 115MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 42% 1.65G/3.91G [00:15<00:20, 110MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 44% 1.74G/3.99G [00:16<00:19, 117MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 43% 1.67G/3.91G [00:16<00:19, 114MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 44% 1.76G/3.99G [00:16<00:19, 114MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 43% 1.69G/3.91G [00:16<00:20, 109MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 45% 1.78G/3.99G [00:16<00:19, 113MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 44% 1.71G/3.91G [00:16<00:20, 106MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 45% 1.80G/3.99G [00:16<00:19, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 44% 1.73G/3.91G [00:16<00:19, 109MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 46% 1.82G/3.99G [00:16<00:19, 113MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 46% 1.85G/3.99G [00:16<00:16, 127MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 45% 1.75G/3.91G [00:17<00:26, 82.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 47% 1.88G/3.99G [00:17<00:14, 145MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 45% 1.77G/3.91G [00:17<00:22, 94.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 48% 1.90G/3.99G [00:17<00:14, 147MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 48% 1.92G/3.99G [00:17<00:13, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 46% 1.79G/3.91G [00:17<00:21, 97.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 49% 1.94G/3.99G [00:17<00:14, 139MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 46% 1.81G/3.91G [00:17<00:20, 100MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 49% 1.96G/3.99G [00:17<00:17, 115MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 47% 1.84G/3.91G [00:17<00:21, 97.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 47% 1.85G/3.91G [00:18<00:21, 97.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 50% 1.98G/3.99G [00:17<00:17, 116MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 50% 2.00G/3.99G [00:18<00:15, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 48% 1.87G/3.91G [00:18<00:18, 110MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 48% 1.89G/3.91G [00:18<00:16, 124MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 51% 2.03G/3.99G [00:18<00:12, 156MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 52% 2.06G/3.99G [00:18<00:12, 160MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 49% 1.91G/3.91G [00:18<00:15, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 52% 2.08G/3.99G [00:18<00:11, 169MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 49% 1.93G/3.91G [00:18<00:14, 138MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 53% 2.10G/3.99G [00:18<00:12, 156MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 50% 1.95G/3.91G [00:23<02:17, 14.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 53% 2.12G/3.99G [00:23<02:01, 15.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 50% 1.97G/3.91G [00:23<01:37, 19.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 51% 1.99G/3.91G [00:23<01:10, 27.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 51% 2.01G/3.91G [00:23<00:51, 36.6MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 54% 2.14G/3.99G [00:23<01:33, 19.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 52% 2.03G/3.91G [00:23<00:39, 47.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 54% 2.15G/3.99G [00:23<01:23, 22.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 53% 2.06G/3.91G [00:23<00:31, 59.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 53% 2.08G/3.91G [00:23<00:24, 74.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 54% 2.17G/3.99G [00:23<00:59, 30.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 54% 2.10G/3.91G [00:23<00:22, 81.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 55% 2.19G/3.99G [00:23<00:47, 38.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 54% 2.12G/3.91G [00:24<00:18, 94.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 55% 2.21G/3.99G [00:24<00:36, 49.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 55% 2.14G/3.91G [00:24<00:16, 107MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 56% 2.23G/3.99G [00:24<00:28, 61.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 55% 2.16G/3.91G [00:24<00:15, 116MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 57% 2.25G/3.99G [00:24<00:23, 75.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 56% 2.18G/3.91G [00:24<00:14, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 57% 2.28G/3.99G [00:24<00:19, 86.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 56% 2.20G/3.91G [00:24<00:13, 124MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 58% 2.30G/3.99G [00:24<00:16, 105MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 57% 2.22G/3.91G [00:24<00:12, 130MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 58% 2.32G/3.99G [00:24<00:14, 118MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 59% 2.34G/3.99G [00:24<00:12, 135MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 57% 2.24G/3.91G [00:24<00:12, 133MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 59% 2.36G/3.99G [00:25<00:10, 151MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 58% 2.26G/3.91G [00:25<00:11, 137MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 60% 2.38G/3.99G [00:25<00:09, 163MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 60% 2.40G/3.99G [00:25<00:09, 171MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 58% 2.29G/3.91G [00:25<00:11, 140MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 61% 2.42G/3.99G [00:25<00:10, 150MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 59% 2.31G/3.91G [00:25<00:12, 129MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 60% 2.33G/3.91G [00:29<01:32, 17.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 61% 2.44G/3.99G [00:29<01:29, 17.3MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 62% 2.46G/3.99G [00:29<01:04, 23.6MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 60% 2.35G/3.91G [00:29<01:07, 23.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 62% 2.49G/3.99G [00:29<00:47, 31.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 61% 2.37G/3.91G [00:29<00:49, 31.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 63% 2.51G/3.99G [00:29<00:36, 40.7MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 61% 2.39G/3.91G [00:29<00:38, 39.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 63% 2.53G/3.99G [00:29<00:30, 48.3MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 62% 2.41G/3.91G [00:29<00:31, 47.6MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 64% 2.55G/3.99G [00:29<00:24, 58.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 62% 2.43G/3.91G [00:30<00:25, 57.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 64% 2.57G/3.99G [00:30<00:20, 70.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 63% 2.45G/3.91G [00:30<00:21, 68.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 65% 2.59G/3.99G [00:30<00:17, 80.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 63% 2.47G/3.91G [00:30<00:19, 75.4MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 65% 2.61G/3.99G [00:30<00:15, 91.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 64% 2.50G/3.91G [00:30<00:16, 84.8MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 66% 2.63G/3.99G [00:30<00:13, 99.9MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 64% 2.52G/3.91G [00:30<00:14, 98.3MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 66% 2.65G/3.99G [00:30<00:11, 111MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 65% 2.54G/3.91G [00:30<00:12, 113MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 67% 2.67G/3.99G [00:30<00:10, 121MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 65% 2.56G/3.91G [00:31<00:11, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 68% 2.69G/3.99G [00:31<00:09, 131MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 66% 2.58G/3.91G [00:31<00:10, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 68% 2.72G/3.99G [00:31<00:09, 135MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 66% 2.60G/3.91G [00:31<00:10, 127MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 69% 2.74G/3.99G [00:31<00:09, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 67% 2.62G/3.91G [00:31<00:10, 129MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 69% 2.76G/3.99G [00:31<00:08, 145MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 70% 2.78G/3.99G [00:31<00:08, 147MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 68% 2.64G/3.91G [00:31<00:09, 133MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 70% 2.80G/3.99G [00:31<00:08, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 68% 2.66G/3.91G [00:31<00:09, 129MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 71% 2.82G/3.99G [00:31<00:07, 156MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 69% 2.68G/3.91G [00:31<00:09, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 71% 2.84G/3.99G [00:31<00:07, 153MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 69% 2.71G/3.91G [00:32<00:08, 136MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 72% 2.86G/3.99G [00:32<00:07, 155MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 70% 2.73G/3.91G [00:32<00:08, 134MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 72% 2.88G/3.99G [00:32<00:07, 155MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 70% 2.75G/3.91G [00:32<00:08, 134MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 73% 2.90G/3.99G [00:32<00:07, 154MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 71% 2.77G/3.91G [00:32<00:08, 139MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 73% 2.93G/3.99G [00:32<00:07, 138MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 71% 2.79G/3.91G [00:32<00:07, 146MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 74% 2.95G/3.99G [00:32<00:07, 143MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 72% 2.81G/3.91G [00:32<00:07, 140MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 74% 2.97G/3.99G [00:32<00:07, 141MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 72% 2.83G/3.91G [00:33<00:08, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 75% 2.99G/3.99G [00:33<00:06, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 73% 2.85G/3.91G [00:33<00:07, 133MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 75% 3.01G/3.99G [00:33<00:06, 146MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 73% 2.87G/3.91G [00:35<00:36, 28.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 76% 3.03G/3.99G [00:35<00:34, 28.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 74% 2.89G/3.91G [00:35<00:26, 38.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 75% 2.92G/3.91G [00:35<00:19, 50.2MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 76% 3.05G/3.99G [00:35<00:26, 35.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 75% 2.94G/3.91G [00:35<00:15, 63.7MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 77% 3.07G/3.99G [00:35<00:20, 45.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 76% 2.96G/3.91G [00:35<00:12, 75.5MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 78% 3.09G/3.99G [00:35<00:15, 57.6MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 76% 2.98G/3.91G [00:35<00:10, 85.0MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 78% 3.11G/3.99G [00:35<00:12, 69.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 77% 3.00G/3.91G [00:36<00:10, 90.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 79% 3.14G/3.99G [00:36<00:10, 83.7MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 79% 3.16G/3.99G [00:36<00:08, 98.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 77% 3.02G/3.91G [00:36<00:09, 96.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 80% 3.18G/3.99G [00:36<00:07, 112MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 78% 3.04G/3.91G [00:36<00:08, 105MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 80% 3.20G/3.99G [00:36<00:06, 119MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 78% 3.06G/3.91G [00:36<00:07, 111MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 81% 3.22G/3.99G [00:36<00:06, 125MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 79% 3.08G/3.91G [00:36<00:06, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 81% 3.24G/3.99G [00:36<00:05, 134MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 82% 3.26G/3.99G [00:36<00:04, 148MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 79% 3.10G/3.91G [00:36<00:06, 125MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 80% 3.12G/3.91G [00:37<00:05, 132MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 82% 3.28G/3.99G [00:37<00:04, 145MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 83% 3.30G/3.99G [00:37<00:04, 146MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 80% 3.15G/3.91G [00:37<00:05, 131MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 83% 3.32G/3.99G [00:37<00:04, 151MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 81% 3.17G/3.91G [00:37<00:05, 140MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 84% 3.34G/3.99G [00:37<00:04, 153MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 82% 3.19G/3.91G [00:37<00:05, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 84% 3.37G/3.99G [00:37<00:04, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 82% 3.21G/3.91G [00:37<00:05, 137MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 85% 3.39G/3.99G [00:37<00:04, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 83% 3.23G/3.91G [00:37<00:05, 130MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 85% 3.41G/3.99G [00:37<00:04, 139MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 83% 3.25G/3.91G [00:38<00:04, 135MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 84% 3.27G/3.91G [00:38<00:04, 130MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 86% 3.43G/3.99G [00:38<00:04, 117MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 86% 3.45G/3.99G [00:38<00:04, 124MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 84% 3.29G/3.91G [00:38<00:04, 128MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 87% 3.47G/3.99G [00:38<00:03, 131MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 85% 3.31G/3.91G [00:38<00:04, 132MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 88% 3.49G/3.99G [00:38<00:03, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 85% 3.33G/3.91G [00:38<00:04, 127MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 88% 3.51G/3.99G [00:38<00:03, 131MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 86% 3.36G/3.91G [00:38<00:04, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 89% 3.53G/3.99G [00:38<00:03, 143MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 86% 3.38G/3.91G [00:39<00:04, 121MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 89% 3.55G/3.99G [00:39<00:03, 139MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 87% 3.40G/3.91G [00:39<00:04, 122MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 90% 3.58G/3.99G [00:39<00:03, 127MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 87% 3.42G/3.91G [00:39<00:04, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 90% 3.60G/3.99G [00:39<00:02, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 88% 3.44G/3.91G [00:39<00:04, 112MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 91% 3.62G/3.99G [00:39<00:03, 116MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 88% 3.46G/3.91G [00:39<00:04, 109MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 91% 3.64G/3.99G [00:39<00:02, 117MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 89% 3.48G/3.91G [00:40<00:03, 108MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 92% 3.66G/3.99G [00:40<00:02, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 90% 3.50G/3.91G [00:40<00:03, 106MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 92% 3.68G/3.99G [00:40<00:02, 108MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 90% 3.52G/3.91G [00:40<00:03, 110MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 93% 3.70G/3.99G [00:40<00:02, 111MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 91% 3.54G/3.91G [00:40<00:03, 108MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 93% 3.72G/3.99G [00:40<00:02, 104MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 91% 3.57G/3.91G [00:40<00:03, 101MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 94% 3.74G/3.99G [00:40<00:02, 98.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 91% 3.58G/3.91G [00:40<00:03, 101MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 94% 3.76G/3.99G [00:41<00:02, 103MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 92% 3.59G/3.91G [00:41<00:04, 75.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 95% 3.79G/3.99G [00:41<00:02, 99.6MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 92% 3.61G/3.91G [00:41<00:03, 84.1MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 95% 3.81G/3.99G [00:41<00:01, 105MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 93% 3.63G/3.91G [00:41<00:02, 95.9MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 96% 3.83G/3.99G [00:41<00:01, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 93% 3.65G/3.91G [00:41<00:02, 107MB/s] \u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 96% 3.85G/3.99G [00:41<00:01, 122MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 94% 3.67G/3.91G [00:41<00:02, 108MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 97% 3.87G/3.99G [00:41<00:01, 110MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 94% 3.69G/3.91G [00:42<00:01, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 98% 3.89G/3.99G [00:42<00:00, 106MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 95% 3.71G/3.91G [00:42<00:01, 111MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 98% 3.91G/3.99G [00:42<00:00, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 95% 3.73G/3.91G [00:42<00:01, 115MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 99% 3.93G/3.99G [00:42<00:00, 112MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 96% 3.75G/3.91G [00:42<00:01, 117MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 99% 3.95G/3.99G [00:42<00:00, 117MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 97% 3.77G/3.91G [00:42<00:01, 119MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 100% 3.97G/3.99G [00:42<00:00, 114MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 97% 3.80G/3.91G [00:42<00:00, 116MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model-00001-of-00002.safetensors: 100% 3.99G/3.99G [00:43<00:00, 92.7MB/s]\n",
+ "Fetching 12 files: 58% 7/12 [00:43<00:36, 7.32s/it]\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 98% 3.82G/3.91G [00:43<00:00, 120MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 98% 3.84G/3.91G [00:43<00:00, 131MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 99% 3.86G/3.91G [00:43<00:00, 145MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 99% 3.88G/3.91G [00:43<00:00, 152MB/s]\u001b[A\u001b[A\n",
+ "\n",
+ "model-00002-of-00002.safetensors: 100% 3.91G/3.91G [00:43<00:00, 89.5MB/s]\n",
+ "Fetching 12 files: 100% 12/12 [00:43<00:00, 3.66s/it]\n",
+ "Fetching 10 files: 0% 0/10 [00:00, ?it/s]\n",
+ "config.json: 100% 662/662 [00:00<00:00, 5.97MB/s]\n",
+ "\n",
+ "tokenizer.json: 0% 0.00/7.03M [00:00, ?B/s]\u001b[A\n",
+ "\n",
+ "merges.txt: 0% 0.00/1.67M [00:00, ?B/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ ".gitattributes: 100% 1.52k/1.52k [00:00<00:00, 13.2MB/s]\n",
+ "Fetching 10 files: 10% 1/10 [00:00<00:01, 5.07it/s]\n",
+ "\n",
+ "\n",
+ "generation_config.json: 100% 206/206 [00:00<00:00, 1.80MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "LICENSE: 100% 7.28k/7.28k [00:00<00:00, 29.3MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "README.md: 100% 4.28k/4.28k [00:00<00:00, 25.1MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "merges.txt: 100% 1.67M/1.67M [00:00<00:00, 18.4MB/s]\n",
+ "\n",
+ "\n",
+ "tokenizer_config.json: 100% 1.29k/1.29k [00:00<00:00, 8.89MB/s]\n",
+ "\n",
+ "\n",
+ "vocab.json: 0% 0.00/2.78M [00:00, ?B/s]\u001b[A\u001b[A\n",
+ "tokenizer.json: 100% 7.03M/7.03M [00:00<00:00, 34.6MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "model.safetensors: 0% 10.5M/3.67G [00:00<00:56, 64.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 1% 31.5M/3.67G [00:00<00:31, 117MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 1% 52.4M/3.67G [00:00<00:25, 140MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "vocab.json: 100% 2.78M/2.78M [00:00<00:00, 7.02MB/s]\n",
+ "\n",
+ "\n",
+ "\n",
+ "model.safetensors: 2% 73.4M/3.67G [00:00<00:22, 161MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 3% 105M/3.67G [00:00<00:19, 181MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 3% 126M/3.67G [00:00<00:18, 189MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 4% 147M/3.67G [00:00<00:18, 191MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 5% 178M/3.67G [00:01<00:16, 208MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 5% 199M/3.67G [00:01<00:16, 208MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 6% 231M/3.67G [00:01<00:16, 212MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 7% 262M/3.67G [00:01<00:15, 216MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 8% 294M/3.67G [00:01<00:16, 211MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 9% 325M/3.67G [00:01<00:15, 222MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 10% 357M/3.67G [00:01<00:14, 225MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 11% 388M/3.67G [00:01<00:14, 229MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 11% 419M/3.67G [00:02<00:14, 227MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 12% 451M/3.67G [00:02<00:14, 218MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 13% 482M/3.67G [00:02<00:14, 214MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 14% 514M/3.67G [00:02<00:14, 212MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 15% 545M/3.67G [00:02<00:14, 214MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 16% 577M/3.67G [00:02<00:13, 225MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 17% 608M/3.67G [00:02<00:13, 227MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 17% 640M/3.67G [00:03<00:13, 218MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 18% 671M/3.67G [00:03<00:14, 214MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 19% 703M/3.67G [00:03<00:13, 220MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 20% 734M/3.67G [00:05<01:09, 42.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 21% 765M/3.67G [00:05<00:51, 56.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 22% 797M/3.67G [00:05<00:39, 73.3MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 23% 828M/3.67G [00:05<00:30, 93.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 23% 860M/3.67G [00:06<00:25, 110MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 24% 891M/3.67G [00:06<00:22, 125MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 25% 923M/3.67G [00:06<00:19, 141MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 26% 954M/3.67G [00:06<00:17, 158MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 27% 986M/3.67G [00:06<00:15, 173MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 28% 1.02G/3.67G [00:06<00:14, 188MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 29% 1.05G/3.67G [00:06<00:13, 195MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 29% 1.08G/3.67G [00:07<00:12, 201MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 30% 1.11G/3.67G [00:07<00:12, 210MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 31% 1.14G/3.67G [00:07<00:11, 218MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 32% 1.17G/3.67G [00:07<00:11, 221MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 33% 1.21G/3.67G [00:07<00:10, 228MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 34% 1.24G/3.67G [00:07<00:10, 231MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 35% 1.27G/3.67G [00:07<00:10, 224MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 35% 1.30G/3.67G [00:08<00:11, 209MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 36% 1.33G/3.67G [00:08<00:11, 212MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 37% 1.36G/3.67G [00:08<00:13, 176MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 38% 1.39G/3.67G [00:08<00:12, 189MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 39% 1.43G/3.67G [00:08<00:11, 194MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 40% 1.46G/3.67G [00:08<00:10, 202MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 40% 1.48G/3.67G [00:09<00:11, 199MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 41% 1.51G/3.67G [00:09<00:10, 210MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 42% 1.54G/3.67G [00:09<00:10, 208MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 43% 1.57G/3.67G [00:09<00:09, 219MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 44% 1.60G/3.67G [00:09<00:09, 212MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 45% 1.64G/3.67G [00:09<00:09, 217MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 45% 1.67G/3.67G [00:09<00:09, 213MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 46% 1.70G/3.67G [00:10<00:09, 198MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 47% 1.72G/3.67G [00:10<00:14, 133MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 47% 1.74G/3.67G [00:10<00:14, 130MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 48% 1.76G/3.67G [00:10<00:13, 138MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 49% 1.78G/3.67G [00:10<00:14, 128MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 49% 1.80G/3.67G [00:11<00:13, 137MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 50% 1.82G/3.67G [00:11<00:13, 135MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 50% 1.85G/3.67G [00:11<00:12, 149MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 51% 1.87G/3.67G [00:11<00:18, 99.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 51% 1.89G/3.67G [00:11<00:18, 97.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 52% 1.91G/3.67G [00:12<00:16, 107MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 53% 1.93G/3.67G [00:12<00:16, 108MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 53% 1.95G/3.67G [00:12<00:14, 122MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 54% 1.97G/3.67G [00:12<00:12, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 54% 1.99G/3.67G [00:12<00:14, 114MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 55% 2.01G/3.67G [00:12<00:13, 125MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 55% 2.03G/3.67G [00:12<00:11, 140MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 56% 2.06G/3.67G [00:13<00:14, 115MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 57% 2.08G/3.67G [00:13<00:12, 132MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 57% 2.11G/3.67G [00:13<00:10, 156MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 58% 2.14G/3.67G [00:13<00:08, 176MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 59% 2.16G/3.67G [00:13<00:08, 183MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 59% 2.18G/3.67G [00:14<00:13, 114MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 60% 2.21G/3.67G [00:14<00:10, 141MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 61% 2.24G/3.67G [00:14<00:08, 166MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 62% 2.28G/3.67G [00:14<00:07, 185MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 63% 2.31G/3.67G [00:14<00:07, 188MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 63% 2.33G/3.67G [00:14<00:07, 192MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 64% 2.36G/3.67G [00:14<00:06, 199MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 65% 2.39G/3.67G [00:15<00:06, 205MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 66% 2.42G/3.67G [00:15<00:05, 209MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 67% 2.45G/3.67G [00:15<00:05, 207MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 68% 2.49G/3.67G [00:15<00:05, 212MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 69% 2.52G/3.67G [00:15<00:05, 210MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 69% 2.55G/3.67G [00:15<00:05, 213MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 70% 2.58G/3.67G [00:15<00:04, 222MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 71% 2.61G/3.67G [00:16<00:04, 226MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 72% 2.64G/3.67G [00:16<00:04, 225MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 73% 2.67G/3.67G [00:16<00:04, 221MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 74% 2.71G/3.67G [00:16<00:04, 217MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 74% 2.74G/3.67G [00:19<00:33, 28.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 75% 2.77G/3.67G [00:19<00:23, 38.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 76% 2.80G/3.67G [00:20<00:17, 51.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 77% 2.83G/3.67G [00:20<00:12, 67.5MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 78% 2.86G/3.67G [00:20<00:11, 71.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 78% 2.88G/3.67G [00:20<00:09, 82.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 79% 2.90G/3.67G [00:20<00:07, 96.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 80% 2.93G/3.67G [00:20<00:06, 111MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 80% 2.96G/3.67G [00:21<00:05, 135MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 81% 2.99G/3.67G [00:21<00:04, 159MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 82% 3.02G/3.67G [00:21<00:03, 167MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 83% 3.05G/3.67G [00:21<00:03, 184MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 84% 3.08G/3.67G [00:21<00:03, 193MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 85% 3.11G/3.67G [00:21<00:02, 206MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 86% 3.15G/3.67G [00:21<00:02, 215MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 86% 3.18G/3.67G [00:22<00:02, 213MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 87% 3.21G/3.67G [00:22<00:02, 210MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 88% 3.24G/3.67G [00:22<00:02, 209MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 89% 3.27G/3.67G [00:22<00:01, 210MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 90% 3.30G/3.67G [00:22<00:01, 215MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 91% 3.33G/3.67G [00:22<00:01, 206MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 91% 3.36G/3.67G [00:22<00:01, 196MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 92% 3.38G/3.67G [00:23<00:01, 184MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 92% 3.40G/3.67G [00:23<00:01, 181MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 93% 3.42G/3.67G [00:23<00:01, 181MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 94% 3.44G/3.67G [00:26<00:09, 25.4MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 94% 3.47G/3.67G [00:26<00:05, 38.1MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 95% 3.50G/3.67G [00:26<00:03, 54.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 96% 3.53G/3.67G [00:26<00:01, 73.0MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 97% 3.57G/3.67G [00:26<00:01, 90.8MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 98% 3.59G/3.67G [00:26<00:00, 95.2MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 98% 3.61G/3.67G [00:26<00:00, 110MB/s] \u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 99% 3.64G/3.67G [00:26<00:00, 133MB/s]\u001b[A\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "model.safetensors: 100% 3.67G/3.67G [00:27<00:00, 135MB/s]\n",
+ "Fetching 10 files: 100% 10/10 [00:27<00:00, 2.74s/it]\n",
+ "Fetching 7 files: 0% 0/7 [00:00, ?it/s]\n",
+ "model.bin: 0% 0.00/3.09G [00:00, ?B/s]\u001b[A\n",
+ "\n",
+ "preprocessor_config.json: 100% 340/340 [00:00<00:00, 2.62MB/s]\n",
+ "\n",
+ "\n",
+ "README.md: 100% 2.05k/2.05k [00:00<00:00, 12.2MB/s]\n",
+ "\n",
+ "\n",
+ "vocabulary.json: 0% 0.00/1.07M [00:00, ?B/s]\u001b[A\u001b[A\n",
+ "\n",
+ "\n",
+ "config.json: 100% 2.39k/2.39k [00:00<00:00, 17.7MB/s]\n",
+ "\n",
+ "model.bin: 1% 21.0M/3.09G [00:00<00:19, 156MB/s]\u001b[A\n",
+ "\n",
+ "\n",
+ ".gitattributes: 100% 1.52k/1.52k [00:00<00:00, 9.47MB/s]\n",
+ "Fetching 7 files: 14% 1/7 [00:00<00:01, 3.83it/s]\n",
+ "\n",
+ "\n",
+ "tokenizer.json: 0% 0.00/2.48M [00:00, ?B/s]\u001b[A\u001b[A\u001b[A\n",
+ "model.bin: 2% 52.4M/3.09G [00:00<00:15, 200MB/s]\u001b[A\n",
+ "\n",
+ "vocabulary.json: 100% 1.07M/1.07M [00:00<00:00, 4.36MB/s]\n",
+ "\n",
+ "model.bin: 3% 83.9M/3.09G [00:00<00:13, 223MB/s]\u001b[A\n",
+ "model.bin: 4% 115M/3.09G [00:00<00:13, 216MB/s] \u001b[A\n",
+ "\n",
+ "\n",
+ "tokenizer.json: 100% 2.48M/2.48M [00:00<00:00, 6.90MB/s]\n",
+ "\n",
+ "model.bin: 5% 147M/3.09G [00:00<00:14, 208MB/s]\u001b[A\n",
+ "model.bin: 6% 178M/3.09G [00:00<00:13, 217MB/s]\u001b[A\n",
+ "model.bin: 7% 210M/3.09G [00:02<00:45, 63.6MB/s]\u001b[A\n",
+ "model.bin: 8% 241M/3.09G [00:02<00:34, 82.6MB/s]\u001b[A\n",
+ "model.bin: 9% 273M/3.09G [00:02<00:27, 104MB/s] \u001b[A\n",
+ "model.bin: 10% 304M/3.09G [00:02<00:21, 127MB/s]\u001b[A\n",
+ "model.bin: 11% 336M/3.09G [00:02<00:22, 124MB/s]\u001b[A\n",
+ "model.bin: 12% 357M/3.09G [00:02<00:22, 122MB/s]\u001b[A\n",
+ "model.bin: 12% 377M/3.09G [00:03<00:22, 120MB/s]\u001b[A\n",
+ "model.bin: 13% 398M/3.09G [00:03<00:22, 121MB/s]\u001b[A\n",
+ "model.bin: 14% 419M/3.09G [00:03<00:19, 136MB/s]\u001b[A\n",
+ "model.bin: 15% 451M/3.09G [00:03<00:16, 157MB/s]\u001b[A\n",
+ "model.bin: 16% 482M/3.09G [00:03<00:15, 173MB/s]\u001b[A\n",
+ "model.bin: 17% 514M/3.09G [00:03<00:13, 189MB/s]\u001b[A\n",
+ "model.bin: 18% 545M/3.09G [00:03<00:12, 204MB/s]\u001b[A\n",
+ "model.bin: 19% 577M/3.09G [00:04<00:12, 197MB/s]\u001b[A\n",
+ "model.bin: 19% 598M/3.09G [00:04<00:12, 199MB/s]\u001b[A\n",
+ "model.bin: 20% 619M/3.09G [00:04<00:12, 201MB/s]\u001b[A\n",
+ "model.bin: 21% 650M/3.09G [00:04<00:11, 214MB/s]\u001b[A\n",
+ "model.bin: 22% 682M/3.09G [00:06<01:08, 35.2MB/s]\u001b[A\n",
+ "model.bin: 23% 713M/3.09G [00:06<00:49, 48.2MB/s]\u001b[A\n",
+ "model.bin: 24% 734M/3.09G [00:07<00:45, 52.1MB/s]\u001b[A\n",
+ "model.bin: 25% 765M/3.09G [00:07<00:33, 69.8MB/s]\u001b[A\n",
+ "model.bin: 26% 797M/3.09G [00:07<00:26, 87.3MB/s]\u001b[A\n",
+ "model.bin: 26% 818M/3.09G [00:07<00:23, 96.6MB/s]\u001b[A\n",
+ "model.bin: 27% 839M/3.09G [00:07<00:20, 108MB/s] \u001b[A\n",
+ "model.bin: 28% 860M/3.09G [00:07<00:18, 123MB/s]\u001b[A\n",
+ "model.bin: 29% 891M/3.09G [00:08<00:15, 145MB/s]\u001b[A\n",
+ "model.bin: 30% 923M/3.09G [00:08<00:12, 167MB/s]\u001b[A\n",
+ "model.bin: 31% 954M/3.09G [00:08<00:11, 184MB/s]\u001b[A\n",
+ "model.bin: 32% 986M/3.09G [00:08<00:12, 170MB/s]\u001b[A\n",
+ "model.bin: 33% 1.01G/3.09G [00:08<00:12, 169MB/s]\u001b[A\n",
+ "model.bin: 34% 1.04G/3.09G [00:08<00:11, 175MB/s]\u001b[A\n",
+ "model.bin: 34% 1.06G/3.09G [00:09<00:13, 152MB/s]\u001b[A\n",
+ "model.bin: 35% 1.08G/3.09G [00:09<00:27, 73.1MB/s]\u001b[A\n",
+ "model.bin: 36% 1.11G/3.09G [00:09<00:20, 95.3MB/s]\u001b[A\n",
+ "model.bin: 37% 1.14G/3.09G [00:10<00:16, 120MB/s] \u001b[A\n",
+ "model.bin: 38% 1.17G/3.09G [00:10<00:13, 144MB/s]\u001b[A\n",
+ "model.bin: 39% 1.21G/3.09G [00:10<00:11, 165MB/s]\u001b[A\n",
+ "model.bin: 40% 1.24G/3.09G [00:10<00:12, 144MB/s]\u001b[A\n",
+ "model.bin: 41% 1.26G/3.09G [00:10<00:11, 153MB/s]\u001b[A\n",
+ "model.bin: 42% 1.29G/3.09G [00:10<00:10, 172MB/s]\u001b[A\n",
+ "model.bin: 42% 1.31G/3.09G [00:10<00:09, 178MB/s]\u001b[A\n",
+ "model.bin: 43% 1.34G/3.09G [00:11<00:08, 194MB/s]\u001b[A\n",
+ "model.bin: 44% 1.37G/3.09G [00:11<00:08, 208MB/s]\u001b[A\n",
+ "model.bin: 46% 1.41G/3.09G [00:11<00:07, 215MB/s]\u001b[A\n",
+ "model.bin: 47% 1.44G/3.09G [00:11<00:07, 218MB/s]\u001b[A\n",
+ "model.bin: 48% 1.47G/3.09G [00:11<00:07, 222MB/s]\u001b[A\n",
+ "model.bin: 49% 1.50G/3.09G [00:11<00:07, 226MB/s]\u001b[A\n",
+ "model.bin: 50% 1.53G/3.09G [00:11<00:06, 227MB/s]\u001b[A\n",
+ "model.bin: 51% 1.56G/3.09G [00:12<00:06, 227MB/s]\u001b[A\n",
+ "model.bin: 52% 1.59G/3.09G [00:12<00:06, 229MB/s]\u001b[A\n",
+ "model.bin: 53% 1.63G/3.09G [00:12<00:06, 240MB/s]\u001b[A\n",
+ "model.bin: 54% 1.66G/3.09G [00:12<00:05, 242MB/s]\u001b[A\n",
+ "model.bin: 55% 1.69G/3.09G [00:12<00:05, 235MB/s]\u001b[A\n",
+ "model.bin: 56% 1.72G/3.09G [00:12<00:05, 242MB/s]\u001b[A\n",
+ "model.bin: 57% 1.75G/3.09G [00:12<00:05, 228MB/s]\u001b[A\n",
+ "model.bin: 58% 1.78G/3.09G [00:12<00:05, 228MB/s]\u001b[A\n",
+ "model.bin: 59% 1.81G/3.09G [00:13<00:05, 222MB/s]\u001b[A\n",
+ "model.bin: 60% 1.85G/3.09G [00:13<00:08, 152MB/s]\u001b[A\n",
+ "model.bin: 61% 1.88G/3.09G [00:13<00:07, 172MB/s]\u001b[A\n",
+ "model.bin: 62% 1.91G/3.09G [00:13<00:06, 191MB/s]\u001b[A\n",
+ "model.bin: 63% 1.94G/3.09G [00:13<00:05, 205MB/s]\u001b[A\n",
+ "model.bin: 64% 1.97G/3.09G [00:13<00:05, 215MB/s]\u001b[A\n",
+ "model.bin: 65% 2.00G/3.09G [00:14<00:05, 215MB/s]\u001b[A\n",
+ "model.bin: 66% 2.03G/3.09G [00:14<00:04, 220MB/s]\u001b[A\n",
+ "model.bin: 67% 2.07G/3.09G [00:14<00:04, 226MB/s]\u001b[A\n",
+ "model.bin: 68% 2.10G/3.09G [00:19<00:47, 20.8MB/s]\u001b[A\n",
+ "model.bin: 69% 2.13G/3.09G [00:19<00:33, 28.8MB/s]\u001b[A\n",
+ "model.bin: 70% 2.16G/3.09G [00:19<00:23, 39.2MB/s]\u001b[A\n",
+ "model.bin: 71% 2.19G/3.09G [00:19<00:17, 52.1MB/s]\u001b[A\n",
+ "model.bin: 72% 2.22G/3.09G [00:19<00:13, 65.4MB/s]\u001b[A\n",
+ "model.bin: 73% 2.24G/3.09G [00:19<00:11, 75.0MB/s]\u001b[A\n",
+ "model.bin: 73% 2.26G/3.09G [00:19<00:09, 87.3MB/s]\u001b[A\n",
+ "model.bin: 74% 2.29G/3.09G [00:20<00:08, 99.5MB/s]\u001b[A\n",
+ "model.bin: 75% 2.32G/3.09G [00:20<00:06, 123MB/s] \u001b[A\n",
+ "model.bin: 76% 2.35G/3.09G [00:20<00:05, 146MB/s]\u001b[A\n",
+ "model.bin: 77% 2.38G/3.09G [00:20<00:04, 163MB/s]\u001b[A\n",
+ "model.bin: 78% 2.41G/3.09G [00:20<00:03, 173MB/s]\u001b[A\n",
+ "model.bin: 79% 2.43G/3.09G [00:20<00:03, 170MB/s]\u001b[A\n",
+ "model.bin: 79% 2.45G/3.09G [00:20<00:03, 173MB/s]\u001b[A\n",
+ "model.bin: 80% 2.47G/3.09G [00:21<00:03, 158MB/s]\u001b[A\n",
+ "model.bin: 81% 2.50G/3.09G [00:21<00:06, 93.1MB/s]\u001b[A\n",
+ "model.bin: 82% 2.52G/3.09G [00:22<00:10, 56.1MB/s]\u001b[A\n",
+ "model.bin: 82% 2.54G/3.09G [00:22<00:07, 69.8MB/s]\u001b[A\n",
+ "model.bin: 83% 2.57G/3.09G [00:22<00:05, 94.2MB/s]\u001b[A\n",
+ "model.bin: 84% 2.60G/3.09G [00:22<00:04, 118MB/s] \u001b[A\n",
+ "model.bin: 85% 2.63G/3.09G [00:22<00:03, 139MB/s]\u001b[A\n",
+ "model.bin: 86% 2.66G/3.09G [00:22<00:02, 157MB/s]\u001b[A\n",
+ "model.bin: 87% 2.68G/3.09G [00:23<00:02, 166MB/s]\u001b[A\n",
+ "model.bin: 88% 2.72G/3.09G [00:23<00:02, 183MB/s]\u001b[A\n",
+ "model.bin: 89% 2.75G/3.09G [00:23<00:01, 198MB/s]\u001b[A\n",
+ "model.bin: 90% 2.78G/3.09G [00:23<00:01, 209MB/s]\u001b[A\n",
+ "model.bin: 91% 2.81G/3.09G [00:23<00:01, 207MB/s]\u001b[A\n",
+ "model.bin: 92% 2.84G/3.09G [00:23<00:01, 212MB/s]\u001b[A\n",
+ "model.bin: 93% 2.87G/3.09G [00:23<00:01, 212MB/s]\u001b[A\n",
+ "model.bin: 94% 2.90G/3.09G [00:24<00:00, 215MB/s]\u001b[A\n",
+ "model.bin: 95% 2.94G/3.09G [00:24<00:00, 213MB/s]\u001b[A\n",
+ "model.bin: 96% 2.97G/3.09G [00:24<00:00, 218MB/s]\u001b[A\n",
+ "model.bin: 97% 3.00G/3.09G [00:24<00:00, 215MB/s]\u001b[A\n",
+ "model.bin: 98% 3.03G/3.09G [00:24<00:00, 213MB/s]\u001b[A\n",
+ "model.bin: 99% 3.06G/3.09G [00:24<00:00, 222MB/s]\u001b[A\n",
+ "model.bin: 100% 3.09G/3.09G [00:24<00:00, 124MB/s]\n",
+ "Fetching 7 files: 100% 7/7 [00:25<00:00, 3.58s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 下载 wav2vec2 模型并保存到指定路径,如果文件已经存在,则跳过下载\n",
+ "%mkdir -p models/ASR/whisper & wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \\\n",
+ " -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth\n",
+ "\n",
+ "!python scripts/huggingface_download.py"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SasAZMzcUw6y"
+ },
+ "source": [
+ "# launch WebUI 启动WebUI"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WH5Hh1ZeaQ5X"
+ },
+ "source": [
+ "首先将 `env.example`填入环境变量并 改名为 `.env`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "uciAy69HTS9S"
+ },
+ "outputs": [],
+ "source": [
+ "%cp env.example .env"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2bAjUCmeaLf-",
+ "outputId": "d34b2f41-0f8d-4cfe-b3c4-da62c4b1364b"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content/Linly-Dubbing\n",
+ "The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.\n",
+ "0it [00:00, ?it/s]\n",
+ "2024-08-19 19:05:21.916936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+ "2024-08-19 19:05:22.191129: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+ "2024-08-19 19:05:22.265133: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+ "2024-08-19 19:05:24.998318: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+ "/usr/local/lib/python3.10/dist-packages/pyannote/audio/core/io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.\n",
+ " torchaudio.set_audio_backend(\"soundfile\")\n",
+ "failed to import ttsfrd, use WeTextProcessing instead\n",
+ "\u001b[32m2024-08-19 19:05:55.796\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mLoading Demucs model: htdemucs_ft\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:05:55.797\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1mLoading TTS model from models/TTS/XTTS-v2\u001b[0m\n",
+ "Loading TTS model from models/TTS/XTTS-v2\n",
+ "\u001b[32m2024-08-19 19:05:55.798\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step021_asr_whisperx\u001b[0m:\u001b[36mload_whisper_model\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mLoading WhisperX model: models/ASR/whisper/faster-whisper-large-v3\u001b[0m\n",
+ "Downloading: \"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th\" to /root/.cache/torch/hub/checkpoints/f7e0c4bc-ba3fe64a.th\n",
+ " 2% 1.50M/80.2M [00:00<00:05, 15.6MB/s] > Using model: xtts\n",
+ "100% 80.2M/80.2M [00:01<00:00, 74.9MB/s]\n",
+ "No language specified, language will be first be detected for each audio file (increases inference time).\n",
+ "Downloading: \"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th\" to /root/.cache/torch/hub/checkpoints/d12395a8-e57c48e6.th\n",
+ " 0% 0.00/80.2M [00:00, ?B/s]\n",
+ " 15% 11.9M/80.2M [00:00<00:01, 65.2MB/s]\n",
+ " 28% 22.8M/80.2M [00:00<00:00, 86.9MB/s]\n",
+ " 44% 35.0M/80.2M [00:00<00:00, 103MB/s] \n",
+ " 60% 47.8M/80.2M [00:00<00:00, 114MB/s]\n",
+ " 90% 71.9M/80.2M [00:00<00:00, 121MB/s]\n",
+ "100% 80.2M/80.2M [00:00<00:00, 109MB/s]\n",
+ "\n",
+ " 7%|██▋ | 1.25M/16.9M [00:00<00:05, 2.74MiB/s]\u001b[A\n",
+ " 15%|█████▌ | 2.55M/16.9M [00:00<00:02, 5.34MiB/s]\u001b[A\n",
+ " 30%|███████████▏ | 5.12M/16.9M [00:01<00:01, 10.1MiB/s]\u001b[A\n",
+ " 41%|███████████████ | 6.86M/16.9M [00:01<00:00, 11.7MiB/s]\u001b[A\n",
+ " 56%|████████████████████▊ | 9.53M/16.9M [00:01<00:00, 14.9MiB/s]\u001b[A\n",
+ " 74%|███████████████████████████▎ | 12.5M/16.9M [00:01<00:00, 17.8MiB/s]\u001b[A\n",
+ "100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 11.5MiB/s]\n",
+ "Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../root/.cache/torch/whisperx-vad-segmentation.bin`\n",
+ "Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
+ "Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.\n",
+ "\u001b[32m2024-08-19 19:06:03.925\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step021_asr_whisperx\u001b[0m:\u001b[36mload_whisper_model\u001b[0m:\u001b[36m40\u001b[0m - \u001b[1mLoaded WhisperX model: models/ASR/whisper/faster-whisper-large-v3 in 8.13s\u001b[0m\n",
+ "Downloading: \"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th\" to /root/.cache/torch/hub/checkpoints/92cfc3b6-ef3bcb9c.th\n",
+ "100% 80.2M/80.2M [00:01<00:00, 51.2MB/s]\n",
+ "\u001b[32m2024-08-19 19:06:12.547\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step021_asr_whisperx\u001b[0m:\u001b[36mload_align_model\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1mLoaded alignment model: en in 8.62s\u001b[0m\n",
+ "Downloading: \"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th\" to /root/.cache/torch/hub/checkpoints/04573f0d-f3cf25b2.th\n",
+ "100% 80.2M/80.2M [00:00<00:00, 115MB/s]\n",
+ "\u001b[32m2024-08-19 19:06:15.009\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1mDemucs model loaded in 19.21 seconds\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:36.627\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m35\u001b[0m - \u001b[1mTTS model loaded in 40.83s\u001b[0m\n",
+ "[BiliBili] Extracting URL: https://www.bilibili.com/video/BV1kr421M7vz/\n",
+ "[BiliBili] 1kr421M7vz: Downloading webpage\n",
+ "[BiliBili] BV1kr421M7vz: Extracting videos in anthology\n",
+ "[BiliBili] Format(s) 4K 超清, 1080P 高码率, 1080P 高清, 720P 高清 are missing; you have to login or become a premium member to download them. Use --cookies-from-browser or --cookies for the authentication. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies\n",
+ "[BiliBili] 1406337061: Extracting chapters\n",
+ "[BiliBili] Extracting URL: https://www.bilibili.com/video/BV1kr421M7vz/\n",
+ "[BiliBili] 1kr421M7vz: Downloading webpage\n",
+ "[BiliBili] BV1kr421M7vz: Extracting videos in anthology\n",
+ "[BiliBili] Format(s) 4K 超清, 1080P 高码率, 1080P 高清, 720P 高清 are missing; you have to login or become a premium member to download them. Use --cookies-from-browser or --cookies for the authentication. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies\n",
+ "[BiliBili] 1406337061: Extracting chapters\n",
+ "[info] BV1kr421M7vz: Downloading 1 format(s): 100110+30280\n",
+ "[info] Downloading video thumbnail 0 ...\n",
+ "[info] Writing video thumbnail 0 to: videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.jpg\n",
+ "[info] Writing video metadata as JSON to: videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.info.json\n",
+ "[download] Destination: videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.f100110.mp4\n",
+ "\u001b[K[download] 100% of 974.42KiB in \u001b[1;37m00:00:00\u001b[0m at \u001b[0;32m1.03MiB/s\u001b[0m\n",
+ "[download] Destination: videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.f30280.m4a\n",
+ "\u001b[K[download] 100% of 435.66KiB in \u001b[1;37m00:00:00\u001b[0m at \u001b[0;32m1.98MiB/s\u001b[0m\n",
+ "[Merger] Merging formats into \"videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.mp4\"\n",
+ "Deleting original file videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.f100110.mp4 (pass -k to keep)\n",
+ "Deleting original file videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.f30280.m4a (pass -k to keep)\n",
+ "\u001b[32m2024-08-19 19:06:43.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step000_video_downloader\u001b[0m:\u001b[36mdownload_single_video\u001b[0m:\u001b[36m53\u001b[0m - \u001b[1mVideo downloaded in videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:43.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mprocess_video\u001b[0m:\u001b[36m43\u001b[0m - \u001b[1mProcess video in videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:43.002\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mextract_audio_from_video\u001b[0m:\u001b[36m90\u001b[0m - \u001b[1mExtracting audio from videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:44.161\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mextract_audio_from_video\u001b[0m:\u001b[36m98\u001b[0m - \u001b[1mAudio extracted from videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:44.161\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mseparate_audio\u001b[0m:\u001b[36m47\u001b[0m - \u001b[1mSeparating audio from videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:44.161\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mLoading Demucs model: htdemucs_ft\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:06:47.483\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mload_model\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1mDemucs model loaded in 3.32 seconds\u001b[0m\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:03<00:00, 4.65seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.42seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.54seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 27.10seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.45seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.63seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.53seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.50seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.72seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.65seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.42seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.28seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.48seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.13seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.10seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 25.65seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 25.85seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 26.01seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 25.97seconds/s]\n",
+ "100%|██████████████████████████████████████████████| 17.549999999999997/17.549999999999997 [00:00<00:00, 25.99seconds/s]\n",
+ "\u001b[32m2024-08-19 19:07:05.511\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mseparate_audio\u001b[0m:\u001b[36m59\u001b[0m - \u001b[1mAudio separated in 18.03 seconds\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:07:05.521\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mseparate_audio\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mVocals saved to videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_vocals.wav\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:07:05.528\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mseparate_audio\u001b[0m:\u001b[36m79\u001b[0m - \u001b[1mInstruments saved to videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_instruments.wav\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:07:05.528\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step010_demucs_vr\u001b[0m:\u001b[36mseparate_all_audio_under_folder\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mAll audio separated under videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:07:05.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step020_asr\u001b[0m:\u001b[36mtranscribe_audio\u001b[0m:\u001b[36m70\u001b[0m - \u001b[1mTranscribing videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_vocals.wav\u001b[0m\n",
+ "Warning: audio is shorter than 30s, language detection may be inaccurate.\n",
+ "Detected language: en (0.87) in first 30s of audio...\n",
+ "\u001b[32m2024-08-19 19:07:08.543\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step020_asr\u001b[0m:\u001b[36mtranscribe_audio\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mTranscribed videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_vocals.wav successfully, and saved to videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/transcript.json\u001b[0m\n",
+ "Loading checkpoint shards: 100% 2/2 [00:32<00:00, 16.31s/it]\n",
+ "WARNING:accelerate.big_modeling:Some parameters are on the meta device device because they were offloaded to the cpu.\n",
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+ "Finish Load model models/LLM/Qwen1.5-4B-Chat\n",
+ "\u001b[32m2024-08-19 19:08:10.205\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36msummarize\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1m{ \"title\": \"(英文无字幕) 阿里这小子在水城威尼斯发来问候\" , \"summary\": \"\" }Note: The summary is empty as there is no information available in the provided content.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:10.206\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36msummarize\u001b[0m:\u001b[36m172\u001b[0m - \u001b[33m\u001b[1m总结失败\n",
+ "Invalid summary\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:39.376\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36msummarize\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1m{\"title\": \"(英文无字幕) 阿里这小子在水城威尼斯发来问候\", \"summary\": \"作者为'村长台钓加拿大',视频中介绍了作者和搭档在威尼斯的旅行体验。他们在欣赏威尼斯的美景并尝试当地美食后表示喜爱。视频未提供具体的旅行细节或美食推荐。\"}\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:39.377\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36msummarize\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1m{'title': '(英文无字幕) 阿里这小子在水城威尼斯发来问候', 'summary': \"作者为'村长台钓加拿大',视频中介绍了作者和搭档在威尼斯的旅行体验。他们在欣赏威尼斯的美景并尝试当地美食后表示喜爱。视频未提供具体的旅行细节或美食推荐。\"}\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:43.439\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:Hello guys, how are you?\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:43.439\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:你好伙计们,你们好吗?\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:47.922\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:I'm in Venice now with my partner.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:47.922\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“我现在正在威尼斯和他的伴侣在一起。”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:54.183\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:We're in Venice looking around the amazing streets.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:54.183\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“我们现在在威尼斯环游令人惊叹的街道。”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:57.447\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:I love it.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:08:57.447\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“我喜欢它。”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:01.254\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:It's perfect.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:01.255\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“太棒了。”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:04.805\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:Look at that.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:04.805\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“看那!”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:08.861\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:So nice.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:08.862\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“真好。”\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:15.255\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m250\u001b[0m - \u001b[1m原文:I can't wait to show you the pizza guys.\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:15.256\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step030_translation\u001b[0m:\u001b[36m_translate\u001b[0m:\u001b[36m251\u001b[0m - \u001b[1m译文:翻译:“我很期待向你们展示披萨男孩。”\u001b[0m\n",
+ "{'title': '(英文无字幕) 阿里这小子在水城威尼斯发来问候', 'author': '村长台钓加拿大', 'summary': \"作者为'村长台钓加拿大',视频中介绍了作者和搭档在威尼斯的旅行体验。他们在欣赏威尼斯的美景并尝试当地美食后表示喜爱。视频未提供具体的旅行细节或美食推荐。\", 'tags': ['小视频'], 'language': '简体中文'} [{'start': 0.089, 'end': 1.129, 'text': 'Hello guys, how are you?', 'speaker': 'SPEAKER_00', 'translation': '你好伙计们,你们好吗?'}, {'start': 1.25, 'end': 3.731, 'text': \"I'm in Venice now with my partner.\", 'speaker': 'SPEAKER_00', 'translation': '我现在正在威尼斯和他的伴侣在一起。'}, {'start': 4.571, 'end': 8.292, 'text': \"We're in Venice looking around the amazing streets.\", 'speaker': 'SPEAKER_00', 'translation': '我们现在在威尼斯环游令人惊叹的街道。'}, {'start': 8.633, 'end': 9.133, 'text': 'I love it.', 'speaker': 'SPEAKER_00', 'translation': '我喜欢它。'}, {'start': 9.193, 'end': 9.753, 'text': \"It's perfect.\", 'speaker': 'SPEAKER_00', 'translation': '太棒了。'}, {'start': 9.793, 'end': 10.273, 'text': 'Look at that.', 'speaker': 'SPEAKER_00', 'translation': '看那!'}, {'start': 11.134, 'end': 11.854, 'text': 'So nice.', 'speaker': 'SPEAKER_00', 'translation': '真好。'}, {'start': 12.434, 'end': 14.175, 'text': \"I can't wait to show you the pizza guys.\", 'speaker': 'SPEAKER_00', 'translation': '我很期待向你们展示披萨男孩。'}]\n",
+ "\u001b[32m2024-08-19 19:09:15.359\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36mgenerate_wavs\u001b[0m:\u001b[36m61\u001b[0m - \u001b[1mFound 1 speakers\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['你好伙计们,你们好吗?']\n",
+ " > Processing time: 1.9015758037567139\n",
+ " > Real-time factor: 0.7245256164092401\n",
+ "\u001b[32m2024-08-19 19:09:17.279\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 你好伙计们,你们好吗?\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:17.280\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['我现在正在威尼斯和他的伴侣在一起。']\n",
+ " > Processing time: 1.967923879623413\n",
+ " > Real-time factor: 0.4483460235751391\n",
+ "\u001b[32m2024-08-19 19:09:19.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 我现在正在威尼斯和他的伴侣在一起。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:19.276\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.6152256571334105\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['我们现在在威尼斯环游令人惊叹的街道。']\n",
+ " > Processing time: 2.2133612632751465\n",
+ " > Real-time factor: 0.45169383843492683\n",
+ "\u001b[32m2024-08-19 19:09:21.527\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 我们现在在威尼斯环游令人惊叹的街道。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:21.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.8265215459795647\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['我喜欢它。']\n",
+ " > Processing time: 1.0285415649414062\n",
+ " > Real-time factor: 0.4428175083364185\n",
+ "\u001b[32m2024-08-19 19:09:22.591\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 我喜欢它。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:22.592\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['太棒了。']\n",
+ " > Processing time: 0.9402084350585938\n",
+ " > Real-time factor: 0.5660658582634882\n",
+ "\u001b[32m2024-08-19 19:09:23.552\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 太棒了。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:23.553\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['看那!']\n",
+ " > Processing time: 0.7887325286865234\n",
+ " > Real-time factor: 0.5803374351821223\n",
+ "\u001b[32m2024-08-19 19:09:24.363\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 看那!\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:24.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['真好。']\n",
+ " > Processing time: 0.791118860244751\n",
+ " > Real-time factor: 0.5820932617591017\n",
+ "\u001b[32m2024-08-19 19:09:25.172\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 真好。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:25.174\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5766150560597962\u001b[0m\n",
+ " > Text splitted to sentences.\n",
+ "['我很期待向你们展示披萨男孩。']\n",
+ " > Processing time: 2.5063178539276123\n",
+ " > Real-time factor: 0.5771126637333318\n",
+ "\u001b[32m2024-08-19 19:09:27.703\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step042_tts_xtts\u001b[0m:\u001b[36mtts\u001b[0m:\u001b[36m74\u001b[0m - \u001b[1mTTS 我很期待向你们展示披萨男孩。\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:27.705\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36madjust_audio_length\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mSpeed Factor 0.5\u001b[0m\n",
+ "\u001b[32m2024-08-19 19:09:27.806\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtools.step040_tts\u001b[0m:\u001b[36mgenerate_wavs\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1mGenerated videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_combined.wav\u001b[0m\n",
+ "ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers\n",
+ " built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)\n",
+ " configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n",
+ " libavutil 56. 70.100 / 56. 70.100\n",
+ " libavcodec 58.134.100 / 58.134.100\n",
+ " libavformat 58. 76.100 / 58. 76.100\n",
+ " libavdevice 58. 13.100 / 58. 13.100\n",
+ " libavfilter 7.110.100 / 7.110.100\n",
+ " libswscale 5. 9.100 / 5. 9.100\n",
+ " libswresample 3. 9.100 / 3. 9.100\n",
+ " libpostproc 55. 9.100 / 55. 9.100\n",
+ "\u001b[0;33mTrailing option(s) found in the command: may be ignored.\n",
+ "\u001b[0mInput #0, mov,mp4,m4a,3gp,3g2,mj2, from 'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/download.mp4':\n",
+ " Metadata:\n",
+ " major_brand : isom\n",
+ " minor_version : 512\n",
+ " compatible_brands: isomiso2mp41\n",
+ " encoder : Lavf58.76.100\n",
+ " description : Packed by Bilibili XCoder v2.0.2\n",
+ " Duration: 00:00:14.74, start: 0.000000, bitrate: 787 kb/s\n",
+ " Stream #0:0(und): Video: hevc (Main) (hev1 / 0x31766568), yuv420p(tv, bt709), 852x480 [SAR 640:639 DAR 16:9], 541 kb/s, 30 fps, 30 tbr, 16k tbn, 30 tbc (default)\n",
+ " Metadata:\n",
+ " handler_name : Bento4 Video Handler\n",
+ " vendor_id : [0][0][0][0]\n",
+ " Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 238 kb/s (default)\n",
+ " Metadata:\n",
+ " handler_name : Bento4 Sound Handler\n",
+ " vendor_id : [0][0][0][0]\n",
+ "\u001b[0;33mGuessed Channel Layout for Input Stream #1.0 : mono\n",
+ "\u001b[0mInput #1, wav, from 'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_combined.wav':\n",
+ " Duration: 00:00:14.74, bitrate: 384 kb/s\n",
+ " Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s\n",
+ "Input #2, png_pipe, from 'docs/linly_watermark.png':\n",
+ " Duration: N/A, bitrate: N/A\n",
+ " Stream #2:0: Video: png, rgba(pc), 1280x502, 25 fps, 25 tbr, 25 tbn, 25 tbc\n",
+ "Stream mapping:\n",
+ " Stream #0:0 (hevc) -> setpts\n",
+ " Stream #1:0 (pcm_s16le) -> atempo\n",
+ " Stream #2:0 (png) -> scale\n",
+ " overlay -> Stream #0:0 (libx264)\n",
+ " atempo -> Stream #0:1 (aac)\n",
+ "Press [q] to stop, [?] for help\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0musing SAR=480/479\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0musing cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mprofile High, level 4.0, 4:2:0, 8-bit\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0m264 - core 163 r3060 5db6aa6 - H.264/MPEG-4 AVC codec - Copyleft 2003-2021 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\n",
+ "Output #0, mp4, to 'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/video.mp4':\n",
+ " Metadata:\n",
+ " major_brand : isom\n",
+ " minor_version : 512\n",
+ " compatible_brands: isomiso2mp41\n",
+ " description : Packed by Bilibili XCoder v2.0.2\n",
+ " encoder : Lavf58.76.100\n",
+ " Stream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1916x1080 [SAR 480:479 DAR 16:9], q=2-31, 30 fps, 15360 tbn (default)\n",
+ " Metadata:\n",
+ " encoder : Lavc58.134.100 libx264\n",
+ " Side data:\n",
+ " cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\n",
+ " Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 24000 Hz, mono, fltp, 69 kb/s (default)\n",
+ " Metadata:\n",
+ " encoder : Lavc58.134.100 aac\n",
+ "frame= 439 fps=6.6 q=-1.0 Lsize= 10811kB time=00:00:14.72 bitrate=6016.6kbits/s speed=0.223x \n",
+ "video:10671kB audio:127kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.117657%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mframe I:2 Avg QP:23.49 size: 74643\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mframe P:155 Avg QP:25.31 size: 46014\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mframe B:282 Avg QP:28.11 size: 12925\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mconsecutive B-frames: 8.2% 15.9% 7.5% 68.3%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mmb I I16..4: 13.5% 79.9% 6.6%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mmb P I16..4: 6.1% 27.3% 1.7% P16..4: 34.3% 12.9% 3.8% 0.0% 0.0% skip:13.9%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mmb B I16..4: 1.0% 2.9% 0.1% B16..8: 31.5% 5.0% 0.8% direct: 2.6% skip:56.1% L0:44.0% L1:47.7% BI: 8.4%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0m8x8 transform intra:76.9% inter:84.6%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mcoded y,uvDC,uvAC intra: 51.7% 41.3% 9.0% inter: 14.9% 9.7% 0.2%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mi16 v,h,dc,p: 26% 22% 5% 47%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mi8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 31% 18% 13% 5% 6% 7% 7% 8% 6%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mi4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 30% 20% 10% 5% 9% 9% 8% 6% 4%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mi8c dc,h,v,p: 57% 16% 21% 6%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mWeighted P-Frames: Y:11.6% UV:3.9%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mref P L0: 64.9% 21.4% 10.2% 3.1% 0.4%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mref B L0: 93.9% 5.1% 1.0%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mref B L1: 98.8% 1.2%\n",
+ "\u001b[1;36m[libx264 @ 0x59392f56df40] \u001b[0mkb/s:5973.48\n",
+ "\u001b[1;36m[aac @ 0x59392f56e9c0] \u001b[0mQavg: 522.590\n",
+ "ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers\n",
+ " built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)\n",
+ " configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n",
+ " libavutil 56. 70.100 / 56. 70.100\n",
+ " libavcodec 58.134.100 / 58.134.100\n",
+ " libavformat 58. 76.100 / 58. 76.100\n",
+ " libavdevice 58. 13.100 / 58. 13.100\n",
+ " libavfilter 7.110.100 / 7.110.100\n",
+ " libswscale 5. 9.100 / 5. 9.100\n",
+ " libswresample 3. 9.100 / 3. 9.100\n",
+ " libpostproc 55. 9.100 / 55. 9.100\n",
+ "\u001b[0;33mTrailing option(s) found in the command: may be ignored.\n",
+ "\u001b[0mInput #0, mov,mp4,m4a,3gp,3g2,mj2, from 'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/video.mp4':\n",
+ " Metadata:\n",
+ " major_brand : isom\n",
+ " minor_version : 512\n",
+ " compatible_brands: isomiso2avc1mp41\n",
+ " encoder : Lavf58.76.100\n",
+ " description : Packed by Bilibili XCoder v2.0.2\n",
+ " Duration: 00:00:14.74, start: 0.000000, bitrate: 6008 kb/s\n",
+ " Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1916x1080 [SAR 480:479 DAR 16:9], 5973 kb/s, 30 fps, 30 tbr, 15360 tbn, 60 tbc (default)\n",
+ " Metadata:\n",
+ " handler_name : VideoHandler\n",
+ " vendor_id : [0][0][0][0]\n",
+ " Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 24000 Hz, mono, fltp, 70 kb/s (default)\n",
+ " Metadata:\n",
+ " handler_name : SoundHandler\n",
+ " vendor_id : [0][0][0][0]\n",
+ "Stream mapping:\n",
+ " Stream #0:0 -> #0:0 (h264 (native) -> h264 (libx264))\n",
+ " Stream #0:1 -> #0:1 (aac (native) -> aac (native))\n",
+ "Press [q] to stop, [?] for help\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mlibass API version: 0x1502000\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mlibass source: tarball: 0.15.2\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mShaper: FriBidi 1.0.8 (SIMPLE) HarfBuzz-ng 2.7.4 (COMPLEX)\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mLoading font file './font/SimHei.ttf'\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mUsing font provider fontconfig\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0musing SAR=480/479\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0musing cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mprofile High, level 4.0, 4:2:0, 8-bit\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0m264 - core 163 r3060 5db6aa6 - H.264/MPEG-4 AVC codec - Copyleft 2003-2021 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=3 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\n",
+ "Output #0, mp4, to 'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/video_subtitles.mp4':\n",
+ " Metadata:\n",
+ " major_brand : isom\n",
+ " minor_version : 512\n",
+ " compatible_brands: isomiso2avc1mp41\n",
+ " description : Packed by Bilibili XCoder v2.0.2\n",
+ " encoder : Lavf58.76.100\n",
+ " Stream #0:0(und): Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1916x1080 [SAR 480:479 DAR 16:9], q=2-31, 30 fps, 15360 tbn (default)\n",
+ " Metadata:\n",
+ " handler_name : VideoHandler\n",
+ " vendor_id : [0][0][0][0]\n",
+ " encoder : Lavc58.134.100 libx264\n",
+ " Side data:\n",
+ " cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\n",
+ " Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 24000 Hz, mono, fltp, 69 kb/s (default)\n",
+ " Metadata:\n",
+ " handler_name : SoundHandler\n",
+ " vendor_id : [0][0][0][0]\n",
+ " encoder : Lavc58.134.100 aac\n",
+ "\u001b[1;32m[Parsed_subtitles_0 @ 0x59186d3020c0] \u001b[0mfontselect: (SimHei, 400, 0) -> SimHei, 0, SimHei\n",
+ "frame= 439 fps=6.3 q=-1.0 Lsize= 10602kB time=00:00:14.72 bitrate=5900.1kbits/s speed=0.213x \n",
+ "video:10464kB audio:125kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.119319%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mframe I:2 Avg QP:23.58 size: 75630\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mframe P:178 Avg QP:25.17 size: 42261\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mframe B:259 Avg QP:28.24 size: 11739\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mconsecutive B-frames: 10.9% 29.2% 6.2% 53.8%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mmb I I16..4: 12.6% 81.9% 5.5%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mmb P I16..4: 5.5% 25.5% 1.4% P16..4: 36.4% 12.3% 3.4% 0.0% 0.0% skip:15.5%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mmb B I16..4: 0.9% 2.8% 0.1% B16..8: 32.6% 4.6% 0.7% direct: 1.8% skip:56.4% L0:43.4% L1:49.7% BI: 7.0%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0m8x8 transform intra:78.1% inter:85.5%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mcoded y,uvDC,uvAC intra: 49.7% 43.8% 8.3% inter: 13.8% 10.2% 0.2%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mi16 v,h,dc,p: 27% 23% 6% 44%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mi8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 31% 19% 13% 5% 6% 6% 6% 8% 5%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mi4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 29% 20% 11% 4% 9% 8% 8% 6% 4%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mi8c dc,h,v,p: 57% 16% 21% 6%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mWeighted P-Frames: Y:9.0% UV:2.2%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mref P L0: 67.6% 20.4% 9.1% 2.6% 0.3%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mref B L0: 93.2% 5.7% 1.1%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mref B L1: 99.0% 1.0%\n",
+ "\u001b[1;36m[libx264 @ 0x59186d2aee40] \u001b[0mkb/s:5857.45\n",
+ "\u001b[1;36m[aac @ 0x59186d2df6c0] \u001b[0mQavg: 341.760\n"
+ ]
+ }
+ ],
+ "source": [
+ "%cd /content/Linly-Dubbing/\n",
+ "# 测试one click是否正常\n",
+ "!python -m tools.do_everything"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Yag1lXjiYGjq",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "9c5a40a6-3df8-403e-db6a-97ad87bedeb7"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content/Linly-Dubbing\n",
+ "2024-08-19 19:12:06.845830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+ "2024-08-19 19:12:06.886168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+ "2024-08-19 19:12:06.898852: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+ "2024-08-19 19:12:09.628214: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+ "/usr/local/lib/python3.10/dist-packages/pyannote/audio/core/io.py:43: UserWarning: torchaudio._backend.set_audio_backend has been deprecated. With dispatcher enabled, this function is no-op. You can remove the function call.\n",
+ " torchaudio.set_audio_backend(\"soundfile\")\n",
+ "failed to import ttsfrd, use WeTextProcessing instead\n",
+ "Running on local URL: http://127.0.0.1:6006\n",
+ "Running on public URL: https://48980f220ce294a00f.gradio.live\n",
+ "\n",
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%cd /content/Linly-Dubbing/\n",
+ "!python webui.py"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "g1yoQoDyhvPp"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QqQF-lXUiAH6"
+ },
+ "source": [
+ ""
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/font/SimHei.ttf b/font/SimHei.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..94794236612d827dfc6800269b43c6e3ad1fc513
--- /dev/null
+++ b/font/SimHei.ttf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:336a838f4a78e150826be608dae69de59d50948c3d2b71760e096ae764154bdc
+size 9751960
diff --git a/gui.py b/gui.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e4d0214a0ce0c9aaeaf1829185d911cf658ded
--- /dev/null
+++ b/gui.py
@@ -0,0 +1,95 @@
+import sys
+from PySide6.QtWidgets import QApplication, QMainWindow, QTabWidget
+from PySide6.QtCore import Qt
+
+# Ensure required modules are importable
+try:
+ # UI components (side-effects may register widgets/styles)
+ from ui_components import (
+ CustomSlider, # noqa: F401
+ FloatSlider, # noqa: F401
+ RadioButtonGroup, # noqa: F401
+ AudioSelector, # noqa: F401
+ VideoPlayer, # noqa: F401
+ )
+
+ # Feature tabs
+ from tabs.full_auto_tab import FullAutoTab
+ from tabs.settings_tab import SettingsTab
+ from tabs.download_tab import DownloadTab
+ from tabs.demucs_tab import DemucsTab
+ from tabs.asr_tab import ASRTab
+ from tabs.translation_tab import TranslationTab
+ from tabs.tts_tab import TTSTab
+ from tabs.video_tab import SynthesizeVideoTab
+ from tabs.linly_talker_tab import LinlyTalkerTab
+
+ # Optional heavy tools (app still runs without them)
+ try:
+ from tools.step000_video_downloader import download_from_url # noqa: F401
+ from tools.step010_demucs_vr import separate_all_audio_under_folder # noqa: F401
+ from tools.step020_asr import transcribe_all_audio_under_folder # noqa: F401
+ from tools.step030_translation import translate_all_transcript_under_folder # noqa: F401
+ from tools.step040_tts import generate_all_wavs_under_folder # noqa: F401
+ from tools.step050_synthesize_video import synthesize_all_video_under_folder # noqa: F401
+ from tools.do_everything import do_everything # noqa: F401
+ from tools.utils import SUPPORT_VOICE # noqa: F401
+ except ImportError as e:
+ print(f"Warning: some tool modules could not be imported: {e}")
+ SUPPORT_VOICE = [
+ "zh-CN-XiaoxiaoNeural",
+ "zh-CN-YunxiNeural",
+ "en-US-JennyNeural",
+ "ja-JP-NanamiNeural",
+ ]
+
+except ImportError as e:
+ print(f"Error: failed to initialize application: {e}")
+ sys.exit(1)
+
+
+class MainWindow(QMainWindow):
+ def __init__(self):
+ super().__init__()
+ self.setWindowTitle("Linly-Dubbing — Smart Multilingual Video Dubbing/Translation")
+ self.resize(1024, 768)
+
+ tabs = QTabWidget()
+
+ # Create tabs
+ self.full_auto_tab = FullAutoTab()
+ self.settings_tab = SettingsTab()
+
+ # Propagate settings changes to the One-Click tab
+ self.settings_tab.config_changed.connect(self.full_auto_tab.update_config)
+
+ # English-only tab labels
+ tabs.addTab(self.full_auto_tab, "One-Click")
+ tabs.addTab(self.settings_tab, "Settings")
+ tabs.addTab(DownloadTab(), "Auto Download")
+ tabs.addTab(DemucsTab(), "Vocal Separation")
+ tabs.addTab(ASRTab(), "ASR Speech Recognition")
+ tabs.addTab(TranslationTab(), "Subtitle Translation")
+ tabs.addTab(TTSTab(), "TTS Synthesis")
+ tabs.addTab(SynthesizeVideoTab(), "Video Composition")
+ tabs.addTab(LinlyTalkerTab(), "Linly-Talker Lip-Sync (WIP)")
+
+ self.setCentralWidget(tabs)
+
+
+def main():
+ # High-DPI: enable crisp UI on modern displays
+ QApplication.setAttribute(Qt.AA_EnableHighDpiScaling, True)
+ QApplication.setAttribute(Qt.AA_UseHighDpiPixmaps, True)
+
+ app = QApplication(sys.argv)
+ app.setStyle("Fusion") # consistent cross-platform look
+
+ window = MainWindow()
+ window.show()
+
+ sys.exit(app.exec())
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d86fa67b4beb4588ed929d45eae87db134d9cba
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,3 @@
+ffmpeg
+libsndfile1
+espeak-ng
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..99530145a8053487924c5b553bab33547976183b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,136 @@
+# --- Core runtime / serving ---
+gradio>=4.0
+fastapi>=0.115
+uvicorn[standard]>=0.30
+huggingface_hub>=0.24
+python-dotenv>=1.0
+
+# --- PyTorch trio (CPU-friendly) ---
+torch==2.3.1
+torchaudio==2.3.1
+torchvision==0.18.1
+
+# --- Scientific / ML backbone ---
+numpy>=1.26,<3
+scipy>=1.11
+scikit-learn>=1.3
+pandas>=2.2
+matplotlib>=3.8
+numba>=0.59
+llvmlite>=0.43
+tqdm>=4.66
+einops>=0.7
+protobuf>=4.24
+safetensors>=0.4.3
+
+# --- NLP / Transformers / Whisper ---
+transformers==4.55.4
+tokenizers>=0.15
+sentencepiece>=0.1.99
+#faster-whisper==1.2.0
+#whisperx==3.7.4
+#whisper==1.1.10
+nltk>=3.9
+regex>=2024.9.11
+
+# --- Audio / TTS / ASR toolchain ---
+soundfile>=0.12
+audioread>=3.0.1
+soxr>=0.3.7
+ffmpeg-python>=0.2.0
+pydub>=0.25.1
+audiostretchy==1.3.5
+demucs==4.0.1
+openunmix==1.3.0
+asteroid-filterbanks==0.4.0
+pytorch-wpe==0.0.1
+speechbrain==1.0.3
+coqui-tts==0.27.2
+coqpit-config==0.2.1
+ttsfrd==0.1.0
+edge-tts==7.2.3
+librosa==0.11.0
+soxr>=0.3.7
+
+# --- Diffusion / audio codecs (if you actually use them) ---
+diffusers==0.27.2
+encodec==0.1.1
+
+# --- Model management / orchestration ---
+accelerate==1.11.0
+hydra-core==1.3.2
+omegaconf==2.3.0
+lightning==2.5.5
+pytorch-lightning==2.5.5
+torchmetrics==1.3.2
+
+# --- PyAnnote (diarization) ---
+pyannote.audio==3.4.0
+pyannote.core==5.0.0
+pyannote.database==5.1.3
+pyannote.metrics==3.2.1
+pyannote.pipeline==3.0.1
+
+# --- OpenAI / HTTP clients ---
+openai==1.55.3
+httpx>=0.27
+requests>=2.31
+urllib3>=2.2
+
+# --- Data / storage / utils ---
+orjson>=3.10
+ujson>=5.9
+pyarrow>=16.0
+zstandard>=0.22
+cloudpickle>=3.0
+joblib>=1.3
+filelock>=3.12
+rich>=13.7
+tabulate>=0.9
+Unidecode>=1.3
+
+# --- Text processing / multi-language ---
+pypinyin>=0.49
+jieba>=0.42.1
+jaconv>=0.4
+SudachiPy>=0.6.8
+SudachiDict-core>=20240109
+WeTextProcessing>=1.0.3
+pysbd>=0.3.4
+bangla==0.0.5
+bnunicodenormalizer==0.1.7
+bnnumerizer==0.0.2
+
+# --- Misc ML / optimization ---
+optuna>=3.6
+pytorch-metric-learning>=2.3
+umap-learn>=0.5.5
+pynndescent>=0.5.10
+
+# --- Media / video ---
+moviepy>=1.0.3
+imageio>=2.34
+imageio-ffmpeg>=0.4.9
+av>=10.0
+
+# --- Small helpers (stable) ---
+loguru>=0.7
+fire>=0.6
+packaging>=23.2
+typing_extensions>=4.9
+python-dateutil>=2.8.2
+pytz>=2024.1
+tzlocal>=5.2
+PyYAML>=6.0.1
+regex>=2024.9.11
+
+# --- Optional (comment out if not needed) ---
+onnxruntime==1.23.1
+translators==6.0.1
+gdown==5.1.0
+yt-dlp>=2024.5.27
+openunmix==1.3.0
+
+demucs>=4.0.0
+
+TTS
diff --git a/requirements_module.txt b/requirements_module.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9538579ece6695e4e0446a9861035c359c478f67
--- /dev/null
+++ b/requirements_module.txt
@@ -0,0 +1,4 @@
+submodules/demucs
+submodules/whisper
+submodules/whisperX
+submodules/TTS
\ No newline at end of file
diff --git a/runtime.txt b/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..55090899d0334b0210fdd7f30ea9b2e23e6fce59
--- /dev/null
+++ b/runtime.txt
@@ -0,0 +1 @@
+python-3.10
diff --git a/scripts/download_models.sh b/scripts/download_models.sh
new file mode 100644
index 0000000000000000000000000000000000000000..afbb427cc4274e6b8fb04993e89df9ccf529bc8d
--- /dev/null
+++ b/scripts/download_models.sh
@@ -0,0 +1,6 @@
+# 下载 wav2vec2 模型并保存到指定路径,如果文件已经存在,则跳过下载
+mkdir -p models/ASR/whisper & wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \
+ -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth
+
+# 执行下载脚本
+python scripts/modelscope_download.py
\ No newline at end of file
diff --git a/scripts/huggingface_download.py b/scripts/huggingface_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0ebb1d894ba5e1da17ac6633f19de7aad368fd
--- /dev/null
+++ b/scripts/huggingface_download.py
@@ -0,0 +1,21 @@
+# pip install huggingface_hub
+from huggingface_hub import snapshot_download
+
+# https://huggingface.co/coqui/XTTS-v2
+snapshot_download('coqui/XTTS-v2', local_dir='models/TTS/XTTS-v2', resume_download=True, local_dir_use_symlinks=False)
+
+# https://huggingface.co/FunAudioLLM/CosyVoice-300M
+# snapshot_download('FunAudioLLM/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M', resume_download=True, local_dir_use_symlinks=False)
+
+# https://huggingface.co/Qwen/Qwen1.5-4B-Chat
+snapshot_download('Qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat', resume_download=True, local_dir_use_symlinks=False)
+
+# https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat
+snapshot_download('Qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat', resume_download=True, local_dir_use_symlinks=False)
+
+# https://huggingface.co/Systran/faster-whisper-large-v3
+snapshot_download('Systran/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3', resume_download=True, local_dir_use_symlinks=False)
+
+# 需要申请自动下载
+# https://huggingface.co/pyannote/speaker-diarization-3.1
+# snapshot_download('pyannote/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1', resume_download=True, local_dir_use_symlinks=False)
diff --git a/scripts/modelscope_download.py b/scripts/modelscope_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..08481aaee97470dedcefd08bcb5874ac463e420f
--- /dev/null
+++ b/scripts/modelscope_download.py
@@ -0,0 +1,21 @@
+# pip install modelscope
+from modelscope import snapshot_download
+
+# https://modelscope.cn/models/AI-ModelScope/XTTS-v2
+snapshot_download('AI-ModelScope/XTTS-v2', local_dir='models/TTS/XTTS-v2')
+
+# https://modelscope.cn/models/iic/CosyVoice-300M
+# snapshot_download('iic/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M')
+
+# https://modelscope.cn/models/qwen/qwen1.5-4b-chat
+snapshot_download('qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat')
+
+# https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat
+# snapshot_download('qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat')
+
+# https://modelscope.cn/models/keepitsimple/faster-whisper-large-v3
+snapshot_download('keepitsimple/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3')
+
+# 需要申请自动下载
+# https://modelscope.cn/models/mirror013/speaker-diarization-3.1
+# snapshot_download('mirror013/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1')
diff --git a/submodules/TTS/CITATION.cff b/submodules/TTS/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..6b0c8f19af1b37607c3994abe28b8d362cbcb564
--- /dev/null
+++ b/submodules/TTS/CITATION.cff
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
+title: "Coqui TTS"
+abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
+date-released: 2021-01-01
+authors:
+ - family-names: "Eren"
+ given-names: "Gölge"
+ - name: "The Coqui TTS Team"
+version: 1.4
+doi: 10.5281/zenodo.6334862
+license: "MPL-2.0"
+url: "https://www.coqui.ai"
+repository-code: "https://github.com/coqui-ai/TTS"
+keywords:
+ - machine learning
+ - deep learning
+ - artificial intelligence
+ - text to speech
+ - TTS
\ No newline at end of file
diff --git a/submodules/TTS/CODE_OF_CONDUCT.md b/submodules/TTS/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..b80639d63c29e902c547de347806651bcc9ad3b2
--- /dev/null
+++ b/submodules/TTS/CODE_OF_CONDUCT.md
@@ -0,0 +1,133 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+coc-report@coqui.ai.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
+at [https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/submodules/TTS/CODE_OWNERS.rst b/submodules/TTS/CODE_OWNERS.rst
new file mode 100644
index 0000000000000000000000000000000000000000..768b573911eae8aeb229de6f56039deb9a64ce27
--- /dev/null
+++ b/submodules/TTS/CODE_OWNERS.rst
@@ -0,0 +1,75 @@
+TTS code owners / governance system
+==========================================
+
+TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
+
+Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
+
+Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
+
+The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
+
+This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
+
+There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
+
+Global owners
+----------------
+
+These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Training, feeding
+-----------------
+
+- Eren Gölge (@erogol)
+
+Model exporting
+---------------
+
+- Eren Gölge (@erogol)
+
+Multi-Speaker TTS
+-----------------
+
+- Eren Gölge (@erogol)
+- Edresson Casanova (@edresson)
+
+TTS
+---
+
+- Eren Gölge (@erogol)
+
+Vocoders
+--------
+
+- Eren Gölge (@erogol)
+
+Speaker Encoder
+---------------
+
+- Eren Gölge (@erogol)
+
+Testing & CI
+------------
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Python bindings
+---------------
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Documentation
+-------------
+
+- Eren Gölge (@erogol)
+
+Third party bindings
+--------------------
+
+Owned by the author.
diff --git a/submodules/TTS/CONTRIBUTING.md b/submodules/TTS/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ce46048c8f861082d33fc89683669194dcf32
--- /dev/null
+++ b/submodules/TTS/CONTRIBUTING.md
@@ -0,0 +1,162 @@
+# Contribution guidelines
+
+Welcome to the 🐸TTS!
+
+This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
+
+## Where to start.
+We welcome everyone who likes to contribute to 🐸TTS.
+
+You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
+
+If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
+
+- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
+
+ You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
+
+- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
+
+ This is a place to find feature requests, bugs.
+
+ Issues with the ```good first issue``` tag are good place for beginners to take on.
+
+- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
+
+ We list all the target improvements for the next version. You can pick one of them and start contributing.
+
+- Also feel free to suggest new features, ideas and models. We're always open for new things.
+
+## Call for sharing language models
+If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
+
+This model can be shared in two ways:
+1. Share the model files with us and we serve them with the next 🐸 TTS release.
+2. Upload your models on GDrive and share the link.
+
+Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
+
+Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
+
+## Sending a ✨**PR**✨
+
+If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
+Please use the following steps to send a ✨**PR**✨.
+Let us know if you encounter a problem along the way.
+
+The following steps are tested on an Ubuntu system.
+
+1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
+
+2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
+
+ ```bash
+ $ git clone git@github.com:/TTS.git
+ $ cd TTS
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
+ ```
+
+3. Install 🐸TTS for development.
+
+ ```bash
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+ $ make install
+ ```
+
+4. Create a new branch with an informative name for your goal.
+
+ ```bash
+ $ git checkout -b an_informative_name_for_my_branch
+ ```
+
+5. Implement your changes on your new branch.
+
+6. Explain your code using [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings.
+
+7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
+
+8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction.
+
+ ```bash
+ $ make test # stop at the first error
+ $ make test_all # run all the tests, report all the errors
+ ```
+
+9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
+
+ ```bash
+ $ make style
+ ```
+
+10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
+
+ ```bash
+ $ make lint
+ ```
+
+11. When things are good, add new files and commit your changes.
+
+ ```bash
+ $ git add my_file1.py my_file2.py ...
+ $ git commit
+ ```
+
+ It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
+
+ ```bash
+ $ git fetch upstream
+ $ git rebase upstream/master
+ # or for the development version
+ $ git rebase upstream/dev
+ ```
+
+12. Send a PR to ```dev``` branch.
+
+ Push your branch to your fork.
+
+ ```bash
+ $ git push -u origin an_informative_name_for_my_branch
+ ```
+
+ Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
+
+ Please set ✨**PR**✨'s target branch to ```dev``` as we use ```dev``` to work on the next version.
+
+13. Let's discuss until it is perfect. 💪
+
+ We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
+
+14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
+
+## Development in Docker container
+
+If you prefer working within a Docker container as your development environment, you can do the following:
+
+1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
+
+2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+
+ ```bash
+ $ git clone git@github.com:/TTS.git
+ $ cd TTS
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
+ ```
+
+3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
+
+ ```
+ docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
+ ```
+
+4. Run the container with GPU support:
+
+ ```
+ docker run -it --gpus all tts-dev:latest /bin/bash
+ ```
+
+Feel free to ping us at any step you need help using our communication channels.
+
+If you are new to Github or open-source contribution, These are good resources.
+
+- [Github Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests)
+- [First-Contribution](https://github.com/firstcontributions/first-contributions)
diff --git a/submodules/TTS/Dockerfile b/submodules/TTS/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9fb3005ef4f5e71512a4dfdaf83236a9d047cc32
--- /dev/null
+++ b/submodules/TTS/Dockerfile
@@ -0,0 +1,19 @@
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
+FROM ${BASE}
+
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN pip3 install llvmlite --ignore-installed
+
+# Install Dependencies:
+RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN rm -rf /root/.cache/pip
+
+# Copy TTS repository contents:
+WORKDIR /root
+COPY . /root
+
+RUN make install
+
+ENTRYPOINT ["tts"]
+CMD ["--help"]
diff --git a/submodules/TTS/LICENSE.txt b/submodules/TTS/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14e2f777f6c395e7e04ab4aa306bbcc4b0c1120e
--- /dev/null
+++ b/submodules/TTS/LICENSE.txt
@@ -0,0 +1,373 @@
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+ means each individual or legal entity that creates, contributes to
+ the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+ means the combination of the Contributions of others (if any) used
+ by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+ means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+ means Source Code Form to which the initial Contributor has attached
+ the notice in Exhibit A, the Executable Form of such Source Code
+ Form, and Modifications of such Source Code Form, in each case
+ including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+ means
+
+ (a) that the initial Contributor has attached the notice described
+ in Exhibit B to the Covered Software; or
+
+ (b) that the Covered Software was made available under the terms of
+ version 1.1 or earlier of the License, but not also under the
+ terms of a Secondary License.
+
+1.6. "Executable Form"
+ means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+ means a work that combines Covered Software with other material, in
+ a separate file or files, that is not Covered Software.
+
+1.8. "License"
+ means this document.
+
+1.9. "Licensable"
+ means having the right to grant, to the maximum extent possible,
+ whether at the time of the initial grant or subsequently, any and
+ all of the rights conveyed by this License.
+
+1.10. "Modifications"
+ means any of the following:
+
+ (a) any file in Source Code Form that results from an addition to,
+ deletion from, or modification of the contents of Covered
+ Software; or
+
+ (b) any new file in Source Code Form that contains any Covered
+ Software.
+
+1.11. "Patent Claims" of a Contributor
+ means any patent claim(s), including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by such
+ Contributor that would be infringed, but for the grant of the
+ License, by the making, using, selling, offering for sale, having
+ made, import, or transfer of either its Contributions or its
+ Contributor Version.
+
+1.12. "Secondary License"
+ means either the GNU General Public License, Version 2.0, the GNU
+ Lesser General Public License, Version 2.1, the GNU Affero General
+ Public License, Version 3.0, or any later versions of those
+ licenses.
+
+1.13. "Source Code Form"
+ means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+ means an individual or a legal entity exercising rights under this
+ License. For legal entities, "You" includes any entity that
+ controls, is controlled by, or is under common control with You. For
+ purposes of this definition, "control" means (a) the power, direct
+ or indirect, to cause the direction or management of such entity,
+ whether by contract or otherwise, or (b) ownership of more than
+ fifty percent (50%) of the outstanding shares or beneficial
+ ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+ Licensable by such Contributor to use, reproduce, make available,
+ modify, display, perform, distribute, and otherwise exploit its
+ Contributions, either on an unmodified basis, with Modifications, or
+ as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+ for sale, have made, import, and otherwise transfer either its
+ Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+ or
+
+(b) for infringements caused by: (i) Your and any other third party's
+ modifications of Covered Software, or (ii) the combination of its
+ Contributions with other software (except as part of its Contributor
+ Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+ its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+ Form, as described in Section 3.1, and You must inform recipients of
+ the Executable Form how they can obtain a copy of such Source Code
+ Form by reasonable means in a timely manner, at a charge no more
+ than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+ License, or sublicense it under different terms, provided that the
+ license for the Executable Form does not attempt to limit or alter
+ the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+* *
+* 6. Disclaimer of Warranty *
+* ------------------------- *
+* *
+* Covered Software is provided under this License on an "as is" *
+* basis, without warranty of any kind, either expressed, implied, or *
+* statutory, including, without limitation, warranties that the *
+* Covered Software is free of defects, merchantable, fit for a *
+* particular purpose or non-infringing. The entire risk as to the *
+* quality and performance of the Covered Software is with You. *
+* Should any Covered Software prove defective in any respect, You *
+* (not any Contributor) assume the cost of any necessary servicing, *
+* repair, or correction. This disclaimer of warranty constitutes an *
+* essential part of this License. No use of any Covered Software is *
+* authorized under this License except under this disclaimer. *
+* *
+************************************************************************
+
+************************************************************************
+* *
+* 7. Limitation of Liability *
+* -------------------------- *
+* *
+* Under no circumstances and under no legal theory, whether tort *
+* (including negligence), contract, or otherwise, shall any *
+* Contributor, or anyone who distributes Covered Software as *
+* permitted above, be liable to You for any direct, indirect, *
+* special, incidental, or consequential damages of any character *
+* including, without limitation, damages for lost profits, loss of *
+* goodwill, work stoppage, computer failure or malfunction, or any *
+* and all other commercial damages or losses, even if such party *
+* shall have been informed of the possibility of such damages. This *
+* limitation of liability shall not apply to liability for death or *
+* personal injury resulting from such party's negligence to the *
+* extent applicable law prohibits such limitation. Some *
+* jurisdictions do not allow the exclusion or limitation of *
+* incidental or consequential damages, so this exclusion and *
+* limitation may not apply to You. *
+* *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+ This Source Code Form is subject to the terms of the Mozilla Public
+ License, v. 2.0. If a copy of the MPL was not distributed with this
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+ This Source Code Form is "Incompatible With Secondary Licenses", as
+ defined by the Mozilla Public License, v. 2.0.
diff --git a/submodules/TTS/MANIFEST.in b/submodules/TTS/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..321d3999c185a326a9d300451a3e732e4225f2e6
--- /dev/null
+++ b/submodules/TTS/MANIFEST.in
@@ -0,0 +1,15 @@
+include README.md
+include LICENSE.txt
+include requirements.*.txt
+include *.cff
+include requirements.txt
+include TTS/VERSION
+recursive-include TTS *.json
+recursive-include TTS *.html
+recursive-include TTS *.png
+recursive-include TTS *.md
+recursive-include TTS *.py
+recursive-include TTS *.pyx
+recursive-include images *.png
+recursive-exclude tests *
+prune tests*
diff --git a/submodules/TTS/Makefile b/submodules/TTS/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7446848f469151515260eda2d689fed85e61c28d
--- /dev/null
+++ b/submodules/TTS/Makefile
@@ -0,0 +1,78 @@
+.DEFAULT_GOAL := help
+.PHONY: test system-deps dev-deps deps style lint install help docs
+
+help:
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+target_dirs := tests TTS notebooks recipes
+
+test_all: ## run tests and don't stop on an error.
+ nose2 --with-coverage --coverage TTS tests
+ ./run_bash_tests.sh
+
+test: ## run tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests
+
+test_vocoder: ## run vocoder tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
+
+test_tts: ## run tts tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
+
+test_tts2: ## run tts tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
+
+test_xtts:
+ nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
+
+test_aux: ## run aux tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
+ ./run_bash_tests.sh
+
+test_zoo: ## run zoo tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
+
+inference_tests: ## run inference tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
+
+data_tests: ## run data tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
+
+test_text: ## run text tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
+
+test_failed: ## only run tests failed the last time.
+ nose2 -F -v -B --with-coverage --coverage TTS tests
+
+style: ## update code style.
+ black ${target_dirs}
+ isort ${target_dirs}
+
+lint: ## run pylint linter.
+ pylint ${target_dirs}
+ black ${target_dirs} --check
+ isort ${target_dirs} --check-only
+
+system-deps: ## install linux system deps
+ sudo apt-get install -y libsndfile1-dev
+
+dev-deps: ## install development deps
+ pip install -r requirements.dev.txt
+
+doc-deps: ## install docs dependencies
+ pip install -r docs/requirements.txt
+
+build-docs: ## build the docs
+ cd docs && make clean && make build
+
+hub-deps: ## install deps for torch hub use
+ pip install -r requirements.hub.txt
+
+deps: ## install 🐸 requirements.
+ pip install -r requirements.txt
+
+install: ## install 🐸 TTS for development.
+ pip install -e .[all]
+
+docs: ## build the docs
+ $(MAKE) -C docs clean && $(MAKE) -C docs html
diff --git a/submodules/TTS/README.md b/submodules/TTS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3205c1bd3a7b5f83162838b3274074123b4de55
--- /dev/null
+++ b/submodules/TTS/README.md
@@ -0,0 +1,407 @@
+
+## 🐸Coqui.ai News
+- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
+- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
+- 📣 ⓍTTS can now stream with <200ms latency.
+- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
+- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
+- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
+- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
+
+
+
+
+##
+
+
+**🐸TTS is a library for advanced Text-to-Speech generation.**
+
+🚀 Pretrained models in +1100 languages.
+
+🛠️ Tools for training new models and fine-tuning existing models in any language.
+
+📚 Utilities for dataset analysis and curation.
+______________________________________________________________________
+
+[](https://discord.gg/5eXr5seRrv)
+[](https://opensource.org/licenses/MPL-2.0)
+[](https://badge.fury.io/py/TTS)
+[](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
+[](https://pepy.tech/project/tts)
+[](https://zenodo.org/badge/latestdoi/265612440)
+
+
+
+
+
+
+
+
+
+
+
+
+[![Docs]()](https://tts.readthedocs.io/en/latest/)
+
+
+
+______________________________________________________________________
+
+## 💬 Where to ask questions
+Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
+
+| Type | Platforms |
+| ------------------------------- | --------------------------------------- |
+| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
+| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
+| 👩💻 **Usage Questions** | [GitHub Discussions] |
+| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
+
+[github issue tracker]: https://github.com/coqui-ai/tts/issues
+[github discussions]: https://github.com/coqui-ai/TTS/discussions
+[discord]: https://discord.gg/5eXr5seRrv
+[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
+
+
+## 🔗 Links and Resources
+| Type | Links |
+| ------------------------------- | --------------------------------------- |
+| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
+| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
+| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
+| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
+| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
+| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
+
+
+## 🥇 TTS Performance
+
+
+Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.
+
+## Features
+- High-performance Deep Learning models for Text2Speech tasks.
+ - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
+ - Speaker Encoder to compute speaker embeddings efficiently.
+ - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
+- Fast and efficient model training.
+- Detailed training logs on the terminal and Tensorboard.
+- Support for Multi-speaker TTS.
+- Efficient, flexible, lightweight but feature complete `Trainer API`.
+- Released and ready-to-use models.
+- Tools to curate Text2Speech datasets under```dataset_analysis```.
+- Utilities to use and test your models.
+- Modular (but not too much) code base enabling easy implementation of new ideas.
+
+## Model Implementations
+### Spectrogram models
+- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
+- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
+- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
+- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
+- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
+- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
+- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
+- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
+- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
+- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
+- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
+- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
+- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
+
+### End-to-End Models
+- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
+- VITS: [paper](https://arxiv.org/pdf/2106.06103)
+- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
+- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
+- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
+
+### Attention Methods
+- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
+- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
+- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
+- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
+- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
+- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
+
+### Speaker Encoder
+- GE2E: [paper](https://arxiv.org/abs/1710.10467)
+- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
+
+### Vocoders
+- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
+- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
+- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
+- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
+- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
+- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
+- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
+- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
+
+### Voice Conversion
+- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
+
+You can also help us implement more models.
+
+## Installation
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
+
+If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
+
+```bash
+pip install TTS
+```
+
+If you plan to code or train models, clone 🐸TTS and install it locally.
+
+```bash
+git clone https://github.com/coqui-ai/TTS
+pip install -e .[all,dev,notebooks] # Select the relevant extras
+```
+
+If you are on Ubuntu (Debian), you can also run following commands for installation.
+
+```bash
+$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+$ make install
+```
+
+If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
+
+
+## Docker Image
+You can also try TTS without install with the docker image.
+Simply run the following command and you will be able to run TTS without installing it.
+
+```bash
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
+```
+
+You can then enjoy the TTS server [here](http://[::1]:5002/)
+More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
+
+
+## Synthesizing speech by 🐸TTS
+
+### 🐍 Python API
+
+#### Running a multi-speaker and multi-lingual model
+
+```python
+import torch
+from TTS.api import TTS
+
+# Get device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# List available 🐸TTS models
+print(TTS().list_models())
+
+# Init TTS
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+
+# Run TTS
+# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
+# Text to speech list of amplitude values as output
+wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
+# Text to speech to a file
+tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+```
+
+#### Running a single speaker model
+
+```python
+# Init TTS with the target model name
+tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
+
+# Run TTS
+tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+
+# Example voice cloning with YourTTS in English, French and Portuguese
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
+tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
+tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
+```
+
+#### Example voice conversion
+
+Converting the voice in `source_wav` to the voice of `target_wav`
+
+```python
+tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
+tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+```
+
+#### Example voice cloning together with the voice conversion model.
+This way, you can clone voices by using any model in 🐸TTS.
+
+```python
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
+ speaker_wav="target/speaker.wav",
+ file_path="output.wav"
+)
+```
+
+#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+For Fairseq models, use the following name format: `tts_models//fairseq/vits`.
+You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
+and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+
+```python
+# TTS with on the fly voice conversion
+api = TTS("tts_models/deu/fairseq/vits")
+api.tts_with_vc_to_file(
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
+ speaker_wav="target/speaker.wav",
+ file_path="output.wav"
+)
+```
+
+### Command-line `tts`
+
+
+
+Synthesize speech on command line.
+
+You can either use your trained model or choose a model from the provided list.
+
+If you don't specify any models, then it uses LJSpeech based English model.
+
+#### Single Speaker Models
+
+- List provided models:
+
+ ```
+ $ tts --list_models
+ ```
+
+- Get model info (for both tts_models and vocoder_models):
+
+ - Query by type/name:
+ The model_info_by_name uses the name as it from the --list_models.
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+ For example:
+ ```
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+ ```
+ - Query by type/idx:
+ The model_query_idx uses the corresponding idx from --list_models.
+
+ ```
+ $ tts --model_info_by_idx "/"
+ ```
+
+ For example:
+
+ ```
+ $ tts --model_info_by_idx tts_models/3
+ ```
+
+ - Query info for model info by full name:
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+
+- Run TTS with default models:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
+ ```
+
+- Run TTS and pipe out the generated TTS wav file data:
+
+ ```
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+ ```
+
+- Run a TTS model with its default vocoder model:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav
+ ```
+
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+ ```
+
+- Run with specific TTS and vocoder models from the list:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav
+ ```
+
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS and Vocoder models:
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+ ```
+
+#### Multi-speaker Models
+
+- List the available speakers and choose a among them:
+
+ ```
+ $ tts --model_name "//" --list_speaker_idxs
+ ```
+
+- Run the multi-speaker TTS model with the target speaker ID:
+
+ ```
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx
+ ```
+
+- Run your own multi-speaker TTS model:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx
+ ```
+
+### Voice Conversion Models
+
+```
+$ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav
+```
+
+
+
+## Directory Structure
+```
+|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
+|- utils/ (common utilities.)
+|- TTS
+ |- bin/ (folder for all the executables.)
+ |- train*.py (train your target model.)
+ |- ...
+ |- tts/ (text to speech models)
+ |- layers/ (model layer definitions)
+ |- models/ (model definitions)
+ |- utils/ (model specific utilities.)
+ |- speaker_encoder/ (Speaker Encoder models.)
+ |- (same)
+ |- vocoder/ (Vocoder models.)
+ |- (same)
+```
diff --git a/submodules/TTS/TTS/.models.json b/submodules/TTS/TTS/.models.json
new file mode 100644
index 0000000000000000000000000000000000000000..b349e7397b1b9ea7e50f537969d1a0cf087c27e0
--- /dev/null
+++ b/submodules/TTS/TTS/.models.json
@@ -0,0 +1,938 @@
+{
+ "tts_models": {
+ "multilingual": {
+ "multi-dataset": {
+ "xtts_v2": {
+ "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
+ "hf_url": [
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
+ ],
+ "model_hash": "10f92b55c512af7a8d39d650547a15a7",
+ "default_vocoder": null,
+ "commit": "480a6cdf7",
+ "license": "CPML",
+ "contact": "info@coqui.ai",
+ "tos_required": true
+ },
+ "xtts_v1.1": {
+ "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
+ "hf_url": [
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
+ ],
+ "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
+ "default_vocoder": null,
+ "commit": "82910a63",
+ "license": "CPML",
+ "contact": "info@coqui.ai",
+ "tos_required": true
+ },
+ "your_tts": {
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+ "default_vocoder": null,
+ "commit": "e9a1953e",
+ "license": "CC BY-NC-ND 4.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "bark": {
+ "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
+ "hf_url": [
+ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
+ "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
+ "https://coqui.gateway.scarf.sh/hf/text_2.pt",
+ "https://coqui.gateway.scarf.sh/hf/bark/config.json",
+ "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
+ "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
+ ],
+ "default_vocoder": null,
+ "commit": "e9a1953e",
+ "license": "MIT",
+ "contact": "https://www.suno.ai/"
+ }
+ }
+ },
+ "bg": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "cs": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "da": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "et": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ga": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "en": {
+ "ek1": {
+ "tacotron2": {
+ "description": "EK1 en-rp tacotron2 by NMStoker",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
+ "commit": "c802255",
+ "license": "apache 2.0"
+ }
+ },
+ "ljspeech": {
+ "tacotron2-DDC": {
+ "description": "Tacotron2 with Double Decoder Consistency.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "bae2ad0f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "tacotron2-DDC_ph": {
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
+ "commit": "3900448",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "glow-tts": {
+ "description": "",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
+ "stats_file": null,
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "speedy-speech": {
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
+ "stats_file": null,
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "4581e3d",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "tacotron2-DCA": {
+ "description": "",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "vits": {
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
+ "default_vocoder": null,
+ "commit": "3900448",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "vits--neon": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
+ "default_vocoder": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause",
+ "contact": null,
+ "commit": null
+ },
+ "fast_pitch": {
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "b27b3ba",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "overflow": {
+ "description": "Overflow model trained on LJSpeech",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "3b1a28f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "neural_hmm": {
+ "description": "Neural HMM model trained on LJSpeech",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "3b1a28f",
+ "author": "Shivam Metha @shivammehta25",
+ "license": "apache 2.0",
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
+ }
+ },
+ "vctk": {
+ "vits": {
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
+ "default_vocoder": null,
+ "commit": "3900448",
+ "author": "Eren @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "fast_pitch": {
+ "description": "FastPitch model trained on VCTK dataseset.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
+ "default_vocoder": null,
+ "commit": "bdab788d",
+ "author": "Eren @erogol",
+ "license": "CC BY-NC-ND 4.0",
+ "contact": "egolge@coqui.ai"
+ }
+ },
+ "sam": {
+ "tacotron-DDC": {
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
+ "commit": "bae2ad0f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "blizzard2013": {
+ "capacitron-t2-c50": {
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
+ "commit": "d6284e7",
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ },
+ "capacitron-t2-c150_v2": {
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
+ "commit": "a67039d",
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ }
+ },
+ "multi-dataset": {
+ "tortoise-v2": {
+ "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
+ "github_rls_url": [
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
+ ],
+ "commit": "c1875f6",
+ "default_vocoder": null,
+ "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
+ "license": "apache 2.0"
+ }
+ },
+ "jenny": {
+ "jenny": {
+ "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
+ "default_vocoder": null,
+ "commit": "ba40a1c",
+ "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
+ "author": "@noml4u"
+ }
+ }
+ },
+ "es": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "fr": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+ "commit": null,
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "uk": {
+ "mai": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
+ "author": "@robinhad",
+ "commit": "bdab788d",
+ "license": "MIT",
+ "contact": "",
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
+ },
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "zh-CN": {
+ "baker": {
+ "tacotron2-DDC-GST": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+ "commit": "unknown",
+ "author": "@kirianguiller",
+ "license": "apache 2.0",
+ "default_vocoder": null
+ }
+ }
+ },
+ "nl": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
+ "author": "@r-dh",
+ "license": "apache 2.0",
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
+ "stats_file": null,
+ "commit": "540d811"
+ }
+ },
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "de": {
+ "thorsten": {
+ "tacotron2-DCA": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
+ "default_vocoder": null,
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
+ "description": "Thorsten-Dec2021-22k-DDC",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ },
+ "css10": {
+ "vits-neon": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
+ "default_vocoder": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause",
+ "commit": null
+ }
+ }
+ },
+ "ja": {
+ "kokoro": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
+ "author": "@kaiidams",
+ "license": "apache 2.0",
+ "commit": "401fbd89"
+ }
+ }
+ },
+ "tr": {
+ "common-voice": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
+ "license": "MIT",
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
+ "author": "Fatih Akademi",
+ "commit": null
+ }
+ }
+ },
+ "it": {
+ "mai_female": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ },
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ }
+ },
+ "mai_male": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ },
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ }
+ }
+ },
+ "ewe": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "hau": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "lin": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "tw_akuapem": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "tw_asante": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "yor": {
+ "openbible": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "hu": {
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "el": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "fi": {
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "hr": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "lt": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "lv": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "mt": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "pl": {
+ "mai_female": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "pt": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ro": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sk": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sl": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sv": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ca": {
+ "custom": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
+ "author": "@gullabi",
+ "license": "CC-BY-4.0"
+ }
+ }
+ },
+ "fa": {
+ "custom": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
+ "author": "@karim23657",
+ "license": "CC-BY-4.0"
+ }
+ }
+ },
+ "bn": {
+ "custom": {
+ "vits-male": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
+ "author": "@mobassir94",
+ "license": "Apache 2.0"
+ },
+ "vits-female": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
+ "author": "@mobassir94",
+ "license": "Apache 2.0"
+ }
+ }
+ },
+ "be": {
+ "common-voice": {
+ "glow-tts":{
+ "description": "Belarusian GlowTTS model created by @alex73 (Github).",
+ "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
+ "default_vocoder": "vocoder_models/be/common-voice/hifigan",
+ "commit": "c0aabb85",
+ "license": "CC-BY-SA 4.0",
+ "contact": "alex73mail@gmail.com"
+ }
+ }
+ }
+ },
+ "vocoder_models": {
+ "universal": {
+ "libri-tts": {
+ "wavegrad": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
+ "commit": "ea976b0",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "fullband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+ "commit": "4132240",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ }
+ },
+ "en": {
+ "ek1": {
+ "wavegrad": {
+ "description": "EK1 en-rp wavegrad by NMStoker",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
+ "commit": "c802255",
+ "license": "apache 2.0"
+ }
+ },
+ "ljspeech": {
+ "multiband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
+ "commit": "ea976b0",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "hifigan_v2": {
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
+ "commit": "bae2ad0f",
+ "author": "@erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "univnet": {
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
+ "commit": "4581e3d",
+ "author": "Eren @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ }
+ },
+ "blizzard2013": {
+ "hifigan_v2": {
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
+ "commit": "d6284e7",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ }
+ },
+ "vctk": {
+ "hifigan_v2": {
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
+ "commit": "2f07160",
+ "author": "Edresson Casanova",
+ "license": "apache 2.0",
+ "contact": ""
+ }
+ },
+ "sam": {
+ "hifigan_v2": {
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
+ "commit": "2f07160",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ }
+ }
+ },
+ "nl": {
+ "mai": {
+ "parallel-wavegan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
+ "author": "@r-dh",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ }
+ },
+ "de": {
+ "thorsten": {
+ "wavegrad": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "fullband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "hifigan_v1": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ }
+ },
+ "ja": {
+ "kokoro": {
+ "hifigan_v1": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
+ "author": "@kaiidams",
+ "license": "apache 2.0",
+ "commit": "3900448"
+ }
+ }
+ },
+ "uk": {
+ "mai": {
+ "multiband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
+ "author": "@robinhad",
+ "commit": "bdab788d",
+ "license": "MIT",
+ "contact": ""
+ }
+ }
+ },
+ "tr": {
+ "common-voice": {
+ "hifigan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
+ "author": "Fatih Akademi",
+ "license": "MIT",
+ "commit": null
+ }
+ }
+ },
+ "be": {
+ "common-voice": {
+ "hifigan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
+ "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
+ "author": "@alex73",
+ "license": "CC-BY-SA 4.0",
+ "commit": "c0aabb85"
+ }
+ }
+ }
+ },
+ "voice_conversion_models": {
+ "multilingual": {
+ "vctk": {
+ "freevc24": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+ "author": "Jing-Yi Li @OlaWod",
+ "license": "MIT",
+ "commit": null
+ }
+ }
+ }
+ }
+}
diff --git a/submodules/TTS/TTS/VERSION b/submodules/TTS/TTS/VERSION
new file mode 100644
index 0000000000000000000000000000000000000000..2157409059873c80aa93884ecb847639add77b7a
--- /dev/null
+++ b/submodules/TTS/TTS/VERSION
@@ -0,0 +1 @@
+0.22.0
diff --git a/submodules/TTS/TTS/__init__.py b/submodules/TTS/TTS/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaf05db1b950d82bfd7e20857e09a0fef45b430a
--- /dev/null
+++ b/submodules/TTS/TTS/__init__.py
@@ -0,0 +1,6 @@
+import os
+
+with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
+ version = f.read().strip()
+
+__version__ = version
diff --git a/submodules/TTS/TTS/api.py b/submodules/TTS/TTS/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7abc188e74032ae4afc66fd8b639733c83b34f3e
--- /dev/null
+++ b/submodules/TTS/TTS/api.py
@@ -0,0 +1,458 @@
+import tempfile
+import warnings
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+from torch import nn
+
+from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+from TTS.config import load_config
+
+
+class TTS(nn.Module):
+ """TODO: Add voice conversion and Capacitron support."""
+
+ def __init__(
+ self,
+ model_name: str = "",
+ model_path: str = None,
+ config_path: str = None,
+ vocoder_path: str = None,
+ vocoder_config_path: str = None,
+ progress_bar: bool = True,
+ gpu=False,
+ ):
+ """🐸TTS python interface that allows to load and use the released models.
+
+ Example with a multi-speaker model:
+ >>> from TTS.api import TTS
+ >>> tts = TTS(TTS.list_models()[0])
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+
+ Example with a single-speaker model:
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+ Example loading a model from a path:
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+ Example voice cloning with YourTTS in English, French and Portuguese:
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
+
+ Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
+ >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
+ >>> tts.tts_to_file("This is a test.", file_path="output.wav")
+
+ Args:
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
+ config_path (str, optional): Path to the model config. Defaults to None.
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+ super().__init__()
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
+ self.config = load_config(config_path) if config_path else None
+ self.synthesizer = None
+ self.voice_converter = None
+ self.model_name = ""
+ if gpu:
+ warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
+
+ if model_name is not None and len(model_name) > 0:
+ if "tts_models" in model_name:
+ self.load_tts_model_by_name(model_name, gpu)
+ elif "voice_conversion_models" in model_name:
+ self.load_vc_model_by_name(model_name, gpu)
+ else:
+ self.load_model_by_name(model_name, gpu)
+
+ if model_path:
+ self.load_tts_model_by_path(
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
+ )
+
+ @property
+ def models(self):
+ return self.manager.list_tts_models()
+
+ @property
+ def is_multi_speaker(self):
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
+ return False
+
+ @property
+ def is_multi_lingual(self):
+ # Not sure what sets this to None, but applied a fix to prevent crashing.
+ if (
+ isinstance(self.model_name, str)
+ and "xtts" in self.model_name
+ or self.config
+ and ("xtts" in self.config.model or len(self.config.languages) > 1)
+ ):
+ return True
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
+ return False
+
+ @property
+ def speakers(self):
+ if not self.is_multi_speaker:
+ return None
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
+
+ @property
+ def languages(self):
+ if not self.is_multi_lingual:
+ return None
+ return self.synthesizer.tts_model.language_manager.language_names
+
+ @staticmethod
+ def get_models_file_path():
+ return Path(__file__).parent / ".models.json"
+
+ def list_models(self):
+ return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
+
+ def download_model_by_name(self, model_name: str):
+ model_path, config_path, model_item = self.manager.download_model(model_name)
+ if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
+ # return model directory if there are multiple files
+ # we assume that the model knows how to load itself
+ return None, None, None, None, model_path
+ if model_item.get("default_vocoder") is None:
+ return model_path, config_path, None, None, None
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+ return model_path, config_path, vocoder_path, vocoder_config_path, None
+
+ def load_model_by_name(self, model_name: str, gpu: bool = False):
+ """Load one of the 🐸TTS models by name.
+
+ Args:
+ model_name (str): Model name to load. You can list models by ```tts.models```.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+ self.load_tts_model_by_name(model_name, gpu)
+
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+ """Load one of the voice conversion models by name.
+
+ Args:
+ model_name (str): Model name to load. You can list models by ```tts.models```.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+ self.model_name = model_name
+ model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
+
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+ """Load one of 🐸TTS models by name.
+
+ Args:
+ model_name (str): Model name to load. You can list models by ```tts.models```.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+
+ TODO: Add tests
+ """
+ self.synthesizer = None
+ self.model_name = model_name
+
+ model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+ model_name
+ )
+
+ # init synthesizer
+ # None values are fetch from the model
+ self.synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=None,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config_path,
+ encoder_checkpoint=None,
+ encoder_config=None,
+ model_dir=model_dir,
+ use_cuda=gpu,
+ )
+
+ def load_tts_model_by_path(
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
+ ):
+ """Load a model from a path.
+
+ Args:
+ model_path (str): Path to the model checkpoint.
+ config_path (str): Path to the model config.
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+
+ self.synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=None,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config,
+ encoder_checkpoint=None,
+ encoder_config=None,
+ use_cuda=gpu,
+ )
+
+ def _check_arguments(
+ self,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = None,
+ speed: float = None,
+ **kwargs,
+ ) -> None:
+ """Check if the arguments are valid for the model."""
+ # check for the coqui tts models
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+ if self.is_multi_lingual and language is None:
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
+ if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+ if not self.is_multi_lingual and language is not None:
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
+ if not emotion is None and not speed is None:
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
+
+ def tts(
+ self,
+ text: str,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = None,
+ speed: float = None,
+ split_sentences: bool = True,
+ **kwargs,
+ ):
+ """Convert text to speech.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+ supported by `XTTS` model.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ emotion (str, optional):
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
+ speed (float, optional):
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
+ Defaults to None.
+ split_sentences (bool, optional):
+ Split text into sentences, synthesize them separately and concatenate the file audio.
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+ applicable to the 🐸TTS models. Defaults to True.
+ kwargs (dict, optional):
+ Additional arguments for the model.
+ """
+ self._check_arguments(
+ speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
+ )
+ wav = self.synthesizer.tts(
+ text=text,
+ speaker_name=speaker,
+ language_name=language,
+ speaker_wav=speaker_wav,
+ reference_wav=None,
+ style_wav=None,
+ style_text=None,
+ reference_speaker_name=None,
+ split_sentences=split_sentences,
+ **kwargs,
+ )
+ return wav
+
+ def tts_to_file(
+ self,
+ text: str,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = None,
+ speed: float = 1.0,
+ pipe_out=None,
+ file_path: str = "output.wav",
+ split_sentences: bool = True,
+ **kwargs,
+ ):
+ """Convert text to speech.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ emotion (str, optional):
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
+ speed (float, optional):
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+ pipe_out (BytesIO, optional):
+ Flag to stdout the generated TTS wav file for shell pipe.
+ file_path (str, optional):
+ Output file path. Defaults to "output.wav".
+ split_sentences (bool, optional):
+ Split text into sentences, synthesize them separately and concatenate the file audio.
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+ applicable to the 🐸TTS models. Defaults to True.
+ kwargs (dict, optional):
+ Additional arguments for the model.
+ """
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
+
+ wav = self.tts(
+ text=text,
+ speaker=speaker,
+ language=language,
+ speaker_wav=speaker_wav,
+ split_sentences=split_sentences,
+ **kwargs,
+ )
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
+ return file_path
+
+ def voice_conversion(
+ self,
+ source_wav: str,
+ target_wav: str,
+ ):
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
+
+ Args:``
+ source_wav (str):
+ Path to the source wav file.
+ target_wav (str):`
+ Path to the target wav file.
+ """
+ wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+ return wav
+
+ def voice_conversion_to_file(
+ self,
+ source_wav: str,
+ target_wav: str,
+ file_path: str = "output.wav",
+ ):
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
+
+ Args:
+ source_wav (str):
+ Path to the source wav file.
+ target_wav (str):
+ Path to the target wav file.
+ file_path (str, optional):
+ Output file path. Defaults to "output.wav".
+ """
+ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+ return file_path
+
+ def tts_with_vc(
+ self,
+ text: str,
+ language: str = None,
+ speaker_wav: str = None,
+ speaker: str = None,
+ split_sentences: bool = True,
+ ):
+ """Convert text to speech with voice conversion.
+
+ It combines tts with voice conversion to fake voice cloning.
+
+ - Convert text to speech with tts.
+ - Convert the output wav to target speaker with voice conversion.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ split_sentences (bool, optional):
+ Split text into sentences, synthesize them separately and concatenate the file audio.
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+ applicable to the 🐸TTS models. Defaults to True.
+ """
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+ # Lazy code... save it to a temp file to resample it while reading it for VC
+ self.tts_to_file(
+ text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
+ )
+ if self.voice_converter is None:
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
+ return wav
+
+ def tts_with_vc_to_file(
+ self,
+ text: str,
+ language: str = None,
+ speaker_wav: str = None,
+ file_path: str = "output.wav",
+ speaker: str = None,
+ split_sentences: bool = True,
+ ):
+ """Convert text to speech with voice conversion and save to file.
+
+ Check `tts_with_vc` for more details.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ file_path (str, optional):
+ Output file path. Defaults to "output.wav".
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ split_sentences (bool, optional):
+ Split text into sentences, synthesize them separately and concatenate the file audio.
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+ applicable to the 🐸TTS models. Defaults to True.
+ """
+ wav = self.tts_with_vc(
+ text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
+ )
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
diff --git a/submodules/TTS/TTS/bin/__init__.py b/submodules/TTS/TTS/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/bin/collect_env_info.py b/submodules/TTS/TTS/bin/collect_env_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..662fcd02ece0fad387b6bfc4bad9316c7e2a0bad
--- /dev/null
+++ b/submodules/TTS/TTS/bin/collect_env_info.py
@@ -0,0 +1,48 @@
+"""Get detailed info about the working environment."""
+import os
+import platform
+import sys
+
+import numpy
+import torch
+
+sys.path += [os.path.abspath(".."), os.path.abspath(".")]
+import json
+
+import TTS
+
+
+def system_info():
+ return {
+ "OS": platform.system(),
+ "architecture": platform.architecture(),
+ "version": platform.version(),
+ "processor": platform.processor(),
+ "python": platform.python_version(),
+ }
+
+
+def cuda_info():
+ return {
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
+ "available": torch.cuda.is_available(),
+ "version": torch.version.cuda,
+ }
+
+
+def package_info():
+ return {
+ "numpy": numpy.__version__,
+ "PyTorch_version": torch.__version__,
+ "PyTorch_debug": torch.version.debug,
+ "TTS": TTS.__version__,
+ }
+
+
+def main():
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
+ print(json.dumps(details, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/compute_attention_masks.py b/submodules/TTS/TTS/bin/compute_attention_masks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab520be7d9f41ecf4f124446400b5e1b597ae8b
--- /dev/null
+++ b/submodules/TTS/TTS/bin/compute_attention_masks.py
@@ -0,0 +1,165 @@
+import argparse
+import importlib
+import os
+from argparse import RawTextHelpFormatter
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets.TTSDataset import TTSDataset
+from TTS.tts.models import setup_model
+from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_checkpoint
+
+if __name__ == "__main__":
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
+These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
+(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
+ """
+Example run:
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
+ --dataset_metafile metadata.csv
+ --data_path /root/LJSpeech-1.1/
+ --batch_size 32
+ --dataset ljspeech
+ --use_cuda True
+""",
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
+ parser.add_argument(
+ "--config_path",
+ type=str,
+ required=True,
+ help="Path to Tacotron/Tacotron2 config file.",
+ )
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default="",
+ required=True,
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
+ )
+
+ parser.add_argument(
+ "--dataset_metafile",
+ type=str,
+ default="",
+ required=True,
+ help="Dataset metafile inclusing file paths with transcripts.",
+ )
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
+
+ parser.add_argument(
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
+ )
+ args = parser.parse_args()
+
+ C = load_config(args.config_path)
+ ap = AudioProcessor(**C.audio)
+
+ # if the vocabulary was passed, replace the default
+ if "characters" in C.keys():
+ symbols, phonemes = make_symbols(**C.characters)
+
+ # load the model
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
+ # TODO: handle multi-speaker
+ model = setup_model(C)
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
+
+ # data loader
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
+ preprocessor = getattr(preprocessor, args.dataset)
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
+ dataset = TTSDataset(
+ model.decoder.r,
+ C.text_cleaner,
+ compute_linear_spec=False,
+ ap=ap,
+ meta_data=meta_data,
+ characters=C.characters if "characters" in C.keys() else None,
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
+ use_phonemes=C.use_phonemes,
+ phoneme_cache_path=C.phoneme_cache_path,
+ phoneme_language=C.phoneme_language,
+ enable_eos_bos=C.enable_eos_bos_chars,
+ )
+
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
+ loader = DataLoader(
+ dataset,
+ batch_size=args.batch_size,
+ num_workers=4,
+ collate_fn=dataset.collate_fn,
+ shuffle=False,
+ drop_last=False,
+ )
+
+ # compute attentions
+ file_paths = []
+ with torch.no_grad():
+ for data in tqdm(loader):
+ # setup input data
+ text_input = data[0]
+ text_lengths = data[1]
+ linear_input = data[3]
+ mel_input = data[4]
+ mel_lengths = data[5]
+ stop_targets = data[6]
+ item_idxs = data[7]
+
+ # dispatch data to GPU
+ if args.use_cuda:
+ text_input = text_input.cuda()
+ text_lengths = text_lengths.cuda()
+ mel_input = mel_input.cuda()
+ mel_lengths = mel_lengths.cuda()
+
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
+
+ alignments = model_outputs["alignments"].detach()
+ for idx, alignment in enumerate(alignments):
+ item_idx = item_idxs[idx]
+ # interpolate if r > 1
+ alignment = (
+ torch.nn.functional.interpolate(
+ alignment.transpose(0, 1).unsqueeze(0),
+ size=None,
+ scale_factor=model.decoder.r,
+ mode="nearest",
+ align_corners=None,
+ recompute_scale_factor=None,
+ )
+ .squeeze(0)
+ .transpose(0, 1)
+ )
+ # remove paddings
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
+ # set file paths
+ wav_file_name = os.path.basename(item_idx)
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
+ file_path = item_idx.replace(wav_file_name, align_file_name)
+ # save output
+ wav_file_abs_path = os.path.abspath(item_idx)
+ file_abs_path = os.path.abspath(file_path)
+ file_paths.append([wav_file_abs_path, file_abs_path])
+ np.save(file_path, alignment)
+
+ # ourput metafile
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
+
+ with open(metafile, "w", encoding="utf-8") as f:
+ for p in file_paths:
+ f.write(f"{p[0]}|{p[1]}\n")
+ print(f" >> Metafile created: {metafile}")
diff --git a/submodules/TTS/TTS/bin/compute_embeddings.py b/submodules/TTS/TTS/bin/compute_embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b5a37df736fd75c8228ceefd818c6ec4a63867f
--- /dev/null
+++ b/submodules/TTS/TTS/bin/compute_embeddings.py
@@ -0,0 +1,197 @@
+import argparse
+import os
+from argparse import RawTextHelpFormatter
+
+import torch
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.managers import save_file
+from TTS.tts.utils.speakers import SpeakerManager
+
+
+def compute_embeddings(
+ model_path,
+ config_path,
+ output_path,
+ old_speakers_file=None,
+ old_append=False,
+ config_dataset_path=None,
+ formatter_name=None,
+ dataset_name=None,
+ dataset_path=None,
+ meta_file_train=None,
+ meta_file_val=None,
+ disable_cuda=False,
+ no_eval=False,
+):
+ use_cuda = torch.cuda.is_available() and not disable_cuda
+
+ if config_dataset_path is not None:
+ c_dataset = load_config(config_dataset_path)
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
+ else:
+ c_dataset = BaseDatasetConfig()
+ c_dataset.formatter = formatter_name
+ c_dataset.dataset_name = dataset_name
+ c_dataset.path = dataset_path
+ if meta_file_train is not None:
+ c_dataset.meta_file_train = meta_file_train
+ if meta_file_val is not None:
+ c_dataset.meta_file_val = meta_file_val
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
+
+ if meta_data_eval is None:
+ samples = meta_data_train
+ else:
+ samples = meta_data_train + meta_data_eval
+
+ encoder_manager = SpeakerManager(
+ encoder_model_path=model_path,
+ encoder_config_path=config_path,
+ d_vectors_file_path=old_speakers_file,
+ use_cuda=use_cuda,
+ )
+
+ class_name_key = encoder_manager.encoder_config.class_name_key
+
+ # compute speaker embeddings
+ if old_speakers_file is not None and old_append:
+ speaker_mapping = encoder_manager.embeddings
+ else:
+ speaker_mapping = {}
+
+ for fields in tqdm(samples):
+ class_name = fields[class_name_key]
+ audio_file = fields["audio_file"]
+ embedding_key = fields["audio_unique_name"]
+
+ # Only update the speaker name when the embedding is already in the old file.
+ if embedding_key in speaker_mapping:
+ speaker_mapping[embedding_key]["name"] = class_name
+ continue
+
+ if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
+ # get the embedding from the old file
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
+ else:
+ # extract the embedding
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
+
+ # create speaker_mapping if target dataset is defined
+ speaker_mapping[embedding_key] = {}
+ speaker_mapping[embedding_key]["name"] = class_name
+ speaker_mapping[embedding_key]["embedding"] = embedd
+
+ if speaker_mapping:
+ # save speaker_mapping if target dataset is defined
+ if os.path.isdir(output_path):
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
+ else:
+ mapping_file_path = output_path
+
+ if os.path.dirname(mapping_file_path) != "":
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+
+ save_file(speaker_mapping, mapping_file_path)
+ print("Speaker embeddings saved at:", mapping_file_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+ """
+ Example runs:
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
+
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+ )
+ parser.add_argument(
+ "--config_path",
+ type=str,
+ help="Path to model config file. It defaults to the released speaker encoder config.",
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+ )
+ parser.add_argument(
+ "--config_dataset_path",
+ type=str,
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+ default=None,
+ )
+ parser.add_argument(
+ "--output_path",
+ type=str,
+ help="Path for output `pth` or `json` file.",
+ default="speakers.pth",
+ )
+ parser.add_argument(
+ "--old_file",
+ type=str,
+ help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+ default=None,
+ )
+ parser.add_argument(
+ "--old_append",
+ help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+ default=False,
+ action="store_true",
+ )
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+ parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+ parser.add_argument(
+ "--formatter_name",
+ type=str,
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--dataset_name",
+ type=str,
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--dataset_path",
+ type=str,
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--meta_file_train",
+ type=str,
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--meta_file_val",
+ type=str,
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ args = parser.parse_args()
+
+ compute_embeddings(
+ args.model_path,
+ args.config_path,
+ args.output_path,
+ old_speakers_file=args.old_file,
+ old_append=args.old_append,
+ config_dataset_path=args.config_dataset_path,
+ formatter_name=args.formatter_name,
+ dataset_name=args.dataset_name,
+ dataset_path=args.dataset_path,
+ meta_file_train=args.meta_file_train,
+ meta_file_val=args.meta_file_val,
+ disable_cuda=args.disable_cuda,
+ no_eval=args.no_eval,
+ )
diff --git a/submodules/TTS/TTS/bin/compute_statistics.py b/submodules/TTS/TTS/bin/compute_statistics.py
new file mode 100755
index 0000000000000000000000000000000000000000..3ab7ea7a3b10ec3cc23d8a744c7bdc79de52dbf2
--- /dev/null
+++ b/submodules/TTS/TTS/bin/compute_statistics.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import glob
+import os
+
+import numpy as np
+from tqdm import tqdm
+
+# from TTS.utils.io import load_config
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+
+
+def main():
+ """Run preprocessing process."""
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
+ parser.add_argument(
+ "--data_path",
+ type=str,
+ required=False,
+ help="folder including the target set of wavs overriding dataset config.",
+ )
+ args, overrides = parser.parse_known_args()
+
+ CONFIG = load_config(args.config_path)
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
+
+ # load config
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
+ CONFIG.audio.stats_path = None # discard pre-defined stats
+
+ # load audio processor
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
+
+ # load the meta data of target dataset
+ if args.data_path:
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
+ else:
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
+ print(f" > There are {len(dataset_items)} files.")
+
+ mel_sum = 0
+ mel_square_sum = 0
+ linear_sum = 0
+ linear_square_sum = 0
+ N = 0
+ for item in tqdm(dataset_items):
+ # compute features
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
+ linear = ap.spectrogram(wav)
+ mel = ap.melspectrogram(wav)
+
+ # compute stats
+ N += mel.shape[1]
+ mel_sum += mel.sum(1)
+ linear_sum += linear.sum(1)
+ mel_square_sum += (mel**2).sum(axis=1)
+ linear_square_sum += (linear**2).sum(axis=1)
+
+ mel_mean = mel_sum / N
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
+ linear_mean = linear_sum / N
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
+
+ output_file_path = args.out_path
+ stats = {}
+ stats["mel_mean"] = mel_mean
+ stats["mel_std"] = mel_scale
+ stats["linear_mean"] = linear_mean
+ stats["linear_std"] = linear_scale
+
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
+
+ # set default config values for mean-var scaling
+ CONFIG.audio.stats_path = output_file_path
+ CONFIG.audio.signal_norm = True
+ # remove redundant values
+ del CONFIG.audio.max_norm
+ del CONFIG.audio.min_level_db
+ del CONFIG.audio.symmetric_norm
+ del CONFIG.audio.clip_norm
+ stats["audio_config"] = CONFIG.audio.to_dict()
+ np.save(output_file_path, stats, allow_pickle=True)
+ print(f" > stats saved to {output_file_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/eval_encoder.py b/submodules/TTS/TTS/bin/eval_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..60fed1393215cd5e2e349795b585ae12f2e227fa
--- /dev/null
+++ b/submodules/TTS/TTS/bin/eval_encoder.py
@@ -0,0 +1,88 @@
+import argparse
+from argparse import RawTextHelpFormatter
+
+import torch
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.speakers import SpeakerManager
+
+
+def compute_encoder_accuracy(dataset_items, encoder_manager):
+ class_name_key = encoder_manager.encoder_config.class_name_key
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
+
+ class_acc_dict = {}
+
+ # compute embeddings for all wav_files
+ for item in tqdm(dataset_items):
+ class_name = item[class_name_key]
+ wav_file = item["audio_file"]
+
+ # extract the embedding
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
+ if encoder_manager.use_cuda:
+ embedding = embedding.cuda()
+
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
+ predicted_label = map_classid_to_classname[str(class_id)]
+ else:
+ predicted_label = None
+
+ if class_name is not None and predicted_label is not None:
+ is_equal = int(class_name == predicted_label)
+ if class_name not in class_acc_dict:
+ class_acc_dict[class_name] = [is_equal]
+ else:
+ class_acc_dict[class_name].append(is_equal)
+ else:
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
+
+ acc_avg = 0
+ for key, values in class_acc_dict.items():
+ acc = sum(values) / len(values)
+ print("Class", key, "Accuracy:", acc)
+ acc_avg += acc
+
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Compute the accuracy of the encoder.\n\n"""
+ """
+ Example runs:
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+ parser.add_argument(
+ "config_path",
+ type=str,
+ help="Path to model config file.",
+ )
+
+ parser.add_argument(
+ "config_dataset_path",
+ type=str,
+ help="Path to dataset config file.",
+ )
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+ args = parser.parse_args()
+
+ c_dataset = load_config(args.config_dataset_path)
+
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+ items = meta_data_train + meta_data_eval
+
+ enc_manager = SpeakerManager(
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
+ )
+
+ compute_encoder_accuracy(items, enc_manager)
diff --git a/submodules/TTS/TTS/bin/extract_tts_spectrograms.py b/submodules/TTS/TTS/bin/extract_tts_spectrograms.py
new file mode 100755
index 0000000000000000000000000000000000000000..c6048626b3cb89daee37b42f757a4ba1e8b7843d
--- /dev/null
+++ b/submodules/TTS/TTS/bin/extract_tts_spectrograms.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""Extract Mel spectrograms with teacher forcing."""
+
+import argparse
+import os
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import TTSDataset, load_tts_samples
+from TTS.tts.models import setup_model
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import quantize
+from TTS.utils.generic_utils import count_parameters
+
+use_cuda = torch.cuda.is_available()
+
+
+def setup_loader(ap, r, verbose=False):
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
+ dataset = TTSDataset(
+ outputs_per_step=r,
+ compute_linear_spec=False,
+ samples=meta_data,
+ tokenizer=tokenizer,
+ ap=ap,
+ batch_group_size=0,
+ min_text_len=c.min_text_len,
+ max_text_len=c.max_text_len,
+ min_audio_len=c.min_audio_len,
+ max_audio_len=c.max_audio_len,
+ phoneme_cache_path=c.phoneme_cache_path,
+ precompute_num_workers=0,
+ use_noise_augment=False,
+ verbose=verbose,
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+ )
+
+ if c.use_phonemes and c.compute_input_seq_cache:
+ # precompute phonemes to have a better estimate of sequence lengths.
+ dataset.compute_input_seq(c.num_loader_workers)
+ dataset.preprocess_samples()
+
+ loader = DataLoader(
+ dataset,
+ batch_size=c.batch_size,
+ shuffle=False,
+ collate_fn=dataset.collate_fn,
+ drop_last=False,
+ sampler=None,
+ num_workers=c.num_loader_workers,
+ pin_memory=False,
+ )
+ return loader
+
+
+def set_filename(wav_path, out_path):
+ wav_file = os.path.basename(wav_path)
+ file_name = wav_file.split(".")[0]
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
+ wavq_path = os.path.join(out_path, "quant", file_name)
+ mel_path = os.path.join(out_path, "mel", file_name)
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+
+
+def format_data(data):
+ # setup input data
+ text_input = data["token_id"]
+ text_lengths = data["token_id_lengths"]
+ mel_input = data["mel"]
+ mel_lengths = data["mel_lengths"]
+ item_idx = data["item_idxs"]
+ d_vectors = data["d_vectors"]
+ speaker_ids = data["speaker_ids"]
+ attn_mask = data["attns"]
+ avg_text_length = torch.mean(text_lengths.float())
+ avg_spec_length = torch.mean(mel_lengths.float())
+
+ # dispatch data to GPU
+ if use_cuda:
+ text_input = text_input.cuda(non_blocking=True)
+ text_lengths = text_lengths.cuda(non_blocking=True)
+ mel_input = mel_input.cuda(non_blocking=True)
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
+ if speaker_ids is not None:
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
+ if d_vectors is not None:
+ d_vectors = d_vectors.cuda(non_blocking=True)
+ if attn_mask is not None:
+ attn_mask = attn_mask.cuda(non_blocking=True)
+ return (
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ avg_text_length,
+ avg_spec_length,
+ attn_mask,
+ item_idx,
+ )
+
+
+@torch.no_grad()
+def inference(
+ model_name,
+ model,
+ ap,
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids=None,
+ d_vectors=None,
+):
+ if model_name == "glow_tts":
+ speaker_c = None
+ if speaker_ids is not None:
+ speaker_c = speaker_ids
+ elif d_vectors is not None:
+ speaker_c = d_vectors
+ outputs = model.inference_with_MAS(
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
+ )
+ model_output = outputs["model_outputs"]
+ model_output = model_output.detach().cpu().numpy()
+
+ elif "tacotron" in model_name:
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
+ postnet_outputs = outputs["model_outputs"]
+ # normalize tacotron output
+ if model_name == "tacotron":
+ mel_specs = []
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
+ for b in range(postnet_outputs.shape[0]):
+ postnet_output = postnet_outputs[b]
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
+ model_output = torch.stack(mel_specs).cpu().numpy()
+
+ elif model_name == "tacotron2":
+ model_output = postnet_outputs.detach().cpu().numpy()
+ return model_output
+
+
+def extract_spectrograms(
+ data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
+):
+ model.eval()
+ export_metadata = []
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+ # format data
+ (
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ _,
+ _,
+ _,
+ item_idx,
+ ) = format_data(data)
+
+ model_output = inference(
+ c.model.lower(),
+ model,
+ ap,
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ )
+
+ for idx in range(text_input.shape[0]):
+ wav_file_path = item_idx[idx]
+ wav = ap.load_wav(wav_file_path)
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+
+ # quantize and save wav
+ if quantize_bits > 0:
+ wavq = quantize(wav, quantize_bits)
+ np.save(wavq_path, wavq)
+
+ # save TTS mel
+ mel = model_output[idx]
+ mel_length = mel_lengths[idx]
+ mel = mel[:mel_length, :].T
+ np.save(mel_path, mel)
+
+ export_metadata.append([wav_file_path, mel_path])
+ if save_audio:
+ ap.save_wav(wav, wav_path)
+
+ if debug:
+ print("Audio for debug saved at:", wav_gl_path)
+ wav = ap.inv_melspectrogram(mel)
+ ap.save_wav(wav, wav_gl_path)
+
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+ for data in export_metadata:
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+
+
+def main(args): # pylint: disable=redefined-outer-name
+ # pylint: disable=global-variable-undefined
+ global meta_data, speaker_manager
+
+ # Audio processor
+ ap = AudioProcessor(**c.audio)
+
+ # load data instances
+ meta_data_train, meta_data_eval = load_tts_samples(
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+
+ # use eval and training partitions
+ meta_data = meta_data_train + meta_data_eval
+
+ # init speaker manager
+ if c.use_speaker_embedding:
+ speaker_manager = SpeakerManager(data_items=meta_data)
+ elif c.use_d_vector_file:
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+ else:
+ speaker_manager = None
+
+ # setup model
+ model = setup_model(c)
+
+ # restore model
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
+
+ if use_cuda:
+ model.cuda()
+
+ num_params = count_parameters(model)
+ print("\n > Model has {} parameters".format(num_params), flush=True)
+ # set r
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
+ own_loader = setup_loader(ap, r, verbose=True)
+
+ extract_spectrograms(
+ own_loader,
+ model,
+ ap,
+ args.output_path,
+ quantize_bits=args.quantize_bits,
+ save_audio=args.save_audio,
+ debug=args.debug,
+ metada_name="metada.txt",
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+ c.audio.trim_silence = False
+ main(args)
diff --git a/submodules/TTS/TTS/bin/find_unique_chars.py b/submodules/TTS/TTS/bin/find_unique_chars.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea16974839df6cf9942ef24a5535597940fde5b2
--- /dev/null
+++ b/submodules/TTS/TTS/bin/find_unique_chars.py
@@ -0,0 +1,45 @@
+"""Find all the unique characters in a dataset"""
+import argparse
+from argparse import RawTextHelpFormatter
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+
+
+def main():
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+ """
+ Example runs:
+
+ python TTS/bin/find_unique_chars.py --config_path config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+
+ # load all datasets
+ train_items, eval_items = load_tts_samples(
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+
+ items = train_items + eval_items
+
+ texts = "".join(item["text"] for item in items)
+ chars = set(texts)
+ lower_chars = filter(lambda c: c.islower(), chars)
+ chars_force_lower = [c.lower() for c in chars]
+ chars_force_lower = set(chars_force_lower)
+
+ print(f" > Number of unique characters: {len(chars)}")
+ print(f" > Unique characters: {''.join(sorted(chars))}")
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/find_unique_phonemes.py b/submodules/TTS/TTS/bin/find_unique_phonemes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bd7a78eef2c4850bca9369def55d68336cd53aa
--- /dev/null
+++ b/submodules/TTS/TTS/bin/find_unique_phonemes.py
@@ -0,0 +1,74 @@
+"""Find all the unique characters in a dataset"""
+import argparse
+import multiprocessing
+from argparse import RawTextHelpFormatter
+
+from tqdm.contrib.concurrent import process_map
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.text.phonemizers import Gruut
+
+
+def compute_phonemes(item):
+ text = item["text"]
+ ph = phonemizer.phonemize(text).replace("|", "")
+ return set(list(ph))
+
+
+def main():
+ # pylint: disable=W0601
+ global c, phonemizer
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+ """
+ Example runs:
+
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+
+ # load all datasets
+ train_items, eval_items = load_tts_samples(
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+ items = train_items + eval_items
+ print("Num items:", len(items))
+
+ language_list = [item["language"] for item in items]
+ is_lang_def = all(language_list)
+
+ if not c.phoneme_language or not is_lang_def:
+ raise ValueError("Phoneme language must be defined in config.")
+
+ if not language_list.count(language_list[0]) == len(language_list):
+ raise ValueError(
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+ )
+
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
+
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
+ phones = []
+ for ph in phonemes:
+ phones.extend(ph)
+
+ phones = set(phones)
+ lower_phones = filter(lambda c: c.islower(), phones)
+ phones_force_lower = [c.lower() for c in phones]
+ phones_force_lower = set(phones_force_lower)
+
+ print(f" > Number of unique phonemes: {len(phones)}")
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/remove_silence_using_vad.py b/submodules/TTS/TTS/bin/remove_silence_using_vad.py
new file mode 100755
index 0000000000000000000000000000000000000000..a1eaf4c9a713e2e72a9e8434397ac430ff10aef1
--- /dev/null
+++ b/submodules/TTS/TTS/bin/remove_silence_using_vad.py
@@ -0,0 +1,124 @@
+import argparse
+import glob
+import multiprocessing
+import os
+import pathlib
+
+import torch
+from tqdm import tqdm
+
+from TTS.utils.vad import get_vad_model_and_utils, remove_silence
+
+torch.set_num_threads(1)
+
+
+def adjust_path_and_remove_silence(audio_path):
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
+ # ignore if the file exists
+ if os.path.exists(output_path) and not args.force:
+ return output_path, False
+
+ # create all directory structure
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+ # remove the silence and save the audio
+ output_path, is_speech = remove_silence(
+ model_and_utils,
+ audio_path,
+ output_path,
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
+ use_cuda=args.use_cuda,
+ )
+ return output_path, is_speech
+
+
+def preprocess_audios():
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
+ print("> Number of files: ", len(files))
+ if not args.force:
+ print("> Ignoring files that already exist in the output idrectory.")
+
+ if args.trim_just_beginning_and_end:
+ print("> Trimming just the beginning and the end with nonspeech parts.")
+ else:
+ print("> Trimming all nonspeech parts.")
+
+ filtered_files = []
+ if files:
+ # create threads
+ # num_threads = multiprocessing.cpu_count()
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
+
+ if args.num_processes > 1:
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
+ results = list(
+ tqdm(
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
+ total=len(files),
+ desc="Processing audio files",
+ )
+ )
+ for output_path, is_speech in results:
+ if not is_speech:
+ filtered_files.append(output_path)
+ else:
+ for f in tqdm(files):
+ output_path, is_speech = adjust_path_and_remove_silence(f)
+ if not is_speech:
+ filtered_files.append(output_path)
+
+ # write files that do not have speech
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+ for file in filtered_files:
+ f.write(str(file) + "\n")
+ else:
+ print("> No files Found !")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
+ )
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
+ parser.add_argument(
+ "-g",
+ "--glob",
+ type=str,
+ default="**/*.wav",
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
+ )
+ parser.add_argument(
+ "-t",
+ "--trim_just_beginning_and_end",
+ type=bool,
+ default=True,
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+ )
+ parser.add_argument(
+ "-c",
+ "--use_cuda",
+ type=bool,
+ default=False,
+ help="If True use cuda",
+ )
+ parser.add_argument(
+ "--use_onnx",
+ type=bool,
+ default=False,
+ help="If True use onnx",
+ )
+ parser.add_argument(
+ "--num_processes",
+ type=int,
+ default=1,
+ help="Number of processes to use",
+ )
+ args = parser.parse_args()
+
+ if args.output_dir == "":
+ args.output_dir = args.input_dir
+
+ # load the model and utils
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
+ preprocess_audios()
diff --git a/submodules/TTS/TTS/bin/resample.py b/submodules/TTS/TTS/bin/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f28485d1fb235ab0d521ee30318c64b48fbd5a
--- /dev/null
+++ b/submodules/TTS/TTS/bin/resample.py
@@ -0,0 +1,90 @@
+import argparse
+import glob
+import os
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from shutil import copytree
+
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+
+def resample_file(func_args):
+ filename, output_sr = func_args
+ y, sr = librosa.load(filename, sr=output_sr)
+ sf.write(filename, y, sr)
+
+
+def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
+ if output_dir:
+ print("Recursively copying the input folder...")
+ copytree(input_dir, output_dir)
+ input_dir = output_dir
+
+ print("Resampling the audio files...")
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
+ print(f"Found {len(audio_files)} files...")
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
+ with Pool(processes=n_jobs) as p:
+ with tqdm(total=len(audio_files)) as pbar:
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+ pbar.update()
+
+ print("Done !")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Resample a folder recusively with librosa
+ Can be used in place or create a copy of the folder as an output.\n\n
+ Example run:
+ python TTS/bin/resample.py
+ --input_dir /root/LJSpeech-1.1/
+ --output_sr 22050
+ --output_dir /root/resampled_LJSpeech-1.1/
+ --file_ext wav
+ --n_jobs 24
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--input_dir",
+ type=str,
+ default=None,
+ required=True,
+ help="Path of the folder containing the audio files to resample",
+ )
+
+ parser.add_argument(
+ "--output_sr",
+ type=int,
+ default=22050,
+ required=False,
+ help="Samlple rate to which the audio files should be resampled",
+ )
+
+ parser.add_argument(
+ "--output_dir",
+ type=str,
+ default=None,
+ required=False,
+ help="Path of the destination folder. If not defined, the operation is done in place",
+ )
+
+ parser.add_argument(
+ "--file_ext",
+ type=str,
+ default="wav",
+ required=False,
+ help="Extension of the audio files to resample",
+ )
+
+ parser.add_argument(
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
+ )
+
+ args = parser.parse_args()
+
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
diff --git a/submodules/TTS/TTS/bin/synthesize.py b/submodules/TTS/TTS/bin/synthesize.py
new file mode 100755
index 0000000000000000000000000000000000000000..b86252ab676bcc1acfab1f6616153ea16a4528e6
--- /dev/null
+++ b/submodules/TTS/TTS/bin/synthesize.py
@@ -0,0 +1,494 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import contextlib
+import sys
+from argparse import RawTextHelpFormatter
+
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path
+
+description = """
+Synthesize speech on command line.
+
+You can either use your trained model or choose a model from the provided list.
+
+If you don't specify any models, then it uses LJSpeech based English model.
+
+#### Single Speaker Models
+
+- List provided models:
+
+ ```
+ $ tts --list_models
+ ```
+
+- Get model info (for both tts_models and vocoder_models):
+
+ - Query by type/name:
+ The model_info_by_name uses the name as it from the --list_models.
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+ For example:
+ ```
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+ ```
+ - Query by type/idx:
+ The model_query_idx uses the corresponding idx from --list_models.
+
+ ```
+ $ tts --model_info_by_idx "/"
+ ```
+
+ For example:
+
+ ```
+ $ tts --model_info_by_idx tts_models/3
+ ```
+
+ - Query info for model info by full name:
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+
+- Run TTS with default models:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
+ ```
+
+- Run TTS and pipe out the generated TTS wav file data:
+
+ ```
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+ ```
+
+- Run a TTS model with its default vocoder model:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav
+ ```
+
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+ ```
+
+- Run with specific TTS and vocoder models from the list:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav
+ ```
+
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS and Vocoder models:
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+ ```
+
+#### Multi-speaker Models
+
+- List the available speakers and choose a among them:
+
+ ```
+ $ tts --model_name "//" --list_speaker_idxs
+ ```
+
+- Run the multi-speaker TTS model with the target speaker ID:
+
+ ```
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx
+ ```
+
+- Run your own multi-speaker TTS model:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx
+ ```
+
+### Voice Conversion Models
+
+```
+$ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav
+```
+"""
+
+
+def str2bool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ if v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description=description.replace(" ```\n", ""),
+ formatter_class=RawTextHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--list_models",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ help="list available pre-trained TTS and vocoder models.",
+ )
+
+ parser.add_argument(
+ "--model_info_by_idx",
+ type=str,
+ default=None,
+ help="model info using query format: /",
+ )
+
+ parser.add_argument(
+ "--model_info_by_name",
+ type=str,
+ default=None,
+ help="model info using query format: ///",
+ )
+
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
+
+ # Args for running pre-trained TTS models.
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ default="tts_models/en/ljspeech/tacotron2-DDC",
+ help="Name of one of the pre-trained TTS models in format //",
+ )
+ parser.add_argument(
+ "--vocoder_name",
+ type=str,
+ default=None,
+ help="Name of one of the pre-trained vocoder models in format //",
+ )
+
+ # Args for running custom models
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ default=None,
+ help="Path to model file.",
+ )
+ parser.add_argument(
+ "--out_path",
+ type=str,
+ default="tts_output.wav",
+ help="Output wav file path.",
+ )
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
+ parser.add_argument(
+ "--vocoder_path",
+ type=str,
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+ default=None,
+ )
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+ parser.add_argument(
+ "--encoder_path",
+ type=str,
+ help="Path to speaker encoder model file.",
+ default=None,
+ )
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
+ parser.add_argument(
+ "--pipe_out",
+ help="stdout the generated TTS wav file for shell pipe.",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ )
+
+ # args for multi-speaker synthesis
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
+ parser.add_argument(
+ "--speaker_idx",
+ type=str,
+ help="Target speaker ID for a multi-speaker TTS model.",
+ default=None,
+ )
+ parser.add_argument(
+ "--language_idx",
+ type=str,
+ help="Target language ID for a multi-lingual TTS model.",
+ default=None,
+ )
+ parser.add_argument(
+ "--speaker_wav",
+ nargs="+",
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
+ default=None,
+ )
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+ parser.add_argument(
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+ )
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
+ parser.add_argument(
+ "--list_speaker_idxs",
+ help="List available speaker ids for the defined multi-speaker model.",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ )
+ parser.add_argument(
+ "--list_language_idxs",
+ help="List available language ids for the defined multi-lingual model.",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ )
+ # aux args
+ parser.add_argument(
+ "--save_spectogram",
+ type=bool,
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
+ default=False,
+ )
+ parser.add_argument(
+ "--reference_wav",
+ type=str,
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
+ default=None,
+ )
+ parser.add_argument(
+ "--reference_speaker_idx",
+ type=str,
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
+ default=None,
+ )
+ parser.add_argument(
+ "--progress_bar",
+ type=str2bool,
+ help="If true shows a progress bar for the model download. Defaults to True",
+ default=True,
+ )
+
+ # voice conversion args
+ parser.add_argument(
+ "--source_wav",
+ type=str,
+ default=None,
+ help="Original audio file to convert in the voice of the target_wav",
+ )
+ parser.add_argument(
+ "--target_wav",
+ type=str,
+ default=None,
+ help="Target audio file to convert in the voice of the source_wav",
+ )
+
+ parser.add_argument(
+ "--voice_dir",
+ type=str,
+ default=None,
+ help="Voice dir for tortoise model",
+ )
+
+ args = parser.parse_args()
+
+ # print the description if either text or list_models is not set
+ check_args = [
+ args.text,
+ args.list_models,
+ args.list_speaker_idxs,
+ args.list_language_idxs,
+ args.reference_wav,
+ args.model_info_by_idx,
+ args.model_info_by_name,
+ args.source_wav,
+ args.target_wav,
+ ]
+ if not any(check_args):
+ parser.parse_args(["-h"])
+
+ pipe_out = sys.stdout if args.pipe_out else None
+
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
+ # Late-import to make things load faster
+ from TTS.api import TTS
+ from TTS.utils.manage import ModelManager
+ from TTS.utils.synthesizer import Synthesizer
+
+ # load model manager
+ path = Path(__file__).parent / "../.models.json"
+ manager = ModelManager(path, progress_bar=args.progress_bar)
+ api = TTS()
+
+ tts_path = None
+ tts_config_path = None
+ speakers_file_path = None
+ language_ids_file_path = None
+ vocoder_path = None
+ vocoder_config_path = None
+ encoder_path = None
+ encoder_config_path = None
+ vc_path = None
+ vc_config_path = None
+ model_dir = None
+
+ # CASE1 #list : list pre-trained TTS models
+ if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+ # CASE2 #info : model info for pre-trained TTS models
+ if args.model_info_by_idx:
+ model_query = args.model_info_by_idx
+ manager.model_info_by_idx(model_query)
+ sys.exit()
+
+ if args.model_info_by_name:
+ model_query_full_name = args.model_info_by_name
+ manager.model_info_by_full_name(model_query_full_name)
+ sys.exit()
+
+ # CASE3: load pre-trained model paths
+ if args.model_name is not None and not args.model_path:
+ model_path, config_path, model_item = manager.download_model(args.model_name)
+ # tts model
+ if model_item["model_type"] == "tts_models":
+ tts_path = model_path
+ tts_config_path = config_path
+ if "default_vocoder" in model_item:
+ args.vocoder_name = (
+ model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+ )
+
+ # voice conversion model
+ if model_item["model_type"] == "voice_conversion_models":
+ vc_path = model_path
+ vc_config_path = config_path
+
+ # tts model with multiple files to be loaded from the directory path
+ if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
+ model_dir = model_path
+ tts_path = None
+ tts_config_path = None
+ args.vocoder_name = None
+
+ # load vocoder
+ if args.vocoder_name is not None and not args.vocoder_path:
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+ # CASE4: set custom model paths
+ if args.model_path is not None:
+ tts_path = args.model_path
+ tts_config_path = args.config_path
+ speakers_file_path = args.speakers_file_path
+ language_ids_file_path = args.language_ids_file_path
+
+ if args.vocoder_path is not None:
+ vocoder_path = args.vocoder_path
+ vocoder_config_path = args.vocoder_config_path
+
+ if args.encoder_path is not None:
+ encoder_path = args.encoder_path
+ encoder_config_path = args.encoder_config_path
+
+ device = args.device
+ if args.use_cuda:
+ device = "cuda"
+
+ # load models
+ synthesizer = Synthesizer(
+ tts_path,
+ tts_config_path,
+ speakers_file_path,
+ language_ids_file_path,
+ vocoder_path,
+ vocoder_config_path,
+ encoder_path,
+ encoder_config_path,
+ vc_path,
+ vc_config_path,
+ model_dir,
+ args.voice_dir,
+ ).to(device)
+
+ # query speaker ids of a multi-speaker model.
+ if args.list_speaker_idxs:
+ print(
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+ )
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
+ return
+
+ # query langauge ids of a multi-lingual model.
+ if args.list_language_idxs:
+ print(
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
+ )
+ print(synthesizer.tts_model.language_manager.name_to_id)
+ return
+
+ # check the arguments against a multi-speaker model.
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+ print(
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+ )
+ return
+
+ # RUN THE SYNTHESIS
+ if args.text:
+ print(" > Text: {}".format(args.text))
+
+ # kick it
+ if tts_path is not None:
+ wav = synthesizer.tts(
+ args.text,
+ speaker_name=args.speaker_idx,
+ language_name=args.language_idx,
+ speaker_wav=args.speaker_wav,
+ reference_wav=args.reference_wav,
+ style_wav=args.capacitron_style_wav,
+ style_text=args.capacitron_style_text,
+ reference_speaker_name=args.reference_speaker_idx,
+ )
+ elif vc_path is not None:
+ wav = synthesizer.voice_conversion(
+ source_wav=args.source_wav,
+ target_wav=args.target_wav,
+ )
+ elif model_dir is not None:
+ wav = synthesizer.tts(
+ args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
+ )
+
+ # save the results
+ print(" > Saving output to {}".format(args.out_path))
+ synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/train_encoder.py b/submodules/TTS/TTS/bin/train_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32ad00f56ee730ff1abe5770c802b01246aed06
--- /dev/null
+++ b/submodules/TTS/TTS/bin/train_encoder.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import time
+import traceback
+
+import torch
+from torch.utils.data import DataLoader
+from trainer.io import copy_model_files, save_best_model, save_checkpoint
+from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer
+
+from TTS.encoder.dataset import EncoderDataset
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.encoder.utils.training import init_training
+from TTS.encoder.utils.visual import plot_embeddings
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
+from TTS.utils.samplers import PerfectBatchSampler
+from TTS.utils.training import check_update
+
+torch.backends.cudnn.enabled = True
+torch.backends.cudnn.benchmark = True
+torch.manual_seed(54321)
+use_cuda = torch.cuda.is_available()
+num_gpus = torch.cuda.device_count()
+print(" > Using CUDA: ", use_cuda)
+print(" > Number of GPUs: ", num_gpus)
+
+
+def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
+
+ dataset = EncoderDataset(
+ c,
+ ap,
+ meta_data_eval if is_val else meta_data_train,
+ voice_len=c.voice_len,
+ num_utter_per_class=num_utter_per_class,
+ num_classes_in_batch=num_classes_in_batch,
+ verbose=verbose,
+ augmentation_config=c.audio_augmentation if not is_val else None,
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
+ )
+ # get classes list
+ classes = dataset.get_class_list()
+
+ sampler = PerfectBatchSampler(
+ dataset.items,
+ classes,
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
+ num_classes_in_batch=num_classes_in_batch,
+ num_gpus=1,
+ shuffle=not is_val,
+ drop_last=True,
+ )
+
+ if len(classes) < num_classes_in_batch:
+ if is_val:
+ raise RuntimeError(
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+ )
+ raise RuntimeError(
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+ )
+
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
+ if is_val:
+ dataset.set_classes(train_classes)
+
+ loader = DataLoader(
+ dataset,
+ num_workers=c.num_loader_workers,
+ batch_sampler=sampler,
+ collate_fn=dataset.collate_fn,
+ )
+
+ return loader, classes, dataset.get_map_classid_to_classname()
+
+
+def evaluation(model, criterion, data_loader, global_step):
+ eval_loss = 0
+ for _, data in enumerate(data_loader):
+ with torch.no_grad():
+ # setup input data
+ inputs, labels = data
+
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+ labels = torch.transpose(
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+ ).reshape(labels.shape)
+ inputs = torch.transpose(
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+ ).reshape(inputs.shape)
+
+ # dispatch data to GPU
+ if use_cuda:
+ inputs = inputs.cuda(non_blocking=True)
+ labels = labels.cuda(non_blocking=True)
+
+ # forward pass model
+ outputs = model(inputs)
+
+ # loss computation
+ loss = criterion(
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+ )
+
+ eval_loss += loss.item()
+
+ eval_avg_loss = eval_loss / len(data_loader)
+ # save stats
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
+ # plot the last batch in the evaluation
+ figures = {
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+ }
+ dashboard_logger.eval_figures(global_step, figures)
+ return eval_avg_loss
+
+
+def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
+ model.train()
+ best_loss = {"train_loss": None, "eval_loss": float("inf")}
+ avg_loader_time = 0
+ end_time = time.time()
+ for epoch in range(c.epochs):
+ tot_loss = 0
+ epoch_time = 0
+ for _, data in enumerate(data_loader):
+ start_time = time.time()
+
+ # setup input data
+ inputs, labels = data
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+ labels.shape
+ )
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+ inputs.shape
+ )
+ # ToDo: move it to a unit test
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+ # idx = 0
+ # for j in range(0, c.num_classes_in_batch, 1):
+ # for i in range(j, len(labels), c.num_classes_in_batch):
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
+ # print("Invalid")
+ # print(labels)
+ # exit()
+ # idx += 1
+ # labels = labels_converted
+ # inputs = inputs_converted
+
+ loader_time = time.time() - end_time
+ global_step += 1
+
+ # setup lr
+ if c.lr_decay:
+ scheduler.step()
+ optimizer.zero_grad()
+
+ # dispatch data to GPU
+ if use_cuda:
+ inputs = inputs.cuda(non_blocking=True)
+ labels = labels.cuda(non_blocking=True)
+
+ # forward pass model
+ outputs = model(inputs)
+
+ # loss computation
+ loss = criterion(
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+ )
+ loss.backward()
+ grad_norm, _ = check_update(model, c.grad_clip)
+ optimizer.step()
+
+ step_time = time.time() - start_time
+ epoch_time += step_time
+
+ # acumulate the total epoch loss
+ tot_loss += loss.item()
+
+ # Averaged Loader Time
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
+ avg_loader_time = (
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
+ if avg_loader_time != 0
+ else loader_time
+ )
+ current_lr = optimizer.param_groups[0]["lr"]
+
+ if global_step % c.steps_plot_stats == 0:
+ # Plot Training Epoch Stats
+ train_stats = {
+ "loss": loss.item(),
+ "lr": current_lr,
+ "grad_norm": grad_norm,
+ "step_time": step_time,
+ "avg_loader_time": avg_loader_time,
+ }
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
+ figures = {
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+ }
+ dashboard_logger.train_figures(global_step, figures)
+
+ if global_step % c.print_step == 0:
+ print(
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
+ ),
+ flush=True,
+ )
+
+ if global_step % c.save_step == 0:
+ # save model
+ save_checkpoint(
+ c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
+ )
+
+ end_time = time.time()
+
+ print("")
+ print(
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
+ ),
+ flush=True,
+ )
+ # evaluation
+ if c.run_eval:
+ model.eval()
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+ print("\n\n")
+ print("--> EVAL PERFORMANCE")
+ print(
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
+ flush=True,
+ )
+ # save the best checkpoint
+ best_loss = save_best_model(
+ {"train_loss": None, "eval_loss": eval_loss},
+ best_loss,
+ c,
+ model,
+ optimizer,
+ None,
+ global_step,
+ epoch,
+ OUT_PATH,
+ criterion=criterion.state_dict(),
+ )
+ model.train()
+
+ return best_loss, global_step
+
+
+def main(args): # pylint: disable=redefined-outer-name
+ # pylint: disable=global-variable-undefined
+ global meta_data_train
+ global meta_data_eval
+ global train_classes
+
+ ap = AudioProcessor(**c.audio)
+ model = setup_encoder_model(c)
+
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
+
+ # pylint: disable=redefined-outer-name
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
+
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
+ if c.run_eval:
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
+ else:
+ eval_data_loader = None
+
+ num_classes = len(train_classes)
+ criterion = model.get_criterion(c, num_classes)
+
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
+ c.map_classid_to_classname = map_classid_to_classname
+ copy_model_files(c, OUT_PATH, new_fields={})
+
+ if args.restore_path:
+ criterion, args.restore_step = model.load_checkpoint(
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+ )
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
+ else:
+ args.restore_step = 0
+
+ if c.lr_decay:
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
+ else:
+ scheduler = None
+
+ num_params = count_parameters(model)
+ print("\n > Model has {} parameters".format(num_params), flush=True)
+
+ if use_cuda:
+ model = model.cuda()
+ criterion.cuda()
+
+ global_step = args.restore_step
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
+
+
+if __name__ == "__main__":
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
+
+ try:
+ main(args)
+ except KeyboardInterrupt:
+ remove_experiment_folder(OUT_PATH)
+ try:
+ sys.exit(0)
+ except SystemExit:
+ os._exit(0) # pylint: disable=protected-access
+ except Exception: # pylint: disable=broad-except
+ remove_experiment_folder(OUT_PATH)
+ traceback.print_exc()
+ sys.exit(1)
diff --git a/submodules/TTS/TTS/bin/train_tts.py b/submodules/TTS/TTS/bin/train_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdb4f6f69122a4a9aa4e07695f1816ce9727f323
--- /dev/null
+++ b/submodules/TTS/TTS/bin/train_tts.py
@@ -0,0 +1,71 @@
+import os
+from dataclasses import dataclass, field
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config import load_config, register_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models import setup_model
+
+
+@dataclass
+class TrainTTSArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def main():
+ """Run `tts` model training directly by a `config.json` file."""
+ # init trainer args
+ train_args = TrainTTSArgs()
+ parser = train_args.init_argparse(arg_prefix="")
+
+ # override trainer args from comman-line args
+ args, config_overrides = parser.parse_known_args()
+ train_args.parse_args(args)
+
+ # load config.json and register
+ if args.config_path or args.continue_path:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ elif args.continue_path:
+ # continue from a prev experiment
+ config = load_config(os.path.join(args.continue_path, "config.json"))
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(config_overrides)
+ config = register_config(config_base.model)()
+
+ # load training samples
+ train_samples, eval_samples = load_tts_samples(
+ config.datasets,
+ eval_split=True,
+ eval_split_max_size=config.eval_split_max_size,
+ eval_split_size=config.eval_split_size,
+ )
+
+ # init the model from config
+ model = setup_model(config, train_samples + eval_samples)
+
+ # init the trainer and 🚀
+ trainer = Trainer(
+ train_args,
+ model.config,
+ config.output_path,
+ model=model,
+ train_samples=train_samples,
+ eval_samples=eval_samples,
+ parse_command_line_args=False,
+ )
+ trainer.fit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/train_vocoder.py b/submodules/TTS/TTS/bin/train_vocoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ecd7bdc3652b3683be846bdd9518e937aee904
--- /dev/null
+++ b/submodules/TTS/TTS/bin/train_vocoder.py
@@ -0,0 +1,77 @@
+import os
+from dataclasses import dataclass, field
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config import load_config, register_config
+from TTS.utils.audio import AudioProcessor
+from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from TTS.vocoder.models import setup_model
+
+
+@dataclass
+class TrainVocoderArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def main():
+ """Run `tts` model training directly by a `config.json` file."""
+ # init trainer args
+ train_args = TrainVocoderArgs()
+ parser = train_args.init_argparse(arg_prefix="")
+
+ # override trainer args from comman-line args
+ args, config_overrides = parser.parse_known_args()
+ train_args.parse_args(args)
+
+ # load config.json and register
+ if args.config_path or args.continue_path:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ elif args.continue_path:
+ # continue from a prev experiment
+ config = load_config(os.path.join(args.continue_path, "config.json"))
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(config_overrides)
+ config = register_config(config_base.model)()
+
+ # load training samples
+ if "feature_path" in config and config.feature_path:
+ # load pre-computed features
+ print(f" > Loading features from: {config.feature_path}")
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
+ else:
+ # load data raw wav files
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+ # setup audio processor
+ ap = AudioProcessor(**config.audio)
+
+ # init the model from config
+ model = setup_model(config)
+
+ # init the trainer and 🚀
+ trainer = Trainer(
+ train_args,
+ config,
+ config.output_path,
+ model=model,
+ train_samples=train_samples,
+ eval_samples=eval_samples,
+ training_assets={"audio_processor": ap},
+ parse_command_line_args=False,
+ )
+ trainer.fit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/bin/tune_wavegrad.py b/submodules/TTS/TTS/bin/tune_wavegrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..09582cea7c7962b098efcde5754a02573d18264a
--- /dev/null
+++ b/submodules/TTS/TTS/bin/tune_wavegrad.py
@@ -0,0 +1,103 @@
+"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
+import argparse
+from itertools import product as cartesian_product
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.utils.audio import AudioProcessor
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
+from TTS.vocoder.models import setup_model
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
+ parser.add_argument(
+ "--num_iter",
+ type=int,
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
+ )
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
+ parser.add_argument(
+ "--search_depth",
+ type=int,
+ default=3,
+ help="Search granularity. Increasing this increases the run-time exponentially.",
+ )
+
+ # load config
+ args = parser.parse_args()
+ config = load_config(args.config_path)
+
+ # setup audio processor
+ ap = AudioProcessor(**config.audio)
+
+ # load dataset
+ _, train_data = load_wav_data(args.data_path, 0)
+ train_data = train_data[: args.num_samples]
+ dataset = WaveGradDataset(
+ ap=ap,
+ items=train_data,
+ seq_len=-1,
+ hop_len=ap.hop_length,
+ pad_short=config.pad_short,
+ conv_pad=config.conv_pad,
+ is_training=True,
+ return_segments=False,
+ use_noise_augment=False,
+ use_cache=False,
+ verbose=True,
+ )
+ loader = DataLoader(
+ dataset,
+ batch_size=1,
+ shuffle=False,
+ collate_fn=dataset.collate_full_clips,
+ drop_last=False,
+ num_workers=config.num_loader_workers,
+ pin_memory=False,
+ )
+
+ # setup the model
+ model = setup_model(config)
+ if args.use_cuda:
+ model.cuda()
+
+ # setup optimization parameters
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
+ print(f" > base values: {base_values}")
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
+ best_error = float("inf")
+ best_schedule = None # pylint: disable=C0103
+ total_search_iter = len(base_values) ** args.num_iter
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
+ beta = exponents * base
+ model.compute_noise_level(beta)
+ for data in loader:
+ mel, audio = data
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
+
+ if args.use_cuda:
+ y_hat = y_hat.cpu()
+ y_hat = y_hat.numpy()
+
+ mel_hat = []
+ for i in range(y_hat.shape[0]):
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
+ mel_hat.append(torch.from_numpy(m))
+
+ mel_hat = torch.stack(mel_hat)
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
+ if mse.item() < best_error:
+ best_error = mse.item()
+ best_schedule = {"beta": beta}
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
+ np.save(args.output_path, best_schedule)
diff --git a/submodules/TTS/TTS/config/__init__.py b/submodules/TTS/TTS/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a6dd68e24e7f2c2c67504bee4c19f1eb6660a1
--- /dev/null
+++ b/submodules/TTS/TTS/config/__init__.py
@@ -0,0 +1,135 @@
+import json
+import os
+import re
+from typing import Dict
+
+import fsspec
+import yaml
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import *
+from TTS.utils.generic_utils import find_module
+
+
+def read_json_with_comments(json_path):
+ """for backward compat."""
+ # fallback to json
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
+ input_str = f.read()
+ # handle comments but not urls with //
+ input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
+ return json.loads(input_str)
+
+def register_config(model_name: str) -> Coqpit:
+ """Find the right config for the given model name.
+
+ Args:
+ model_name (str): Model name.
+
+ Raises:
+ ModuleNotFoundError: No matching config for the model name.
+
+ Returns:
+ Coqpit: config class.
+ """
+ config_class = None
+ config_name = model_name + "_config"
+
+ # TODO: fix this
+ if model_name == "xtts":
+ from TTS.tts.configs.xtts_config import XttsConfig
+
+ config_class = XttsConfig
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
+ for path in paths:
+ try:
+ config_class = find_module(path, config_name)
+ except ModuleNotFoundError:
+ pass
+ if config_class is None:
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
+ return config_class
+
+
+def _process_model_name(config_dict: Dict) -> str:
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
+
+ Args:
+ config_dict (Dict): A dictionary including the config fields.
+
+ Returns:
+ str: Formatted modelname.
+ """
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
+ return model_name
+
+
+def load_config(config_path: str) -> Coqpit:
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
+ to find the corresponding Config class. Then initialize the Config.
+
+ Args:
+ config_path (str): path to the config file.
+
+ Raises:
+ TypeError: given config file has an unknown type.
+
+ Returns:
+ Coqpit: TTS config object.
+ """
+ config_dict = {}
+ ext = os.path.splitext(config_path)[1]
+ if ext in (".yml", ".yaml"):
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
+ data = yaml.safe_load(f)
+ elif ext == ".json":
+ try:
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ except json.decoder.JSONDecodeError:
+ # backwards compat.
+ data = read_json_with_comments(config_path)
+ else:
+ raise TypeError(f" [!] Unknown config file type {ext}")
+ config_dict.update(data)
+ model_name = _process_model_name(config_dict)
+ config_class = register_config(model_name.lower())
+ config = config_class()
+ config.from_dict(config_dict)
+ return config
+
+
+def check_config_and_model_args(config, arg_name, value):
+ """Check the give argument in `config.model_args` if exist or in `config` for
+ the given value.
+
+ Return False if the argument does not exist in `config.model_args` or `config`.
+ This is to patch up the compatibility between models with and without `model_args`.
+
+ TODO: Remove this in the future with a unified approach.
+ """
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name] == value
+ if hasattr(config, arg_name):
+ return config[arg_name] == value
+ return False
+
+
+def get_from_config_or_model_args(config, arg_name):
+ """Get the given argument from `config.model_args` if exist or in `config`."""
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name]
+ return config[arg_name]
+
+
+def get_from_config_or_model_args_with_default(config, arg_name, def_val):
+ """Get the given argument from `config.model_args` if exist or in `config`."""
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name]
+ if hasattr(config, arg_name):
+ return config[arg_name]
+ return def_val
diff --git a/submodules/TTS/TTS/config/shared_configs.py b/submodules/TTS/TTS/config/shared_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fae77d61361eff8c8fa521a0f4a90dc46f63c75
--- /dev/null
+++ b/submodules/TTS/TTS/config/shared_configs.py
@@ -0,0 +1,268 @@
+from dataclasses import asdict, dataclass
+from typing import List
+
+from coqpit import Coqpit, check_argument
+from trainer import TrainerConfig
+
+
+@dataclass
+class BaseAudioConfig(Coqpit):
+ """Base config to definge audio processing parameters. It is used to initialize
+ ```TTS.utils.audio.AudioProcessor.```
+
+ Args:
+ fft_size (int):
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
+
+ win_length (int):
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
+ ```fft_size```. Defaults to 1024.
+
+ hop_length (int):
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
+
+ frame_shift_ms (int):
+ Set ```hop_length``` based on milliseconds and sampling rate.
+
+ frame_length_ms (int):
+ Set ```win_length``` based on milliseconds and sampling rate.
+
+ stft_pad_mode (str):
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
+
+ sample_rate (int):
+ Audio sampling rate. Defaults to 22050.
+
+ resample (bool):
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
+
+ preemphasis (float):
+ Preemphasis coefficient. Defaults to 0.0.
+
+ ref_level_db (int): 20
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
+ Defaults to 20.
+
+ do_sound_norm (bool):
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
+
+ log_func (str):
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
+
+ do_trim_silence (bool):
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
+
+ do_amp_to_db_linear (bool, optional):
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+ do_amp_to_db_mel (bool, optional):
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+ pitch_fmax (float, optional):
+ Maximum frequency of the F0 frames. Defaults to ```640```.
+
+ pitch_fmin (float, optional):
+ Minimum frequency of the F0 frames. Defaults to ```1```.
+
+ trim_db (int):
+ Silence threshold used for silence trimming. Defaults to 45.
+
+ do_rms_norm (bool, optional):
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
+
+ db_level (int, optional):
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
+
+ power (float):
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
+ artifacts in the synthesized voice. Defaults to 1.5.
+
+ griffin_lim_iters (int):
+ Number of Griffing Lim iterations. Defaults to 60.
+
+ num_mels (int):
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
+
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
+ It needs to be adjusted for a dataset. Defaults to 0.
+
+ mel_fmax (float):
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
+
+ spec_gain (int):
+ Gain applied when converting amplitude to DB. Defaults to 20.
+
+ signal_norm (bool):
+ enable/disable signal normalization. Defaults to True.
+
+ min_level_db (int):
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
+
+ symmetric_norm (bool):
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
+ [0, k], Defaults to True.
+
+ max_norm (float):
+ ```k``` defining the normalization range. Defaults to 4.0.
+
+ clip_norm (bool):
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+ stats_path (str):
+ Path to the computed stats file. Defaults to None.
+ """
+
+ # stft parameters
+ fft_size: int = 1024
+ win_length: int = 1024
+ hop_length: int = 256
+ frame_shift_ms: int = None
+ frame_length_ms: int = None
+ stft_pad_mode: str = "reflect"
+ # audio processing parameters
+ sample_rate: int = 22050
+ resample: bool = False
+ preemphasis: float = 0.0
+ ref_level_db: int = 20
+ do_sound_norm: bool = False
+ log_func: str = "np.log10"
+ # silence trimming
+ do_trim_silence: bool = True
+ trim_db: int = 45
+ # rms volume normalization
+ do_rms_norm: bool = False
+ db_level: float = None
+ # griffin-lim params
+ power: float = 1.5
+ griffin_lim_iters: int = 60
+ # mel-spec params
+ num_mels: int = 80
+ mel_fmin: float = 0.0
+ mel_fmax: float = None
+ spec_gain: int = 20
+ do_amp_to_db_linear: bool = True
+ do_amp_to_db_mel: bool = True
+ # f0 params
+ pitch_fmax: float = 640.0
+ pitch_fmin: float = 1.0
+ # normalization params
+ signal_norm: bool = True
+ min_level_db: int = -100
+ symmetric_norm: bool = True
+ max_norm: float = 4.0
+ clip_norm: bool = True
+ stats_path: str = None
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
+ check_argument(
+ "frame_length_ms",
+ c,
+ restricted=True,
+ min_val=10,
+ max_val=1000,
+ alternative="win_length",
+ )
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
+
+ # normalization parameters
+ check_argument("signal_norm", c, restricted=True)
+ check_argument("symmetric_norm", c, restricted=True)
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
+ check_argument("clip_norm", c, restricted=True)
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
+ check_argument("do_trim_silence", c, restricted=True)
+ check_argument("trim_db", c, restricted=True)
+
+
+@dataclass
+class BaseDatasetConfig(Coqpit):
+ """Base config for TTS datasets.
+
+ Args:
+ formatter (str):
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
+
+ dataset_name (str):
+ Unique name for the dataset. Defaults to `""`.
+
+ path (str):
+ Root path to the dataset files. Defaults to `""`.
+
+ meta_file_train (str):
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+ Defaults to `""`.
+
+ ignored_speakers (List):
+ List of speakers IDs that are not used at the training. Default None.
+
+ language (str):
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
+
+ phonemizer (str):
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
+
+ meta_file_val (str):
+ Name of the dataset meta file that defines the instances used at validation.
+
+ meta_file_attn_mask (str):
+ Path to the file that lists the attention mask files used with models that require attention masks to
+ train the duration predictor.
+ """
+
+ formatter: str = ""
+ dataset_name: str = ""
+ path: str = ""
+ meta_file_train: str = ""
+ ignored_speakers: List[str] = None
+ language: str = ""
+ phonemizer: str = ""
+ meta_file_val: str = ""
+ meta_file_attn_mask: str = ""
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ check_argument("formatter", c, restricted=True)
+ check_argument("path", c, restricted=True)
+ check_argument("meta_file_train", c, restricted=True)
+ check_argument("meta_file_val", c, restricted=False)
+ check_argument("meta_file_attn_mask", c, restricted=False)
+
+
+@dataclass
+class BaseTrainingConfig(TrainerConfig):
+ """Base config to define the basic 🐸TTS training parameters that are shared
+ among all the models. It is based on ```Trainer.TrainingConfig```.
+
+ Args:
+ model (str):
+ Name of the model that is used in the training.
+
+ num_loader_workers (int):
+ Number of workers for training time dataloader.
+
+ num_eval_loader_workers (int):
+ Number of workers for evaluation time dataloader.
+ """
+
+ model: str = None
+ # dataloading
+ num_loader_workers: int = 0
+ num_eval_loader_workers: int = 0
+ use_noise_augment: bool = False
diff --git a/submodules/TTS/TTS/demos/xtts_ft_demo/requirements.txt b/submodules/TTS/TTS/demos/xtts_ft_demo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cb5b16f66e295fe0de66ae16cfa4ad34afa8845f
--- /dev/null
+++ b/submodules/TTS/TTS/demos/xtts_ft_demo/requirements.txt
@@ -0,0 +1,2 @@
+faster_whisper==0.9.0
+gradio==4.7.1
\ No newline at end of file
diff --git a/submodules/TTS/TTS/demos/xtts_ft_demo/utils/formatter.py b/submodules/TTS/TTS/demos/xtts_ft_demo/utils/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..536faa01086d788c82287e87b660279fbcc2ad7c
--- /dev/null
+++ b/submodules/TTS/TTS/demos/xtts_ft_demo/utils/formatter.py
@@ -0,0 +1,160 @@
+import os
+import gc
+import torchaudio
+import pandas
+from faster_whisper import WhisperModel
+from glob import glob
+
+from tqdm import tqdm
+
+import torch
+import torchaudio
+# torch.set_num_threads(1)
+
+from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
+
+torch.set_num_threads(16)
+
+
+import os
+
+audio_types = (".wav", ".mp3", ".flac")
+
+
+def list_audios(basePath, contains=None):
+ # return the set of files that are valid
+ return list_files(basePath, validExts=audio_types, contains=contains)
+
+def list_files(basePath, validExts=None, contains=None):
+ # loop over the directory structure
+ for (rootDir, dirNames, filenames) in os.walk(basePath):
+ # loop over the filenames in the current directory
+ for filename in filenames:
+ # if the contains string is not none and the filename does not contain
+ # the supplied string, then ignore the file
+ if contains is not None and filename.find(contains) == -1:
+ continue
+
+ # determine the file extension of the current file
+ ext = filename[filename.rfind("."):].lower()
+
+ # check to see if the file is an audio and should be processed
+ if validExts is None or ext.endswith(validExts):
+ # construct the path to the audio and yield it
+ audioPath = os.path.join(rootDir, filename)
+ yield audioPath
+
+def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
+ audio_total_size = 0
+ # make sure that ooutput file exists
+ os.makedirs(out_path, exist_ok=True)
+
+ # Loading Whisper
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ print("Loading Whisper Model!")
+ asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
+
+ metadata = {"audio_file": [], "text": [], "speaker_name": []}
+
+ if gradio_progress is not None:
+ tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
+ else:
+ tqdm_object = tqdm(audio_files)
+
+ for audio_path in tqdm_object:
+ wav, sr = torchaudio.load(audio_path)
+ # stereo to mono if needed
+ if wav.size(0) != 1:
+ wav = torch.mean(wav, dim=0, keepdim=True)
+
+ wav = wav.squeeze()
+ audio_total_size += (wav.size(-1) / sr)
+
+ segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
+ segments = list(segments)
+ i = 0
+ sentence = ""
+ sentence_start = None
+ first_word = True
+ # added all segments words in a unique list
+ words_list = []
+ for _, segment in enumerate(segments):
+ words = list(segment.words)
+ words_list.extend(words)
+
+ # process each word
+ for word_idx, word in enumerate(words_list):
+ if first_word:
+ sentence_start = word.start
+ # If it is the first sentence, add buffer or get the begining of the file
+ if word_idx == 0:
+ sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
+ else:
+ # get previous sentence end
+ previous_word_end = words_list[word_idx - 1].end
+ # add buffer or get the silence midle between the previous sentence and the current one
+ sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
+
+ sentence = word.word
+ first_word = False
+ else:
+ sentence += word.word
+
+ if word.word[-1] in ["!", ".", "?"]:
+ sentence = sentence[1:]
+ # Expand number and abbreviations plus normalization
+ sentence = multilingual_cleaners(sentence, target_language)
+ audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
+
+ audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
+
+ # Check for the next word's existence
+ if word_idx + 1 < len(words_list):
+ next_word_start = words_list[word_idx + 1].start
+ else:
+ # If don't have more words it means that it is the last sentence then use the audio len as next word start
+ next_word_start = (wav.shape[0] - 1) / sr
+
+ # Average the current word end and next word start
+ word_end = min((word.end + next_word_start) / 2, word.end + buffer)
+
+ absoulte_path = os.path.join(out_path, audio_file)
+ os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
+ i += 1
+ first_word = True
+
+ audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
+ # if the audio is too short ignore it (i.e < 0.33 seconds)
+ if audio.size(-1) >= sr/3:
+ torchaudio.save(absoulte_path,
+ audio,
+ sr
+ )
+ else:
+ continue
+
+ metadata["audio_file"].append(audio_file)
+ metadata["text"].append(sentence)
+ metadata["speaker_name"].append(speaker_name)
+
+ df = pandas.DataFrame(metadata)
+ df = df.sample(frac=1)
+ num_val_samples = int(len(df)*eval_percentage)
+
+ df_eval = df[:num_val_samples]
+ df_train = df[num_val_samples:]
+
+ df_train = df_train.sort_values('audio_file')
+ train_metadata_path = os.path.join(out_path, "metadata_train.csv")
+ df_train.to_csv(train_metadata_path, sep="|", index=False)
+
+ eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
+ df_eval = df_eval.sort_values('audio_file')
+ df_eval.to_csv(eval_metadata_path, sep="|", index=False)
+
+ # deallocate VRAM and RAM
+ del asr_model, df_train, df_eval, df, metadata
+ gc.collect()
+
+ return train_metadata_path, eval_metadata_path, audio_total_size
\ No newline at end of file
diff --git a/submodules/TTS/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/submodules/TTS/TTS/demos/xtts_ft_demo/utils/gpt_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98765c3e79b6227797ba528d425ec14cc8dbd45
--- /dev/null
+++ b/submodules/TTS/TTS/demos/xtts_ft_demo/utils/gpt_train.py
@@ -0,0 +1,172 @@
+import os
+import gc
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+
+
+def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
+ # Logging parameters
+ RUN_NAME = "GPT_XTTS_FT"
+ PROJECT_NAME = "XTTS_trainer"
+ DASHBOARD_LOGGER = "tensorboard"
+ LOGGER_URI = None
+
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
+ OUT_PATH = os.path.join(output_path, "run", "training")
+
+ # Training Parameters
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
+ START_WITH_EVAL = False # if True it will star with evaluation
+ BATCH_SIZE = batch_size # set here the batch size
+ GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
+
+
+ # Define here the dataset that you want to use for the fine-tuning on.
+ config_dataset = BaseDatasetConfig(
+ formatter="coqui",
+ dataset_name="ft_dataset",
+ path=os.path.dirname(train_csv),
+ meta_file_train=train_csv,
+ meta_file_val=eval_csv,
+ language=language,
+ )
+
+ # Add here the configs of the datasets
+ DATASETS_CONFIG_LIST = [config_dataset]
+
+ # Define the path where XTTS v2.0.1 files will be downloaded
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+
+
+ # DVAE files
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+
+ # Set the path to the downloaded files
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+
+ # download DVAE files if needed
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+ print(" > Downloading DVAE files!")
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+
+
+ # Download XTTS v2.0 checkpoint if needed
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+ XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
+
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
+ XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
+
+ # download XTTS v2.0 files if needed
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+ print(" > Downloading XTTS v2.0 files!")
+ ModelManager._download_model_files(
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+ )
+
+ # init args and config
+ model_args = GPTArgs(
+ max_conditioning_length=132300, # 6 secs
+ min_conditioning_length=66150, # 3 secs
+ debug_loading_failures=False,
+ max_wav_length=max_audio_length, # ~11.6 seconds
+ max_text_length=200,
+ mel_norm_file=MEL_NORM_FILE,
+ dvae_checkpoint=DVAE_CHECKPOINT,
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
+ tokenizer_file=TOKENIZER_FILE,
+ gpt_num_audio_tokens=1026,
+ gpt_start_audio_token=1024,
+ gpt_stop_audio_token=1025,
+ gpt_use_masking_gt_prompt_approach=True,
+ gpt_use_perceiver_resampler=True,
+ )
+ # define audio config
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+ # training parameters config
+ config = GPTTrainerConfig(
+ epochs=num_epochs,
+ output_path=OUT_PATH,
+ model_args=model_args,
+ run_name=RUN_NAME,
+ project_name=PROJECT_NAME,
+ run_description="""
+ GPT XTTS training
+ """,
+ dashboard_logger=DASHBOARD_LOGGER,
+ logger_uri=LOGGER_URI,
+ audio=audio_config,
+ batch_size=BATCH_SIZE,
+ batch_group_size=48,
+ eval_batch_size=BATCH_SIZE,
+ num_loader_workers=8,
+ eval_split_max_size=256,
+ print_step=50,
+ plot_step=100,
+ log_model_step=100,
+ save_step=1000,
+ save_n_checkpoints=1,
+ save_checkpoints=True,
+ # target_loss="loss",
+ print_eval=False,
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+ optimizer="AdamW",
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+ lr=5e-06, # learning rate
+ lr_scheduler="MultiStepLR",
+ # it was adjusted accordly for the new step scheme
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+ test_sentences=[],
+ )
+
+ # init the model from config
+ model = GPTTrainer.init_from_config(config)
+
+ # load training samples
+ train_samples, eval_samples = load_tts_samples(
+ DATASETS_CONFIG_LIST,
+ eval_split=True,
+ eval_split_max_size=config.eval_split_max_size,
+ eval_split_size=config.eval_split_size,
+ )
+
+ # init the trainer and 🚀
+ trainer = Trainer(
+ TrainerArgs(
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+ skip_train_epoch=False,
+ start_with_eval=START_WITH_EVAL,
+ grad_accum_steps=GRAD_ACUMM_STEPS,
+ ),
+ config,
+ output_path=OUT_PATH,
+ model=model,
+ train_samples=train_samples,
+ eval_samples=eval_samples,
+ )
+ trainer.fit()
+
+ # get the longest text audio file to use as speaker reference
+ samples_len = [len(item["text"].split(" ")) for item in train_samples]
+ longest_text_idx = samples_len.index(max(samples_len))
+ speaker_ref = train_samples[longest_text_idx]["audio_file"]
+
+ trainer_out_path = trainer.output_path
+
+ # deallocate VRAM and RAM
+ del model, trainer, train_samples, eval_samples
+ gc.collect()
+
+ return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
diff --git a/submodules/TTS/TTS/demos/xtts_ft_demo/xtts_demo.py b/submodules/TTS/TTS/demos/xtts_ft_demo/xtts_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb11f29d16084c9fb6dd46b42fbed017f487374
--- /dev/null
+++ b/submodules/TTS/TTS/demos/xtts_ft_demo/xtts_demo.py
@@ -0,0 +1,415 @@
+import argparse
+import os
+import sys
+import tempfile
+
+import gradio as gr
+import librosa.display
+import numpy as np
+
+import os
+import torch
+import torchaudio
+import traceback
+from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
+from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
+
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+
+
+def clear_gpu_cache():
+ # clear the GPU cache
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+XTTS_MODEL = None
+def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
+ global XTTS_MODEL
+ clear_gpu_cache()
+ if not xtts_checkpoint or not xtts_config or not xtts_vocab:
+ return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
+ config = XttsConfig()
+ config.load_json(xtts_config)
+ XTTS_MODEL = Xtts.init_from_config(config)
+ print("Loading XTTS model! ")
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
+ if torch.cuda.is_available():
+ XTTS_MODEL.cuda()
+
+ print("Model Loaded!")
+ return "Model Loaded!"
+
+def run_tts(lang, tts_text, speaker_audio_file):
+ if XTTS_MODEL is None or not speaker_audio_file:
+ return "You need to run the previous step to load the model !!", None, None
+
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
+ out = XTTS_MODEL.inference(
+ text=tts_text,
+ language=lang,
+ gpt_cond_latent=gpt_cond_latent,
+ speaker_embedding=speaker_embedding,
+ temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
+ length_penalty=XTTS_MODEL.config.length_penalty,
+ repetition_penalty=XTTS_MODEL.config.repetition_penalty,
+ top_k=XTTS_MODEL.config.top_k,
+ top_p=XTTS_MODEL.config.top_p,
+ )
+
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+ out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
+ out_path = fp.name
+ torchaudio.save(out_path, out["wav"], 24000)
+
+ return "Speech generated !", out_path, speaker_audio_file
+
+
+
+
+# define a logger to redirect
+class Logger:
+ def __init__(self, filename="log.out"):
+ self.log_file = filename
+ self.terminal = sys.stdout
+ self.log = open(self.log_file, "w")
+
+ def write(self, message):
+ self.terminal.write(message)
+ self.log.write(message)
+
+ def flush(self):
+ self.terminal.flush()
+ self.log.flush()
+
+ def isatty(self):
+ return False
+
+# redirect stdout and stderr to a file
+sys.stdout = Logger()
+sys.stderr = sys.stdout
+
+
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+import logging
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(levelname)s] %(message)s",
+ handlers=[
+ logging.StreamHandler(sys.stdout)
+ ]
+)
+
+def read_logs():
+ sys.stdout.flush()
+ with open(sys.stdout.log_file, "r") as f:
+ return f.read()
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(
+ description="""XTTS fine-tuning demo\n\n"""
+ """
+ Example runs:
+ python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
+ """,
+ formatter_class=argparse.RawTextHelpFormatter,
+ )
+ parser.add_argument(
+ "--port",
+ type=int,
+ help="Port to run the gradio demo. Default: 5003",
+ default=5003,
+ )
+ parser.add_argument(
+ "--out_path",
+ type=str,
+ help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
+ default="/tmp/xtts_ft/",
+ )
+
+ parser.add_argument(
+ "--num_epochs",
+ type=int,
+ help="Number of epochs to train. Default: 10",
+ default=10,
+ )
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ help="Batch size. Default: 4",
+ default=4,
+ )
+ parser.add_argument(
+ "--grad_acumm",
+ type=int,
+ help="Grad accumulation steps. Default: 1",
+ default=1,
+ )
+ parser.add_argument(
+ "--max_audio_length",
+ type=int,
+ help="Max permitted audio size in seconds. Default: 11",
+ default=11,
+ )
+
+ args = parser.parse_args()
+
+ with gr.Blocks() as demo:
+ with gr.Tab("1 - Data processing"):
+ out_path = gr.Textbox(
+ label="Output path (where data and checkpoints will be saved):",
+ value=args.out_path,
+ )
+ # upload_file = gr.Audio(
+ # sources="upload",
+ # label="Select here the audio files that you want to use for XTTS trainining !",
+ # type="filepath",
+ # )
+ upload_file = gr.File(
+ file_count="multiple",
+ label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
+ )
+ lang = gr.Dropdown(
+ label="Dataset Language",
+ value="en",
+ choices=[
+ "en",
+ "es",
+ "fr",
+ "de",
+ "it",
+ "pt",
+ "pl",
+ "tr",
+ "ru",
+ "nl",
+ "cs",
+ "ar",
+ "zh",
+ "hu",
+ "ko",
+ "ja"
+ ],
+ )
+ progress_data = gr.Label(
+ label="Progress:"
+ )
+ logs = gr.Textbox(
+ label="Logs:",
+ interactive=False,
+ )
+ demo.load(read_logs, None, logs, every=1)
+
+ prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
+
+ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
+ clear_gpu_cache()
+ out_path = os.path.join(out_path, "dataset")
+ os.makedirs(out_path, exist_ok=True)
+ if audio_path is None:
+ return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
+ else:
+ try:
+ train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
+ except:
+ traceback.print_exc()
+ error = traceback.format_exc()
+ return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
+
+ clear_gpu_cache()
+
+ # if audio total len is less than 2 minutes raise an error
+ if audio_total_size < 120:
+ message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
+ print(message)
+ return message, "", ""
+
+ print("Dataset Processed!")
+ return "Dataset Processed!", train_meta, eval_meta
+
+ with gr.Tab("2 - Fine-tuning XTTS Encoder"):
+ train_csv = gr.Textbox(
+ label="Train CSV:",
+ )
+ eval_csv = gr.Textbox(
+ label="Eval CSV:",
+ )
+ num_epochs = gr.Slider(
+ label="Number of epochs:",
+ minimum=1,
+ maximum=100,
+ step=1,
+ value=args.num_epochs,
+ )
+ batch_size = gr.Slider(
+ label="Batch size:",
+ minimum=2,
+ maximum=512,
+ step=1,
+ value=args.batch_size,
+ )
+ grad_acumm = gr.Slider(
+ label="Grad accumulation steps:",
+ minimum=2,
+ maximum=128,
+ step=1,
+ value=args.grad_acumm,
+ )
+ max_audio_length = gr.Slider(
+ label="Max permitted audio size in seconds:",
+ minimum=2,
+ maximum=20,
+ step=1,
+ value=args.max_audio_length,
+ )
+ progress_train = gr.Label(
+ label="Progress:"
+ )
+ logs_tts_train = gr.Textbox(
+ label="Logs:",
+ interactive=False,
+ )
+ demo.load(read_logs, None, logs_tts_train, every=1)
+ train_btn = gr.Button(value="Step 2 - Run the training")
+
+ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
+ clear_gpu_cache()
+ if not train_csv or not eval_csv:
+ return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
+ try:
+ # convert seconds to waveform frames
+ max_audio_length = int(max_audio_length * 22050)
+ config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
+ except:
+ traceback.print_exc()
+ error = traceback.format_exc()
+ return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
+
+ # copy original files to avoid parameters changes issues
+ os.system(f"cp {config_path} {exp_path}")
+ os.system(f"cp {vocab_file} {exp_path}")
+
+ ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
+ print("Model training done!")
+ clear_gpu_cache()
+ return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
+
+ with gr.Tab("3 - Inference"):
+ with gr.Row():
+ with gr.Column() as col1:
+ xtts_checkpoint = gr.Textbox(
+ label="XTTS checkpoint path:",
+ value="",
+ )
+ xtts_config = gr.Textbox(
+ label="XTTS config path:",
+ value="",
+ )
+
+ xtts_vocab = gr.Textbox(
+ label="XTTS vocab path:",
+ value="",
+ )
+ progress_load = gr.Label(
+ label="Progress:"
+ )
+ load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
+
+ with gr.Column() as col2:
+ speaker_reference_audio = gr.Textbox(
+ label="Speaker reference audio:",
+ value="",
+ )
+ tts_language = gr.Dropdown(
+ label="Language",
+ value="en",
+ choices=[
+ "en",
+ "es",
+ "fr",
+ "de",
+ "it",
+ "pt",
+ "pl",
+ "tr",
+ "ru",
+ "nl",
+ "cs",
+ "ar",
+ "zh",
+ "hu",
+ "ko",
+ "ja",
+ ]
+ )
+ tts_text = gr.Textbox(
+ label="Input Text.",
+ value="This model sounds really good and above all, it's reasonably fast.",
+ )
+ tts_btn = gr.Button(value="Step 4 - Inference")
+
+ with gr.Column() as col3:
+ progress_gen = gr.Label(
+ label="Progress:"
+ )
+ tts_output_audio = gr.Audio(label="Generated Audio.")
+ reference_audio = gr.Audio(label="Reference audio used.")
+
+ prompt_compute_btn.click(
+ fn=preprocess_dataset,
+ inputs=[
+ upload_file,
+ lang,
+ out_path,
+ ],
+ outputs=[
+ progress_data,
+ train_csv,
+ eval_csv,
+ ],
+ )
+
+
+ train_btn.click(
+ fn=train_model,
+ inputs=[
+ lang,
+ train_csv,
+ eval_csv,
+ num_epochs,
+ batch_size,
+ grad_acumm,
+ out_path,
+ max_audio_length,
+ ],
+ outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
+ )
+
+ load_btn.click(
+ fn=load_model,
+ inputs=[
+ xtts_checkpoint,
+ xtts_config,
+ xtts_vocab
+ ],
+ outputs=[progress_load],
+ )
+
+ tts_btn.click(
+ fn=run_tts,
+ inputs=[
+ tts_language,
+ tts_text,
+ speaker_reference_audio,
+ ],
+ outputs=[progress_gen, tts_output_audio, reference_audio],
+ )
+
+ demo.launch(
+ share=True,
+ debug=False,
+ server_port=args.port,
+ server_name="0.0.0.0"
+ )
diff --git a/submodules/TTS/TTS/encoder/README.md b/submodules/TTS/TTS/encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b38b20052b707b0358068bc0ce58bc300a149def
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/README.md
@@ -0,0 +1,18 @@
+### Speaker Encoder
+
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+
+
+
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+
+To run the code, you need to follow the same flow as in TTS.
+
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS
diff --git a/submodules/TTS/TTS/encoder/__init__.py b/submodules/TTS/TTS/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/encoder/configs/base_encoder_config.py b/submodules/TTS/TTS/encoder/configs/base_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebbaa0457bb55aef70d54dd36fd9b2b7f7c702bb
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/configs/base_encoder_config.py
@@ -0,0 +1,61 @@
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+
+from coqpit import MISSING
+
+from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+
+
+@dataclass
+class BaseEncoderConfig(BaseTrainingConfig):
+ """Defines parameters for a Generic Encoder model."""
+
+ model: str = None
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+ # model params
+ model_params: Dict = field(
+ default_factory=lambda: {
+ "model_name": "lstm",
+ "input_dim": 80,
+ "proj_dim": 256,
+ "lstm_dim": 768,
+ "num_lstm_layers": 3,
+ "use_lstm_with_projection": True,
+ }
+ )
+
+ audio_augmentation: Dict = field(default_factory=lambda: {})
+
+ # training params
+ epochs: int = 10000
+ loss: str = "angleproto"
+ grad_clip: float = 3.0
+ lr: float = 0.0001
+ optimizer: str = "radam"
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
+ lr_decay: bool = False
+ warmup_steps: int = 4000
+
+ # logging params
+ tb_model_param_stats: bool = False
+ steps_plot_stats: int = 10
+ save_step: int = 1000
+ print_step: int = 20
+ run_eval: bool = False
+
+ # data loader
+ num_classes_in_batch: int = MISSING
+ num_utter_per_class: int = MISSING
+ eval_num_classes_in_batch: int = None
+ eval_num_utter_per_class: int = None
+
+ num_loader_workers: int = MISSING
+ voice_len: float = 1.6
+
+ def check_values(self):
+ super().check_values()
+ c = asdict(self)
+ assert (
+ c["model_params"]["input_dim"] == self.audio.num_mels
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
diff --git a/submodules/TTS/TTS/encoder/configs/emotion_encoder_config.py b/submodules/TTS/TTS/encoder/configs/emotion_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eda2671be980abce4a0506a075387b601a1596c
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/configs/emotion_encoder_config.py
@@ -0,0 +1,12 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class EmotionEncoderConfig(BaseEncoderConfig):
+ """Defines parameters for Emotion Encoder model."""
+
+ model: str = "emotion_encoder"
+ map_classid_to_classname: dict = None
+ class_name_key: str = "emotion_name"
diff --git a/submodules/TTS/TTS/encoder/configs/speaker_encoder_config.py b/submodules/TTS/TTS/encoder/configs/speaker_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dceb00277ba68efe128936ff7f9456338f9753f
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/configs/speaker_encoder_config.py
@@ -0,0 +1,11 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseEncoderConfig):
+ """Defines parameters for Speaker Encoder model."""
+
+ model: str = "speaker_encoder"
+ class_name_key: str = "speaker_name"
diff --git a/submodules/TTS/TTS/encoder/dataset.py b/submodules/TTS/TTS/encoder/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..582b1fe9ca35cb9afbc20b8f72b6173282201272
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/dataset.py
@@ -0,0 +1,147 @@
+import random
+
+import torch
+from torch.utils.data import Dataset
+
+from TTS.encoder.utils.generic_utils import AugmentWAV
+
+
+class EncoderDataset(Dataset):
+ def __init__(
+ self,
+ config,
+ ap,
+ meta_data,
+ voice_len=1.6,
+ num_classes_in_batch=64,
+ num_utter_per_class=10,
+ verbose=False,
+ augmentation_config=None,
+ use_torch_spec=None,
+ ):
+ """
+ Args:
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
+ meta_data (list): list of dataset instances.
+ seq_len (int): voice segment length in seconds.
+ verbose (bool): print diagnostic information.
+ """
+ super().__init__()
+ self.config = config
+ self.items = meta_data
+ self.sample_rate = ap.sample_rate
+ self.seq_len = int(voice_len * self.sample_rate)
+ self.num_utter_per_class = num_utter_per_class
+ self.ap = ap
+ self.verbose = verbose
+ self.use_torch_spec = use_torch_spec
+ self.classes, self.items = self.__parse_items()
+
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+ # Data Augmentation
+ self.augmentator = None
+ self.gaussian_augmentation_config = None
+ if augmentation_config:
+ self.data_augmentation_p = augmentation_config["p"]
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+ self.augmentator = AugmentWAV(ap, augmentation_config)
+
+ if "gaussian" in augmentation_config.keys():
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
+
+ if self.verbose:
+ print("\n > DataLoader initialization")
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
+ print(f" | > Number of instances : {len(self.items)}")
+ print(f" | > Sequence length: {self.seq_len}")
+ print(f" | > Num Classes: {len(self.classes)}")
+ print(f" | > Classes: {self.classes}")
+
+ def load_wav(self, filename):
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+ return audio
+
+ def __parse_items(self):
+ class_to_utters = {}
+ for item in self.items:
+ path_ = item["audio_file"]
+ class_name = item[self.config.class_name_key]
+ if class_name in class_to_utters.keys():
+ class_to_utters[class_name].append(path_)
+ else:
+ class_to_utters[class_name] = [
+ path_,
+ ]
+
+ # skip classes with number of samples >= self.num_utter_per_class
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
+
+ classes = list(class_to_utters.keys())
+ classes.sort()
+
+ new_items = []
+ for item in self.items:
+ path_ = item["audio_file"]
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
+ # ignore filtered classes
+ if class_name not in classes:
+ continue
+ # ignore small audios
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
+ continue
+
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
+
+ return classes, new_items
+
+ def __len__(self):
+ return len(self.items)
+
+ def get_num_classes(self):
+ return len(self.classes)
+
+ def get_class_list(self):
+ return self.classes
+
+ def set_classes(self, classes):
+ self.classes = classes
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+ def get_map_classid_to_classname(self):
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
+
+ def __getitem__(self, idx):
+ return self.items[idx]
+
+ def collate_fn(self, batch):
+ # get the batch class_ids
+ labels = []
+ feats = []
+ for item in batch:
+ utter_path = item["wav_file_path"]
+ class_name = item["class_name"]
+
+ # get classid
+ class_id = self.classname_to_classid[class_name]
+ # load wav file
+ wav = self.load_wav(utter_path)
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
+ wav = wav[offset : offset + self.seq_len]
+
+ if self.augmentator is not None and self.data_augmentation_p:
+ if random.random() < self.data_augmentation_p:
+ wav = self.augmentator.apply_one(wav)
+
+ if not self.use_torch_spec:
+ mel = self.ap.melspectrogram(wav)
+ feats.append(torch.FloatTensor(mel))
+ else:
+ feats.append(torch.FloatTensor(wav))
+
+ labels.append(class_id)
+
+ feats = torch.stack(feats)
+ labels = torch.LongTensor(labels)
+
+ return feats, labels
diff --git a/submodules/TTS/TTS/encoder/losses.py b/submodules/TTS/TTS/encoder/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b5aa0fc48fe00aeedeff28ba48ed2af498ce582
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/losses.py
@@ -0,0 +1,226 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+# adapted from https://github.com/cvqluu/GE2E-Loss
+class GE2ELoss(nn.Module):
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
+ """
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
+ Accepts an input of size (N, M, D)
+ where N is the number of speakers in the batch,
+ M is the number of utterances per speaker,
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
+ Args:
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
+ """
+ super().__init__()
+ # pylint: disable=E1102
+ self.w = nn.Parameter(torch.tensor(init_w))
+ # pylint: disable=E1102
+ self.b = nn.Parameter(torch.tensor(init_b))
+ self.loss_method = loss_method
+
+ print(" > Initialized Generalized End-to-End loss")
+
+ assert self.loss_method in ["softmax", "contrast"]
+
+ if self.loss_method == "softmax":
+ self.embed_loss = self.embed_loss_softmax
+ if self.loss_method == "contrast":
+ self.embed_loss = self.embed_loss_contrast
+
+ # pylint: disable=R0201
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
+ """
+ Calculates the new centroids excluding the reference utterance
+ """
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
+ excl = torch.mean(excl, 0)
+ new_centroids = []
+ for i, centroid in enumerate(centroids):
+ if i == spkr:
+ new_centroids.append(excl)
+ else:
+ new_centroids.append(centroid)
+ return torch.stack(new_centroids)
+
+ def calc_cosine_sim(self, dvecs, centroids):
+ """
+ Make the cosine similarity matrix with dims (N,M,N)
+ """
+ cos_sim_matrix = []
+ for spkr_idx, speaker in enumerate(dvecs):
+ cs_row = []
+ for utt_idx, utterance in enumerate(speaker):
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
+ # vector based cosine similarity for speed
+ cs_row.append(
+ torch.clamp(
+ torch.mm(
+ utterance.unsqueeze(1).transpose(0, 1),
+ new_centroids.transpose(0, 1),
+ )
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
+ 1e-6,
+ )
+ )
+ cs_row = torch.cat(cs_row, dim=0)
+ cos_sim_matrix.append(cs_row)
+ return torch.stack(cos_sim_matrix)
+
+ # pylint: disable=R0201
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
+ """
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
+ """
+ N, M, _ = dvecs.shape
+ L = []
+ for j in range(N):
+ L_row = []
+ for i in range(M):
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
+ L_row = torch.stack(L_row)
+ L.append(L_row)
+ return torch.stack(L)
+
+ # pylint: disable=R0201
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
+ """
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
+ """
+ N, M, _ = dvecs.shape
+ L = []
+ for j in range(N):
+ L_row = []
+ for i in range(M):
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
+ L_row = torch.stack(L_row)
+ L.append(L_row)
+ return torch.stack(L)
+
+ def forward(self, x, _label=None):
+ """
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ assert x.size()[1] >= 2
+
+ centroids = torch.mean(x, 1)
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
+ torch.clamp(self.w, 1e-6)
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
+ L = self.embed_loss(x, cos_sim_matrix)
+ return L.mean()
+
+
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+ """
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+ Accepts an input of size (N, M, D)
+ where N is the number of speakers in the batch,
+ M is the number of utterances per speaker,
+ and D is the dimensionality of the embedding vector
+ Args:
+ - init_w (float): defines the initial value of w
+ - init_b (float): definies the initial value of b
+ """
+
+ def __init__(self, init_w=10.0, init_b=-5.0):
+ super().__init__()
+ # pylint: disable=E1102
+ self.w = nn.Parameter(torch.tensor(init_w))
+ # pylint: disable=E1102
+ self.b = nn.Parameter(torch.tensor(init_b))
+ self.criterion = torch.nn.CrossEntropyLoss()
+
+ print(" > Initialized Angular Prototypical loss")
+
+ def forward(self, x, _label=None):
+ """
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ assert x.size()[1] >= 2
+
+ out_anchor = torch.mean(x[:, 1:, :], 1)
+ out_positive = x[:, 0, :]
+ num_speakers = out_anchor.size()[0]
+
+ cos_sim_matrix = F.cosine_similarity(
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
+ )
+ torch.clamp(self.w, 1e-6)
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
+ L = self.criterion(cos_sim_matrix, label)
+ return L
+
+
+class SoftmaxLoss(nn.Module):
+ """
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
+ Args:
+ - embedding_dim (float): speaker embedding dim
+ - n_speakers (float): number of speakers
+ """
+
+ def __init__(self, embedding_dim, n_speakers):
+ super().__init__()
+
+ self.criterion = torch.nn.CrossEntropyLoss()
+ self.fc = nn.Linear(embedding_dim, n_speakers)
+
+ print("Initialised Softmax Loss")
+
+ def forward(self, x, label=None):
+ # reshape for compatibility
+ x = x.reshape(-1, x.size()[-1])
+ label = label.reshape(-1)
+
+ x = self.fc(x)
+ L = self.criterion(x, label)
+
+ return L
+
+ def inference(self, embedding):
+ x = self.fc(embedding)
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
+ class_id = torch.argmax(activations)
+ return class_id
+
+
+class SoftmaxAngleProtoLoss(nn.Module):
+ """
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
+ Args:
+ - embedding_dim (float): speaker embedding dim
+ - n_speakers (float): number of speakers
+ - init_w (float): defines the initial value of w
+ - init_b (float): definies the initial value of b
+ """
+
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
+ super().__init__()
+
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
+ self.angleproto = AngleProtoLoss(init_w, init_b)
+
+ print("Initialised SoftmaxAnglePrototypical Loss")
+
+ def forward(self, x, label=None):
+ """
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ Lp = self.angleproto(x)
+
+ Ls = self.softmax(x, label)
+
+ return Ls + Lp
diff --git a/submodules/TTS/TTS/encoder/requirements.txt b/submodules/TTS/TTS/encoder/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a486cc45ddb44591bd03c9c0df294fbe98c13884
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/requirements.txt
@@ -0,0 +1,2 @@
+umap-learn
+numpy>=1.17.0
diff --git a/submodules/TTS/TTS/encoder/utils/__init__.py b/submodules/TTS/TTS/encoder/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/encoder/utils/generic_utils.py b/submodules/TTS/TTS/encoder/utils/generic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..236d6fe937a9637fd86f06bea5fb45fad4ee2502
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/utils/generic_utils.py
@@ -0,0 +1,136 @@
+import glob
+import os
+import random
+
+import numpy as np
+from scipy import signal
+
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
+
+
+class AugmentWAV(object):
+ def __init__(self, ap, augmentation_config):
+ self.ap = ap
+ self.use_additive_noise = False
+
+ if "additive" in augmentation_config.keys():
+ self.additive_noise_config = augmentation_config["additive"]
+ additive_path = self.additive_noise_config["sounds_path"]
+ if additive_path:
+ self.use_additive_noise = True
+ # get noise types
+ self.additive_noise_types = []
+ for key in self.additive_noise_config.keys():
+ if isinstance(self.additive_noise_config[key], dict):
+ self.additive_noise_types.append(key)
+
+ additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
+
+ self.noise_list = {}
+
+ for wav_file in additive_files:
+ noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
+ # ignore not listed directories
+ if noise_dir not in self.additive_noise_types:
+ continue
+ if not noise_dir in self.noise_list:
+ self.noise_list[noise_dir] = []
+ self.noise_list[noise_dir].append(wav_file)
+
+ print(
+ f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
+ )
+
+ self.use_rir = False
+
+ if "rir" in augmentation_config.keys():
+ self.rir_config = augmentation_config["rir"]
+ if self.rir_config["rir_path"]:
+ self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
+ self.use_rir = True
+
+ print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
+
+ self.create_augmentation_global_list()
+
+ def create_augmentation_global_list(self):
+ if self.use_additive_noise:
+ self.global_noise_list = self.additive_noise_types
+ else:
+ self.global_noise_list = []
+ if self.use_rir:
+ self.global_noise_list.append("RIR_AUG")
+
+ def additive_noise(self, noise_type, audio):
+ clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+
+ noise_list = random.sample(
+ self.noise_list[noise_type],
+ random.randint(
+ self.additive_noise_config[noise_type]["min_num_noises"],
+ self.additive_noise_config[noise_type]["max_num_noises"],
+ ),
+ )
+
+ audio_len = audio.shape[0]
+ noises_wav = None
+ for noise in noise_list:
+ noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+
+ if noiseaudio.shape[0] < audio_len:
+ continue
+
+ noise_snr = random.uniform(
+ self.additive_noise_config[noise_type]["min_snr_in_db"],
+ self.additive_noise_config[noise_type]["max_num_noises"],
+ )
+ noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
+ noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+
+ if noises_wav is None:
+ noises_wav = noise_wav
+ else:
+ noises_wav += noise_wav
+
+ # if all possible files is less than audio, choose other files
+ if noises_wav is None:
+ return self.additive_noise(noise_type, audio)
+
+ return audio + noises_wav
+
+ def reverberate(self, audio):
+ audio_len = audio.shape[0]
+
+ rir_file = random.choice(self.rir_files)
+ rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+ rir = rir / np.sqrt(np.sum(rir**2))
+ return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
+
+ def apply_one(self, audio):
+ noise_type = random.choice(self.global_noise_list)
+ if noise_type == "RIR_AUG":
+ return self.reverberate(audio)
+
+ return self.additive_noise(noise_type, audio)
+
+
+def setup_encoder_model(config: "Coqpit"):
+ if config.model_params["model_name"].lower() == "lstm":
+ model = LSTMSpeakerEncoder(
+ config.model_params["input_dim"],
+ config.model_params["proj_dim"],
+ config.model_params["lstm_dim"],
+ config.model_params["num_lstm_layers"],
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
+ audio_config=config.audio,
+ )
+ elif config.model_params["model_name"].lower() == "resnet":
+ model = ResNetSpeakerEncoder(
+ input_dim=config.model_params["input_dim"],
+ proj_dim=config.model_params["proj_dim"],
+ log_input=config.model_params.get("log_input", False),
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
+ audio_config=config.audio,
+ )
+ return model
diff --git a/submodules/TTS/TTS/encoder/utils/prepare_voxceleb.py b/submodules/TTS/TTS/encoder/utils/prepare_voxceleb.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93baf9e60f0d5c35a4e86f6746e29f6097174b5
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/utils/prepare_voxceleb.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Only support eager mode and TF>=2.0.0
+# pylint: disable=no-member, invalid-name, relative-beyond-top-level
+# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
+""" voxceleb 1 & 2 """
+
+import hashlib
+import os
+import subprocess
+import sys
+import zipfile
+
+import pandas
+import soundfile as sf
+from absl import logging
+
+SUBSETS = {
+ "vox1_dev_wav": [
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+ ],
+ "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
+ "vox2_dev_aac": [
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
+ ],
+ "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
+}
+
+MD5SUM = {
+ "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
+ "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
+ "vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
+ "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
+}
+
+USER = {"user": "", "password": ""}
+
+speaker_id_dict = {}
+
+
+def download_and_extract(directory, subset, urls):
+ """Download and extract the given split of dataset.
+
+ Args:
+ directory: the directory where to put the downloaded data.
+ subset: subset name of the corpus.
+ urls: the list of urls to download the data file.
+ """
+ os.makedirs(directory, exist_ok=True)
+
+ try:
+ for url in urls:
+ zip_filepath = os.path.join(directory, url.split("/")[-1])
+ if os.path.exists(zip_filepath):
+ continue
+ logging.info("Downloading %s to %s" % (url, zip_filepath))
+ subprocess.call(
+ "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+ shell=True,
+ )
+
+ statinfo = os.stat(zip_filepath)
+ logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+
+ # concatenate all parts into zip files
+ if ".zip" not in zip_filepath:
+ zip_filepath = "_".join(zip_filepath.split("_")[:-1])
+ subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+ zip_filepath += ".zip"
+ extract_path = zip_filepath.strip(".zip")
+
+ # check zip file md5sum
+ with open(zip_filepath, "rb") as f_zip:
+ md5 = hashlib.md5(f_zip.read()).hexdigest()
+ if md5 != MD5SUM[subset]:
+ raise ValueError("md5sum of %s mismatch" % zip_filepath)
+
+ with zipfile.ZipFile(zip_filepath, "r") as zfile:
+ zfile.extractall(directory)
+ extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
+ subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+ finally:
+ # os.remove(zip_filepath)
+ pass
+
+
+def exec_cmd(cmd):
+ """Run a command in a subprocess.
+ Args:
+ cmd: command line to be executed.
+ Return:
+ int, the return code.
+ """
+ try:
+ retcode = subprocess.call(cmd, shell=True)
+ if retcode < 0:
+ logging.info(f"Child was terminated by signal {retcode}")
+ except OSError as e:
+ logging.info(f"Execution failed: {e}")
+ retcode = -999
+ return retcode
+
+
+def decode_aac_with_ffmpeg(aac_file, wav_file):
+ """Decode a given AAC file into WAV using ffmpeg.
+ Args:
+ aac_file: file path to input AAC file.
+ wav_file: file path to output WAV file.
+ Return:
+ bool, True if success.
+ """
+ cmd = f"ffmpeg -i {aac_file} {wav_file}"
+ logging.info(f"Decoding aac file using command line: {cmd}")
+ ret = exec_cmd(cmd)
+ if ret != 0:
+ logging.error(f"Failed to decode aac file with retcode {ret}")
+ logging.error("Please check your ffmpeg installation.")
+ return False
+ return True
+
+
+def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
+ """Optionally convert AAC to WAV and make speaker labels.
+ Args:
+ input_dir: the directory which holds the input dataset.
+ subset: the name of the specified subset. e.g. vox1_dev_wav
+ output_dir: the directory to place the newly generated csv files.
+ output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
+ """
+
+ logging.info("Preprocessing audio and label for subset %s" % subset)
+ source_dir = os.path.join(input_dir, subset)
+
+ files = []
+ # Convert all AAC file into WAV format. At the same time, generate the csv
+ for root, _, filenames in os.walk(source_dir):
+ for filename in filenames:
+ name, ext = os.path.splitext(filename)
+ if ext.lower() == ".wav":
+ _, ext2 = os.path.splitext(name)
+ if ext2:
+ continue
+ wav_file = os.path.join(root, filename)
+ elif ext.lower() == ".m4a":
+ # Convert AAC to WAV.
+ aac_file = os.path.join(root, filename)
+ wav_file = aac_file + ".wav"
+ if not os.path.exists(wav_file):
+ if not decode_aac_with_ffmpeg(aac_file, wav_file):
+ raise RuntimeError("Audio decoding failed.")
+ else:
+ continue
+ speaker_name = root.split(os.path.sep)[-2]
+ if speaker_name not in speaker_id_dict:
+ num = len(speaker_id_dict)
+ speaker_id_dict[speaker_name] = num
+ # wav_filesize = os.path.getsize(wav_file)
+ wav_length = len(sf.read(wav_file)[0])
+ files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
+
+ # Write to CSV file which contains four columns:
+ # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
+ csv_file_path = os.path.join(output_dir, output_file)
+ df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
+ df.to_csv(csv_file_path, index=False, sep="\t")
+ logging.info("Successfully generated csv file {}".format(csv_file_path))
+
+
+def processor(directory, subset, force_process):
+ """download and process"""
+ urls = SUBSETS
+ if subset not in urls:
+ raise ValueError(subset, "is not in voxceleb")
+
+ subset_csv = os.path.join(directory, subset + ".csv")
+ if not force_process and os.path.exists(subset_csv):
+ return subset_csv
+
+ logging.info("Downloading and process the voxceleb in %s", directory)
+ logging.info("Preparing subset %s", subset)
+ download_and_extract(directory, subset, urls[subset])
+ convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
+ logging.info("Finished downloading and processing")
+ return subset_csv
+
+
+if __name__ == "__main__":
+ logging.set_verbosity(logging.INFO)
+ if len(sys.argv) != 4:
+ print("Usage: python prepare_data.py save_directory user password")
+ sys.exit()
+
+ DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
+ for SUBSET in SUBSETS:
+ processor(DIR, SUBSET, False)
diff --git a/submodules/TTS/TTS/encoder/utils/training.py b/submodules/TTS/TTS/encoder/utils/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8f271d80c40ff8fa5bbb824615c19d0f99d19d
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/utils/training.py
@@ -0,0 +1,99 @@
+import os
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+from trainer import TrainerArgs, get_last_checkpoint
+from trainer.io import copy_model_files
+from trainer.logging import logger_factory
+from trainer.logging.console_logger import ConsoleLogger
+
+from TTS.config import load_config, register_config
+from TTS.tts.utils.text.characters import parse_symbols
+from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
+
+
+@dataclass
+class TrainArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def getarguments():
+ train_config = TrainArgs()
+ parser = train_config.init_argparse(arg_prefix="")
+ return parser
+
+
+def process_args(args, config=None):
+ """Process parsed comand line arguments and initialize the config if not provided.
+ Args:
+ args (argparse.Namespace or dict like): Parsed input arguments.
+ config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
+ Returns:
+ c (TTS.utils.io.AttrDict): Config paramaters.
+ out_path (str): Path to save models and logging.
+ audio_path (str): Path to save generated test audios.
+ c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
+ logging to the console.
+ dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
+ TODO:
+ - Interactive config definition.
+ """
+ if isinstance(args, tuple):
+ args, coqpit_overrides = args
+ if args.continue_path:
+ # continue a previous training from its output folder
+ experiment_path = args.continue_path
+ args.config_path = os.path.join(args.continue_path, "config.json")
+ args.restore_path, best_model = get_last_checkpoint(args.continue_path)
+ if not args.best_path:
+ args.best_path = best_model
+ # init config if not already defined
+ if config is None:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(coqpit_overrides)
+ config = register_config(config_base.model)()
+ # override values from command-line args
+ config.parse_known_args(coqpit_overrides, relaxed_parser=True)
+ experiment_path = args.continue_path
+ if not experiment_path:
+ experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
+ audio_path = os.path.join(experiment_path, "test_audios")
+ config.output_log_path = experiment_path
+ # setup rank 0 process in distributed training
+ dashboard_logger = None
+ if args.rank == 0:
+ new_fields = {}
+ if args.restore_path:
+ new_fields["restore_path"] = args.restore_path
+ new_fields["github_branch"] = get_git_branch()
+ # if model characters are not set in the config file
+ # save the default set to the config file for future
+ # compatibility.
+ if config.has("characters") and config.characters is None:
+ used_characters = parse_symbols()
+ new_fields["characters"] = used_characters
+ copy_model_files(config, experiment_path, new_fields)
+ dashboard_logger = logger_factory(config, experiment_path)
+ c_logger = ConsoleLogger()
+ return config, experiment_path, audio_path, c_logger, dashboard_logger
+
+
+def init_arguments():
+ train_config = TrainArgs()
+ parser = train_config.init_argparse(arg_prefix="")
+ return parser
+
+
+def init_training(config: Coqpit = None):
+ """Initialization of a training run."""
+ parser = init_arguments()
+ args = parser.parse_known_args()
+ config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
+ return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
diff --git a/submodules/TTS/TTS/encoder/utils/visual.py b/submodules/TTS/TTS/encoder/utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..6575b86ec22818fe1dc0c1e6336a7fd255855330
--- /dev/null
+++ b/submodules/TTS/TTS/encoder/utils/visual.py
@@ -0,0 +1,50 @@
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import umap
+
+matplotlib.use("Agg")
+
+
+colormap = (
+ np.array(
+ [
+ [76, 255, 0],
+ [0, 127, 70],
+ [255, 0, 0],
+ [255, 217, 38],
+ [0, 135, 255],
+ [165, 0, 165],
+ [255, 167, 255],
+ [0, 255, 255],
+ [255, 96, 38],
+ [142, 76, 0],
+ [33, 0, 127],
+ [0, 0, 0],
+ [183, 183, 183],
+ ],
+ dtype=float,
+ )
+ / 255
+)
+
+
+def plot_embeddings(embeddings, num_classes_in_batch):
+ num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
+
+ # if necessary get just the first 10 classes
+ if num_classes_in_batch > 10:
+ num_classes_in_batch = 10
+ embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
+
+ model = umap.UMAP()
+ projection = model.fit_transform(embeddings)
+ ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
+ colors = [colormap[i] for i in ground_truth]
+ fig, ax = plt.subplots(figsize=(16, 10))
+ _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
+ plt.gca().set_aspect("equal", "datalim")
+ plt.title("UMAP projection")
+ plt.tight_layout()
+ plt.savefig("umap")
+ return fig
diff --git a/submodules/TTS/TTS/model.py b/submodules/TTS/TTS/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae6be7b444695756c00c4faa8f2f6c787dfcf9d8
--- /dev/null
+++ b/submodules/TTS/TTS/model.py
@@ -0,0 +1,59 @@
+from abc import abstractmethod
+from typing import Dict
+
+import torch
+from coqpit import Coqpit
+from trainer import TrainerModel
+
+# pylint: skip-file
+
+
+class BaseTrainerModel(TrainerModel):
+ """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+
+ Every new 🐸TTS model must inherit it.
+ """
+
+ @staticmethod
+ @abstractmethod
+ def init_from_config(config: Coqpit):
+ """Init the model and all its attributes from the given config.
+
+ Override this depending on your model.
+ """
+ ...
+
+ @abstractmethod
+ def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
+ """Forward pass for inference.
+
+ It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+ is considered to be the main output and you can add any other auxiliary outputs as you want.
+
+ We don't use `*kwargs` since it is problematic with the TorchScript API.
+
+ Args:
+ input (torch.Tensor): [description]
+ aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
+
+ Returns:
+ Dict: [description]
+ """
+ outputs_dict = {"model_outputs": None}
+ ...
+ return outputs_dict
+
+ @abstractmethod
+ def load_checkpoint(
+ self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
+ ) -> None:
+ """Load a model checkpoint gile and get ready for training or inference.
+
+ Args:
+ config (Coqpit): Model configuration.
+ checkpoint_path (str): Path to the model checkpoint file.
+ eval (bool, optional): If true, init model for inference else for training. Defaults to False.
+ strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+ cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
+ """
+ ...
diff --git a/submodules/TTS/TTS/server/README.md b/submodules/TTS/TTS/server/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..270656c4e39dc11636efbb1ba51eba7c9b4a8f04
--- /dev/null
+++ b/submodules/TTS/TTS/server/README.md
@@ -0,0 +1,18 @@
+# :frog: TTS demo server
+Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
+
+**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
+
+Examples runs:
+
+List officially released models.
+```python TTS/server/server.py --list_models ```
+
+Run the server with the official models.
+```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
+
+Run the server with the official models on a GPU.
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
+
+Run the server with a custom models.
+```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
diff --git a/submodules/TTS/TTS/server/__init__.py b/submodules/TTS/TTS/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/server/conf.json b/submodules/TTS/TTS/server/conf.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b6c09c3848a224dfb39a1f653aa1b289a4b6e5
--- /dev/null
+++ b/submodules/TTS/TTS/server/conf.json
@@ -0,0 +1,12 @@
+{
+ "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
+ "tts_file":"best_model.pth", // tts checkpoint file
+ "tts_config":"config.json", // tts config.json file
+ "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
+ "vocoder_config":null,
+ "vocoder_file": null,
+ "is_wavernn_batched":true,
+ "port": 5002,
+ "use_cuda": true,
+ "debug": true
+}
diff --git a/submodules/TTS/TTS/server/server.py b/submodules/TTS/TTS/server/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b2141a9aa419b9095956ccae317621fa3a604da
--- /dev/null
+++ b/submodules/TTS/TTS/server/server.py
@@ -0,0 +1,258 @@
+#!flask/bin/python
+import argparse
+import io
+import json
+import os
+import sys
+from pathlib import Path
+from threading import Lock
+from typing import Union
+from urllib.parse import parse_qs
+
+from flask import Flask, render_template, render_template_string, request, send_file
+
+from TTS.config import load_config
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+def create_argparser():
+ def convert_boolean(x):
+ return x.lower() in ["true", "1", "yes"]
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--list_models",
+ type=convert_boolean,
+ nargs="?",
+ const=True,
+ default=False,
+ help="list available pre-trained tts and vocoder models.",
+ )
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ default="tts_models/en/ljspeech/tacotron2-DDC",
+ help="Name of one of the pre-trained tts models in format //",
+ )
+ parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
+
+ # Args for running custom models
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ default=None,
+ help="Path to model file.",
+ )
+ parser.add_argument(
+ "--vocoder_path",
+ type=str,
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+ default=None,
+ )
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+ parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+ parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
+ parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
+ parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
+ return parser
+
+
+# parse the args
+args = create_argparser().parse_args()
+
+path = Path(__file__).parent / "../.models.json"
+manager = ModelManager(path)
+
+if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+# update in-use models to the specified released models.
+model_path = None
+config_path = None
+speakers_file_path = None
+vocoder_path = None
+vocoder_config_path = None
+
+# CASE1: list pre-trained TTS models
+if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+# CASE2: load pre-trained model paths
+if args.model_name is not None and not args.model_path:
+ model_path, config_path, model_item = manager.download_model(args.model_name)
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+
+if args.vocoder_name is not None and not args.vocoder_path:
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+# CASE3: set custom model paths
+if args.model_path is not None:
+ model_path = args.model_path
+ config_path = args.config_path
+ speakers_file_path = args.speakers_file_path
+
+if args.vocoder_path is not None:
+ vocoder_path = args.vocoder_path
+ vocoder_config_path = args.vocoder_config_path
+
+# load models
+synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=speakers_file_path,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config_path,
+ encoder_checkpoint="",
+ encoder_config="",
+ use_cuda=args.use_cuda,
+)
+
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
+ synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
+)
+speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
+
+use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
+ synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
+)
+language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+
+# TODO: set this from SpeakerManager
+use_gst = synthesizer.tts_config.get("use_gst", False)
+app = Flask(__name__)
+
+
+def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
+ """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
+ or a dict (gst tokens/values to be use for styling)
+
+ Args:
+ style_wav (str): uri
+
+ Returns:
+ Union[str, dict]: path to file (str) or gst style (dict)
+ """
+ if style_wav:
+ if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
+ return style_wav # style_wav is a .wav file located on the server
+
+ style_wav = json.loads(style_wav)
+ return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
+ return None
+
+
+@app.route("/")
+def index():
+ return render_template(
+ "index.html",
+ show_details=args.show_details,
+ use_multi_speaker=use_multi_speaker,
+ use_multi_language=use_multi_language,
+ speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
+ language_ids=language_manager.name_to_id if language_manager is not None else None,
+ use_gst=use_gst,
+ )
+
+
+@app.route("/details")
+def details():
+ if args.config_path is not None and os.path.isfile(args.config_path):
+ model_config = load_config(args.config_path)
+ else:
+ if args.model_name is not None:
+ model_config = load_config(config_path)
+
+ if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
+ vocoder_config = load_config(args.vocoder_config_path)
+ else:
+ if args.vocoder_name is not None:
+ vocoder_config = load_config(vocoder_config_path)
+ else:
+ vocoder_config = None
+
+ return render_template(
+ "details.html",
+ show_details=args.show_details,
+ model_config=model_config,
+ vocoder_config=vocoder_config,
+ args=args.__dict__,
+ )
+
+
+lock = Lock()
+
+
+@app.route("/api/tts", methods=["GET", "POST"])
+def tts():
+ with lock:
+ text = request.headers.get("text") or request.values.get("text", "")
+ speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
+ language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+ style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
+ style_wav = style_wav_uri_to_dict(style_wav)
+
+ print(f" > Model input: {text}")
+ print(f" > Speaker Idx: {speaker_idx}")
+ print(f" > Language Idx: {language_idx}")
+ wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+ out = io.BytesIO()
+ synthesizer.save_wav(wavs, out)
+ return send_file(out, mimetype="audio/wav")
+
+
+# Basic MaryTTS compatibility layer
+
+
+@app.route("/locales", methods=["GET"])
+def mary_tts_api_locales():
+ """MaryTTS-compatible /locales endpoint"""
+ # NOTE: We currently assume there is only one model active at the same time
+ if args.model_name is not None:
+ model_details = args.model_name.split("/")
+ else:
+ model_details = ["", "en", "", "default"]
+ return render_template_string("{{ locale }}\n", locale=model_details[1])
+
+
+@app.route("/voices", methods=["GET"])
+def mary_tts_api_voices():
+ """MaryTTS-compatible /voices endpoint"""
+ # NOTE: We currently assume there is only one model active at the same time
+ if args.model_name is not None:
+ model_details = args.model_name.split("/")
+ else:
+ model_details = ["", "en", "", "default"]
+ return render_template_string(
+ "{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u"
+ )
+
+
+@app.route("/process", methods=["GET", "POST"])
+def mary_tts_api_process():
+ """MaryTTS-compatible /process endpoint"""
+ with lock:
+ if request.method == "POST":
+ data = parse_qs(request.get_data(as_text=True))
+ # NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
+ text = data.get("INPUT_TEXT", [""])[0]
+ else:
+ text = request.args.get("INPUT_TEXT", "")
+ print(f" > Model input: {text}")
+ wavs = synthesizer.tts(text)
+ out = io.BytesIO()
+ synthesizer.save_wav(wavs, out)
+ return send_file(out, mimetype="audio/wav")
+
+
+def main():
+ app.run(debug=args.debug, host="::", port=args.port)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/submodules/TTS/TTS/server/static/coqui-log-green-TTS.png b/submodules/TTS/TTS/server/static/coqui-log-green-TTS.png
new file mode 100644
index 0000000000000000000000000000000000000000..62cb253cc83ee5f0216f5195fef7d87f44fa84fc
--- /dev/null
+++ b/submodules/TTS/TTS/server/static/coqui-log-green-TTS.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19396284c1cf4bd73441283f79fd100b192a08da930b76d615a72003893e3b59
+size 61564
diff --git a/submodules/TTS/TTS/server/templates/details.html b/submodules/TTS/TTS/server/templates/details.html
new file mode 100644
index 0000000000000000000000000000000000000000..51c9ed85a83ac0aab045623ee1e6c430fbe51b9d
--- /dev/null
+++ b/submodules/TTS/TTS/server/templates/details.html
@@ -0,0 +1,131 @@
+
+
+
+
+
+
+
+
+
+
+ TTS engine
+
+
+
+
+
+
+
+
+
+
+
+ {% if show_details == true %}
+
+
+ Model details
+
+
+
+
+ CLI arguments:
+
+
+ CLI key
+ Value
+
+
+ {% for key, value in args.items() %}
+
+
+ {{ key }}
+ {{ value }}
+
+
+ {% endfor %}
+
+
+
+
+
+
+ {% if model_config != None %}
+
+
+ Model config:
+
+
+
+ Key
+ Value
+
+
+
+ {% for key, value in model_config.items() %}
+
+
+ {{ key }}
+ {{ value }}
+
+
+ {% endfor %}
+
+
+
+
+ {% endif %}
+
+
+
+
+
+
+ {% if vocoder_config != None %}
+
+ Vocoder model config:
+
+
+
+ Key
+ Value
+
+
+
+ {% for key, value in vocoder_config.items() %}
+
+
+ {{ key }}
+ {{ value }}
+
+
+ {% endfor %}
+
+
+
+
+ {% endif %}
+
+
+ {% else %}
+
+ Please start server with --show_details=true to see details.
+
+
+ {% endif %}
+
+
+
+
\ No newline at end of file
diff --git a/submodules/TTS/TTS/server/templates/index.html b/submodules/TTS/TTS/server/templates/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..6354d3919d9a1e9c1e22e9866c84c4eb8284bc13
--- /dev/null
+++ b/submodules/TTS/TTS/server/templates/index.html
@@ -0,0 +1,154 @@
+
+
+
+
+
+
+
+
+
+
+ TTS engine
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {%if use_gst%}
+
+ {%endif%}
+
+
+
Speak
+
+ {%if use_multi_speaker%}
+ Choose a speaker:
+
+ {% for speaker_id in speaker_ids %}
+ {{speaker_id}} "
+ {% endfor %}
+
+ {%endif%}
+
+ {%if use_multi_language%}
+ Choose a language:
+
+ {% for language_id in language_ids %}
+ {{language_id}} "
+ {% endfor %}
+
+ {%endif%}
+
+
+ {%if show_details%}
+
Model
+ Details
+ {%endif%}
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/submodules/TTS/TTS/tts/__init__.py b/submodules/TTS/TTS/tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/configs/__init__.py b/submodules/TTS/TTS/tts/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3146ac1c116cb807a81889b7a9ab223b9a051036
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/__init__.py
@@ -0,0 +1,17 @@
+import importlib
+import os
+from inspect import isclass
+
+# import all files under configs/
+# configs_dir = os.path.dirname(__file__)
+# for file in os.listdir(configs_dir):
+# path = os.path.join(configs_dir, file)
+# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
+# config_name = file[: file.find(".py")] if file.endswith(".py") else file
+# module = importlib.import_module("TTS.tts.configs." + config_name)
+# for attribute_name in dir(module):
+# attribute = getattr(module, attribute_name)
+
+# if isclass(attribute):
+# # Add the class to this package's variables
+# globals()[attribute_name] = attribute
diff --git a/submodules/TTS/TTS/tts/configs/align_tts_config.py b/submodules/TTS/TTS/tts/configs/align_tts_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..317a01af53ce26914d83610a913eb44b5836dac2
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/align_tts_config.py
@@ -0,0 +1,107 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.align_tts import AlignTTSArgs
+
+
+@dataclass
+class AlignTTSConfig(BaseTTSConfig):
+ """Defines parameters for AlignTTS model.
+ Example:
+
+ >>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
+ >>> config = AlignTTSConfig()
+
+ Args:
+ model(str):
+ Model name used for selecting the right model at initialization. Defaults to `align_tts`.
+ positional_encoding (bool):
+ enable / disable positional encoding applied to the encoder output. Defaults to True.
+ hidden_channels (int):
+ Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
+ parameters. Defaults to 256.
+ hidden_channels_dp (int):
+ Number of hidden channels of the duration predictor's layers. Defaults to 256.
+ encoder_type (str):
+ Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+ Defaults to `fftransformer`.
+ encoder_params (dict):
+ Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+ Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
+ decoder_type (str):
+ Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+ Defaults to `fftransformer`.
+ decoder_params (dict):
+ Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+ Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
+ phase_start_steps (List[int]):
+ A list of number of steps required to start the next training phase. AlignTTS has 4 different training
+ phases. Thus you need to define 4 different values to enable phase based training. If None, it
+ trains the whole model together. Defaults to None.
+ ssim_alpha (float):
+ Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
+ duration_loss_alpha (float):
+ Weight for the duration predictor's loss. Defaults to 1.0.
+ mdn_alpha (float):
+ Weight for the MDN loss. Defaults to 1.0.
+ spec_loss_alpha (float):
+ Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+ noam_schedule (bool):
+ enable / disable the use of Noam LR scheduler. Defaults to False.
+ warmup_steps (int):
+ Number of warm-up steps for the Noam scheduler. Defaults 4000.
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
+
+ model: str = "align_tts"
+ # model specific params
+ model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
+ phase_start_steps: List[int] = None
+
+ ssim_alpha: float = 1.0
+ spec_loss_alpha: float = 1.0
+ dur_loss_alpha: float = 1.0
+ mdn_alpha: float = 1.0
+
+ # multi-speaker settings
+ use_speaker_embedding: bool = False
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = None
+ lr_scheduler_params: dict = None
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+
+ # overrides
+ min_seq_len: int = 13
+ max_seq_len: int = 200
+ r: int = 1
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
diff --git a/submodules/TTS/TTS/tts/configs/bark_config.py b/submodules/TTS/TTS/tts/configs/bark_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1cd1374afe8d5f0b9e87ed81db25d7e4032af9
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/bark_config.py
@@ -0,0 +1,105 @@
+import os
+from dataclasses import dataclass, field
+from typing import Dict
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.layers.bark.model import GPTConfig
+from TTS.tts.layers.bark.model_fine import FineGPTConfig
+from TTS.tts.models.bark import BarkAudioConfig
+from TTS.utils.generic_utils import get_user_data_dir
+
+
+@dataclass
+class BarkConfig(BaseTTSConfig):
+ """Bark TTS configuration
+
+ Args:
+ model (str): model name that registers the model.
+ audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
+ num_chars (int): number of characters in the alphabet. Defaults to 0.
+ semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
+ fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
+ coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
+ CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
+ SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
+ SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
+ CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
+ N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
+ N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
+ COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
+ SAMPLE_RATE (int): sample rate. Defaults to 24_000.
+ USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
+ TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
+ SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
+ TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
+ TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
+ TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
+ SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
+ COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
+ COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
+ REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
+ REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
+ LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
+ SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
+ CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
+ DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
+ """
+
+ model: str = "bark"
+ audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
+ num_chars: int = 0
+ semantic_config: GPTConfig = field(default_factory=GPTConfig)
+ fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
+ coarse_config: GPTConfig = field(default_factory=GPTConfig)
+ CONTEXT_WINDOW_SIZE: int = 1024
+ SEMANTIC_RATE_HZ: float = 49.9
+ SEMANTIC_VOCAB_SIZE: int = 10_000
+ CODEBOOK_SIZE: int = 1024
+ N_COARSE_CODEBOOKS: int = 2
+ N_FINE_CODEBOOKS: int = 8
+ COARSE_RATE_HZ: int = 75
+ SAMPLE_RATE: int = 24_000
+ USE_SMALLER_MODELS: bool = False
+
+ TEXT_ENCODING_OFFSET: int = 10_048
+ SEMANTIC_PAD_TOKEN: int = 10_000
+ TEXT_PAD_TOKEN: int = 129_595
+ SEMANTIC_INFER_TOKEN: int = 129_599
+ COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
+ COARSE_INFER_TOKEN: int = 12_050
+
+ REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
+ REMOTE_MODEL_PATHS: Dict = None
+ LOCAL_MODEL_PATHS: Dict = None
+ SMALL_REMOTE_MODEL_PATHS: Dict = None
+ CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
+ DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
+
+ def __post_init__(self):
+ self.REMOTE_MODEL_PATHS = {
+ "text": {
+ "path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
+ "checksum": "54afa89d65e318d4f5f80e8e8799026a",
+ },
+ "coarse": {
+ "path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
+ "checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
+ },
+ "fine": {
+ "path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
+ "checksum": "59d184ed44e3650774a2f0503a48a97b",
+ },
+ }
+ self.LOCAL_MODEL_PATHS = {
+ "text": os.path.join(self.CACHE_DIR, "text_2.pt"),
+ "coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
+ "fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
+ "hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
+ "hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
+ }
+ self.SMALL_REMOTE_MODEL_PATHS = {
+ "text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
+ "coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
+ "fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
+ }
+ self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init
diff --git a/submodules/TTS/TTS/tts/configs/delightful_tts_config.py b/submodules/TTS/TTS/tts/configs/delightful_tts_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..805d995369e29fce7d6aa87750356b21458cd64a
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/delightful_tts_config.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
+
+
+@dataclass
+class DelightfulTTSConfig(BaseTTSConfig):
+ """
+ Configuration class for the DelightfulTTS model.
+
+ Attributes:
+ model (str): Name of the model ("delightful_tts").
+ audio (DelightfulTtsAudioConfig): Configuration for audio settings.
+ model_args (DelightfulTtsArgs): Configuration for model arguments.
+ use_attn_priors (bool): Whether to use attention priors.
+ vocoder (VocoderConfig): Configuration for the vocoder.
+ init_discriminator (bool): Whether to initialize the discriminator.
+ steps_to_start_discriminator (int): Number of steps to start the discriminator.
+ grad_clip (List[float]): Gradient clipping values.
+ lr_gen (float): Learning rate for the gan generator.
+ lr_disc (float): Learning rate for the gan discriminator.
+ lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
+ lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
+ lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
+ lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
+ scheduler_after_epoch (bool): Whether to schedule after each epoch.
+ optimizer (str): Name of the optimizer.
+ optimizer_params (dict): Parameters for the optimizer.
+ ssim_loss_alpha (float): Alpha value for the SSIM loss.
+ mel_loss_alpha (float): Alpha value for the mel loss.
+ aligner_loss_alpha (float): Alpha value for the aligner loss.
+ pitch_loss_alpha (float): Alpha value for the pitch loss.
+ energy_loss_alpha (float): Alpha value for the energy loss.
+ u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
+ p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
+ dur_loss_alpha (float): Alpha value for the duration loss.
+ char_dur_loss_alpha (float): Alpha value for the character duration loss.
+ binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
+ binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
+ disc_loss_alpha (float): Alpha value for the discriminator loss.
+ gen_loss_alpha (float): Alpha value for the generator loss.
+ feat_loss_alpha (float): Alpha value for the feature loss.
+ vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
+ multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
+ multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
+ return_wav (bool): Whether to return audio waveforms.
+ use_weighted_sampler (bool): Whether to use a weighted sampler.
+ weighted_sampler_attrs (dict): Attributes for the weighted sampler.
+ weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
+ r (int): Value for the `r` override.
+ compute_f0 (bool): Whether to compute F0 values.
+ f0_cache_path (str): Path to the F0 cache.
+ attn_prior_cache_path (str): Path to the attention prior cache.
+ num_speakers (int): Number of speakers.
+ use_speaker_embedding (bool): Whether to use speaker embedding.
+ speakers_file (str): Path to the speaker file.
+ speaker_embedding_channels (int): Number of channels for the speaker embedding.
+ language_ids_file (str): Path to the language IDs file.
+ """
+
+ model: str = "delightful_tts"
+
+ # model specific params
+ audio: DelightfulTtsAudioConfig = field(default_factory=DelightfulTtsAudioConfig)
+ model_args: DelightfulTtsArgs = field(default_factory=DelightfulTtsArgs)
+ use_attn_priors: bool = True
+
+ # vocoder
+ vocoder: VocoderConfig = field(default_factory=VocoderConfig)
+ init_discriminator: bool = True
+
+ # optimizer
+ steps_to_start_discriminator: int = 200000
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+ lr_gen: float = 0.0002
+ lr_disc: float = 0.0002
+ lr_scheduler_gen: str = "ExponentialLR"
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+ lr_scheduler_disc: str = "ExponentialLR"
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+ scheduler_after_epoch: bool = True
+ optimizer: str = "AdamW"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+
+ # acoustic model loss params
+ ssim_loss_alpha: float = 1.0
+ mel_loss_alpha: float = 1.0
+ aligner_loss_alpha: float = 1.0
+ pitch_loss_alpha: float = 1.0
+ energy_loss_alpha: float = 1.0
+ u_prosody_loss_alpha: float = 0.5
+ p_prosody_loss_alpha: float = 0.5
+ dur_loss_alpha: float = 1.0
+ char_dur_loss_alpha: float = 0.01
+ binary_align_loss_alpha: float = 0.1
+ binary_loss_warmup_epochs: int = 10
+
+ # vocoder loss params
+ disc_loss_alpha: float = 1.0
+ gen_loss_alpha: float = 1.0
+ feat_loss_alpha: float = 1.0
+ vocoder_mel_loss_alpha: float = 10.0
+ multi_scale_stft_loss_alpha: float = 2.5
+ multi_scale_stft_loss_params: dict = field(
+ default_factory=lambda: {
+ "n_ffts": [1024, 2048, 512],
+ "hop_lengths": [120, 240, 50],
+ "win_lengths": [600, 1200, 240],
+ }
+ )
+
+ # data loader params
+ return_wav: bool = True
+ use_weighted_sampler: bool = False
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+ # overrides
+ r: int = 1
+
+ # dataset configs
+ compute_f0: bool = True
+ f0_cache_path: str = None
+ attn_prior_cache_path: str = None
+
+ # multi-speaker settings
+ # use speaker embedding layer
+ num_speakers: int = 0
+ use_speaker_embedding: bool = False
+ speakers_file: str = None
+ speaker_embedding_channels: int = 256
+ language_ids_file: str = None
+ use_language_embedding: bool = False
+
+ # use d-vectors
+ use_d_vector_file: bool = False
+ d_vector_file: str = None
+ d_vector_dim: int = None
+
+ # testing
+ test_sentences: List[List[str]] = field(
+ default_factory=lambda: [
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+ ["Be a voice, not an echo."],
+ ["I'm sorry Dave. I'm afraid I can't do that."],
+ ["This cake is great. It's so delicious and moist."],
+ ["Prior to November 22, 1963."],
+ ]
+ )
+
+ def __post_init__(self):
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+ if self.num_speakers > 0:
+ self.model_args.num_speakers = self.num_speakers
+
+ # speaker embedding settings
+ if self.use_speaker_embedding:
+ self.model_args.use_speaker_embedding = True
+ if self.speakers_file:
+ self.model_args.speakers_file = self.speakers_file
+
+ # d-vector settings
+ if self.use_d_vector_file:
+ self.model_args.use_d_vector_file = True
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
+ self.model_args.d_vector_dim = self.d_vector_dim
+ if self.d_vector_file:
+ self.model_args.d_vector_file = self.d_vector_file
diff --git a/submodules/TTS/TTS/tts/configs/fast_pitch_config.py b/submodules/TTS/TTS/tts/configs/fast_pitch_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d086d26564450c60fa04a7f3a068506f4147d3be
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/fast_pitch_config.py
@@ -0,0 +1,183 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class FastPitchConfig(BaseTTSConfig):
+ """Configure `ForwardTTS` as FastPitch model.
+
+ Example:
+
+ >>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
+ >>> config = FastPitchConfig()
+
+ Args:
+ model (str):
+ Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+ base_model (str):
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+ model_args (Coqpit):
+ Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+ data_dep_init_steps (int):
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+ for the rest. Defaults to 10.
+
+ speakers_file (str):
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+ speaker names. Defaults to `None`.
+
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+ d_vector_dim (int):
+ Dimension of the external speaker embeddings. Defaults to 0.
+
+ optimizer (str):
+ Name of the model optimizer. Defaults to `Adam`.
+
+ optimizer_params (dict):
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+ lr_scheduler (str):
+ Name of the learning rate scheduler. Defaults to `Noam`.
+
+ lr_scheduler_params (dict):
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+
+ grad_clip (float):
+ Gradient norm clipping value. Defaults to `5.0`.
+
+ spec_loss_type (str):
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ duration_loss_type (str):
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ use_ssim_loss (bool):
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+
+ ssim_loss_alpha (float):
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+ dur_loss_alpha (float):
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+ spec_loss_alpha (float):
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+ pitch_loss_alpha (float):
+ Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+ binary_align_loss_alpha (float):
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+ binary_loss_warmup_epochs (float):
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+ # dataset configs
+ compute_f0(bool):
+ Compute pitch. defaults to True
+
+ f0_cache_path(str):
+ pith cache path. defaults to None
+ """
+
+ model: str = "fast_pitch"
+ base_model: str = "forward_tts"
+
+ # model specific params
+ model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
+
+ # multi-speaker settings
+ num_speakers: int = 0
+ speakers_file: str = None
+ use_speaker_embedding: bool = False
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+ d_vector_dim: int = 0
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+
+ # loss params
+ spec_loss_type: str = "mse"
+ duration_loss_type: str = "mse"
+ use_ssim_loss: bool = True
+ ssim_loss_alpha: float = 1.0
+ spec_loss_alpha: float = 1.0
+ aligner_loss_alpha: float = 1.0
+ pitch_loss_alpha: float = 0.1
+ dur_loss_alpha: float = 0.1
+ binary_align_loss_alpha: float = 0.1
+ binary_loss_warmup_epochs: int = 150
+
+ # overrides
+ min_seq_len: int = 13
+ max_seq_len: int = 200
+ r: int = 1 # DO NOT CHANGE
+
+ # dataset configs
+ compute_f0: bool = True
+ f0_cache_path: str = None
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
+
+ def __post_init__(self):
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+ if self.num_speakers > 0:
+ self.model_args.num_speakers = self.num_speakers
+
+ # speaker embedding settings
+ if self.use_speaker_embedding:
+ self.model_args.use_speaker_embedding = True
+ if self.speakers_file:
+ self.model_args.speakers_file = self.speakers_file
+
+ # d-vector settings
+ if self.use_d_vector_file:
+ self.model_args.use_d_vector_file = True
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
+ self.model_args.d_vector_dim = self.d_vector_dim
+ if self.d_vector_file:
+ self.model_args.d_vector_file = self.d_vector_file
diff --git a/submodules/TTS/TTS/tts/configs/fast_speech_config.py b/submodules/TTS/TTS/tts/configs/fast_speech_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6c2db6faf55ee2b15047fff86281d42dab1b87
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/fast_speech_config.py
@@ -0,0 +1,177 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class FastSpeechConfig(BaseTTSConfig):
+ """Configure `ForwardTTS` as FastSpeech model.
+
+ Example:
+
+ >>> from TTS.tts.configs.fast_speech_config import FastSpeechConfig
+ >>> config = FastSpeechConfig()
+
+ Args:
+ model (str):
+ Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+ base_model (str):
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+ model_args (Coqpit):
+ Model class arguments. Check `FastSpeechArgs` for more details. Defaults to `FastSpeechArgs()`.
+
+ data_dep_init_steps (int):
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+ for the rest. Defaults to 10.
+
+ speakers_file (str):
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+ speaker names. Defaults to `None`.
+
+
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+ d_vector_dim (int):
+ Dimension of the external speaker embeddings. Defaults to 0.
+
+ optimizer (str):
+ Name of the model optimizer. Defaults to `Adam`.
+
+ optimizer_params (dict):
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+ lr_scheduler (str):
+ Name of the learning rate scheduler. Defaults to `Noam`.
+
+ lr_scheduler_params (dict):
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+
+ grad_clip (float):
+ Gradient norm clipping value. Defaults to `5.0`.
+
+ spec_loss_type (str):
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ duration_loss_type (str):
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ use_ssim_loss (bool):
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+
+ ssim_loss_alpha (float):
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+ dur_loss_alpha (float):
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+ spec_loss_alpha (float):
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+ pitch_loss_alpha (float):
+ Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+ binary_loss_alpha (float):
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+ binary_loss_warmup_epochs (float):
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+ """
+
+ model: str = "fast_speech"
+ base_model: str = "forward_tts"
+
+ # model specific params
+ model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
+
+ # multi-speaker settings
+ num_speakers: int = 0
+ speakers_file: str = None
+ use_speaker_embedding: bool = False
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+ d_vector_dim: int = 0
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+
+ # loss params
+ spec_loss_type: str = "mse"
+ duration_loss_type: str = "mse"
+ use_ssim_loss: bool = True
+ ssim_loss_alpha: float = 1.0
+ dur_loss_alpha: float = 1.0
+ spec_loss_alpha: float = 1.0
+ pitch_loss_alpha: float = 0.0
+ aligner_loss_alpha: float = 1.0
+ binary_align_loss_alpha: float = 1.0
+ binary_loss_warmup_epochs: int = 150
+
+ # overrides
+ min_seq_len: int = 13
+ max_seq_len: int = 200
+ r: int = 1 # DO NOT CHANGE
+
+ # dataset configs
+ compute_f0: bool = False
+ f0_cache_path: str = None
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
+
+ def __post_init__(self):
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+ if self.num_speakers > 0:
+ self.model_args.num_speakers = self.num_speakers
+
+ # speaker embedding settings
+ if self.use_speaker_embedding:
+ self.model_args.use_speaker_embedding = True
+ if self.speakers_file:
+ self.model_args.speakers_file = self.speakers_file
+
+ # d-vector settings
+ if self.use_d_vector_file:
+ self.model_args.use_d_vector_file = True
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
+ self.model_args.d_vector_dim = self.d_vector_dim
+ if self.d_vector_file:
+ self.model_args.d_vector_file = self.d_vector_file
diff --git a/submodules/TTS/TTS/tts/configs/fastspeech2_config.py b/submodules/TTS/TTS/tts/configs/fastspeech2_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d179617fb034fff269355ce7e3d78b67db90aacd
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/fastspeech2_config.py
@@ -0,0 +1,198 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class Fastspeech2Config(BaseTTSConfig):
+ """Configure `ForwardTTS` as FastPitch model.
+
+ Example:
+
+ >>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
+ >>> config = FastSpeech2Config()
+
+ Args:
+ model (str):
+ Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+ base_model (str):
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+ model_args (Coqpit):
+ Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+ data_dep_init_steps (int):
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+ for the rest. Defaults to 10.
+
+ speakers_file (str):
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+ speaker names. Defaults to `None`.
+
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+ d_vector_dim (int):
+ Dimension of the external speaker embeddings. Defaults to 0.
+
+ optimizer (str):
+ Name of the model optimizer. Defaults to `Adam`.
+
+ optimizer_params (dict):
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+ lr_scheduler (str):
+ Name of the learning rate scheduler. Defaults to `Noam`.
+
+ lr_scheduler_params (dict):
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+
+ grad_clip (float):
+ Gradient norm clipping value. Defaults to `5.0`.
+
+ spec_loss_type (str):
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ duration_loss_type (str):
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+ use_ssim_loss (bool):
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+
+ ssim_loss_alpha (float):
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+ dur_loss_alpha (float):
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+ spec_loss_alpha (float):
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+ pitch_loss_alpha (float):
+ Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+ energy_loss_alpha (float):
+ Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
+
+ binary_align_loss_alpha (float):
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+ binary_loss_warmup_epochs (float):
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+ # dataset configs
+ compute_f0(bool):
+ Compute pitch. defaults to True
+
+ f0_cache_path(str):
+ pith cache path. defaults to None
+
+ # dataset configs
+ compute_energy(bool):
+ Compute energy. defaults to True
+
+ energy_cache_path(str):
+ energy cache path. defaults to None
+ """
+
+ model: str = "fastspeech2"
+ base_model: str = "forward_tts"
+
+ # model specific params
+ model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
+
+ # multi-speaker settings
+ num_speakers: int = 0
+ speakers_file: str = None
+ use_speaker_embedding: bool = False
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+ d_vector_dim: int = 0
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+
+ # loss params
+ spec_loss_type: str = "mse"
+ duration_loss_type: str = "mse"
+ use_ssim_loss: bool = True
+ ssim_loss_alpha: float = 1.0
+ spec_loss_alpha: float = 1.0
+ aligner_loss_alpha: float = 1.0
+ pitch_loss_alpha: float = 0.1
+ energy_loss_alpha: float = 0.1
+ dur_loss_alpha: float = 0.1
+ binary_align_loss_alpha: float = 0.1
+ binary_loss_warmup_epochs: int = 150
+
+ # overrides
+ min_seq_len: int = 13
+ max_seq_len: int = 200
+ r: int = 1 # DO NOT CHANGE
+
+ # dataset configs
+ compute_f0: bool = True
+ f0_cache_path: str = None
+
+ # dataset configs
+ compute_energy: bool = True
+ energy_cache_path: str = None
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
+
+ def __post_init__(self):
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+ if self.num_speakers > 0:
+ self.model_args.num_speakers = self.num_speakers
+
+ # speaker embedding settings
+ if self.use_speaker_embedding:
+ self.model_args.use_speaker_embedding = True
+ if self.speakers_file:
+ self.model_args.speakers_file = self.speakers_file
+
+ # d-vector settings
+ if self.use_d_vector_file:
+ self.model_args.use_d_vector_file = True
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
+ self.model_args.d_vector_dim = self.d_vector_dim
+ if self.d_vector_file:
+ self.model_args.d_vector_file = self.d_vector_file
diff --git a/submodules/TTS/TTS/tts/configs/glow_tts_config.py b/submodules/TTS/TTS/tts/configs/glow_tts_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42f3e5a510bacf1b2312ccea7d46201bbcb774f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/glow_tts_config.py
@@ -0,0 +1,182 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class GlowTTSConfig(BaseTTSConfig):
+ """Defines parameters for GlowTTS model.
+
+ Example:
+
+ >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
+ >>> config = GlowTTSConfig()
+
+ Args:
+ model(str):
+ Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
+ encoder_type (str):
+ Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
+ Defaults to `rel_pos_transformers`.
+ encoder_params (dict):
+ Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
+ Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
+ use_encoder_prenet (bool):
+ enable / disable the use of a prenet for the encoder. Defaults to True.
+ hidden_channels_enc (int):
+ Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
+ and for some encoder types internal hidden channels sizes too. Defaults to 192.
+ hidden_channels_dec (int):
+ Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
+ hidden_channels_dp (int):
+ Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
+ mean_only (bool):
+ If true predict only the mean values by the decoder flow. Defaults to True.
+ out_channels (int):
+ Number of channels of the model output tensor. Defaults to 80.
+ num_flow_blocks_dec (int):
+ Number of decoder blocks. Defaults to 12.
+ inference_noise_scale (float):
+ Noise scale used at inference. Defaults to 0.33.
+ kernel_size_dec (int):
+ Decoder kernel size. Defaults to 5
+ dilation_rate (int):
+ Rate to increase dilation by each layer in a decoder block. Defaults to 1.
+ num_block_layers (int):
+ Number of decoder layers in each decoder block. Defaults to 4.
+ dropout_p_dec (float):
+ Dropout rate for decoder. Defaults to 0.1.
+ num_speaker (int):
+ Number of speaker to define the size of speaker embedding layer. Defaults to 0.
+ c_in_channels (int):
+ Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
+ num_splits (int):
+ Number of split levels in inversible conv1x1 operation. Defaults to 4.
+ num_squeeze (int):
+ Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
+ 'num_squeeze'. Defaults to 2.
+ sigmoid_scale (bool):
+ enable/disable sigmoid scaling in decoder. Defaults to False.
+ mean_only (bool):
+ If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
+ encoder_type (str):
+ Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
+ Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
+ encoder_params (dict):
+ Encoder module parameters. Defaults to None.
+ d_vector_dim (int):
+ Channels of external speaker embedding vectors. Defaults to 0.
+ data_dep_init_steps (int):
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+ for the rest. Defaults to 10.
+ style_wav_for_test (str):
+ Path to the wav file used for changing the style of the speech. Defaults to None.
+ inference_noise_scale (float):
+ Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
+ length_scale (float):
+ Multiply the predicted durations with this value to change the speech speed. Defaults to 1.
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+ noam_schedule (bool):
+ enable / disable the use of Noam LR scheduler. Defaults to False.
+ warmup_steps (int):
+ Number of warm-up steps for the Noam scheduler. Defaults 4000.
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+ """
+
+ model: str = "glow_tts"
+
+ # model params
+ num_chars: int = None
+ encoder_type: str = "rel_pos_transformer"
+ encoder_params: dict = field(
+ default_factory=lambda: {
+ "kernel_size": 3,
+ "dropout_p": 0.1,
+ "num_layers": 6,
+ "num_heads": 2,
+ "hidden_channels_ffn": 768,
+ }
+ )
+ use_encoder_prenet: bool = True
+ hidden_channels_enc: int = 192
+ hidden_channels_dec: int = 192
+ hidden_channels_dp: int = 256
+ dropout_p_dp: float = 0.1
+ dropout_p_dec: float = 0.05
+ mean_only: bool = True
+ out_channels: int = 80
+ num_flow_blocks_dec: int = 12
+ inference_noise_scale: float = 0.33
+ kernel_size_dec: int = 5
+ dilation_rate: int = 1
+ num_block_layers: int = 4
+ num_speakers: int = 0
+ c_in_channels: int = 0
+ num_splits: int = 4
+ num_squeeze: int = 2
+ sigmoid_scale: bool = False
+ encoder_type: str = "rel_pos_transformer"
+ encoder_params: dict = field(
+ default_factory=lambda: {
+ "kernel_size": 3,
+ "dropout_p": 0.1,
+ "num_layers": 6,
+ "num_heads": 2,
+ "hidden_channels_ffn": 768,
+ "input_length": None,
+ }
+ )
+ d_vector_dim: int = 0
+
+ # training params
+ data_dep_init_steps: int = 10
+
+ # inference params
+ style_wav_for_test: str = None
+ inference_noise_scale: float = 0.0
+ length_scale: float = 1.0
+
+ # multi-speaker settings
+ use_speaker_embedding: bool = False
+ speakers_file: str = None
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+
+ # optimizer parameters
+ optimizer: str = "RAdam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ grad_clip: float = 5.0
+ lr: float = 1e-3
+
+ # overrides
+ min_seq_len: int = 3
+ max_seq_len: int = 500
+ r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
diff --git a/submodules/TTS/TTS/tts/configs/neuralhmm_tts_config.py b/submodules/TTS/TTS/tts/configs/neuralhmm_tts_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f72847ed3e1c7089915ef8fd77ae5775c5b260
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/neuralhmm_tts_config.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class NeuralhmmTTSConfig(BaseTTSConfig):
+ """
+ Define parameters for Neural HMM TTS model.
+
+ Example:
+
+ >>> from TTS.tts.configs.overflow_config import OverflowConfig
+ >>> config = OverflowConfig()
+
+ Args:
+ model (str):
+ Model name used to select the right model class to initilize. Defaults to `Overflow`.
+ run_eval_steps (int):
+ Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+ save_step (int):
+ Save local checkpoint every save_step steps. Defaults to 500.
+ plot_step (int):
+ Plot training stats on the logger every plot_step steps. Defaults to 1.
+ model_param_stats (bool):
+ Log model parameters stats on the logger dashboard. Defaults to False.
+ force_generate_statistics (bool):
+ Force generate mel normalization statistics. Defaults to False.
+ mel_statistics_parameter_path (str):
+ Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+ Defaults to None.
+ num_chars (int):
+ Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+ state_per_phone (int):
+ Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+ encoder_in_out_features (int):
+ Channels of encoder input and character embedding tensors. Defaults to 512.
+ encoder_n_convolutions (int):
+ Number of convolution layers in the encoder. Defaults to 3.
+ out_channels (int):
+ Channels of the final model output. It must match the spectragram size. Defaults to 80.
+ ar_order (int):
+ Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+ sampling_temp (float):
+ Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+ deterministic_transition (bool):
+ deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+ duration_threshold (float):
+ Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+ use_grad_checkpointing (bool):
+ Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+ max_sampling_time (int):
+ Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+ prenet_type (str):
+ `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+ Prenet. Defaults to `original`.
+ prenet_dim (int):
+ Dimension of the Prenet. Defaults to 256.
+ prenet_n_layers (int):
+ Number of layers in the Prenet. Defaults to 2.
+ prenet_dropout (float):
+ Dropout rate of the Prenet. Defaults to 0.5.
+ prenet_dropout_at_inference (bool):
+ Use dropout at inference time. Defaults to False.
+ memory_rnn_dim (int):
+ Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+ outputnet_size (list[int]):
+ Size of the output network inside the neural HMM. Defaults to [1024].
+ flat_start_params (dict):
+ Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+ It will be recomputed when you pass the dataset.
+ std_floor (float):
+ Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+ It is called `variance flooring` in standard HMM literature.
+ optimizer (str):
+ Optimizer to use for training. Defaults to `adam`.
+ optimizer_params (dict):
+ Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+ grad_clip (float):
+ Gradient clipping threshold. Defaults to 40_000.
+ lr (float):
+ Learning rate. Defaults to 1e-3.
+ lr_scheduler (str):
+ Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+ `TTS.utils.training`. Defaults to `None`.
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+ """
+
+ model: str = "NeuralHMM_TTS"
+
+ # Training and Checkpoint configs
+ run_eval_steps: int = 100
+ save_step: int = 500
+ plot_step: int = 1
+ model_param_stats: bool = False
+
+ # data parameters
+ force_generate_statistics: bool = False
+ mel_statistics_parameter_path: str = None
+
+ # Encoder parameters
+ num_chars: int = None
+ state_per_phone: int = 2
+ encoder_in_out_features: int = 512
+ encoder_n_convolutions: int = 3
+
+ # HMM parameters
+ out_channels: int = 80
+ ar_order: int = 1
+ sampling_temp: float = 0
+ deterministic_transition: bool = True
+ duration_threshold: float = 0.43
+ use_grad_checkpointing: bool = True
+ max_sampling_time: int = 1000
+
+ ## Prenet parameters
+ prenet_type: str = "original"
+ prenet_dim: int = 256
+ prenet_n_layers: int = 2
+ prenet_dropout: float = 0.5
+ prenet_dropout_at_inference: bool = True
+ memory_rnn_dim: int = 1024
+
+ ## Outputnet parameters
+ outputnet_size: List[int] = field(default_factory=lambda: [1024])
+ flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
+ std_floor: float = 0.001
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
+ grad_clip: float = 40000.0
+ lr: float = 1e-3
+ lr_scheduler: str = None
+
+ # overrides
+ min_text_len: int = 10
+ max_text_len: int = 500
+ min_audio_len: int = 512
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "Be a voice, not an echo.",
+ ]
+ )
+
+ # Extra needed config
+ r: int = 1
+ use_d_vector_file: bool = False
+ use_speaker_embedding: bool = False
+
+ def check_values(self):
+ """Validate the hyperparameters.
+
+ Raises:
+ AssertionError: when the parameters network is not defined
+ AssertionError: transition probability is not between 0 and 1
+ """
+ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
+ assert (
+ len(self.outputnet_size) >= 1
+ ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+ assert (
+ 0 < self.flat_start_params["transition_p"] < 1
+ ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
diff --git a/submodules/TTS/TTS/tts/configs/overflow_config.py b/submodules/TTS/TTS/tts/configs/overflow_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3e5548b8f62f76c88acca85d19e2cee8687ebd
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/overflow_config.py
@@ -0,0 +1,201 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class OverflowConfig(BaseTTSConfig): # The classname has to be camel case
+ """
+ Define parameters for OverFlow model.
+
+ Example:
+
+ >>> from TTS.tts.configs.overflow_config import OverflowConfig
+ >>> config = OverflowConfig()
+
+ Args:
+ model (str):
+ Model name used to select the right model class to initilize. Defaults to `Overflow`.
+ run_eval_steps (int):
+ Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+ save_step (int):
+ Save local checkpoint every save_step steps. Defaults to 500.
+ plot_step (int):
+ Plot training stats on the logger every plot_step steps. Defaults to 1.
+ model_param_stats (bool):
+ Log model parameters stats on the logger dashboard. Defaults to False.
+ force_generate_statistics (bool):
+ Force generate mel normalization statistics. Defaults to False.
+ mel_statistics_parameter_path (str):
+ Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+ Defaults to None.
+ num_chars (int):
+ Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+ state_per_phone (int):
+ Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+ encoder_in_out_features (int):
+ Channels of encoder input and character embedding tensors. Defaults to 512.
+ encoder_n_convolutions (int):
+ Number of convolution layers in the encoder. Defaults to 3.
+ out_channels (int):
+ Channels of the final model output. It must match the spectragram size. Defaults to 80.
+ ar_order (int):
+ Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+ sampling_temp (float):
+ Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+ deterministic_transition (bool):
+ deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+ duration_threshold (float):
+ Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+ use_grad_checkpointing (bool):
+ Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+ max_sampling_time (int):
+ Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+ prenet_type (str):
+ `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+ Prenet. Defaults to `original`.
+ prenet_dim (int):
+ Dimension of the Prenet. Defaults to 256.
+ prenet_n_layers (int):
+ Number of layers in the Prenet. Defaults to 2.
+ prenet_dropout (float):
+ Dropout rate of the Prenet. Defaults to 0.5.
+ prenet_dropout_at_inference (bool):
+ Use dropout at inference time. Defaults to False.
+ memory_rnn_dim (int):
+ Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+ outputnet_size (list[int]):
+ Size of the output network inside the neural HMM. Defaults to [1024].
+ flat_start_params (dict):
+ Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+ It will be recomputed when you pass the dataset.
+ std_floor (float):
+ Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+ It is called `variance flooring` in standard HMM literature.
+ hidden_channels_dec (int):
+ Number of base hidden channels used by the decoder WaveNet network. Defaults to 150.
+ kernel_size_dec (int):
+ Decoder kernel size. Defaults to 5
+ dilation_rate (int):
+ Rate to increase dilation by each layer in a decoder block. Defaults to 1.
+ num_flow_blocks_dec (int):
+ Number of decoder layers in each decoder block. Defaults to 4.
+ dropout_p_dec (float):
+ Dropout rate of the decoder. Defaults to 0.05.
+ num_splits (int):
+ Number of split levels in inversible conv1x1 operation. Defaults to 4.
+ num_squeeze (int):
+ Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
+ 'num_squeeze'. Defaults to 2.
+ sigmoid_scale (bool):
+ enable/disable sigmoid scaling in decoder. Defaults to False.
+ c_in_channels (int):
+ Unused parameter from GlowTTS's decoder. Defaults to 0.
+ optimizer (str):
+ Optimizer to use for training. Defaults to `adam`.
+ optimizer_params (dict):
+ Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+ grad_clip (float):
+ Gradient clipping threshold. Defaults to 40_000.
+ lr (float):
+ Learning rate. Defaults to 1e-3.
+ lr_scheduler (str):
+ Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+ `TTS.utils.training`. Defaults to `None`.
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+ """
+
+ model: str = "Overflow"
+
+ # Training and Checkpoint configs
+ run_eval_steps: int = 100
+ save_step: int = 500
+ plot_step: int = 1
+ model_param_stats: bool = False
+
+ # data parameters
+ force_generate_statistics: bool = False
+ mel_statistics_parameter_path: str = None
+
+ # Encoder parameters
+ num_chars: int = None
+ state_per_phone: int = 2
+ encoder_in_out_features: int = 512
+ encoder_n_convolutions: int = 3
+
+ # HMM parameters
+ out_channels: int = 80
+ ar_order: int = 1
+ sampling_temp: float = 0.334
+ deterministic_transition: bool = True
+ duration_threshold: float = 0.55
+ use_grad_checkpointing: bool = True
+ max_sampling_time: int = 1000
+
+ ## Prenet parameters
+ prenet_type: str = "original"
+ prenet_dim: int = 256
+ prenet_n_layers: int = 2
+ prenet_dropout: float = 0.5
+ prenet_dropout_at_inference: bool = False
+ memory_rnn_dim: int = 1024
+
+ ## Outputnet parameters
+ outputnet_size: List[int] = field(default_factory=lambda: [1024])
+ flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
+ std_floor: float = 0.01
+
+ # Decoder parameters
+ hidden_channels_dec: int = 150
+ kernel_size_dec: int = 5
+ dilation_rate: int = 1
+ num_flow_blocks_dec: int = 12
+ num_block_layers: int = 4
+ dropout_p_dec: float = 0.05
+ num_splits: int = 4
+ num_squeeze: int = 2
+ sigmoid_scale: bool = False
+ c_in_channels: int = 0
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
+ grad_clip: float = 40000.0
+ lr: float = 1e-3
+ lr_scheduler: str = None
+
+ # overrides
+ min_text_len: int = 10
+ max_text_len: int = 500
+ min_audio_len: int = 512
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "Be a voice, not an echo.",
+ ]
+ )
+
+ # Extra needed config
+ r: int = 1
+ use_d_vector_file: bool = False
+ use_speaker_embedding: bool = False
+
+ def check_values(self):
+ """Validate the hyperparameters.
+
+ Raises:
+ AssertionError: when the parameters network is not defined
+ AssertionError: transition probability is not between 0 and 1
+ """
+ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
+ assert (
+ len(self.outputnet_size) >= 1
+ ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+ assert (
+ 0 < self.flat_start_params["transition_p"] < 1
+ ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
diff --git a/submodules/TTS/TTS/tts/configs/shared_configs.py b/submodules/TTS/TTS/tts/configs/shared_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf17322c190bb234d4e27c6196e53b276fb5f09d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/shared_configs.py
@@ -0,0 +1,344 @@
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+
+from coqpit import Coqpit, check_argument
+
+from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+
+
+@dataclass
+class GSTConfig(Coqpit):
+ """Defines the Global Style Token Module
+
+ Args:
+ gst_style_input_wav (str):
+ Path to the wav file used to define the style of the output speech at inference. Defaults to None.
+
+ gst_style_input_weights (dict):
+ Defines the weights for each style token used at inference. Defaults to None.
+
+ gst_embedding_dim (int):
+ Defines the size of the GST embedding vector dimensions. Defaults to 256.
+
+ gst_num_heads (int):
+ Number of attention heads used by the multi-head attention. Defaults to 4.
+
+ gst_num_style_tokens (int):
+ Number of style token vectors. Defaults to 10.
+ """
+
+ gst_style_input_wav: str = None
+ gst_style_input_weights: dict = None
+ gst_embedding_dim: int = 256
+ gst_use_speaker_embedding: bool = False
+ gst_num_heads: int = 4
+ gst_num_style_tokens: int = 10
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ super().check_values()
+ check_argument("gst_style_input_weights", c, restricted=False)
+ check_argument("gst_style_input_wav", c, restricted=False)
+ check_argument("gst_embedding_dim", c, restricted=True, min_val=0, max_val=1000)
+ check_argument("gst_use_speaker_embedding", c, restricted=False)
+ check_argument("gst_num_heads", c, restricted=True, min_val=2, max_val=10)
+ check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
+
+
+@dataclass
+class CapacitronVAEConfig(Coqpit):
+ """Defines the capacitron VAE Module
+ Args:
+ capacitron_capacity (int):
+ Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
+ capacitron_VAE_embedding_dim (int):
+ Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
+ capacitron_use_text_summary_embeddings (bool):
+ If True, use a text summary embedding in Capacitron. Defaults to True.
+ capacitron_text_summary_embedding_dim (int):
+ Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
+ capacitron_use_speaker_embedding (bool):
+ if True use speaker embeddings in Capacitron. Defaults to False.
+ capacitron_VAE_loss_alpha (float):
+ Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ capacitron_grad_clip (float):
+ Gradient clipping value for all gradients except beta. Defaults to 5.0
+ """
+
+ capacitron_loss_alpha: int = 1
+ capacitron_capacity: int = 150
+ capacitron_VAE_embedding_dim: int = 128
+ capacitron_use_text_summary_embeddings: bool = True
+ capacitron_text_summary_embedding_dim: int = 128
+ capacitron_use_speaker_embedding: bool = False
+ capacitron_VAE_loss_alpha: float = 0.25
+ capacitron_grad_clip: float = 5.0
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ super().check_values()
+ check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
+ check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
+ check_argument("capacitron_use_speaker_embedding", c, restricted=False)
+ check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
+ check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
+ check_argument("capacitron_grad_clip", c, restricted=False)
+
+
+@dataclass
+class CharactersConfig(Coqpit):
+ """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
+
+ Args:
+ characters_class (str):
+ Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on
+ the configuration. Defaults to None.
+
+ vocab_dict (dict):
+ Defines the vocabulary dictionary used to encode the characters. Defaults to None.
+
+ pad (str):
+ characters in place of empty padding. Defaults to None.
+
+ eos (str):
+ characters showing the end of a sentence. Defaults to None.
+
+ bos (str):
+ characters showing the beginning of a sentence. Defaults to None.
+
+ blank (str):
+ Optional character used between characters by some models for better prosody. Defaults to `_blank`.
+
+ characters (str):
+ character set used by the model. Characters not in this list are ignored when converting input text to
+ a list of sequence IDs. Defaults to None.
+
+ punctuations (str):
+ characters considered as punctuation as parsing the input sentence. Defaults to None.
+
+ phonemes (str):
+ characters considered as parsing phonemes. This is only for backwards compat. Use `characters` for new
+ models. Defaults to None.
+
+ is_unique (bool):
+ remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
+ models trained with character lists with duplicates. Defaults to True.
+
+ is_sorted (bool):
+ Sort the characters in alphabetical order. Defaults to True.
+ """
+
+ characters_class: str = None
+
+ # using BaseVocabulary
+ vocab_dict: Dict = None
+
+ # using on BaseCharacters
+ pad: str = None
+ eos: str = None
+ bos: str = None
+ blank: str = None
+ characters: str = None
+ punctuations: str = None
+ phonemes: str = None
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
+ is_sorted: bool = True
+
+
+@dataclass
+class BaseTTSConfig(BaseTrainingConfig):
+ """Shared parameters among all the tts models.
+
+ Args:
+
+ audio (BaseAudioConfig):
+ Audio processor config object instance.
+
+ use_phonemes (bool):
+ enable / disable phoneme use.
+
+ phonemizer (str):
+ Name of the phonemizer to use. If set None, the phonemizer will be selected by `phoneme_language`.
+ Defaults to None.
+
+ phoneme_language (str):
+ Language code for the phonemizer. You can check the list of supported languages by running
+ `python TTS/tts/utils/text/phonemizers/__init__.py`. Defaults to None.
+
+ compute_input_seq_cache (bool):
+ enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
+ the training, It allows faster data loader time and precise limitation with `max_seq_len` and
+ `min_seq_len`.
+
+ text_cleaner (str):
+ Name of the text cleaner used for cleaning and formatting transcripts.
+
+ enable_eos_bos_chars (bool):
+ enable / disable the use of eos and bos characters.
+
+ test_senteces_file (str):
+ Path to a txt file that has sentences used at test time. The file must have a sentence per line.
+
+ phoneme_cache_path (str):
+ Path to the output folder caching the computed phonemes for each sample.
+
+ characters (CharactersConfig):
+ Instance of a CharactersConfig class.
+
+ batch_group_size (int):
+ Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
+ length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
+ prevent using the same batches for each epoch.
+
+ loss_masking (bool):
+ enable / disable masking loss values against padded segments of samples in a batch.
+
+ min_text_len (int):
+ Minimum length of input text to be used. All shorter samples will be ignored. Defaults to 0.
+
+ max_text_len (int):
+ Maximum length of input text to be used. All longer samples will be ignored. Defaults to float("inf").
+
+ min_audio_len (int):
+ Minimum length of input audio to be used. All shorter samples will be ignored. Defaults to 0.
+
+ max_audio_len (int):
+ Maximum length of input audio to be used. All longer samples will be ignored. The maximum length in the
+ dataset defines the VRAM used in the training. Hence, pay attention to this value if you encounter an
+ OOM error in training. Defaults to float("inf").
+
+ compute_f0 (int):
+ (Not in use yet).
+
+ compute_energy (int):
+ (Not in use yet).
+
+ compute_linear_spec (bool):
+ If True data loader computes and returns linear spectrograms alongside the other data.
+
+ precompute_num_workers (int):
+ Number of workers to precompute features. Defaults to 0.
+
+ use_noise_augment (bool):
+ Augment the input audio with random noise.
+
+ start_by_longest (bool):
+ If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
+ Defaults to False.
+
+ shuffle (bool):
+ If True, the data loader will shuffle the dataset when there is not sampler defined. Defaults to True.
+
+ drop_last (bool):
+ If True, the data loader will drop the last batch if it is not complete. It helps to prevent
+ issues that emerge from the partial batch statistics. Defaults to True.
+
+ add_blank (bool):
+ Add blank characters between each other two characters. It improves performance for some models at expense
+ of slower run-time due to the longer input sequence.
+
+ datasets (List[BaseDatasetConfig]):
+ List of datasets used for training. If multiple datasets are provided, they are merged and used together
+ for training.
+
+ optimizer (str):
+ Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
+ Defaults to ``.
+
+ optimizer_params (dict):
+ Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+
+ lr_scheduler (str):
+ Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+ `TTS.utils.training`. Defaults to ``.
+
+ lr_scheduler_params (dict):
+ Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
+
+ test_sentences (List[str]):
+ List of sentences to be used at testing. Defaults to '[]'
+
+ eval_split_max_size (int):
+ Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+ eval_split_size (float):
+ If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+ If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+
+ use_speaker_weighted_sampler (bool):
+ Enable / Disable the batch balancer by speaker. Defaults to ```False```.
+
+ speaker_weighted_sampler_alpha (float):
+ Number that control the influence of the speaker sampler weights. Defaults to ```1.0```.
+
+ use_language_weighted_sampler (bool):
+ Enable / Disable the batch balancer by language. Defaults to ```False```.
+
+ language_weighted_sampler_alpha (float):
+ Number that control the influence of the language sampler weights. Defaults to ```1.0```.
+
+ use_length_weighted_sampler (bool):
+ Enable / Disable the batch balancer by audio length. If enabled the dataset will be divided
+ into 10 buckets considering the min and max audio of the dataset. The sampler weights will be
+ computed forcing to have the same quantity of data for each bucket in each training batch. Defaults to ```False```.
+
+ length_weighted_sampler_alpha (float):
+ Number that control the influence of the length sampler weights. Defaults to ```1.0```.
+ """
+
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+ # phoneme settings
+ use_phonemes: bool = False
+ phonemizer: str = None
+ phoneme_language: str = None
+ compute_input_seq_cache: bool = False
+ text_cleaner: str = None
+ enable_eos_bos_chars: bool = False
+ test_sentences_file: str = ""
+ phoneme_cache_path: str = None
+ # vocabulary parameters
+ characters: CharactersConfig = None
+ add_blank: bool = False
+ # training params
+ batch_group_size: int = 0
+ loss_masking: bool = None
+ # dataloading
+ min_audio_len: int = 1
+ max_audio_len: int = float("inf")
+ min_text_len: int = 1
+ max_text_len: int = float("inf")
+ compute_f0: bool = False
+ compute_energy: bool = False
+ compute_linear_spec: bool = False
+ precompute_num_workers: int = 0
+ use_noise_augment: bool = False
+ start_by_longest: bool = False
+ shuffle: bool = False
+ drop_last: bool = False
+ # dataset
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+ # optimizer
+ optimizer: str = "radam"
+ optimizer_params: dict = None
+ # scheduler
+ lr_scheduler: str = None
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
+ # testing
+ test_sentences: List[str] = field(default_factory=lambda: [])
+ # evaluation
+ eval_split_max_size: int = None
+ eval_split_size: float = 0.01
+ # weighted samplers
+ use_speaker_weighted_sampler: bool = False
+ speaker_weighted_sampler_alpha: float = 1.0
+ use_language_weighted_sampler: bool = False
+ language_weighted_sampler_alpha: float = 1.0
+ use_length_weighted_sampler: bool = False
+ length_weighted_sampler_alpha: float = 1.0
diff --git a/submodules/TTS/TTS/tts/configs/speedy_speech_config.py b/submodules/TTS/TTS/tts/configs/speedy_speech_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf8517dfc478a135978df19f3126313a616c14c2
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/speedy_speech_config.py
@@ -0,0 +1,194 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class SpeedySpeechConfig(BaseTTSConfig):
+ """Configure `ForwardTTS` as SpeedySpeech model.
+
+ Example:
+
+ >>> from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
+ >>> config = SpeedySpeechConfig()
+
+ Args:
+ model (str):
+ Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
+
+ base_model (str):
+ Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+ the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+ model_args (Coqpit):
+ Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+ data_dep_init_steps (int):
+ Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+ Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+ for the rest. Defaults to 10.
+
+ speakers_file (str):
+ Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+ speaker names. Defaults to `None`.
+
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+ d_vector_dim (int):
+ Dimension of the external speaker embeddings. Defaults to 0.
+
+ optimizer (str):
+ Name of the model optimizer. Defaults to `RAdam`.
+
+ optimizer_params (dict):
+ Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+ lr_scheduler (str):
+ Name of the learning rate scheduler. Defaults to `Noam`.
+
+ lr_scheduler_params (dict):
+ Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+ lr (float):
+ Initial learning rate. Defaults to `1e-3`.
+
+ grad_clip (float):
+ Gradient norm clipping value. Defaults to `5.0`.
+
+ spec_loss_type (str):
+ Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `l1`.
+
+ duration_loss_type (str):
+ Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `huber`.
+
+ use_ssim_loss (bool):
+ Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-7`.
+
+ ssim_loss_alpha (float):
+ Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+ dur_loss_alpha (float):
+ Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+ spec_loss_alpha (float):
+ Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+ binary_loss_alpha (float):
+ Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+ binary_loss_warmup_epochs (float):
+ Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+ min_seq_len (int):
+ Minimum input sequence length to be used at training.
+
+ max_seq_len (int):
+ Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+ """
+
+ model: str = "speedy_speech"
+ base_model: str = "forward_tts"
+
+ # set model args as SpeedySpeech
+ model_args: ForwardTTSArgs = field(
+ default_factory=lambda: ForwardTTSArgs(
+ use_pitch=False,
+ encoder_type="residual_conv_bn",
+ encoder_params={
+ "kernel_size": 4,
+ "dilations": 4 * [1, 2, 4] + [1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 13,
+ },
+ decoder_type="residual_conv_bn",
+ decoder_params={
+ "kernel_size": 4,
+ "dilations": 4 * [1, 2, 4, 8] + [1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 17,
+ },
+ out_channels=80,
+ hidden_channels=128,
+ positional_encoding=True,
+ detach_duration_predictor=True,
+ )
+ )
+
+ # multi-speaker settings
+ num_speakers: int = 0
+ speakers_file: str = None
+ use_speaker_embedding: bool = False
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+ d_vector_dim: int = 0
+
+ # optimizer parameters
+ optimizer: str = "Adam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+
+ # loss params
+ spec_loss_type: str = "l1"
+ duration_loss_type: str = "huber"
+ use_ssim_loss: bool = False
+ ssim_loss_alpha: float = 1.0
+ dur_loss_alpha: float = 1.0
+ spec_loss_alpha: float = 1.0
+ aligner_loss_alpha: float = 1.0
+ binary_align_loss_alpha: float = 0.3
+ binary_loss_warmup_epochs: int = 150
+
+ # overrides
+ min_seq_len: int = 13
+ max_seq_len: int = 200
+ r: int = 1 # DO NOT CHANGE
+
+ # dataset configs
+ compute_f0: bool = False
+ f0_cache_path: str = None
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
+
+ def __post_init__(self):
+ # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+ if self.num_speakers > 0:
+ self.model_args.num_speakers = self.num_speakers
+
+ # speaker embedding settings
+ if self.use_speaker_embedding:
+ self.model_args.use_speaker_embedding = True
+ if self.speakers_file:
+ self.model_args.speakers_file = self.speakers_file
+
+ # d-vector settings
+ if self.use_d_vector_file:
+ self.model_args.use_d_vector_file = True
+ if self.d_vector_dim is not None and self.d_vector_dim > 0:
+ self.model_args.d_vector_dim = self.d_vector_dim
+ if self.d_vector_file:
+ self.model_args.d_vector_file = self.d_vector_file
diff --git a/submodules/TTS/TTS/tts/configs/tacotron2_config.py b/submodules/TTS/TTS/tts/configs/tacotron2_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b65202218cf3aa0dd70c8d8cd55a3f913ed308
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/tacotron2_config.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from TTS.tts.configs.tacotron_config import TacotronConfig
+
+
+@dataclass
+class Tacotron2Config(TacotronConfig):
+ """Defines parameters for Tacotron2 based models.
+
+ Example:
+
+ >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
+ >>> config = Tacotron2Config()
+
+ Check `TacotronConfig` for argument descriptions.
+ """
+
+ model: str = "tacotron2"
+ out_channels: int = 80
+ encoder_in_features: int = 512
+ decoder_in_features: int = 512
diff --git a/submodules/TTS/TTS/tts/configs/tacotron_config.py b/submodules/TTS/TTS/tts/configs/tacotron_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..350b5ea99633569d6977851875d5d8d83175ac36
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/tacotron_config.py
@@ -0,0 +1,235 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
+
+
+@dataclass
+class TacotronConfig(BaseTTSConfig):
+ """Defines parameters for Tacotron based models.
+
+ Example:
+
+ >>> from TTS.tts.configs.tacotron_config import TacotronConfig
+ >>> config = TacotronConfig()
+
+ Args:
+ model (str):
+ Model name used to select the right model class to initilize. Defaults to `Tacotron`.
+ use_gst (bool):
+ enable / disable the use of Global Style Token modules. Defaults to False.
+ gst (GSTConfig):
+ Instance of `GSTConfig` class.
+ gst_style_input (str):
+ Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
+ this is not defined, the model uses a zero vector as an input. Defaults to None.
+ use_capacitron_vae (bool):
+ enable / disable the use of Capacitron modules. Defaults to False.
+ capacitron_vae (CapacitronConfig):
+ Instance of `CapacitronConfig` class.
+ num_chars (int):
+ Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+ num_speakers (int):
+ Number of speakers for multi-speaker models. Defaults to 1.
+ r (int):
+ Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
+ faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
+ `gradual_training` schedule. Defaults to 1.
+ gradual_training (List[List]):
+ Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
+ the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
+ If sets None, no gradual training is used. Defaults to None.
+ memory_size (int):
+ Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
+ Defaults to -1.
+ prenet_type (str):
+ `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+ Prenet. Defaults to `original`.
+ prenet_dropout (bool):
+ enables / disables the use of dropout in the Prenet. Defaults to True.
+ prenet_dropout_at_inference (bool):
+ enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
+ stopnet (bool):
+ enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
+ stopnet_pos_weight (float):
+ Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
+ datasets with longer sentences. Defaults to 0.2.
+ max_decoder_steps (int):
+ Max number of steps allowed for the decoder. Defaults to 50.
+ encoder_in_features (int):
+ Channels of encoder input and character embedding tensors. Defaults to 256.
+ decoder_in_features (int):
+ Channels of decoder input and encoder output tensors. Defaults to 256.
+ out_channels (int):
+ Channels of the final model output. It must match the spectragram size. Defaults to 80.
+ separate_stopnet (bool):
+ Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
+ attention_type (str):
+ attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
+ attention_heads (int):
+ Number of attention heads for GMM attention. Defaults to 5.
+ windowing (bool):
+ It especially useful at inference to keep attention alignment diagonal. Defaults to False.
+ use_forward_attn (bool):
+ It is only valid if ```attn_type``` is ```original```. Defaults to False.
+ forward_attn_mask (bool):
+ enable/disable extra masking over forward attention. It is useful at inference to prevent
+ possible attention failures. Defaults to False.
+ transition_agent (bool):
+ enable/disable transition agent in forward attention. Defaults to False.
+ location_attn (bool):
+ enable/disable location sensitive attention as in the original Tacotron2 paper.
+ It is only valid if ```attn_type``` is ```original```. Defaults to True.
+ bidirectional_decoder (bool):
+ enable/disable bidirectional decoding. Defaults to False.
+ double_decoder_consistency (bool):
+ enable/disable double decoder consistency. Defaults to False.
+ ddc_r (int):
+ reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
+ as a multiple of the `r` value. Defaults to 6.
+ speakers_file (str):
+ Path to the speaker mapping file for the Speaker Manager. Defaults to None.
+ use_speaker_embedding (bool):
+ enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+ in the multi-speaker mode. Defaults to False.
+ use_d_vector_file (bool):
+ enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+ d_vector_file (str):
+ Path to the file including pre-computed speaker embeddings. Defaults to None.
+ optimizer (str):
+ Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
+ Defaults to `RAdam`.
+ optimizer_params (dict):
+ Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+ lr_scheduler (str):
+ Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+ `TTS.utils.training`. Defaults to `NoamLR`.
+ lr_scheduler_params (dict):
+ Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
+ lr (float):
+ Initial learning rate. Defaults to `1e-4`.
+ wd (float):
+ Weight decay coefficient. Defaults to `1e-6`.
+ grad_clip (float):
+ Gradient clipping threshold. Defaults to `5`.
+ seq_len_norm (bool):
+ enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
+ is divided by the sequence length. Defaults to False.
+ loss_masking (bool):
+ enable / disable masking the paddings of the samples in loss computation. Defaults to True.
+ decoder_loss_alpha (float):
+ Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ postnet_loss_alpha (float):
+ Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ postnet_diff_spec_alpha (float):
+ Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ decoder_diff_spec_alpha (float):
+
+ Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ decoder_ssim_alpha (float):
+ Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ postnet_ssim_alpha (float):
+ Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+ corresponding loss function. Defaults to 0.25
+ ga_alpha (float):
+ Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
+ function. Defaults to 5.
+ """
+
+ model: str = "tacotron"
+ # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
+ use_gst: bool = False
+ gst: GSTConfig = None
+ gst_style_input: str = None
+
+ use_capacitron_vae: bool = False
+ capacitron_vae: CapacitronVAEConfig = None
+
+ # model specific params
+ num_speakers: int = 1
+ num_chars: int = 0
+ r: int = 2
+ gradual_training: List[List[int]] = None
+ memory_size: int = -1
+ prenet_type: str = "original"
+ prenet_dropout: bool = True
+ prenet_dropout_at_inference: bool = False
+ stopnet: bool = True
+ separate_stopnet: bool = True
+ stopnet_pos_weight: float = 0.2
+ max_decoder_steps: int = 10000
+ encoder_in_features: int = 256
+ decoder_in_features: int = 256
+ decoder_output_dim: int = 80
+ out_channels: int = 513
+
+ # attention layers
+ attention_type: str = "original"
+ attention_heads: int = None
+ attention_norm: str = "sigmoid"
+ attention_win: bool = False
+ windowing: bool = False
+ use_forward_attn: bool = False
+ forward_attn_mask: bool = False
+ transition_agent: bool = False
+ location_attn: bool = True
+
+ # advance methods
+ bidirectional_decoder: bool = False
+ double_decoder_consistency: bool = False
+ ddc_r: int = 6
+
+ # multi-speaker settings
+ speakers_file: str = None
+ use_speaker_embedding: bool = False
+ speaker_embedding_dim: int = 512
+ use_d_vector_file: bool = False
+ d_vector_file: str = False
+ d_vector_dim: int = None
+
+ # optimizer parameters
+ optimizer: str = "RAdam"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+ lr_scheduler: str = "NoamLR"
+ lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+ lr: float = 1e-4
+ grad_clip: float = 5.0
+ seq_len_norm: bool = False
+ loss_masking: bool = True
+
+ # loss params
+ decoder_loss_alpha: float = 0.25
+ postnet_loss_alpha: float = 0.25
+ postnet_diff_spec_alpha: float = 0.25
+ decoder_diff_spec_alpha: float = 0.25
+ decoder_ssim_alpha: float = 0.25
+ postnet_ssim_alpha: float = 0.25
+ ga_alpha: float = 5.0
+
+ # testing
+ test_sentences: List[str] = field(
+ default_factory=lambda: [
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+ "Be a voice, not an echo.",
+ "I'm sorry Dave. I'm afraid I can't do that.",
+ "This cake is great. It's so delicious and moist.",
+ "Prior to November 22, 1963.",
+ ]
+ )
+
+ def check_values(self):
+ if self.gradual_training:
+ assert (
+ self.gradual_training[0][1] == self.r
+ ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+ if self.model == "tacotron" and self.audio is not None:
+ assert self.out_channels == (
+ self.audio.fft_size // 2 + 1
+ ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+ if self.model == "tacotron2" and self.audio is not None:
+ assert self.out_channels == self.audio.num_mels
diff --git a/submodules/TTS/TTS/tts/configs/tortoise_config.py b/submodules/TTS/TTS/tts/configs/tortoise_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60e43d71280bfa085988e31a52acfeef015c5f0
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/tortoise_config.py
@@ -0,0 +1,87 @@
+from dataclasses import dataclass, field
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
+
+
+@dataclass
+class TortoiseConfig(BaseTTSConfig):
+ """Defines parameters for Tortoise TTS model.
+
+ Args:
+ model (str):
+ Model name. Do not change unless you know what you are doing.
+
+ model_args (TortoiseArgs):
+ Model architecture arguments. Defaults to `TortoiseArgs()`.
+
+ audio (TortoiseAudioConfig):
+ Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
+
+ model_dir (str):
+ Path to the folder that has all the Tortoise models. Defaults to None.
+
+ temperature (float):
+ Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+ length_penalty (float):
+ Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+ which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+ length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+ reperation_penalty (float):
+ The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+ top_p (float):
+ If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+ Defaults to `0.8`.
+
+ cond_free_k (float):
+ Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+ As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+ Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
+
+ diffusion_temperature (float):
+ Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+ are the "mean" prediction of the diffusion network and will sound bland and smeared.
+ Defaults to `1.0`.
+
+ num_autoregressive_samples (int):
+ Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+ As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+ Defaults to `16`.
+
+ diffusion_iterations (int):
+ Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+ the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+ however. Defaults to `30`.
+
+ sampler (str):
+ Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
+ Note:
+ Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+ Example:
+
+ >>> from TTS.tts.configs.tortoise_config import TortoiseConfig
+ >>> config = TortoiseConfig()
+ """
+
+ model: str = "tortoise"
+ # model specific params
+ model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
+ audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
+ model_dir: str = None
+
+ # settings
+ temperature: float = 0.2
+ length_penalty: float = 1.0
+ repetition_penalty: float = 2.0
+ top_p: float = 0.8
+ cond_free_k: float = 2.0
+ diffusion_temperature: float = 1.0
+
+ # inference params
+ num_autoregressive_samples: int = 16
+ diffusion_iterations: int = 30
+ sampler: str = "ddim"
diff --git a/submodules/TTS/TTS/tts/configs/vits_config.py b/submodules/TTS/TTS/tts/configs/vits_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0242bf131a25d6b2cef7a297a3c32b283f908a
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/vits_config.py
@@ -0,0 +1,176 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+
+
+@dataclass
+class VitsConfig(BaseTTSConfig):
+ """Defines parameters for VITS End2End TTS model.
+
+ Args:
+ model (str):
+ Model name. Do not change unless you know what you are doing.
+
+ model_args (VitsArgs):
+ Model architecture arguments. Defaults to `VitsArgs()`.
+
+ audio (VitsAudioConfig):
+ Audio processing configuration. Defaults to `VitsAudioConfig()`.
+
+ grad_clip (List):
+ Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
+
+ lr_gen (float):
+ Initial learning rate for the generator. Defaults to 0.0002.
+
+ lr_disc (float):
+ Initial learning rate for the discriminator. Defaults to 0.0002.
+
+ lr_scheduler_gen (str):
+ Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+ `ExponentialLR`.
+
+ lr_scheduler_gen_params (dict):
+ Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+ lr_scheduler_disc (str):
+ Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+ `ExponentialLR`.
+
+ lr_scheduler_disc_params (dict):
+ Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+ scheduler_after_epoch (bool):
+ If true, step the schedulers after each epoch else after each step. Defaults to `False`.
+
+ optimizer (str):
+ Name of the optimizer to use with both the generator and the discriminator networks. One of the
+ `torch.optim.*`. Defaults to `AdamW`.
+
+ kl_loss_alpha (float):
+ Loss weight for KL loss. Defaults to 1.0.
+
+ disc_loss_alpha (float):
+ Loss weight for the discriminator loss. Defaults to 1.0.
+
+ gen_loss_alpha (float):
+ Loss weight for the generator loss. Defaults to 1.0.
+
+ feat_loss_alpha (float):
+ Loss weight for the feature matching loss. Defaults to 1.0.
+
+ mel_loss_alpha (float):
+ Loss weight for the mel loss. Defaults to 45.0.
+
+ return_wav (bool):
+ If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
+
+ compute_linear_spec (bool):
+ If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
+
+ use_weighted_sampler (bool):
+ If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
+
+ weighted_sampler_attrs (dict):
+ Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
+ by overweighting `root_path` by 2.0. Defaults to `{}`.
+
+ weighted_sampler_multipliers (dict):
+ Weight each unique value of a key returned by the formatter for weighted sampling.
+ For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
+ It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
+
+ r (int):
+ Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
+
+ add_blank (bool):
+ If true, a blank token is added in between every character. Defaults to `True`.
+
+ test_sentences (List[List]):
+ List of sentences with speaker and language information to be used for testing.
+
+ language_ids_file (str):
+ Path to the language ids file.
+
+ use_language_embedding (bool):
+ If true, language embedding is used. Defaults to `False`.
+
+ Note:
+ Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+ Example:
+
+ >>> from TTS.tts.configs.vits_config import VitsConfig
+ >>> config = VitsConfig()
+ """
+
+ model: str = "vits"
+ # model specific params
+ model_args: VitsArgs = field(default_factory=VitsArgs)
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+
+ # optimizer
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+ lr_gen: float = 0.0002
+ lr_disc: float = 0.0002
+ lr_scheduler_gen: str = "ExponentialLR"
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+ lr_scheduler_disc: str = "ExponentialLR"
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+ scheduler_after_epoch: bool = True
+ optimizer: str = "AdamW"
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+
+ # loss params
+ kl_loss_alpha: float = 1.0
+ disc_loss_alpha: float = 1.0
+ gen_loss_alpha: float = 1.0
+ feat_loss_alpha: float = 1.0
+ mel_loss_alpha: float = 45.0
+ dur_loss_alpha: float = 1.0
+ speaker_encoder_loss_alpha: float = 1.0
+
+ # data loader params
+ return_wav: bool = True
+ compute_linear_spec: bool = True
+
+ # sampler params
+ use_weighted_sampler: bool = False # TODO: move it to the base config
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+ # overrides
+ r: int = 1 # DO NOT CHANGE
+ add_blank: bool = True
+
+ # testing
+ test_sentences: List[List] = field(
+ default_factory=lambda: [
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+ ["Be a voice, not an echo."],
+ ["I'm sorry Dave. I'm afraid I can't do that."],
+ ["This cake is great. It's so delicious and moist."],
+ ["Prior to November 22, 1963."],
+ ]
+ )
+
+ # multi-speaker settings
+ # use speaker embedding layer
+ num_speakers: int = 0
+ use_speaker_embedding: bool = False
+ speakers_file: str = None
+ speaker_embedding_channels: int = 256
+ language_ids_file: str = None
+ use_language_embedding: bool = False
+
+ # use d-vectors
+ use_d_vector_file: bool = False
+ d_vector_file: List[str] = None
+ d_vector_dim: int = None
+
+ def __post_init__(self):
+ for key, val in self.model_args.items():
+ if hasattr(self, key):
+ self[key] = val
diff --git a/submodules/TTS/TTS/tts/configs/xtts_config.py b/submodules/TTS/TTS/tts/configs/xtts_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbf048e1ab7984e0cc0c7914cf8fd991fd62ef1f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/configs/xtts_config.py
@@ -0,0 +1,107 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
+
+
+@dataclass
+class XttsConfig(BaseTTSConfig):
+ """Defines parameters for XTTS TTS model.
+
+ Args:
+ model (str):
+ Model name. Do not change unless you know what you are doing.
+
+ model_args (XttsArgs):
+ Model architecture arguments. Defaults to `XttsArgs()`.
+
+ audio (XttsAudioConfig):
+ Audio processing configuration. Defaults to `XttsAudioConfig()`.
+
+ model_dir (str):
+ Path to the folder that has all the XTTS models. Defaults to None.
+
+ temperature (float):
+ Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+ length_penalty (float):
+ Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+ which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+ length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+ repetition_penalty (float):
+ The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+ top_p (float):
+ If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+ Defaults to `0.8`.
+
+ num_gpt_outputs (int):
+ Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+ As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
+ Defaults to `16`.
+
+ gpt_cond_len (int):
+ Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`.
+
+ gpt_cond_chunk_len (int):
+ Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the
+ latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len.
+ If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`.
+
+ max_ref_len (int):
+ Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.
+
+ sound_norm_refs (bool):
+ Whether to normalize the conditioning audio. Defaults to `False`.
+
+ Note:
+ Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+ Example:
+
+ >>> from TTS.tts.configs.xtts_config import XttsConfig
+ >>> config = XttsConfig()
+ """
+
+ model: str = "xtts"
+ # model specific params
+ model_args: XttsArgs = field(default_factory=XttsArgs)
+ audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
+ model_dir: str = None
+ languages: List[str] = field(
+ default_factory=lambda: [
+ "en",
+ "es",
+ "fr",
+ "de",
+ "it",
+ "pt",
+ "pl",
+ "tr",
+ "ru",
+ "nl",
+ "cs",
+ "ar",
+ "zh-cn",
+ "hu",
+ "ko",
+ "ja",
+ "hi",
+ ]
+ )
+
+ # inference params
+ temperature: float = 0.85
+ length_penalty: float = 1.0
+ repetition_penalty: float = 2.0
+ top_k: int = 50
+ top_p: float = 0.85
+ num_gpt_outputs: int = 1
+
+ # cloning
+ gpt_cond_len: int = 12
+ gpt_cond_chunk_len: int = 4
+ max_ref_len: int = 10
+ sound_norm_refs: bool = False
diff --git a/submodules/TTS/TTS/tts/datasets/__init__.py b/submodules/TTS/TTS/tts/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..192138561fdb4e85978fe8beb52eae2edf73888e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/datasets/__init__.py
@@ -0,0 +1,181 @@
+import os
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Callable, Dict, List, Tuple, Union
+
+import numpy as np
+
+from TTS.tts.datasets.dataset import *
+from TTS.tts.datasets.formatters import *
+
+
+def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
+ """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
+
+ Args:
+ items (List[List]):
+ A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+
+ eval_split_max_size (int):
+ Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+ eval_split_size (float):
+ If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+ If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+ """
+ speakers = [item["speaker_name"] for item in items]
+ is_multi_speaker = len(set(speakers)) > 1
+ if eval_split_size > 1:
+ eval_split_size = int(eval_split_size)
+ else:
+ if eval_split_max_size:
+ eval_split_size = min(eval_split_max_size, int(len(items) * eval_split_size))
+ else:
+ eval_split_size = int(len(items) * eval_split_size)
+
+ assert (
+ eval_split_size > 0
+ ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format(
+ 1 / len(items)
+ )
+ np.random.seed(0)
+ np.random.shuffle(items)
+ if is_multi_speaker:
+ items_eval = []
+ speakers = [item["speaker_name"] for item in items]
+ speaker_counter = Counter(speakers)
+ while len(items_eval) < eval_split_size:
+ item_idx = np.random.randint(0, len(items))
+ speaker_to_be_removed = items[item_idx]["speaker_name"]
+ if speaker_counter[speaker_to_be_removed] > 1:
+ items_eval.append(items[item_idx])
+ speaker_counter[speaker_to_be_removed] -= 1
+ del items[item_idx]
+ return items_eval, items
+ return items[:eval_split_size], items[eval_split_size:]
+
+
+def add_extra_keys(metadata, language, dataset_name):
+ for item in metadata:
+ # add language name
+ item["language"] = language
+ # add unique audio name
+ relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
+ audio_unique_name = f"{dataset_name}#{relfilepath}"
+ item["audio_unique_name"] = audio_unique_name
+ return metadata
+
+
+def load_tts_samples(
+ datasets: Union[List[Dict], Dict],
+ eval_split=True,
+ formatter: Callable = None,
+ eval_split_max_size=None,
+ eval_split_size=0.01,
+) -> Tuple[List[List], List[List]]:
+ """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided.
+ If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based
+ on the dataset name.
+
+ Args:
+ datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are
+ in the list, they are all merged.
+
+ eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
+ an eval split automatically. Defaults to True.
+
+ formatter (Callable, optional): The preprocessing function to be applied to create the list of samples. It
+ must take the root_path and the meta_file name and return a list of samples in the format of
+ `[[text, audio_path, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
+ example. Defaults to None.
+
+ eval_split_max_size (int):
+ Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+ eval_split_size (float):
+ If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+ If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+
+ Returns:
+ Tuple[List[List], List[List]: training and evaluation splits of the dataset.
+ """
+ meta_data_train_all = []
+ meta_data_eval_all = [] if eval_split else None
+ if not isinstance(datasets, list):
+ datasets = [datasets]
+ for dataset in datasets:
+ formatter_name = dataset["formatter"]
+ dataset_name = dataset["dataset_name"]
+ root_path = dataset["path"]
+ meta_file_train = dataset["meta_file_train"]
+ meta_file_val = dataset["meta_file_val"]
+ ignored_speakers = dataset["ignored_speakers"]
+ language = dataset["language"]
+
+ # setup the right data processor
+ if formatter is None:
+ formatter = _get_formatter_by_name(formatter_name)
+ # load train set
+ meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
+ assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
+
+ meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
+
+ print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
+ # load evaluation split if set
+ if eval_split:
+ if meta_file_val:
+ meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
+ meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
+ else:
+ eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
+ meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
+ meta_data_eval_all += meta_data_eval
+ meta_data_train_all += meta_data_train
+ # load attention masks for the duration predictor training
+ if dataset.meta_file_attn_mask:
+ meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
+ for idx, ins in enumerate(meta_data_train_all):
+ attn_file = meta_data[ins["audio_file"]].strip()
+ meta_data_train_all[idx].update({"alignment_file": attn_file})
+ if meta_data_eval_all:
+ for idx, ins in enumerate(meta_data_eval_all):
+ attn_file = meta_data[ins["audio_file"]].strip()
+ meta_data_eval_all[idx].update({"alignment_file": attn_file})
+ # set none for the next iter
+ formatter = None
+ return meta_data_train_all, meta_data_eval_all
+
+
+def load_attention_mask_meta_data(metafile_path):
+ """Load meta data file created by compute_attention_masks.py"""
+ with open(metafile_path, "r", encoding="utf-8") as f:
+ lines = f.readlines()
+
+ meta_data = []
+ for line in lines:
+ wav_file, attn_file = line.split("|")
+ meta_data.append([wav_file, attn_file])
+ return meta_data
+
+
+def _get_formatter_by_name(name):
+ """Returns the respective preprocessing function."""
+ thismodule = sys.modules[__name__]
+ return getattr(thismodule, name.lower())
+
+
+def find_unique_chars(data_samples, verbose=True):
+ texts = "".join(item[0] for item in data_samples)
+ chars = set(texts)
+ lower_chars = filter(lambda c: c.islower(), chars)
+ chars_force_lower = [c.lower() for c in chars]
+ chars_force_lower = set(chars_force_lower)
+
+ if verbose:
+ print(f" > Number of unique characters: {len(chars)}")
+ print(f" > Unique characters: {''.join(sorted(chars))}")
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
+ return chars_force_lower
diff --git a/submodules/TTS/TTS/tts/datasets/dataset.py b/submodules/TTS/TTS/tts/datasets/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fb25bef88375c99b5aeb7f608843b15393263e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/datasets/dataset.py
@@ -0,0 +1,973 @@
+import base64
+import collections
+import os
+import random
+from typing import Dict, List, Union
+
+import numpy as np
+import torch
+import tqdm
+from torch.utils.data import Dataset
+
+from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
+
+import mutagen
+
+# to prevent too many open files error as suggested here
+# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def _parse_sample(item):
+ language_name = None
+ attn_file = None
+ if len(item) == 5:
+ text, wav_file, speaker_name, language_name, attn_file = item
+ elif len(item) == 4:
+ text, wav_file, speaker_name, language_name = item
+ elif len(item) == 3:
+ text, wav_file, speaker_name = item
+ else:
+ raise ValueError(" [!] Dataset cannot parse the sample.")
+ return text, wav_file, speaker_name, language_name, attn_file
+
+
+def noise_augment_audio(wav):
+ return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
+
+
+def string2filename(string):
+ # generate a safe and reversible filename based on a string
+ filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
+ return filename
+
+
+def get_audio_size(audiopath):
+ extension = audiopath.rpartition(".")[-1].lower()
+ if extension not in {"mp3", "wav", "flac"}:
+ raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
+
+ audio_info = mutagen.File(audiopath).info
+ return int(audio_info.length * audio_info.sample_rate)
+
+
+class TTSDataset(Dataset):
+ def __init__(
+ self,
+ outputs_per_step: int = 1,
+ compute_linear_spec: bool = False,
+ ap: AudioProcessor = None,
+ samples: List[Dict] = None,
+ tokenizer: "TTSTokenizer" = None,
+ compute_f0: bool = False,
+ compute_energy: bool = False,
+ f0_cache_path: str = None,
+ energy_cache_path: str = None,
+ return_wav: bool = False,
+ batch_group_size: int = 0,
+ min_text_len: int = 0,
+ max_text_len: int = float("inf"),
+ min_audio_len: int = 0,
+ max_audio_len: int = float("inf"),
+ phoneme_cache_path: str = None,
+ precompute_num_workers: int = 0,
+ speaker_id_mapping: Dict = None,
+ d_vector_mapping: Dict = None,
+ language_id_mapping: Dict = None,
+ use_noise_augment: bool = False,
+ start_by_longest: bool = False,
+ verbose: bool = False,
+ ):
+ """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
+
+ If you need something different, you can subclass and override.
+
+ Args:
+ outputs_per_step (int): Number of time frames predicted per step.
+
+ compute_linear_spec (bool): compute linear spectrogram if True.
+
+ ap (TTS.tts.utils.AudioProcessor): Audio processor object.
+
+ samples (list): List of dataset samples.
+
+ tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
+ use the given. Defaults to None.
+
+ compute_f0 (bool): compute f0 if True. Defaults to False.
+
+ compute_energy (bool): compute energy if True. Defaults to False.
+
+ f0_cache_path (str): Path to store f0 cache. Defaults to None.
+
+ energy_cache_path (str): Path to store energy cache. Defaults to None.
+
+ return_wav (bool): Return the waveform of the sample. Defaults to False.
+
+ batch_group_size (int): Range of batch randomization after sorting
+ sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a
+ batch. Set 0 to disable. Defaults to 0.
+
+ min_text_len (int): Minimum length of input text to be used. All shorter samples will be ignored.
+ Defaults to 0.
+
+ max_text_len (int): Maximum length of input text to be used. All longer samples will be ignored.
+ Defaults to float("inf").
+
+ min_audio_len (int): Minimum length of input audio to be used. All shorter samples will be ignored.
+ Defaults to 0.
+
+ max_audio_len (int): Maximum length of input audio to be used. All longer samples will be ignored.
+ The maximum length in the dataset defines the VRAM used in the training. Hence, pay attention to
+ this value if you encounter an OOM error in training. Defaults to float("inf").
+
+ phoneme_cache_path (str): Path to cache computed phonemes. It writes phonemes of each sample to a
+ separate file. Defaults to None.
+
+ precompute_num_workers (int): Number of workers to precompute features. Defaults to 0.
+
+ speaker_id_mapping (dict): Mapping of speaker names to IDs used to compute embedding vectors by the
+ embedding layer. Defaults to None.
+
+ d_vector_mapping (dict): Mapping of wav files to computed d-vectors. Defaults to None.
+
+ use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False.
+
+ start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
+
+ verbose (bool): Print diagnostic information. Defaults to false.
+ """
+ super().__init__()
+ self.batch_group_size = batch_group_size
+ self._samples = samples
+ self.outputs_per_step = outputs_per_step
+ self.compute_linear_spec = compute_linear_spec
+ self.return_wav = return_wav
+ self.compute_f0 = compute_f0
+ self.compute_energy = compute_energy
+ self.f0_cache_path = f0_cache_path
+ self.energy_cache_path = energy_cache_path
+ self.min_audio_len = min_audio_len
+ self.max_audio_len = max_audio_len
+ self.min_text_len = min_text_len
+ self.max_text_len = max_text_len
+ self.ap = ap
+ self.phoneme_cache_path = phoneme_cache_path
+ self.speaker_id_mapping = speaker_id_mapping
+ self.d_vector_mapping = d_vector_mapping
+ self.language_id_mapping = language_id_mapping
+ self.use_noise_augment = use_noise_augment
+ self.start_by_longest = start_by_longest
+
+ self.verbose = verbose
+ self.rescue_item_idx = 1
+ self.pitch_computed = False
+ self.tokenizer = tokenizer
+
+ if self.tokenizer.use_phonemes:
+ self.phoneme_dataset = PhonemeDataset(
+ self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers
+ )
+
+ if compute_f0:
+ self.f0_dataset = F0Dataset(
+ self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
+ )
+ if compute_energy:
+ self.energy_dataset = EnergyDataset(
+ self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
+ )
+ if self.verbose:
+ self.print_logs()
+
+ @property
+ def lengths(self):
+ lens = []
+ for item in self.samples:
+ _, wav_file, *_ = _parse_sample(item)
+ audio_len = get_audio_size(wav_file)
+ lens.append(audio_len)
+ return lens
+
+ @property
+ def samples(self):
+ return self._samples
+
+ @samples.setter
+ def samples(self, new_samples):
+ self._samples = new_samples
+ if hasattr(self, "f0_dataset"):
+ self.f0_dataset.samples = new_samples
+ if hasattr(self, "energy_dataset"):
+ self.energy_dataset.samples = new_samples
+ if hasattr(self, "phoneme_dataset"):
+ self.phoneme_dataset.samples = new_samples
+
+ def __len__(self):
+ return len(self.samples)
+
+ def __getitem__(self, idx):
+ return self.load_data(idx)
+
+ def print_logs(self, level: int = 0) -> None:
+ indent = "\t" * level
+ print("\n")
+ print(f"{indent}> DataLoader initialization")
+ print(f"{indent}| > Tokenizer:")
+ self.tokenizer.print_logs(level + 1)
+ print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+ def load_wav(self, filename):
+ waveform = self.ap.load_wav(filename)
+ assert waveform.size > 0
+ return waveform
+
+ def get_phonemes(self, idx, text):
+ out_dict = self.phoneme_dataset[idx]
+ assert text == out_dict["text"], f"{text} != {out_dict['text']}"
+ assert len(out_dict["token_ids"]) > 0
+ return out_dict
+
+ def get_f0(self, idx):
+ out_dict = self.f0_dataset[idx]
+ item = self.samples[idx]
+ assert item["audio_unique_name"] == out_dict["audio_unique_name"]
+ return out_dict
+
+ def get_energy(self, idx):
+ out_dict = self.energy_dataset[idx]
+ item = self.samples[idx]
+ assert item["audio_unique_name"] == out_dict["audio_unique_name"]
+ return out_dict
+
+ @staticmethod
+ def get_attn_mask(attn_file):
+ return np.load(attn_file)
+
+ def get_token_ids(self, idx, text):
+ if self.tokenizer.use_phonemes:
+ token_ids = self.get_phonemes(idx, text)["token_ids"]
+ else:
+ token_ids = self.tokenizer.text_to_ids(text)
+ return np.array(token_ids, dtype=np.int32)
+
+ def load_data(self, idx):
+ item = self.samples[idx]
+
+ raw_text = item["text"]
+
+ wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32)
+
+ # apply noise for augmentation
+ if self.use_noise_augment:
+ wav = noise_augment_audio(wav)
+
+ # get token ids
+ token_ids = self.get_token_ids(idx, item["text"])
+
+ # get pre-computed attention maps
+ attn = None
+ if "alignment_file" in item:
+ attn = self.get_attn_mask(item["alignment_file"])
+
+ # after phonemization the text length may change
+ # this is a shareful 🤭 hack to prevent longer phonemes
+ # TODO: find a better fix
+ if len(token_ids) > self.max_text_len or len(wav) < self.min_audio_len:
+ self.rescue_item_idx += 1
+ return self.load_data(self.rescue_item_idx)
+
+ # get f0 values
+ f0 = None
+ if self.compute_f0:
+ f0 = self.get_f0(idx)["f0"]
+ energy = None
+ if self.compute_energy:
+ energy = self.get_energy(idx)["energy"]
+
+ sample = {
+ "raw_text": raw_text,
+ "token_ids": token_ids,
+ "wav": wav,
+ "pitch": f0,
+ "energy": energy,
+ "attn": attn,
+ "item_idx": item["audio_file"],
+ "speaker_name": item["speaker_name"],
+ "language_name": item["language"],
+ "wav_file_name": os.path.basename(item["audio_file"]),
+ "audio_unique_name": item["audio_unique_name"],
+ }
+ return sample
+
+ @staticmethod
+ def _compute_lengths(samples):
+ new_samples = []
+ for item in samples:
+ audio_length = get_audio_size(item["audio_file"])
+ text_lenght = len(item["text"])
+ item["audio_length"] = audio_length
+ item["text_length"] = text_lenght
+ new_samples += [item]
+ return new_samples
+
+ @staticmethod
+ def filter_by_length(lengths: List[int], min_len: int, max_len: int):
+ idxs = np.argsort(lengths) # ascending order
+ ignore_idx = []
+ keep_idx = []
+ for idx in idxs:
+ length = lengths[idx]
+ if length < min_len or length > max_len:
+ ignore_idx.append(idx)
+ else:
+ keep_idx.append(idx)
+ return ignore_idx, keep_idx
+
+ @staticmethod
+ def sort_by_length(samples: List[List]):
+ audio_lengths = [s["audio_length"] for s in samples]
+ idxs = np.argsort(audio_lengths) # ascending order
+ return idxs
+
+ @staticmethod
+ def create_buckets(samples, batch_group_size: int):
+ assert batch_group_size > 0
+ for i in range(len(samples) // batch_group_size):
+ offset = i * batch_group_size
+ end_offset = offset + batch_group_size
+ temp_items = samples[offset:end_offset]
+ random.shuffle(temp_items)
+ samples[offset:end_offset] = temp_items
+ return samples
+
+ @staticmethod
+ def _select_samples_by_idx(idxs, samples):
+ samples_new = []
+ for idx in idxs:
+ samples_new.append(samples[idx])
+ return samples_new
+
+ def preprocess_samples(self):
+ r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length
+ range.
+ """
+ samples = self._compute_lengths(self.samples)
+
+ # sort items based on the sequence length in ascending order
+ text_lengths = [i["text_length"] for i in samples]
+ audio_lengths = [i["audio_length"] for i in samples]
+ text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len)
+ audio_ignore_idx, audio_keep_idx = self.filter_by_length(audio_lengths, self.min_audio_len, self.max_audio_len)
+ keep_idx = list(set(audio_keep_idx) & set(text_keep_idx))
+ ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx))
+
+ samples = self._select_samples_by_idx(keep_idx, samples)
+
+ sorted_idxs = self.sort_by_length(samples)
+
+ if self.start_by_longest:
+ longest_idxs = sorted_idxs[-1]
+ sorted_idxs[-1] = sorted_idxs[0]
+ sorted_idxs[0] = longest_idxs
+
+ samples = self._select_samples_by_idx(sorted_idxs, samples)
+
+ if len(samples) == 0:
+ raise RuntimeError(" [!] No samples left")
+
+ # shuffle batch groups
+ # create batches with similar length items
+ # the larger the `batch_group_size`, the higher the length variety in a batch.
+ if self.batch_group_size > 0:
+ samples = self.create_buckets(samples, self.batch_group_size)
+
+ # update items to the new sorted items
+ audio_lengths = [s["audio_length"] for s in samples]
+ text_lengths = [s["text_length"] for s in samples]
+ self.samples = samples
+
+ if self.verbose:
+ print(" | > Preprocessing samples")
+ print(" | > Max text length: {}".format(np.max(text_lengths)))
+ print(" | > Min text length: {}".format(np.min(text_lengths)))
+ print(" | > Avg text length: {}".format(np.mean(text_lengths)))
+ print(" | ")
+ print(" | > Max audio length: {}".format(np.max(audio_lengths)))
+ print(" | > Min audio length: {}".format(np.min(audio_lengths)))
+ print(" | > Avg audio length: {}".format(np.mean(audio_lengths)))
+ print(f" | > Num. instances discarded samples: {len(ignore_idx)}")
+ print(" | > Batch group size: {}.".format(self.batch_group_size))
+
+ @staticmethod
+ def _sort_batch(batch, text_lengths):
+ """Sort the batch by the input text length for RNN efficiency.
+
+ Args:
+ batch (Dict): Batch returned by `__getitem__`.
+ text_lengths (List[int]): Lengths of the input character sequences.
+ """
+ text_lengths, ids_sorted_decreasing = torch.sort(torch.LongTensor(text_lengths), dim=0, descending=True)
+ batch = [batch[idx] for idx in ids_sorted_decreasing]
+ return batch, text_lengths, ids_sorted_decreasing
+
+ def collate_fn(self, batch):
+ r"""
+ Perform preprocessing and create a final data batch:
+ 1. Sort batch instances by text-length
+ 2. Convert Audio signal to features.
+ 3. PAD sequences wrt r.
+ 4. Load to Torch.
+ """
+
+ # Puts each data field into a tensor with outer dimension batch size
+ if isinstance(batch[0], collections.abc.Mapping):
+ token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
+
+ # sort items with text input length for RNN efficiency
+ batch, token_ids_lengths, ids_sorted_decreasing = self._sort_batch(batch, token_ids_lengths)
+
+ # convert list of dicts to dict of lists
+ batch = {k: [dic[k] for dic in batch] for k in batch[0]}
+
+ # get language ids from language names
+ if self.language_id_mapping is not None:
+ language_ids = [self.language_id_mapping[ln] for ln in batch["language_name"]]
+ else:
+ language_ids = None
+ # get pre-computed d-vectors
+ if self.d_vector_mapping is not None:
+ embedding_keys = list(batch["audio_unique_name"])
+ d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
+ else:
+ d_vectors = None
+
+ # get numerical speaker ids from speaker names
+ if self.speaker_id_mapping:
+ speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]]
+ else:
+ speaker_ids = None
+ # compute features
+ mel = [self.ap.melspectrogram(w).astype("float32") for w in batch["wav"]]
+
+ mel_lengths = [m.shape[1] for m in mel]
+
+ # lengths adjusted by the reduction factor
+ mel_lengths_adjusted = [
+ m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
+ if m.shape[1] % self.outputs_per_step
+ else m.shape[1]
+ for m in mel
+ ]
+
+ # compute 'stop token' targets
+ stop_targets = [np.array([0.0] * (mel_len - 1) + [1.0]) for mel_len in mel_lengths]
+
+ # PAD stop targets
+ stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step)
+
+ # PAD sequences with longest instance in the batch
+ token_ids = prepare_data(batch["token_ids"]).astype(np.int32)
+
+ # PAD features with longest instance
+ mel = prepare_tensor(mel, self.outputs_per_step)
+
+ # B x D x T --> B x T x D
+ mel = mel.transpose(0, 2, 1)
+
+ # convert things to pytorch
+ token_ids_lengths = torch.LongTensor(token_ids_lengths)
+ token_ids = torch.LongTensor(token_ids)
+ mel = torch.FloatTensor(mel).contiguous()
+ mel_lengths = torch.LongTensor(mel_lengths)
+ stop_targets = torch.FloatTensor(stop_targets)
+
+ # speaker vectors
+ if d_vectors is not None:
+ d_vectors = torch.FloatTensor(d_vectors)
+
+ if speaker_ids is not None:
+ speaker_ids = torch.LongTensor(speaker_ids)
+
+ if language_ids is not None:
+ language_ids = torch.LongTensor(language_ids)
+
+ # compute linear spectrogram
+ linear = None
+ if self.compute_linear_spec:
+ linear = [self.ap.spectrogram(w).astype("float32") for w in batch["wav"]]
+ linear = prepare_tensor(linear, self.outputs_per_step)
+ linear = linear.transpose(0, 2, 1)
+ assert mel.shape[1] == linear.shape[1]
+ linear = torch.FloatTensor(linear).contiguous()
+
+ # format waveforms
+ wav_padded = None
+ if self.return_wav:
+ wav_lengths = [w.shape[0] for w in batch["wav"]]
+ max_wav_len = max(mel_lengths_adjusted) * self.ap.hop_length
+ wav_lengths = torch.LongTensor(wav_lengths)
+ wav_padded = torch.zeros(len(batch["wav"]), 1, max_wav_len)
+ for i, w in enumerate(batch["wav"]):
+ mel_length = mel_lengths_adjusted[i]
+ w = np.pad(w, (0, self.ap.hop_length * self.outputs_per_step), mode="edge")
+ w = w[: mel_length * self.ap.hop_length]
+ wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
+ wav_padded.transpose_(1, 2)
+
+ # format F0
+ if self.compute_f0:
+ pitch = prepare_data(batch["pitch"])
+ assert mel.shape[1] == pitch.shape[1], f"[!] {mel.shape} vs {pitch.shape}"
+ pitch = torch.FloatTensor(pitch)[:, None, :].contiguous() # B x 1 xT
+ else:
+ pitch = None
+ # format energy
+ if self.compute_energy:
+ energy = prepare_data(batch["energy"])
+ assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
+ energy = torch.FloatTensor(energy)[:, None, :].contiguous() # B x 1 xT
+ else:
+ energy = None
+ # format attention masks
+ attns = None
+ if batch["attn"][0] is not None:
+ attns = [batch["attn"][idx].T for idx in ids_sorted_decreasing]
+ for idx, attn in enumerate(attns):
+ pad2 = mel.shape[1] - attn.shape[1]
+ pad1 = token_ids.shape[1] - attn.shape[0]
+ assert pad1 >= 0 and pad2 >= 0, f"[!] Negative padding - {pad1} and {pad2}"
+ attn = np.pad(attn, [[0, pad1], [0, pad2]])
+ attns[idx] = attn
+ attns = prepare_tensor(attns, self.outputs_per_step)
+ attns = torch.FloatTensor(attns).unsqueeze(1)
+
+ return {
+ "token_id": token_ids,
+ "token_id_lengths": token_ids_lengths,
+ "speaker_names": batch["speaker_name"],
+ "linear": linear,
+ "mel": mel,
+ "mel_lengths": mel_lengths,
+ "stop_targets": stop_targets,
+ "item_idxs": batch["item_idx"],
+ "d_vectors": d_vectors,
+ "speaker_ids": speaker_ids,
+ "attns": attns,
+ "waveform": wav_padded,
+ "raw_text": batch["raw_text"],
+ "pitch": pitch,
+ "energy": energy,
+ "language_ids": language_ids,
+ "audio_unique_names": batch["audio_unique_name"],
+ }
+
+ raise TypeError(
+ (
+ "batch must contain tensors, numbers, dicts or lists;\
+ found {}".format(
+ type(batch[0])
+ )
+ )
+ )
+
+
+class PhonemeDataset(Dataset):
+ """Phoneme Dataset for converting input text to phonemes and then token IDs
+
+ At initialization, it pre-computes the phonemes under `cache_path` and loads them in training to reduce data
+ loading latency. If `cache_path` is already present, it skips the pre-computation.
+
+ Args:
+ samples (Union[List[List], List[Dict]]):
+ List of samples. Each sample is a list or a dict.
+
+ tokenizer (TTSTokenizer):
+ Tokenizer to convert input text to phonemes.
+
+ cache_path (str):
+ Path to cache phonemes. If `cache_path` is already present or None, it skips the pre-computation.
+
+ precompute_num_workers (int):
+ Number of workers used for pre-computing the phonemes. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ samples: Union[List[Dict], List[List]],
+ tokenizer: "TTSTokenizer",
+ cache_path: str,
+ precompute_num_workers=0,
+ ):
+ self.samples = samples
+ self.tokenizer = tokenizer
+ self.cache_path = cache_path
+ if cache_path is not None and not os.path.exists(cache_path):
+ os.makedirs(cache_path)
+ self.precompute(precompute_num_workers)
+
+ def __getitem__(self, index):
+ item = self.samples[index]
+ ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
+ ph_hat = self.tokenizer.ids_to_text(ids)
+ return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
+
+ def __len__(self):
+ return len(self.samples)
+
+ def compute_or_load(self, file_name, text, language):
+ """Compute phonemes for the given text.
+
+ If the phonemes are already cached, load them from cache.
+ """
+ file_ext = "_phoneme.npy"
+ cache_path = os.path.join(self.cache_path, file_name + file_ext)
+ try:
+ ids = np.load(cache_path)
+ except FileNotFoundError:
+ ids = self.tokenizer.text_to_ids(text, language=language)
+ np.save(cache_path, ids)
+ return ids
+
+ def get_pad_id(self):
+ """Get pad token ID for sequence padding"""
+ return self.tokenizer.pad_id
+
+ def precompute(self, num_workers=1):
+ """Precompute phonemes for all samples.
+
+ We use pytorch dataloader because we are lazy.
+ """
+ print("[*] Pre-computing phonemes...")
+ with tqdm.tqdm(total=len(self)) as pbar:
+ batch_size = num_workers if num_workers > 0 else 1
+ dataloder = torch.utils.data.DataLoader(
+ batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+ )
+ for _ in dataloder:
+ pbar.update(batch_size)
+
+ def collate_fn(self, batch):
+ ids = [item["token_ids"] for item in batch]
+ ids_lens = [item["token_ids_len"] for item in batch]
+ texts = [item["text"] for item in batch]
+ texts_hat = [item["ph_hat"] for item in batch]
+ ids_lens_max = max(ids_lens)
+ ids_torch = torch.LongTensor(len(ids), ids_lens_max).fill_(self.get_pad_id())
+ for i, ids_len in enumerate(ids_lens):
+ ids_torch[i, :ids_len] = torch.LongTensor(ids[i])
+ return {"text": texts, "ph_hat": texts_hat, "token_ids": ids_torch}
+
+ def print_logs(self, level: int = 0) -> None:
+ indent = "\t" * level
+ print("\n")
+ print(f"{indent}> PhonemeDataset ")
+ print(f"{indent}| > Tokenizer:")
+ self.tokenizer.print_logs(level + 1)
+ print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+
+class F0Dataset:
+ """F0 Dataset for computing F0 from wav files in CPU
+
+ Pre-compute F0 values for all the samples at initialization if `cache_path` is not None or already present. It
+ also computes the mean and std of F0 values if `normalize_f0` is True.
+
+ Args:
+ samples (Union[List[List], List[Dict]]):
+ List of samples. Each sample is a list or a dict.
+
+ ap (AudioProcessor):
+ AudioProcessor to compute F0 from wav files.
+
+ cache_path (str):
+ Path to cache F0 values. If `cache_path` is already present or None, it skips the pre-computation.
+ Defaults to None.
+
+ precompute_num_workers (int):
+ Number of workers used for pre-computing the F0 values. Defaults to 0.
+
+ normalize_f0 (bool):
+ Whether to normalize F0 values by mean and std. Defaults to True.
+ """
+
+ def __init__(
+ self,
+ samples: Union[List[List], List[Dict]],
+ ap: "AudioProcessor",
+ audio_config=None, # pylint: disable=unused-argument
+ verbose=False,
+ cache_path: str = None,
+ precompute_num_workers=0,
+ normalize_f0=True,
+ ):
+ self.samples = samples
+ self.ap = ap
+ self.verbose = verbose
+ self.cache_path = cache_path
+ self.normalize_f0 = normalize_f0
+ self.pad_id = 0.0
+ self.mean = None
+ self.std = None
+ if cache_path is not None and not os.path.exists(cache_path):
+ os.makedirs(cache_path)
+ self.precompute(precompute_num_workers)
+ if normalize_f0:
+ self.load_stats(cache_path)
+
+ def __getitem__(self, idx):
+ item = self.samples[idx]
+ f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
+ if self.normalize_f0:
+ assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
+ f0 = self.normalize(f0)
+ return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
+
+ def __len__(self):
+ return len(self.samples)
+
+ def precompute(self, num_workers=0):
+ print("[*] Pre-computing F0s...")
+ with tqdm.tqdm(total=len(self)) as pbar:
+ batch_size = num_workers if num_workers > 0 else 1
+ # we do not normalize at preproessing
+ normalize_f0 = self.normalize_f0
+ self.normalize_f0 = False
+ dataloder = torch.utils.data.DataLoader(
+ batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+ )
+ computed_data = []
+ for batch in dataloder:
+ f0 = batch["f0"]
+ computed_data.append(f for f in f0)
+ pbar.update(batch_size)
+ self.normalize_f0 = normalize_f0
+
+ if self.normalize_f0:
+ computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
+ pitch_mean, pitch_std = self.compute_pitch_stats(computed_data)
+ pitch_stats = {"mean": pitch_mean, "std": pitch_std}
+ np.save(os.path.join(self.cache_path, "pitch_stats"), pitch_stats, allow_pickle=True)
+
+ def get_pad_id(self):
+ return self.pad_id
+
+ @staticmethod
+ def create_pitch_file_path(file_name, cache_path):
+ pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
+ return pitch_file
+
+ @staticmethod
+ def _compute_and_save_pitch(ap, wav_file, pitch_file=None):
+ wav = ap.load_wav(wav_file)
+ pitch = ap.compute_f0(wav)
+ if pitch_file:
+ np.save(pitch_file, pitch)
+ return pitch
+
+ @staticmethod
+ def compute_pitch_stats(pitch_vecs):
+ nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs])
+ mean, std = np.mean(nonzeros), np.std(nonzeros)
+ return mean, std
+
+ def load_stats(self, cache_path):
+ stats_path = os.path.join(cache_path, "pitch_stats.npy")
+ stats = np.load(stats_path, allow_pickle=True).item()
+ self.mean = stats["mean"].astype(np.float32)
+ self.std = stats["std"].astype(np.float32)
+
+ def normalize(self, pitch):
+ zero_idxs = np.where(pitch == 0.0)[0]
+ pitch = pitch - self.mean
+ pitch = pitch / self.std
+ pitch[zero_idxs] = 0.0
+ return pitch
+
+ def denormalize(self, pitch):
+ zero_idxs = np.where(pitch == 0.0)[0]
+ pitch *= self.std
+ pitch += self.mean
+ pitch[zero_idxs] = 0.0
+ return pitch
+
+ def compute_or_load(self, wav_file, audio_unique_name):
+ """
+ compute pitch and return a numpy array of pitch values
+ """
+ pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
+ if not os.path.exists(pitch_file):
+ pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
+ else:
+ pitch = np.load(pitch_file)
+ return pitch.astype(np.float32)
+
+ def collate_fn(self, batch):
+ audio_unique_name = [item["audio_unique_name"] for item in batch]
+ f0s = [item["f0"] for item in batch]
+ f0_lens = [len(item["f0"]) for item in batch]
+ f0_lens_max = max(f0_lens)
+ f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
+ for i, f0_len in enumerate(f0_lens):
+ f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
+ return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
+
+ def print_logs(self, level: int = 0) -> None:
+ indent = "\t" * level
+ print("\n")
+ print(f"{indent}> F0Dataset ")
+ print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+
+class EnergyDataset:
+ """Energy Dataset for computing Energy from wav files in CPU
+
+ Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
+ also computes the mean and std of Energy values if `normalize_Energy` is True.
+
+ Args:
+ samples (Union[List[List], List[Dict]]):
+ List of samples. Each sample is a list or a dict.
+
+ ap (AudioProcessor):
+ AudioProcessor to compute Energy from wav files.
+
+ cache_path (str):
+ Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
+ Defaults to None.
+
+ precompute_num_workers (int):
+ Number of workers used for pre-computing the Energy values. Defaults to 0.
+
+ normalize_Energy (bool):
+ Whether to normalize Energy values by mean and std. Defaults to True.
+ """
+
+ def __init__(
+ self,
+ samples: Union[List[List], List[Dict]],
+ ap: "AudioProcessor",
+ verbose=False,
+ cache_path: str = None,
+ precompute_num_workers=0,
+ normalize_energy=True,
+ ):
+ self.samples = samples
+ self.ap = ap
+ self.verbose = verbose
+ self.cache_path = cache_path
+ self.normalize_energy = normalize_energy
+ self.pad_id = 0.0
+ self.mean = None
+ self.std = None
+ if cache_path is not None and not os.path.exists(cache_path):
+ os.makedirs(cache_path)
+ self.precompute(precompute_num_workers)
+ if normalize_energy:
+ self.load_stats(cache_path)
+
+ def __getitem__(self, idx):
+ item = self.samples[idx]
+ energy = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
+ if self.normalize_energy:
+ assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
+ energy = self.normalize(energy)
+ return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
+
+ def __len__(self):
+ return len(self.samples)
+
+ def precompute(self, num_workers=0):
+ print("[*] Pre-computing energys...")
+ with tqdm.tqdm(total=len(self)) as pbar:
+ batch_size = num_workers if num_workers > 0 else 1
+ # we do not normalize at preproessing
+ normalize_energy = self.normalize_energy
+ self.normalize_energy = False
+ dataloder = torch.utils.data.DataLoader(
+ batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+ )
+ computed_data = []
+ for batch in dataloder:
+ energy = batch["energy"]
+ computed_data.append(e for e in energy)
+ pbar.update(batch_size)
+ self.normalize_energy = normalize_energy
+
+ if self.normalize_energy:
+ computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
+ energy_mean, energy_std = self.compute_energy_stats(computed_data)
+ energy_stats = {"mean": energy_mean, "std": energy_std}
+ np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
+
+ def get_pad_id(self):
+ return self.pad_id
+
+ @staticmethod
+ def create_energy_file_path(wav_file, cache_path):
+ file_name = os.path.splitext(os.path.basename(wav_file))[0]
+ energy_file = os.path.join(cache_path, file_name + "_energy.npy")
+ return energy_file
+
+ @staticmethod
+ def _compute_and_save_energy(ap, wav_file, energy_file=None):
+ wav = ap.load_wav(wav_file)
+ energy = calculate_energy(wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length)
+ if energy_file:
+ np.save(energy_file, energy)
+ return energy
+
+ @staticmethod
+ def compute_energy_stats(energy_vecs):
+ nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
+ mean, std = np.mean(nonzeros), np.std(nonzeros)
+ return mean, std
+
+ def load_stats(self, cache_path):
+ stats_path = os.path.join(cache_path, "energy_stats.npy")
+ stats = np.load(stats_path, allow_pickle=True).item()
+ self.mean = stats["mean"].astype(np.float32)
+ self.std = stats["std"].astype(np.float32)
+
+ def normalize(self, energy):
+ zero_idxs = np.where(energy == 0.0)[0]
+ energy = energy - self.mean
+ energy = energy / self.std
+ energy[zero_idxs] = 0.0
+ return energy
+
+ def denormalize(self, energy):
+ zero_idxs = np.where(energy == 0.0)[0]
+ energy *= self.std
+ energy += self.mean
+ energy[zero_idxs] = 0.0
+ return energy
+
+ def compute_or_load(self, wav_file, audio_unique_name):
+ """
+ compute energy and return a numpy array of energy values
+ """
+ energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
+ if not os.path.exists(energy_file):
+ energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
+ else:
+ energy = np.load(energy_file)
+ return energy.astype(np.float32)
+
+ def collate_fn(self, batch):
+ audio_unique_name = [item["audio_unique_name"] for item in batch]
+ energys = [item["energy"] for item in batch]
+ energy_lens = [len(item["energy"]) for item in batch]
+ energy_lens_max = max(energy_lens)
+ energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
+ for i, energy_len in enumerate(energy_lens):
+ energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
+ return {"audio_unique_name": audio_unique_name, "energy": energys_torch, "energy_lens": energy_lens}
+
+ def print_logs(self, level: int = 0) -> None:
+ indent = "\t" * level
+ print("\n")
+ print(f"{indent}> energyDataset ")
+ print(f"{indent}| > Number of instances : {len(self.samples)}")
diff --git a/submodules/TTS/TTS/tts/datasets/formatters.py b/submodules/TTS/TTS/tts/datasets/formatters.py
new file mode 100644
index 0000000000000000000000000000000000000000..053444b0c1010d5a93970cc26b2c225e305279a9
--- /dev/null
+++ b/submodules/TTS/TTS/tts/datasets/formatters.py
@@ -0,0 +1,655 @@
+import os
+import re
+import xml.etree.ElementTree as ET
+from glob import glob
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+from tqdm import tqdm
+
+########################
+# DATASETS
+########################
+
+
+def cml_tts(root_path, meta_file, ignored_speakers=None):
+ """Normalizes the CML-TTS meta data file to TTS format
+ https://github.com/freds0/CML-TTS-Dataset/"""
+ filepath = os.path.join(root_path, meta_file)
+ # ensure there are 4 columns for every line
+ with open(filepath, "r", encoding="utf8") as f:
+ lines = f.readlines()
+ num_cols = len(lines[0].split("|")) # take the first row as reference
+ for idx, line in enumerate(lines[1:]):
+ if len(line.split("|")) != num_cols:
+ print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+ # load metadata
+ metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
+ assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
+ client_id = None if "client_id" in metadata.columns else "default"
+ emotion_name = None if "emotion_name" in metadata.columns else "neutral"
+ items = []
+ not_found_counter = 0
+ for row in metadata.itertuples():
+ if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
+ continue
+ audio_path = os.path.join(root_path, row.wav_filename)
+ if not os.path.exists(audio_path):
+ not_found_counter += 1
+ continue
+ items.append(
+ {
+ "text": row.transcript,
+ "audio_file": audio_path,
+ "speaker_name": client_id if client_id is not None else row.client_id,
+ "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+ "root_path": root_path,
+ }
+ )
+ if not_found_counter > 0:
+ print(f" | > [!] {not_found_counter} files not found")
+ return items
+
+
+def coqui(root_path, meta_file, ignored_speakers=None):
+ """Interal dataset formatter."""
+ filepath = os.path.join(root_path, meta_file)
+ # ensure there are 4 columns for every line
+ with open(filepath, "r", encoding="utf8") as f:
+ lines = f.readlines()
+ num_cols = len(lines[0].split("|")) # take the first row as reference
+ for idx, line in enumerate(lines[1:]):
+ if len(line.split("|")) != num_cols:
+ print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+ # load metadata
+ metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
+ assert all(x in metadata.columns for x in ["audio_file", "text"])
+ speaker_name = None if "speaker_name" in metadata.columns else "coqui"
+ emotion_name = None if "emotion_name" in metadata.columns else "neutral"
+ items = []
+ not_found_counter = 0
+ for row in metadata.itertuples():
+ if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
+ continue
+ audio_path = os.path.join(root_path, row.audio_file)
+ if not os.path.exists(audio_path):
+ not_found_counter += 1
+ continue
+ items.append(
+ {
+ "text": row.text,
+ "audio_file": audio_path,
+ "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
+ "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+ "root_path": root_path,
+ }
+ )
+ if not_found_counter > 0:
+ print(f" | > [!] {not_found_counter} files not found")
+ return items
+
+
+def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalize TWEB dataset.
+ https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
+ """
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "tweb"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("\t")
+ wav_file = os.path.join(root_path, cols[0] + ".wav")
+ text = cols[1]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes Mozilla meta data files to TTS format"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "mozilla"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = cols[1].strip()
+ text = cols[0].strip()
+ wav_file = os.path.join(root_path, "wavs", wav_file)
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes Mozilla meta data files to TTS format"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "mozilla"
+ with open(txt_file, "r", encoding="ISO 8859-1") as ttf:
+ for line in ttf:
+ cols = line.strip().split("|")
+ wav_file = cols[0].strip()
+ text = cols[1].strip()
+ folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
+ wav_file = os.path.join(root_path, folder_name, wav_file)
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def mailabs(root_path, meta_files=None, ignored_speakers=None):
+ """Normalizes M-AI-Labs meta data files to TTS format
+
+ Args:
+ root_path (str): root folder of the MAILAB language folder.
+ meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
+ recursively. Defaults to None
+ """
+ speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P[^{os.sep}]+){os.sep}")
+ if not meta_files:
+ csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
+ else:
+ csv_files = meta_files
+
+ # meta_files = [f.strip() for f in meta_files.split(",")]
+ items = []
+ for csv_file in csv_files:
+ if os.path.isfile(csv_file):
+ txt_file = csv_file
+ else:
+ txt_file = os.path.join(root_path, csv_file)
+
+ folder = os.path.dirname(txt_file)
+ # determine speaker based on folder structure...
+ speaker_name_match = speaker_regex.search(txt_file)
+ if speaker_name_match is None:
+ continue
+ speaker_name = speaker_name_match.group("speaker_name")
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_name in ignored_speakers:
+ continue
+ print(" | > {}".format(csv_file))
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ if not meta_files:
+ wav_file = os.path.join(folder, "wavs", cols[0] + ".wav")
+ else:
+ wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav")
+ if os.path.isfile(wav_file):
+ text = cols[1].strip()
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}
+ )
+ else:
+ # M-AI-Labs have some missing samples, so just print the warning
+ print("> File %s does not exist!" % (wav_file))
+ return items
+
+
+def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the LJSpeech meta data file to TTS format
+ https://keithito.com/LJ-Speech-Dataset/"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "ljspeech"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+ text = cols[2]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the LJSpeech meta data file for TTS testing
+ https://keithito.com/LJ-Speech-Dataset/"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ speaker_id = 0
+ for idx, line in enumerate(ttf):
+ # 2 samples per speaker to avoid eval split issues
+ if idx % 2 == 0:
+ speaker_id += 1
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+ text = cols[2]
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{speaker_id}", "root_path": root_path}
+ )
+ return items
+
+
+def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the thorsten meta data file to TTS format
+ https://github.com/thorstenMueller/deep-learning-german-tts/"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "thorsten"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+ text = cols[1]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the sam-accenture meta data file to TTS format
+ https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
+ xml_file = os.path.join(root_path, "voice_over_recordings", meta_file)
+ xml_root = ET.parse(xml_file).getroot()
+ items = []
+ speaker_name = "sam_accenture"
+ for item in xml_root.findall("./fileid"):
+ text = item.text
+ wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
+ if not os.path.exists(wav_file):
+ print(f" [!] {wav_file} in metafile does not exist. Skipping...")
+ continue
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the RUSLAN meta data file to TTS format
+ https://ruslan-corpus.github.io/"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "ruslan"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav")
+ text = cols[1]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the CSS10 dataset file to TTS format"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "css10"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, cols[0])
+ text = cols[1]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Normalizes the Nancy meta data file to TTS format"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "nancy"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ utt_id = line.split()[1]
+ text = line[line.find('"') + 1 : line.rfind('"') - 1]
+ wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def common_voice(root_path, meta_file, ignored_speakers=None):
+ """Normalize the common voice meta data file to TTS format."""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ if line.startswith("client_id"):
+ continue
+ cols = line.split("\t")
+ text = cols[2]
+ speaker_name = cols[0]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_name in ignored_speakers:
+ continue
+ wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": "MCV_" + speaker_name, "root_path": root_path}
+ )
+ return items
+
+
+def libri_tts(root_path, meta_files=None, ignored_speakers=None):
+ """https://ai.google/tools/datasets/libri-tts/"""
+ items = []
+ if not meta_files:
+ meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
+ else:
+ if isinstance(meta_files, str):
+ meta_files = [os.path.join(root_path, meta_files)]
+
+ for meta_file in meta_files:
+ _meta_file = os.path.basename(meta_file).split(".")[0]
+ with open(meta_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("\t")
+ file_name = cols[0]
+ speaker_name, chapter_id, *_ = cols[0].split("_")
+ _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
+ wav_file = os.path.join(_root_path, file_name + ".wav")
+ text = cols[2]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_name in ignored_speakers:
+ continue
+ items.append(
+ {
+ "text": text,
+ "audio_file": wav_file,
+ "speaker_name": f"LTTS_{speaker_name}",
+ "root_path": root_path,
+ }
+ )
+ for item in items:
+ assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
+ return items
+
+
+def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "turkish-female"
+ skipped_files = []
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
+ if not os.path.exists(wav_file):
+ skipped_files.append(wav_file)
+ continue
+ text = cols[1].strip()
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
+ return items
+
+
+# ToDo: add the dataset link when the dataset is released publicly
+def brspeech(root_path, meta_file, ignored_speakers=None):
+ """BRSpeech 3.0 beta"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ if line.startswith("wav_filename"):
+ continue
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, cols[0])
+ text = cols[2]
+ speaker_id = cols[3]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_id in ignored_speakers:
+ continue
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_id, "root_path": root_path})
+ return items
+
+
+def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic1", ignored_speakers=None):
+ """VCTK dataset v0.92.
+
+ URL:
+ https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip
+
+ This dataset has 2 recordings per speaker that are annotated with ```mic1``` and ```mic2```.
+ It is believed that (😄 ) ```mic1``` files are the same as the previous version of the dataset.
+
+ mic1:
+ Audio recorded using an omni-directional microphone (DPA 4035).
+ Contains very low frequency noises.
+ This is the same audio released in previous versions of VCTK:
+ https://doi.org/10.7488/ds/1994
+
+ mic2:
+ Audio recorded using a small diaphragm condenser microphone with
+ very wide bandwidth (Sennheiser MKH 800).
+ Two speakers, p280 and p315 had technical issues of the audio
+ recordings using MKH 800.
+ """
+ file_ext = "flac"
+ items = []
+ meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+ for meta_file in meta_files:
+ _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+ file_id = txt_file.split(".")[0]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_id in ignored_speakers:
+ continue
+ with open(meta_file, "r", encoding="utf-8") as file_text:
+ text = file_text.readlines()[0]
+ # p280 has no mic2 recordings
+ if speaker_id == "p280":
+ wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_mic1.{file_ext}")
+ else:
+ wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
+ if os.path.exists(wav_file):
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
+ )
+ else:
+ print(f" [!] wav files don't exist - {wav_file}")
+ return items
+
+
+def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
+ """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
+ items = []
+ meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+ for meta_file in meta_files:
+ _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+ file_id = txt_file.split(".")[0]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_id in ignored_speakers:
+ continue
+ with open(meta_file, "r", encoding="utf-8") as file_text:
+ text = file_text.readlines()[0]
+ wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id, "root_path": root_path}
+ )
+ return items
+
+
+def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-argument
+ items = []
+ speaker_name = "synpaflex"
+ root_path = os.path.join(root_path, "")
+ wav_files = glob(f"{root_path}**/*.wav", recursive=True)
+ for wav_file in wav_files:
+ if os.sep + "wav" + os.sep in wav_file:
+ txt_file = wav_file.replace("wav", "txt")
+ else:
+ txt_file = os.path.join(
+ os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
+ )
+ if os.path.exists(txt_file) and os.path.exists(wav_file):
+ with open(txt_file, "r", encoding="utf-8") as file_text:
+ text = file_text.readlines()[0]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
+ """ToDo: Refer the paper when available"""
+ items = []
+ split_dir = meta_files
+ meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True)
+ for meta_file in meta_files:
+ _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+ file_id = txt_file.split(".")[0]
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_id in ignored_speakers:
+ continue
+ with open(meta_file, "r", encoding="utf-8") as file_text:
+ text = file_text.readline().replace("\n", "")
+ # ignore sentences that contains digits
+ if ignore_digits_sentences and any(map(str.isdigit, text)):
+ continue
+ wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac")
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id, "root_path": root_path})
+ return items
+
+
+def mls(root_path, meta_files=None, ignored_speakers=None):
+ """http://www.openslr.org/94/"""
+ items = []
+ with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta:
+ for line in meta:
+ file, text = line.split("\t")
+ text = text[:-1]
+ speaker, book, *_ = file.split("_")
+ wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker in ignored_speakers:
+ continue
+ items.append(
+ {"text": text, "audio_file": wav_file, "speaker_name": "MLS_" + speaker, "root_path": root_path}
+ )
+ return items
+
+
+# ======================================== VOX CELEB ===========================================
+def voxceleb2(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument
+ """
+ :param meta_file Used only for consistency with load_tts_samples api
+ """
+ return _voxcel_x(root_path, meta_file, voxcel_idx="2")
+
+
+def voxceleb1(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument
+ """
+ :param meta_file Used only for consistency with load_tts_samples api
+ """
+ return _voxcel_x(root_path, meta_file, voxcel_idx="1")
+
+
+def _voxcel_x(root_path, meta_file, voxcel_idx):
+ assert voxcel_idx in ["1", "2"]
+ expected_count = 148_000 if voxcel_idx == "1" else 1_000_000
+ voxceleb_path = Path(root_path)
+ cache_to = voxceleb_path / f"metafile_voxceleb{voxcel_idx}.csv"
+ cache_to.parent.mkdir(exist_ok=True)
+
+ # if not exists meta file, crawl recursively for 'wav' files
+ if meta_file is not None:
+ with open(str(meta_file), "r", encoding="utf-8") as f:
+ return [x.strip().split("|") for x in f.readlines()]
+
+ elif not cache_to.exists():
+ cnt = 0
+ meta_data = []
+ wav_files = voxceleb_path.rglob("**/*.wav")
+ for path in tqdm(
+ wav_files,
+ desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.",
+ total=expected_count,
+ ):
+ speaker_id = str(Path(path).parent.parent.stem)
+ assert speaker_id.startswith("id")
+ text = None # VoxCel does not provide transciptions, and they are not needed for training the SE
+ meta_data.append(f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n")
+ cnt += 1
+ with open(str(cache_to), "w", encoding="utf-8") as f:
+ f.write("".join(meta_data))
+ if cnt < expected_count:
+ raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")
+
+ with open(str(cache_to), "r", encoding="utf-8") as f:
+ return [x.strip().split("|") for x in f.readlines()]
+
+
+def emotion(root_path, meta_file, ignored_speakers=None):
+ """Generic emotion dataset"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ if line.startswith("file_path"):
+ continue
+ cols = line.split(",")
+ wav_file = os.path.join(root_path, cols[0])
+ speaker_id = cols[1]
+ emotion_id = cols[2].replace("\n", "")
+ # ignore speakers
+ if isinstance(ignored_speakers, list):
+ if speaker_id in ignored_speakers:
+ continue
+ items.append(
+ {"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id, "root_path": root_path}
+ )
+ return items
+
+
+def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument
+ """Normalizes the Baker meta data file to TTS format
+
+ Args:
+ root_path (str): path to the baker dataset
+ meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
+ Returns:
+ List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
+ """
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "baker"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ wav_name, text = line.rstrip("\n").split("|")
+ wav_path = os.path.join(root_path, "clips_22", wav_name)
+ items.append({"text": text, "audio_file": wav_path, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "kokoro"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+ text = cols[2].replace(" ", "")
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ """Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "kss"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, cols[0])
+ text = cols[2] # cols[1] => 6월, cols[2] => 유월
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
+
+
+def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "bel_tts"
+ with open(txt_file, "r", encoding="utf-8") as ttf:
+ for line in ttf:
+ cols = line.split("|")
+ wav_file = os.path.join(root_path, cols[0])
+ text = cols[1]
+ items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+ return items
diff --git a/submodules/TTS/TTS/tts/layers/__init__.py b/submodules/TTS/TTS/tts/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f93efdb7fc41109ec3497d8e5e37ba05b0a4315e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/__init__.py
@@ -0,0 +1 @@
+from TTS.tts.layers.losses import *
diff --git a/submodules/TTS/TTS/tts/layers/align_tts/__init__.py b/submodules/TTS/TTS/tts/layers/align_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/align_tts/duration_predictor.py b/submodules/TTS/TTS/tts/layers/align_tts/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2b83894cc3f87575a89ea8fd7bf4a584ca22c28
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/align_tts/duration_predictor.py
@@ -0,0 +1,21 @@
+from torch import nn
+
+from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
+from TTS.tts.layers.generic.transformer import FFTransformerBlock
+
+
+class DurationPredictor(nn.Module):
+ def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
+ super().__init__()
+ self.embed = nn.Embedding(num_chars, hidden_channels)
+ self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
+ self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
+ self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
+
+ def forward(self, text, text_lengths):
+ # B, L -> B, L
+ emb = self.embed(text)
+ emb = self.pos_enc(emb.transpose(1, 2))
+ x = self.FFT(emb, text_lengths)
+ x = self.out_layer(x).squeeze(-1)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/align_tts/mdn.py b/submodules/TTS/TTS/tts/layers/align_tts/mdn.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdb332524bf7a5fec6a23da9e7977de6325a0324
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/align_tts/mdn.py
@@ -0,0 +1,30 @@
+from torch import nn
+
+
+class MDNBlock(nn.Module):
+ """Mixture of Density Network implementation
+ https://arxiv.org/pdf/2003.01950.pdf
+ """
+
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.out_channels = out_channels
+ self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
+ self.norm = nn.LayerNorm(in_channels)
+ self.relu = nn.ReLU()
+ self.dropout = nn.Dropout(0.1)
+ self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
+
+ def forward(self, x):
+ o = self.conv1(x)
+ o = o.transpose(1, 2)
+ o = self.norm(o)
+ o = o.transpose(1, 2)
+ o = self.relu(o)
+ o = self.dropout(o)
+ mu_sigma = self.conv2(o)
+ # TODO: check this sigmoid
+ # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
+ mu = mu_sigma[:, : self.out_channels // 2, :]
+ log_sigma = mu_sigma[:, self.out_channels // 2 :, :]
+ return mu, log_sigma
diff --git a/submodules/TTS/TTS/tts/layers/bark/__init__.py b/submodules/TTS/TTS/tts/layers/bark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/bark/hubert/__init__.py b/submodules/TTS/TTS/tts/layers/bark/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/bark/hubert/hubert_manager.py b/submodules/TTS/TTS/tts/layers/bark/hubert/hubert_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bc199294164da0e8c480e292dd5a478e72f4daf
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/hubert/hubert_manager.py
@@ -0,0 +1,35 @@
+# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
+
+import os.path
+import shutil
+import urllib.request
+
+import huggingface_hub
+
+
+class HubertManager:
+ @staticmethod
+ def make_sure_hubert_installed(
+ download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
+ ):
+ if not os.path.isfile(model_path):
+ print("Downloading HuBERT base model")
+ urllib.request.urlretrieve(download_url, model_path)
+ print("Downloaded HuBERT")
+ return model_path
+ return None
+
+ @staticmethod
+ def make_sure_tokenizer_installed(
+ model: str = "quantifier_hubert_base_ls960_14.pth",
+ repo: str = "GitMylo/bark-voice-cloning",
+ model_path: str = "",
+ ):
+ model_dir = os.path.dirname(model_path)
+ if not os.path.isfile(model_path):
+ print("Downloading HuBERT custom tokenizer")
+ huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
+ shutil.move(os.path.join(model_dir, model), model_path)
+ print("Downloaded tokenizer")
+ return model_path
+ return None
diff --git a/submodules/TTS/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/submodules/TTS/TTS/tts/layers/bark/hubert/kmeans_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a3b9aeb1111ca0abeccb6142007ecc5b39d78d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@@ -0,0 +1,82 @@
+"""
+Modified HuBERT model without kmeans.
+Original author: https://github.com/lucidrains/
+Modified by: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
+
+import logging
+from pathlib import Path
+
+import torch
+from einops import pack, unpack
+from torch import nn
+from torchaudio.functional import resample
+from transformers import HubertModel
+
+
+def round_down_nearest_multiple(num, divisor):
+ return num // divisor * divisor
+
+
+def curtail_to_multiple(t, mult, from_left=False):
+ data_len = t.shape[-1]
+ rounded_seq_len = round_down_nearest_multiple(data_len, mult)
+ seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
+ return t[..., seq_slice]
+
+
+def exists(val):
+ return val is not None
+
+
+def default(val, d):
+ return val if exists(val) else d
+
+
+class CustomHubert(nn.Module):
+ """
+ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
+ or you can train your own
+ """
+
+ def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
+ super().__init__()
+ self.target_sample_hz = target_sample_hz
+ self.seq_len_multiple_of = seq_len_multiple_of
+ self.output_layer = output_layer
+ if device is not None:
+ self.to(device)
+ self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
+ if device is not None:
+ self.model.to(device)
+ self.model.eval()
+
+ @property
+ def groups(self):
+ return 1
+
+ @torch.no_grad()
+ def forward(self, wav_input, flatten=True, input_sample_hz=None):
+ device = wav_input.device
+
+ if exists(input_sample_hz):
+ wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
+
+ if exists(self.seq_len_multiple_of):
+ wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
+
+ outputs = self.model.forward(
+ wav_input,
+ output_hidden_states=True,
+ )
+ embed = outputs["hidden_states"][self.output_layer]
+ embed, packed_shape = pack([embed], "* d")
+ codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
+ if flatten:
+ return codebook_indices
+
+ (codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
+ return codebook_indices
diff --git a/submodules/TTS/TTS/tts/layers/bark/hubert/tokenizer.py b/submodules/TTS/TTS/tts/layers/bark/hubert/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3070241f1cc1ac95867f2d4173495b9a7047a15e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/hubert/tokenizer.py
@@ -0,0 +1,195 @@
+"""
+Custom tokenizer model.
+Author: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+import json
+import os.path
+from zipfile import ZipFile
+
+import numpy
+import torch
+from torch import nn, optim
+
+
+class HubertTokenizer(nn.Module):
+ def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
+ super().__init__()
+ next_size = input_size
+ if version == 0:
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+ next_size = hidden_size
+ if version == 1:
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+ self.intermediate = nn.Linear(hidden_size, 4096)
+ next_size = 4096
+
+ self.fc = nn.Linear(next_size, output_size)
+ self.softmax = nn.LogSoftmax(dim=1)
+ self.optimizer: optim.Optimizer = None
+ self.lossfunc = nn.CrossEntropyLoss()
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.output_size = output_size
+ self.version = version
+
+ def forward(self, x):
+ x, _ = self.lstm(x)
+ if self.version == 1:
+ x = self.intermediate(x)
+ x = self.fc(x)
+ x = self.softmax(x)
+ return x
+
+ @torch.no_grad()
+ def get_token(self, x):
+ """
+ Used to get the token for the first
+ :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
+ :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
+ """
+ return torch.argmax(self(x), dim=1)
+
+ def prepare_training(self):
+ self.optimizer = optim.Adam(self.parameters(), 0.001)
+
+ def train_step(self, x_train, y_train, log_loss=False):
+ # y_train = y_train[:-1]
+ # y_train = y_train[1:]
+
+ optimizer = self.optimizer
+ lossfunc = self.lossfunc
+ # Zero the gradients
+ self.zero_grad()
+
+ # Forward pass
+ y_pred = self(x_train)
+
+ y_train_len = len(y_train)
+ y_pred_len = y_pred.shape[0]
+
+ if y_train_len > y_pred_len:
+ diff = y_train_len - y_pred_len
+ y_train = y_train[diff:]
+ elif y_train_len < y_pred_len:
+ diff = y_pred_len - y_train_len
+ y_pred = y_pred[:-diff, :]
+
+ y_train_hot = torch.zeros(len(y_train), self.output_size)
+ y_train_hot[range(len(y_train)), y_train] = 1
+ y_train_hot = y_train_hot.to("cuda")
+
+ # Calculate the loss
+ loss = lossfunc(y_pred, y_train_hot)
+
+ # Print loss
+ if log_loss:
+ print("Loss", loss.item())
+
+ # Backward pass
+ loss.backward()
+
+ # Update the weights
+ optimizer.step()
+
+ def save(self, path):
+ info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
+ torch.save(self.state_dict(), path)
+ data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
+ with ZipFile(path, "a") as model_zip:
+ model_zip.writestr(info_path, data_from_model.save())
+ model_zip.close()
+
+ @staticmethod
+ def load_from_checkpoint(path, map_location=None):
+ old = True
+ with ZipFile(path) as model_zip:
+ filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
+ file = filesMatch[0] if filesMatch else None
+ if file:
+ old = False
+ data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
+ model_zip.close()
+ if old:
+ model = HubertTokenizer()
+ else:
+ model = HubertTokenizer(
+ data_from_model.hidden_size,
+ data_from_model.input_size,
+ data_from_model.output_size,
+ data_from_model.version,
+ )
+ model.load_state_dict(torch.load(path, map_location=map_location))
+ if map_location:
+ model = model.to(map_location)
+ return model
+
+
+class Data:
+ input_size: int
+ hidden_size: int
+ output_size: int
+ version: int
+
+ def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.output_size = output_size
+ self.version = version
+
+ @staticmethod
+ def load(string):
+ data = json.loads(string)
+ return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
+
+ def save(self):
+ data = {
+ "input_size": self.input_size,
+ "hidden_size": self.hidden_size,
+ "output_size": self.output_size,
+ "version": self.version,
+ }
+ return json.dumps(data)
+
+
+def auto_train(data_path, save_path="model.pth", load_model: str = None, save_epochs=1):
+ data_x, data_y = [], []
+
+ if load_model and os.path.isfile(load_model):
+ print("Loading model from", load_model)
+ model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
+ else:
+ print("Creating new model.")
+ model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
+ save_path = os.path.join(data_path, save_path)
+ base_save_path = ".".join(save_path.split(".")[:-1])
+
+ sem_string = "_semantic.npy"
+ feat_string = "_semantic_features.npy"
+
+ ready = os.path.join(data_path, "ready")
+ for input_file in os.listdir(ready):
+ full_path = os.path.join(ready, input_file)
+ if input_file.endswith(sem_string):
+ data_y.append(numpy.load(full_path))
+ elif input_file.endswith(feat_string):
+ data_x.append(numpy.load(full_path))
+ model_training.prepare_training()
+
+ epoch = 1
+
+ while 1:
+ for _ in range(save_epochs):
+ j = 0
+ for x, y in zip(data_x, data_y):
+ model_training.train_step(
+ torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
+ ) # Print loss every 50 steps
+ j += 1
+ save_p = save_path
+ save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
+ model_training.save(save_p)
+ model_training.save(save_p_2)
+ print(f"Epoch {epoch} completed")
+ epoch += 1
diff --git a/submodules/TTS/TTS/tts/layers/bark/inference_funcs.py b/submodules/TTS/TTS/tts/layers/bark/inference_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3d3fee9371fae0cd06187c967a5b0028940138e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/inference_funcs.py
@@ -0,0 +1,606 @@
+import logging
+import os
+import re
+from glob import glob
+from typing import Dict, List
+
+import librosa
+import numpy as np
+import torch
+import torchaudio
+import tqdm
+from encodec.utils import convert_audio
+from scipy.special import softmax
+from torch.nn import functional as F
+
+from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
+from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
+from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
+from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
+
+logger = logging.getLogger(__name__)
+
+
+def _tokenize(tokenizer, text):
+ return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+ return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+ return re.sub(r"\s+", " ", text).strip()
+
+
+def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
+ dirs = extra_voice_dirs
+ voices: Dict[str, List[str]] = {}
+ for d in dirs:
+ subs = os.listdir(d)
+ for sub in subs:
+ subj = os.path.join(d, sub)
+ if os.path.isdir(subj):
+ voices[sub] = list(glob(f"{subj}/*.npz"))
+ # fetch audio files if no npz files are found
+ if len(voices[sub]) == 0:
+ voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
+ return voices
+
+
+def load_npz(npz_file):
+ x_history = np.load(npz_file)
+ semantic = x_history["semantic_prompt"]
+ coarse = x_history["coarse_prompt"]
+ fine = x_history["fine_prompt"]
+ return semantic, coarse, fine
+
+
+def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
+ if voice == "random":
+ return None, None, None
+
+ voices = get_voices(extra_voice_dirs)
+ paths = voices[voice]
+
+ # bark only uses a single sample for cloning
+ if len(paths) > 1:
+ raise ValueError(f"Voice {voice} has multiple paths: {paths}")
+
+ try:
+ path = voices[voice]
+ except KeyError as e:
+ raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
+
+ if len(paths) == 1 and paths[0].endswith(".npz"):
+ return load_npz(path[0])
+
+ audio_path = paths[0]
+ # replace the file extension with .npz
+ output_path = os.path.splitext(audio_path)[0] + ".npz"
+ generate_voice(audio=audio_path, model=model, output_path=output_path)
+ return load_voice(model, voice, extra_voice_dirs)
+
+
+def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
+ zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
+ total_frames = 1 + int((len(audio) - frame_length) / hop_length)
+ return zero_crossings / total_frames
+
+
+def compute_spectral_contrast(audio_data, sample_rate, n_bands=6, fmin=200.0):
+ spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_bands=n_bands, fmin=fmin)
+ return np.mean(spectral_contrast)
+
+
+def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
+ stft = librosa.stft(audio_data)
+ power_spectrogram = np.abs(stft) ** 2
+ frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=stft.shape[0])
+ bass_mask = frequencies <= max_bass_freq
+ bass_energy = power_spectrogram[np.ix_(bass_mask, np.arange(power_spectrogram.shape[1]))].mean()
+ return bass_energy
+
+
+def generate_voice(
+ audio,
+ model,
+ output_path,
+):
+ """Generate a new voice from a given audio and text prompt.
+
+ Args:
+ audio (np.ndarray): The audio to use as a base for the new voice.
+ text (str): Transcription of the audio you are clonning.
+ model (BarkModel): The BarkModel to use for generating the new voice.
+ output_path (str): The path to save the generated voice to.
+ """
+ if isinstance(audio, str):
+ audio, sr = torchaudio.load(audio)
+ audio = convert_audio(audio, sr, model.config.sample_rate, model.encodec.channels)
+ audio = audio.unsqueeze(0).to(model.device)
+
+ with torch.no_grad():
+ encoded_frames = model.encodec.encode(audio)
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
+
+ # move codes to cpu
+ codes = codes.cpu().numpy()
+
+ # generate semantic tokens
+ # Load the HuBERT model
+ hubert_manager = HubertManager()
+ # hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
+ hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
+
+ hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
+
+ # Load the CustomTokenizer model
+ tokenizer = HubertTokenizer.load_from_checkpoint(
+ model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"], map_location=model.device
+ )
+ # semantic_tokens = model.text_to_semantic(
+ # text, max_gen_duration_s=seconds, top_k=50, top_p=0.95, temp=0.7
+ # ) # not 100%
+ semantic_vectors = hubert_model.forward(audio[0], input_sample_hz=model.config.sample_rate)
+ semantic_tokens = tokenizer.get_token(semantic_vectors)
+ semantic_tokens = semantic_tokens.cpu().numpy()
+
+ np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
+
+
+def generate_text_semantic(
+ text,
+ model,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ base=None,
+ use_kv_caching=True,
+ **kwargs, # pylint: disable=unused-argument
+):
+ """Generate semantic tokens from text.
+
+ Args:
+ text (str): The text to generate semantic tokens from.
+ model (BarkModel): The BarkModel to use for generating the semantic tokens.
+ history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+ temp (float): The temperature to use for the generation.
+ top_k (int): The number of top tokens to consider for the generation.
+ top_p (float): The cumulative probability to consider for the generation.
+ silent (bool): Whether to silence the tqdm progress bar.
+ min_eos_p (float): The minimum probability to consider for the end of sentence token.
+ max_gen_duration_s (float): The maximum duration in seconds to generate for.
+ allow_early_stop (bool): Whether to allow the generation to stop early.
+ base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+ use_kv_caching (bool): Whether to use key-value caching for the generation.
+ **kwargs: Additional keyword arguments. They are ignored.
+
+ Returns:
+ np.ndarray: The generated semantic tokens.
+ """
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ assert len(text.strip()) > 0
+ if all(v is not None for v in history_prompt) or base is not None:
+ if history_prompt is not None:
+ semantic_history = history_prompt[0]
+ if base is not None:
+ semantic_history = base[0]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=model.config.TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=model.config.SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([model.config.SEMANTIC_PAD_TOKEN] * 256)
+ x = torch.from_numpy(
+ np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+ with inference_mode():
+ x = x.to(model.device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=100)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model.semantic_model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, : model.config.SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [model.config.SEMANTIC_PAD_TOKEN]])
+ ) # eos
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = torch.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1)
+ if allow_early_stop and (
+ item_next == model.config.SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(100 - pbar_state)
+ break
+ x = torch.cat((x, item_next[None]), dim=1)
+ tot_generated_duration_s += 1 / model.config.SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.update(100 - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(100 - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
+ if req_pbar_state > pbar_state:
+ pbar.update(req_pbar_state - pbar_state)
+ pbar_state = req_pbar_state
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
+ clear_cuda_cache()
+ return out
+
+
+def _flatten_codebooks(arr, offset_size):
+ assert len(arr.shape) == 2
+ arr = arr.copy()
+ if offset_size is not None:
+ for n in range(1, arr.shape[0]):
+ arr[n, :] += offset_size * n
+ flat_arr = arr.ravel("F")
+ return flat_arr
+
+
+def generate_coarse(
+ x_semantic,
+ model,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ base=None,
+ use_kv_caching=True,
+):
+ """Generate coarse audio codes from semantic tokens.
+
+ Args:
+ x_semantic (np.ndarray): The semantic tokens to generate coarse audio codes from.
+ model (BarkModel): The BarkModel to use for generating the coarse audio codes.
+ history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+ temp (float): The temperature to use for the generation.
+ top_k (int): The number of top tokens to consider for the generation.
+ top_p (float): The cumulative probability to consider for the generation.
+ silent (bool): Whether to silence the tqdm progress bar.
+ max_coarse_history (int): The maximum number of coarse audio codes to use as history.
+ sliding_window_len (int): The length of the sliding window to use for the generation.
+ base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+ use_kv_caching (bool): Whether to use key-value caching for the generation.
+
+ Returns:
+ np.ndarray: The generated coarse audio codes.
+ """
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = (
+ model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
+ )
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if all(v is not None for v in history_prompt) or base is not None:
+ if history_prompt is not None:
+ x_history = history_prompt
+ x_semantic_history = x_history[0]
+ x_coarse_history = x_history[1]
+ if base is not None:
+ x_semantic_history = base[0]
+ x_coarse_history = base[1]
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == model.config.N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= model.config.CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS, 1)
+ )
+ )
+ x_coarse_history = (
+ _flatten_codebooks(x_coarse_history, model.config.CODEBOOK_SIZE) + model.config.SEMANTIC_VOCAB_SIZE
+ )
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ x_coarse_history = x_coarse_history[:-2]
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS)
+ * model.config.N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % model.config.N_COARSE_CODEBOOKS == 0
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ with inference_mode():
+ x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
+ x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in = x_in[:, :256]
+ x_in = F.pad(
+ x_in,
+ (0, 256 - x_in.shape[-1]),
+ "constant",
+ model.config.COARSE_SEMANTIC_PAD_TOKEN,
+ )
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([model.config.COARSE_INFER_TOKEN])[None].to(model.device),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % model.config.N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in[:, [-1]]
+ else:
+ x_input = x_in
+
+ logits, kv_cache = model.coarse_model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
+ logit_start_idx = (
+ model.config.SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * model.config.CODEBOOK_SIZE
+ )
+ logit_end_idx = model.config.SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * model.config.CODEBOOK_SIZE
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(torch.nn.functional.softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = torch.nn.functional.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1)
+ item_next += logit_start_idx
+ x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+ x_in = torch.cat((x_in, item_next[None]), dim=1)
+ del logits, relevant_logits, probs, item_next
+ n_step += 1
+ del x_in
+ del x_semantic_in
+ gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+ del x_coarse_in
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = (
+ gen_coarse_arr.reshape(-1, model.config.N_COARSE_CODEBOOKS).T - model.config.SEMANTIC_VOCAB_SIZE
+ )
+ for n in range(1, model.config.N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
+ clear_cuda_cache()
+ return gen_coarse_audio_arr
+
+
+def generate_fine(
+ x_coarse_gen,
+ model,
+ history_prompt=None,
+ temp=0.5,
+ silent=True,
+ base=None,
+):
+ """Generate full audio codes from coarse audio codes.
+
+ Args:
+ x_coarse_gen (np.ndarray): The coarse audio codes to generate full audio codes from.
+ model (BarkModel): The BarkModel to use for generating the full audio codes.
+ history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+ temp (float): The temperature to use for the generation.
+ silent (bool): Whether to silence the tqdm progress bar.
+ base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+
+ Returns:
+ np.ndarray: The generated full audio codes.
+ """
+ assert (
+ isinstance(x_coarse_gen, np.ndarray)
+ and len(x_coarse_gen.shape) == 2
+ and 1 <= x_coarse_gen.shape[0] <= model.config.N_FINE_CODEBOOKS - 1
+ and x_coarse_gen.shape[1] > 0
+ and x_coarse_gen.min() >= 0
+ and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
+ )
+ if all(v is not None for v in history_prompt) or base is not None:
+ if history_prompt is not None:
+ x_fine_history = history_prompt[2]
+ if base is not None:
+ x_fine_history = base[2]
+ assert (
+ isinstance(x_fine_history, np.ndarray)
+ and len(x_fine_history.shape) == 2
+ and x_fine_history.shape[0] == model.config.N_FINE_CODEBOOKS
+ and x_fine_history.shape[1] >= 0
+ and x_fine_history.min() >= 0
+ and x_fine_history.max() <= model.config.CODEBOOK_SIZE - 1
+ )
+ else:
+ x_fine_history = None
+ n_coarse = x_coarse_gen.shape[0]
+ # make input arr
+ in_arr = np.vstack(
+ [
+ x_coarse_gen,
+ np.zeros((model.config.N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ + model.config.CODEBOOK_SIZE, # padding
+ ]
+ ).astype(np.int32)
+ # prepend history if available (max 512)
+ if x_fine_history is not None:
+ x_fine_history = x_fine_history.astype(np.int32)
+ in_arr = np.hstack(
+ [
+ x_fine_history[:, -512:].astype(np.int32),
+ in_arr,
+ ]
+ )
+ n_history = x_fine_history[:, -512:].shape[1]
+ else:
+ n_history = 0
+ n_remove_from_end = 0
+ # need to pad if too short (since non-causal model)
+ if in_arr.shape[1] < 1024:
+ n_remove_from_end = 1024 - in_arr.shape[1]
+ in_arr = np.hstack(
+ [
+ in_arr,
+ np.zeros((model.config.N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
+ + model.config.CODEBOOK_SIZE,
+ ]
+ )
+ # we can be lazy about fractional loop and just keep overwriting codebooks
+ n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
+ with inference_mode():
+ in_arr = torch.tensor(in_arr.T).to(model.device)
+ for n in tqdm.tqdm(range(n_loops), disable=silent):
+ start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+ start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+ rel_start_fill_idx = start_fill_idx - start_idx
+ in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+ for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
+ logits = model.fine_model(nn, in_buffer)
+ if temp is None:
+ relevant_logits = logits[0, rel_start_fill_idx:, : model.config.CODEBOOK_SIZE]
+ codebook_preds = torch.argmax(relevant_logits, -1)
+ else:
+ relevant_logits = logits[0, :, : model.config.CODEBOOK_SIZE] / temp
+ probs = F.softmax(relevant_logits, dim=-1)
+ codebook_preds = torch.hstack(
+ [torch.multinomial(probs[n], num_samples=1) for n in range(rel_start_fill_idx, 1024)]
+ )
+ in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+ del logits, codebook_preds
+ # transfer over info into model_in and convert to numpy
+ for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
+ in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[
+ 0, rel_start_fill_idx:, nn
+ ]
+ del in_buffer
+ gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+ del in_arr
+ gen_fine_arr = gen_fine_arr[:, n_history:]
+ if n_remove_from_end > 0:
+ gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+ assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+ clear_cuda_cache()
+ return gen_fine_arr
+
+
+def codec_decode(fine_tokens, model):
+ """Turn quantized audio codes into audio array using encodec."""
+ arr = torch.from_numpy(fine_tokens)[None]
+ arr = arr.to(model.device)
+ arr = arr.transpose(0, 1)
+ emb = model.encodec.quantizer.decode(arr)
+ out = model.encodec.decoder(emb)
+ audio_arr = out.detach().cpu().numpy().squeeze()
+ return audio_arr
diff --git a/submodules/TTS/TTS/tts/layers/bark/load_model.py b/submodules/TTS/TTS/tts/layers/bark/load_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b757f054ce98b91601b494854ef8e7b56b131
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/load_model.py
@@ -0,0 +1,160 @@
+import contextlib
+import functools
+import hashlib
+import logging
+import os
+
+import requests
+import torch
+import tqdm
+
+from TTS.tts.layers.bark.model import GPT, GPTConfig
+from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
+
+if (
+ torch.cuda.is_available()
+ and hasattr(torch.cuda, "amp")
+ and hasattr(torch.cuda.amp, "autocast")
+ and torch.cuda.is_bf16_supported()
+):
+ autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+
+ @contextlib.contextmanager
+ def autocast():
+ yield
+
+
+# hold models in global scope to lazy load
+
+logger = logging.getLogger(__name__)
+
+
+if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
+ logger.warning(
+ "torch version does not support flash attention. You will get significantly faster"
+ + " inference speed by upgrade torch to newest version / nightly."
+ )
+
+
+def _md5(fname):
+ hash_md5 = hashlib.md5()
+ with open(fname, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ return hash_md5.hexdigest()
+
+
+def _download(from_s3_path, to_local_path, CACHE_DIR):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ response = requests.get(from_s3_path, stream=True)
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
+ block_size = 1024 # 1 Kibibyte
+ progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+ with open(to_local_path, "wb") as file:
+ for data in response.iter_content(block_size):
+ progress_bar.update(len(data))
+ file.write(data)
+ progress_bar.close()
+ if total_size_in_bytes not in [0, progress_bar.n]:
+ raise ValueError("ERROR, something went wrong")
+
+
+class InferenceContext:
+ def __init__(self, benchmark=False):
+ # we can't expect inputs to be the same length, so disable benchmarking by default
+ self._chosen_cudnn_benchmark = benchmark
+ self._cudnn_benchmark = None
+
+ def __enter__(self):
+ self._cudnn_benchmark = torch.backends.cudnn.benchmark
+ torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def inference_mode():
+ with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+ yield
+
+
+def clear_cuda_cache():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+
+def load_model(ckpt_path, device, config, model_type="text"):
+ logger.info(f"loading {model_type} model from {ckpt_path}...")
+
+ if device == "cpu":
+ logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
+ if model_type == "text":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "coarse":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "fine":
+ ConfigClass = FineGPTConfig
+ ModelClass = FineGPT
+ else:
+ raise NotImplementedError()
+ if (
+ not config.USE_SMALLER_MODELS
+ and os.path.exists(ckpt_path)
+ and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
+ ):
+ logger.warning(f"found outdated {model_type} model, removing...")
+ os.remove(ckpt_path)
+ if not os.path.exists(ckpt_path):
+ logger.info(f"{model_type} model not found, downloading...")
+ _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
+
+ checkpoint = torch.load(ckpt_path, map_location=device)
+ # this is a hack
+ model_args = checkpoint["model_args"]
+ if "input_vocab_size" not in model_args:
+ model_args["input_vocab_size"] = model_args["vocab_size"]
+ model_args["output_vocab_size"] = model_args["vocab_size"]
+ del model_args["vocab_size"]
+
+ gptconf = ConfigClass(**checkpoint["model_args"])
+ if model_type == "text":
+ config.semantic_config = gptconf
+ elif model_type == "coarse":
+ config.coarse_config = gptconf
+ elif model_type == "fine":
+ config.fine_config = gptconf
+
+ model = ModelClass(gptconf)
+ state_dict = checkpoint["model"]
+ # fixup checkpoint
+ unwanted_prefix = "_orig_mod."
+ for k, _ in list(state_dict.items()):
+ if k.startswith(unwanted_prefix):
+ state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+ extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+ extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
+ missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+ missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ model.load_state_dict(state_dict, strict=False)
+ n_params = model.get_num_params()
+ val_loss = checkpoint["best_val_loss"].item()
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+ model.eval()
+ model.to(device)
+ del checkpoint, state_dict
+ clear_cuda_cache()
+ return model, config
diff --git a/submodules/TTS/TTS/tts/layers/bark/model.py b/submodules/TTS/TTS/tts/layers/bark/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c84022bd08bcdd2f3f9f3caadfc15a7bf80ddaf3
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/model.py
@@ -0,0 +1,233 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+from coqpit import Coqpit
+from torch import nn
+from torch.nn import functional as F
+
+
+class LayerNorm(nn.Module):
+ """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+
+ def __init__(self, ndim, bias):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(ndim))
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+ def forward(self, x):
+ return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+
+
+class CausalSelfAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+ if not self.flash:
+ # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
+ # causal mask to ensure that attention is only applied to the left in the input sequence
+ self.register_buffer(
+ "bias",
+ torch.tril(torch.ones(config.block_size, config.block_size)).view(
+ 1, 1, config.block_size, config.block_size
+ ),
+ )
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ if past_kv is not None:
+ past_key = past_kv[0]
+ past_value = past_kv[1]
+ k = torch.cat((past_key, k), dim=-2)
+ v = torch.cat((past_value, v), dim=-2)
+
+ FULL_T = k.shape[-2]
+
+ if use_cache is True:
+ present = (k, v)
+ else:
+ present = None
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ if past_kv is not None:
+ # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
+ # the query for the last token. scaled_dot_product_attention interprets this as the first token in the
+ # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
+ # to work around this we set is_causal=False.
+ is_causal = False
+ else:
+ is_causal = True
+
+ # efficient attention using Flash Attention CUDA kernels
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf"))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return (y, present)
+
+
+class MLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+ self.dropout = nn.Dropout(config.dropout)
+ self.gelu = nn.GELU()
+
+ def forward(self, x):
+ x = self.c_fc(x)
+ x = self.gelu(x)
+ x = self.c_proj(x)
+ x = self.dropout(x)
+ return x
+
+
+class Block(nn.Module):
+ def __init__(self, config, layer_idx):
+ super().__init__()
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+ self.attn = CausalSelfAttention(config)
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+ self.mlp = MLP(config)
+ self.layer_idx = layer_idx
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
+ x = x + attn_output
+ x = x + self.mlp(self.ln_2(x))
+ return (x, prev_kvs)
+
+
+@dataclass
+class GPTConfig(Coqpit):
+ block_size: int = 1024
+ input_vocab_size: int = 10_048
+ output_vocab_size: int = 10_048
+ n_layer: int = 12
+ n_head: int = 12
+ n_embd: int = 768
+ dropout: float = 0.0
+ bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+
+class GPT(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ assert config.input_vocab_size is not None
+ assert config.output_vocab_size is not None
+ assert config.block_size is not None
+ self.config = config
+
+ self.transformer = nn.ModuleDict(
+ dict(
+ wte=nn.Embedding(config.input_vocab_size, config.n_embd),
+ wpe=nn.Embedding(config.block_size, config.n_embd),
+ drop=nn.Dropout(config.dropout),
+ h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
+ ln_f=LayerNorm(config.n_embd, bias=config.bias),
+ )
+ )
+ self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ n_params -= self.transformer.wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
+ device = idx.device
+ _, t = idx.size()
+ if past_kv is not None:
+ assert t == 1
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+ else:
+ if merge_context:
+ assert idx.shape[1] >= 256 + 256 + 1
+ t = idx.shape[1] - 256
+ else:
+ assert (
+ t <= self.config.block_size
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+
+ # forward the GPT model itself
+ if merge_context:
+ tok_emb = torch.cat(
+ [
+ self.transformer.wte(idx[:, :256]) + self.transformer.wte(idx[:, 256 : 256 + 256]),
+ self.transformer.wte(idx[:, 256 + 256 :]),
+ ],
+ dim=1,
+ )
+ else:
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+
+ if past_kv is None:
+ past_length = 0
+ past_kv = tuple([None] * len(self.transformer.h))
+ else:
+ past_length = past_kv[0][0].size(-2)
+
+ if position_ids is None:
+ position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0) # shape (1, t)
+ assert position_ids.shape == (1, t)
+
+ pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
+
+ x = self.transformer.drop(tok_emb + pos_emb)
+
+ new_kv = () if use_cache else None
+
+ for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+ x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
+
+ if use_cache:
+ new_kv = new_kv + (kv,)
+
+ x = self.transformer.ln_f(x)
+
+ # inference-time mini-optimization: only forward the lm_head on the very last position
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+
+ return (logits, new_kv)
diff --git a/submodules/TTS/TTS/tts/layers/bark/model_fine.py b/submodules/TTS/TTS/tts/layers/bark/model_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e5f4765dce8743db2a3ed879e7811d2b9d23d6
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/bark/model_fine.py
@@ -0,0 +1,142 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .model import GPT, MLP, GPTConfig
+
+
+class NonCausalSelfAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+
+ def forward(self, x):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ y = torch.nn.functional.scaled_dot_product_attention(
+ q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
+ )
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return y
+
+
+class FineBlock(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.ln_1 = nn.LayerNorm(config.n_embd)
+ self.attn = NonCausalSelfAttention(config)
+ self.ln_2 = nn.LayerNorm(config.n_embd)
+ self.mlp = MLP(config)
+
+ def forward(self, x):
+ x = x + self.attn(self.ln_1(x))
+ x = x + self.mlp(self.ln_2(x))
+ return x
+
+
+class FineGPT(GPT):
+ def __init__(self, config):
+ super().__init__(config)
+ del self.lm_head
+ self.config = config
+ self.n_codes_total = config.n_codes_total
+ self.transformer = nn.ModuleDict(
+ dict(
+ wtes=nn.ModuleList(
+ [nn.Embedding(config.input_vocab_size, config.n_embd) for _ in range(config.n_codes_total)]
+ ),
+ wpe=nn.Embedding(config.block_size, config.n_embd),
+ drop=nn.Dropout(config.dropout),
+ h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
+ ln_f=nn.LayerNorm(config.n_embd),
+ )
+ )
+ self.lm_heads = nn.ModuleList(
+ [
+ nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+ for _ in range(config.n_codes_given, self.n_codes_total)
+ ]
+ )
+ for i in range(self.n_codes_total - config.n_codes_given):
+ self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
+
+ def forward(self, pred_idx, idx):
+ device = idx.device
+ b, t, codes = idx.size()
+ assert (
+ t <= self.config.block_size
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+ assert pred_idx > 0, "cannot predict 0th codebook"
+ assert codes == self.n_codes_total, (b, t, codes)
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
+
+ # forward the GPT model itself
+ tok_embs = [
+ wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
+ ] # token embeddings of shape (b, t, n_embd)
+ tok_emb = torch.cat(tok_embs, dim=-1)
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
+ x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
+ x = self.transformer.drop(x + pos_emb)
+ for block in self.transformer.h:
+ x = block(x)
+ x = self.transformer.ln_f(x)
+ logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
+ return logits
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ for wte in self.transformer.wtes:
+ n_params -= wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+
+@dataclass
+class FineGPTConfig(GPTConfig):
+ n_codes_total: int = 8
+ n_codes_given: int = 1
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/__init__.py b/submodules/TTS/TTS/tts/layers/delightful_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/acoustic_model.py b/submodules/TTS/TTS/tts/layers/delightful_tts/acoustic_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c906b882e567fade64139a8b932c71d554117547
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/acoustic_model.py
@@ -0,0 +1,563 @@
+### credit: https://github.com/dunky11/voicesmith
+from typing import Callable, Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from coqpit import Coqpit
+from torch import nn
+
+from TTS.tts.layers.delightful_tts.conformer import Conformer
+from TTS.tts.layers.delightful_tts.encoders import (
+ PhonemeLevelProsodyEncoder,
+ UtteranceLevelProsodyEncoder,
+ get_mask_from_lengths,
+)
+from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
+from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
+from TTS.tts.layers.delightful_tts.phoneme_prosody_predictor import PhonemeProsodyPredictor
+from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.layers.generic.aligner import AlignmentNetwork
+from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
+
+
+class AcousticModel(torch.nn.Module):
+ def __init__(
+ self,
+ args: "ModelArgs",
+ tokenizer: "TTSTokenizer" = None,
+ speaker_manager: "SpeakerManager" = None,
+ ):
+ super().__init__()
+ self.args = args
+ self.tokenizer = tokenizer
+ self.speaker_manager = speaker_manager
+
+ self.init_multispeaker(args)
+ # self.set_embedding_dims()
+
+ self.length_scale = (
+ float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale
+ )
+
+ self.emb_dim = args.n_hidden_conformer_encoder
+ self.encoder = Conformer(
+ dim=self.args.n_hidden_conformer_encoder,
+ n_layers=self.args.n_layers_conformer_encoder,
+ n_heads=self.args.n_heads_conformer_encoder,
+ speaker_embedding_dim=self.embedded_speaker_dim,
+ p_dropout=self.args.dropout_conformer_encoder,
+ kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_encoder,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+ self.pitch_adaptor = PitchAdaptor(
+ n_input=self.args.n_hidden_conformer_encoder,
+ n_hidden=self.args.n_hidden_variance_adaptor,
+ n_out=1,
+ kernel_size=self.args.kernel_size_variance_adaptor,
+ emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
+ p_dropout=self.args.dropout_variance_adaptor,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+ self.energy_adaptor = EnergyAdaptor(
+ channels_in=self.args.n_hidden_conformer_encoder,
+ channels_hidden=self.args.n_hidden_variance_adaptor,
+ channels_out=1,
+ kernel_size=self.args.kernel_size_variance_adaptor,
+ emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
+ dropout=self.args.dropout_variance_adaptor,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+
+ self.aligner = AlignmentNetwork(
+ in_query_channels=self.args.out_channels,
+ in_key_channels=self.args.n_hidden_conformer_encoder,
+ )
+
+ self.duration_predictor = VariancePredictor(
+ channels_in=self.args.n_hidden_conformer_encoder,
+ channels=self.args.n_hidden_variance_adaptor,
+ channels_out=1,
+ kernel_size=self.args.kernel_size_variance_adaptor,
+ p_dropout=self.args.dropout_variance_adaptor,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+
+ self.utterance_prosody_encoder = UtteranceLevelProsodyEncoder(
+ num_mels=self.args.num_mels,
+ ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
+ ref_enc_size=self.args.ref_enc_size_reference_encoder,
+ ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
+ ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
+ n_hidden=self.args.n_hidden_conformer_encoder,
+ dropout=self.args.dropout_conformer_encoder,
+ bottleneck_size_u=self.args.bottleneck_size_u_reference_encoder,
+ token_num=self.args.token_num_reference_encoder,
+ )
+
+ self.utterance_prosody_predictor = PhonemeProsodyPredictor(
+ hidden_size=self.args.n_hidden_conformer_encoder,
+ kernel_size=self.args.predictor_kernel_size_reference_encoder,
+ dropout=self.args.dropout_conformer_encoder,
+ bottleneck_size=self.args.bottleneck_size_u_reference_encoder,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+
+ self.phoneme_prosody_encoder = PhonemeLevelProsodyEncoder(
+ num_mels=self.args.num_mels,
+ ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
+ ref_enc_size=self.args.ref_enc_size_reference_encoder,
+ ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
+ ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
+ n_hidden=self.args.n_hidden_conformer_encoder,
+ dropout=self.args.dropout_conformer_encoder,
+ bottleneck_size_p=self.args.bottleneck_size_p_reference_encoder,
+ n_heads=self.args.n_heads_conformer_encoder,
+ )
+
+ self.phoneme_prosody_predictor = PhonemeProsodyPredictor(
+ hidden_size=self.args.n_hidden_conformer_encoder,
+ kernel_size=self.args.predictor_kernel_size_reference_encoder,
+ dropout=self.args.dropout_conformer_encoder,
+ bottleneck_size=self.args.bottleneck_size_p_reference_encoder,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+
+ self.u_bottle_out = nn.Linear(
+ self.args.bottleneck_size_u_reference_encoder,
+ self.args.n_hidden_conformer_encoder,
+ )
+
+ self.u_norm = nn.InstanceNorm1d(self.args.bottleneck_size_u_reference_encoder)
+ self.p_bottle_out = nn.Linear(
+ self.args.bottleneck_size_p_reference_encoder,
+ self.args.n_hidden_conformer_encoder,
+ )
+ self.p_norm = nn.InstanceNorm1d(
+ self.args.bottleneck_size_p_reference_encoder,
+ )
+ self.decoder = Conformer(
+ dim=self.args.n_hidden_conformer_decoder,
+ n_layers=self.args.n_layers_conformer_decoder,
+ n_heads=self.args.n_heads_conformer_decoder,
+ speaker_embedding_dim=self.embedded_speaker_dim,
+ p_dropout=self.args.dropout_conformer_decoder,
+ kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_decoder,
+ lrelu_slope=self.args.lrelu_slope,
+ )
+
+ padding_idx = self.tokenizer.characters.pad_id
+ self.src_word_emb = EmbeddingPadded(
+ self.args.num_chars, self.args.n_hidden_conformer_encoder, padding_idx=padding_idx
+ )
+ self.to_mel = nn.Linear(
+ self.args.n_hidden_conformer_decoder,
+ self.args.num_mels,
+ )
+
+ self.energy_scaler = torch.nn.BatchNorm1d(1, affine=False, track_running_stats=True, momentum=None)
+ self.energy_scaler.requires_grad_(False)
+
+ def init_multispeaker(self, args: Coqpit): # pylint: disable=unused-argument
+ """Init for multi-speaker training."""
+ self.embedded_speaker_dim = 0
+ self.num_speakers = self.args.num_speakers
+ self.audio_transform = None
+
+ if self.speaker_manager:
+ self.num_speakers = self.speaker_manager.num_speakers
+
+ if self.args.use_speaker_embedding:
+ self._init_speaker_embedding()
+
+ if self.args.use_d_vector_file:
+ self._init_d_vector()
+
+ @staticmethod
+ def _set_cond_input(aux_input: Dict):
+ """Set the speaker conditioning input based on the multi-speaker mode."""
+ sid, g, lid, durations = None, None, None, None
+ if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
+ sid = aux_input["speaker_ids"]
+ if sid.ndim == 0:
+ sid = sid.unsqueeze_(0)
+ if "d_vectors" in aux_input and aux_input["d_vectors"] is not None:
+ g = F.normalize(aux_input["d_vectors"]) # .unsqueeze_(-1)
+ if g.ndim == 2:
+ g = g # .unsqueeze_(0) # pylint: disable=self-assigning-variable
+
+ if "durations" in aux_input and aux_input["durations"] is not None:
+ durations = aux_input["durations"]
+
+ return sid, g, lid, durations
+
+ def get_aux_input(self, aux_input: Dict):
+ sid, g, lid, _ = self._set_cond_input(aux_input)
+ return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
+
+ def _set_speaker_input(self, aux_input: Dict):
+ d_vectors = aux_input.get("d_vectors", None)
+ speaker_ids = aux_input.get("speaker_ids", None)
+
+ if d_vectors is not None and speaker_ids is not None:
+ raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
+
+ if speaker_ids is not None and not hasattr(self, "emb_g"):
+ raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
+
+ g = speaker_ids if speaker_ids is not None else d_vectors
+ return g
+
+ # def set_embedding_dims(self):
+ # if self.embedded_speaker_dim > 0:
+ # self.embedding_dims = self.embedded_speaker_dim
+ # else:
+ # self.embedding_dims = 0
+
+ def _init_speaker_embedding(self):
+ # pylint: disable=attribute-defined-outside-init
+ if self.num_speakers > 0:
+ print(" > initialization of speaker-embedding layers.")
+ self.embedded_speaker_dim = self.args.speaker_embedding_channels
+ self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
+
+ def _init_d_vector(self):
+ # pylint: disable=attribute-defined-outside-init
+ if hasattr(self, "emb_g"):
+ raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
+ self.embedded_speaker_dim = self.args.d_vector_dim
+
+ @staticmethod
+ def generate_attn(dr, x_mask, y_mask=None):
+ """Generate an attention mask from the linear scale durations.
+
+ Args:
+ dr (Tensor): Linear scale durations.
+ x_mask (Tensor): Mask for the input (character) sequence.
+ y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
+ if None. Defaults to None.
+
+ Shapes
+ - dr: :math:`(B, T_{en})`
+ - x_mask: :math:`(B, T_{en})`
+ - y_mask: :math:`(B, T_{de})`
+ """
+ # compute decode mask from the durations
+ if y_mask is None:
+ y_lengths = dr.sum(1).long()
+ y_lengths[y_lengths < 1] = 1
+ y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
+ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
+ attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
+ return attn
+
+ def _expand_encoder_with_durations(
+ self,
+ o_en: torch.FloatTensor,
+ dr: torch.IntTensor,
+ x_mask: torch.IntTensor,
+ y_lengths: torch.IntTensor,
+ ):
+ y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
+ attn = self.generate_attn(dr, x_mask, y_mask)
+ o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en])
+ return y_mask, o_en_ex, attn.transpose(1, 2)
+
+ def _forward_aligner(
+ self,
+ x: torch.FloatTensor,
+ y: torch.FloatTensor,
+ x_mask: torch.IntTensor,
+ y_mask: torch.IntTensor,
+ attn_priors: torch.FloatTensor,
+ ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+ """Aligner forward pass.
+
+ 1. Compute a mask to apply to the attention map.
+ 2. Run the alignment network.
+ 3. Apply MAS to compute the hard alignment map.
+ 4. Compute the durations from the hard alignment map.
+
+ Args:
+ x (torch.FloatTensor): Input sequence.
+ y (torch.FloatTensor): Output sequence.
+ x_mask (torch.IntTensor): Input sequence mask.
+ y_mask (torch.IntTensor): Output sequence mask.
+ attn_priors (torch.FloatTensor): Prior for the aligner network map.
+
+ Returns:
+ Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+ Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
+ hard alignment map.
+
+ Shapes:
+ - x: :math:`[B, T_en, C_en]`
+ - y: :math:`[B, T_de, C_de]`
+ - x_mask: :math:`[B, 1, T_en]`
+ - y_mask: :math:`[B, 1, T_de]`
+ - attn_priors: :math:`[B, T_de, T_en]`
+
+ - aligner_durations: :math:`[B, T_en]`
+ - aligner_soft: :math:`[B, T_de, T_en]`
+ - aligner_logprob: :math:`[B, 1, T_de, T_en]`
+ - aligner_mas: :math:`[B, T_de, T_en]`
+ """
+ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # [B, 1, T_en, T_de]
+ aligner_soft, aligner_logprob = self.aligner(y.transpose(1, 2), x.transpose(1, 2), x_mask, attn_priors)
+ aligner_mas = maximum_path(
+ aligner_soft.squeeze(1).transpose(1, 2).contiguous(), attn_mask.squeeze(1).contiguous()
+ )
+ aligner_durations = torch.sum(aligner_mas, -1).int()
+ aligner_soft = aligner_soft.squeeze(1) # [B, T_max2, T_max]
+ aligner_mas = aligner_mas.transpose(1, 2) # [B, T_max, T_max2] -> [B, T_max2, T_max]
+ return aligner_durations, aligner_soft, aligner_logprob, aligner_mas
+
+ def average_utterance_prosody( # pylint: disable=no-self-use
+ self, u_prosody_pred: torch.Tensor, src_mask: torch.Tensor
+ ) -> torch.Tensor:
+ lengths = ((~src_mask) * 1.0).sum(1)
+ u_prosody_pred = u_prosody_pred.sum(1, keepdim=True) / lengths.view(-1, 1, 1)
+ return u_prosody_pred
+
+ def forward(
+ self,
+ tokens: torch.Tensor,
+ src_lens: torch.Tensor,
+ mels: torch.Tensor,
+ mel_lens: torch.Tensor,
+ pitches: torch.Tensor,
+ energies: torch.Tensor,
+ attn_priors: torch.Tensor,
+ use_ground_truth: bool = True,
+ d_vectors: torch.Tensor = None,
+ speaker_idx: torch.Tensor = None,
+ ) -> Dict[str, torch.Tensor]:
+ sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
+ {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
+ ) # pylint: disable=unused-variable
+
+ src_mask = get_mask_from_lengths(src_lens) # [B, T_src]
+ mel_mask = get_mask_from_lengths(mel_lens) # [B, T_mel]
+
+ # Token embeddings
+ token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden]
+ token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
+
+ # Alignment network and durations
+ aligner_durations, aligner_soft, aligner_logprob, aligner_mas = self._forward_aligner(
+ x=token_embeddings,
+ y=mels.transpose(1, 2),
+ x_mask=~src_mask[:, None],
+ y_mask=~mel_mask[:, None],
+ attn_priors=attn_priors,
+ )
+ dr = aligner_durations # [B, T_en]
+
+ # Embeddings
+ speaker_embedding = None
+ if d_vectors is not None:
+ speaker_embedding = g
+ elif speaker_idx is not None:
+ speaker_embedding = F.normalize(self.emb_g(sid))
+
+ pos_encoding = positional_encoding(
+ self.emb_dim,
+ max(token_embeddings.shape[1], max(mel_lens)),
+ device=token_embeddings.device,
+ )
+ encoder_outputs = self.encoder(
+ token_embeddings,
+ src_mask,
+ speaker_embedding=speaker_embedding,
+ encoding=pos_encoding,
+ )
+
+ u_prosody_ref = self.u_norm(self.utterance_prosody_encoder(mels=mels, mel_lens=mel_lens))
+ u_prosody_pred = self.u_norm(
+ self.average_utterance_prosody(
+ u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
+ src_mask=src_mask,
+ )
+ )
+
+ if use_ground_truth:
+ encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_ref)
+ else:
+ encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred)
+
+ p_prosody_ref = self.p_norm(
+ self.phoneme_prosody_encoder(
+ x=encoder_outputs, src_mask=src_mask, mels=mels, mel_lens=mel_lens, encoding=pos_encoding
+ )
+ )
+ p_prosody_pred = self.p_norm(self.phoneme_prosody_predictor(x=encoder_outputs, mask=src_mask))
+
+ if use_ground_truth:
+ encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_ref)
+ else:
+ encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred)
+
+ encoder_outputs_res = encoder_outputs
+
+ pitch_pred, avg_pitch_target, pitch_emb = self.pitch_adaptor.get_pitch_embedding_train(
+ x=encoder_outputs,
+ target=pitches,
+ dr=dr,
+ mask=src_mask,
+ )
+
+ energy_pred, avg_energy_target, energy_emb = self.energy_adaptor.get_energy_embedding_train(
+ x=encoder_outputs,
+ target=energies,
+ dr=dr,
+ mask=src_mask,
+ )
+
+ encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
+ log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
+
+ mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
+ o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None]
+ )
+
+ x = self.decoder(
+ encoder_outputs_ex.transpose(1, 2),
+ mel_mask,
+ speaker_embedding=speaker_embedding,
+ encoding=pos_encoding,
+ )
+ x = self.to_mel(x)
+
+ dr = torch.log(dr + 1)
+
+ dr_pred = torch.exp(log_duration_prediction) - 1
+ alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2']
+
+ return {
+ "model_outputs": x,
+ "pitch_pred": pitch_pred,
+ "pitch_target": avg_pitch_target,
+ "energy_pred": energy_pred,
+ "energy_target": avg_energy_target,
+ "u_prosody_pred": u_prosody_pred,
+ "u_prosody_ref": u_prosody_ref,
+ "p_prosody_pred": p_prosody_pred,
+ "p_prosody_ref": p_prosody_ref,
+ "alignments_dp": alignments_dp,
+ "alignments": alignments, # [B, T_de, T_en]
+ "aligner_soft": aligner_soft,
+ "aligner_mas": aligner_mas,
+ "aligner_durations": aligner_durations,
+ "aligner_logprob": aligner_logprob,
+ "dr_log_pred": log_duration_prediction.squeeze(1), # [B, T]
+ "dr_log_target": dr.squeeze(1), # [B, T]
+ "spk_emb": speaker_embedding,
+ }
+
+ @torch.no_grad()
+ def inference(
+ self,
+ tokens: torch.Tensor,
+ speaker_idx: torch.Tensor,
+ p_control: float = None, # TODO # pylint: disable=unused-argument
+ d_control: float = None, # TODO # pylint: disable=unused-argument
+ d_vectors: torch.Tensor = None,
+ pitch_transform: Callable = None,
+ energy_transform: Callable = None,
+ ) -> torch.Tensor:
+ src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
+ src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable
+ sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
+ {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
+ ) # pylint: disable=unused-variable
+
+ token_embeddings = self.src_word_emb(tokens)
+ token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
+
+ # Embeddings
+ speaker_embedding = None
+ if d_vectors is not None:
+ speaker_embedding = g
+ elif speaker_idx is not None:
+ speaker_embedding = F.normalize(self.emb_g(sid))
+
+ pos_encoding = positional_encoding(
+ self.emb_dim,
+ token_embeddings.shape[1],
+ device=token_embeddings.device,
+ )
+ encoder_outputs = self.encoder(
+ token_embeddings,
+ src_mask,
+ speaker_embedding=speaker_embedding,
+ encoding=pos_encoding,
+ )
+
+ u_prosody_pred = self.u_norm(
+ self.average_utterance_prosody(
+ u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
+ src_mask=src_mask,
+ )
+ )
+ encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred).expand_as(encoder_outputs)
+
+ p_prosody_pred = self.p_norm(
+ self.phoneme_prosody_predictor(
+ x=encoder_outputs,
+ mask=src_mask,
+ )
+ )
+ encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred).expand_as(encoder_outputs)
+
+ encoder_outputs_res = encoder_outputs
+
+ pitch_emb_pred, pitch_pred = self.pitch_adaptor.get_pitch_embedding(
+ x=encoder_outputs,
+ mask=src_mask,
+ pitch_transform=pitch_transform,
+ pitch_mean=self.pitch_mean if hasattr(self, "pitch_mean") else None,
+ pitch_std=self.pitch_std if hasattr(self, "pitch_std") else None,
+ )
+
+ energy_emb_pred, energy_pred = self.energy_adaptor.get_energy_embedding(
+ x=encoder_outputs, mask=src_mask, energy_transform=energy_transform
+ )
+ encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb_pred + energy_emb_pred
+
+ log_duration_pred = self.duration_predictor(
+ x=encoder_outputs_res.detach(), mask=src_mask
+ ) # [B, C_hidden, T_src] -> [B, T_src]
+ duration_pred = (torch.exp(log_duration_pred) - 1) * (~src_mask) * self.length_scale # -> [B, T_src]
+ duration_pred[duration_pred < 1] = 1.0 # -> [B, T_src]
+ duration_pred = torch.round(duration_pred) # -> [B, T_src]
+ mel_lens = duration_pred.sum(1) # -> [B,]
+
+ _, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
+ o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
+ )
+
+ mel_mask = get_mask_from_lengths(
+ torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
+ )
+
+ if encoder_outputs_ex.shape[1] > pos_encoding.shape[1]:
+ encoding = positional_encoding(self.emb_dim, encoder_outputs_ex.shape[2], device=tokens.device)
+
+ # [B, C_hidden, T_src], [B, 1, T_src], [B, C_emb], [B, T_src, C_hidden] -> [B, C_hidden, T_src]
+ x = self.decoder(
+ encoder_outputs_ex.transpose(1, 2),
+ mel_mask,
+ speaker_embedding=speaker_embedding,
+ encoding=encoding,
+ )
+ x = self.to_mel(x)
+ outputs = {
+ "model_outputs": x,
+ "alignments": alignments,
+ # "pitch": pitch_emb_pred,
+ "durations": duration_pred,
+ "pitch": pitch_pred,
+ "energy": energy_pred,
+ "spk_emb": speaker_embedding,
+ }
+ return outputs
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/conformer.py b/submodules/TTS/TTS/tts/layers/delightful_tts/conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2175b3b965c6b100846e87d405a753dc272c9e7
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/conformer.py
@@ -0,0 +1,450 @@
+### credit: https://github.com/dunky11/voicesmith
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d
+from TTS.tts.layers.delightful_tts.networks import GLUActivation
+
+
+def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+ pad = kernel_size // 2
+ return (pad, pad - (kernel_size + 1) % 2)
+
+
+class Conformer(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ n_layers: int,
+ n_heads: int,
+ speaker_embedding_dim: int,
+ p_dropout: float,
+ kernel_size_conv_mod: int,
+ lrelu_slope: float,
+ ):
+ """
+ A Transformer variant that integrates both CNNs and Transformers components.
+ Conformer proposes a novel combination of self-attention and convolution, in which self-attention
+ learns the global interaction while the convolutions efficiently capture the local correlations.
+
+ Args:
+ dim (int): Number of the dimensions for the model.
+ n_layers (int): Number of model layers.
+ n_heads (int): The number of attention heads.
+ speaker_embedding_dim (int): Number of speaker embedding dimensions.
+ p_dropout (float): Probabilty of dropout.
+ kernel_size_conv_mod (int): Size of kernels for convolution modules.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ - **encoding** (batch, time, dim): Positional embedding tensor
+ - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produced by Conformer Encoder.
+ """
+ super().__init__()
+ d_k = d_v = dim // n_heads
+ self.layer_stack = nn.ModuleList(
+ [
+ ConformerBlock(
+ dim,
+ n_heads,
+ d_k,
+ d_v,
+ kernel_size_conv_mod=kernel_size_conv_mod,
+ dropout=p_dropout,
+ speaker_embedding_dim=speaker_embedding_dim,
+ lrelu_slope=lrelu_slope,
+ )
+ for _ in range(n_layers)
+ ]
+ )
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ mask: torch.Tensor,
+ speaker_embedding: torch.Tensor,
+ encoding: torch.Tensor,
+ ) -> torch.Tensor:
+ """
+ Shapes:
+ - x: :math:`[B, T_src, C]`
+ - mask: :math: `[B]`
+ - speaker_embedding: :math: `[B, C]`
+ - encoding: :math: `[B, T_max2, C]`
+ """
+
+ attn_mask = mask.view((mask.shape[0], 1, 1, mask.shape[1]))
+ for enc_layer in self.layer_stack:
+ x = enc_layer(
+ x,
+ mask=mask,
+ slf_attn_mask=attn_mask,
+ speaker_embedding=speaker_embedding,
+ encoding=encoding,
+ )
+ return x
+
+
+class ConformerBlock(torch.nn.Module):
+ def __init__(
+ self,
+ d_model: int,
+ n_head: int,
+ d_k: int, # pylint: disable=unused-argument
+ d_v: int, # pylint: disable=unused-argument
+ kernel_size_conv_mod: int,
+ speaker_embedding_dim: int,
+ dropout: float,
+ lrelu_slope: float = 0.3,
+ ):
+ """
+ A Conformer block is composed of four modules stacked together,
+ A feed-forward module, a self-attention module, a convolution module,
+ and a second feed-forward module in the end. The block starts with two Feed forward
+ modules sandwiching the Multi-Headed Self-Attention module and the Conv module.
+
+ Args:
+ d_model (int): The dimension of model
+ n_head (int): The number of attention heads.
+ kernel_size_conv_mod (int): Size of kernels for convolution modules.
+ speaker_embedding_dim (int): Number of speaker embedding dimensions.
+ emotion_embedding_dim (int): Number of emotion embedding dimensions.
+ dropout (float): Probabilty of dropout.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ - **encoding** (batch, time, dim): Positional embedding tensor
+ - **slf_attn_mask** (batch, 1, 1, time1): Tensor containing indices to be masked in self attention module
+ - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produced by the Conformer Block.
+ """
+ super().__init__()
+ if isinstance(speaker_embedding_dim, int):
+ self.conditioning = Conv1dGLU(
+ d_model=d_model,
+ kernel_size=kernel_size_conv_mod,
+ padding=kernel_size_conv_mod // 2,
+ embedding_dim=speaker_embedding_dim,
+ )
+
+ self.ff = FeedForward(d_model=d_model, dropout=dropout, kernel_size=3, lrelu_slope=lrelu_slope)
+ self.conformer_conv_1 = ConformerConvModule(
+ d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
+ )
+ self.ln = nn.LayerNorm(d_model)
+ self.slf_attn = ConformerMultiHeadedSelfAttention(d_model=d_model, num_heads=n_head, dropout_p=dropout)
+ self.conformer_conv_2 = ConformerConvModule(
+ d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
+ )
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ speaker_embedding: torch.Tensor,
+ mask: torch.Tensor,
+ slf_attn_mask: torch.Tensor,
+ encoding: torch.Tensor,
+ ) -> torch.Tensor:
+ """
+ Shapes:
+ - x: :math:`[B, T_src, C]`
+ - mask: :math: `[B]`
+ - slf_attn_mask: :math: `[B, 1, 1, T_src]`
+ - speaker_embedding: :math: `[B, C]`
+ - emotion_embedding: :math: `[B, C]`
+ - encoding: :math: `[B, T_max2, C]`
+ """
+ if speaker_embedding is not None:
+ x = self.conditioning(x, embeddings=speaker_embedding)
+ x = self.ff(x) + x
+ x = self.conformer_conv_1(x) + x
+ res = x
+ x = self.ln(x)
+ x, _ = self.slf_attn(query=x, key=x, value=x, mask=slf_attn_mask, encoding=encoding)
+ x = x + res
+ x = x.masked_fill(mask.unsqueeze(-1), 0)
+
+ x = self.conformer_conv_2(x) + x
+ return x
+
+
+class FeedForward(nn.Module):
+ def __init__(
+ self,
+ d_model: int,
+ kernel_size: int,
+ dropout: float,
+ lrelu_slope: float,
+ expansion_factor: int = 4,
+ ):
+ """
+ Feed Forward module for conformer block.
+
+ Args:
+ d_model (int): The dimension of model.
+ kernel_size (int): Size of the kernels for conv layers.
+ dropout (float): probability of dropout.
+ expansion_factor (int): The factor by which to project the number of channels.
+ lrelu_slope (int): the negative slope factor for the leaky relu activation.
+
+ Inputs: inputs
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produced by the feed forward module.
+ """
+ super().__init__()
+ self.dropout = nn.Dropout(dropout)
+ self.ln = nn.LayerNorm(d_model)
+ self.conv_1 = nn.Conv1d(
+ d_model,
+ d_model * expansion_factor,
+ kernel_size=kernel_size,
+ padding=kernel_size // 2,
+ )
+ self.act = nn.LeakyReLU(lrelu_slope)
+ self.conv_2 = nn.Conv1d(d_model * expansion_factor, d_model, kernel_size=1)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Shapes:
+ x: :math: `[B, T, C]`
+ """
+ x = self.ln(x)
+ x = x.permute((0, 2, 1))
+ x = self.conv_1(x)
+ x = x.permute((0, 2, 1))
+ x = self.act(x)
+ x = self.dropout(x)
+ x = x.permute((0, 2, 1))
+ x = self.conv_2(x)
+ x = x.permute((0, 2, 1))
+ x = self.dropout(x)
+ x = 0.5 * x
+ return x
+
+
+class ConformerConvModule(nn.Module):
+ def __init__(
+ self,
+ d_model: int,
+ expansion_factor: int = 2,
+ kernel_size: int = 7,
+ dropout: float = 0.1,
+ lrelu_slope: float = 0.3,
+ ):
+ """
+ Convolution module for conformer. Starts with a gating machanism.
+ a pointwise convolution and a gated linear unit (GLU). This is followed
+ by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution
+ to help with training. it also contains an expansion factor to project the number of channels.
+
+ Args:
+ d_model (int): The dimension of model.
+ expansion_factor (int): The factor by which to project the number of channels.
+ kernel_size (int): Size of kernels for convolution modules.
+ dropout (float): Probabilty of dropout.
+ lrelu_slope (float): The slope coefficient for leaky relu activation.
+
+ Inputs: inputs
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produced by the conv module.
+
+ """
+ super().__init__()
+ inner_dim = d_model * expansion_factor
+ self.ln_1 = nn.LayerNorm(d_model)
+ self.conv_1 = PointwiseConv1d(d_model, inner_dim * 2)
+ self.conv_act = GLUActivation(slope=lrelu_slope)
+ self.depthwise = DepthWiseConv1d(
+ inner_dim,
+ inner_dim,
+ kernel_size=kernel_size,
+ padding=calc_same_padding(kernel_size)[0],
+ )
+ self.ln_2 = nn.GroupNorm(1, inner_dim)
+ self.activation = nn.LeakyReLU(lrelu_slope)
+ self.conv_2 = PointwiseConv1d(inner_dim, d_model)
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Shapes:
+ x: :math: `[B, T, C]`
+ """
+ x = self.ln_1(x)
+ x = x.permute(0, 2, 1)
+ x = self.conv_1(x)
+ x = self.conv_act(x)
+ x = self.depthwise(x)
+ x = self.ln_2(x)
+ x = self.activation(x)
+ x = self.conv_2(x)
+ x = x.permute(0, 2, 1)
+ x = self.dropout(x)
+ return x
+
+
+class ConformerMultiHeadedSelfAttention(nn.Module):
+ """
+ Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
+ the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
+ module to generalize better on different input length and the resulting encoder is more robust to the variance of
+ the utterance length. Conformer use prenorm residual units with dropout which helps training
+ and regularizing deeper models.
+ Args:
+ d_model (int): The dimension of model
+ num_heads (int): The number of attention heads.
+ dropout_p (float): probability of dropout
+ Inputs: inputs, mask
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
+ """
+
+ def __init__(self, d_model: int, num_heads: int, dropout_p: float):
+ super().__init__()
+ self.attention = RelativeMultiHeadAttention(d_model=d_model, num_heads=num_heads)
+ self.dropout = nn.Dropout(p=dropout_p)
+
+ def forward(
+ self,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ mask: torch.Tensor,
+ encoding: torch.Tensor,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable
+ encoding = encoding[:, : key.shape[1]]
+ encoding = encoding.repeat(batch_size, 1, 1)
+ outputs, attn = self.attention(query, key, value, pos_embedding=encoding, mask=mask)
+ outputs = self.dropout(outputs)
+ return outputs, attn
+
+
+class RelativeMultiHeadAttention(nn.Module):
+ """
+ Multi-head attention with relative positional encoding.
+ This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+ Args:
+ d_model (int): The dimension of model
+ num_heads (int): The number of attention heads.
+ Inputs: query, key, value, pos_embedding, mask
+ - **query** (batch, time, dim): Tensor containing query vector
+ - **key** (batch, time, dim): Tensor containing key vector
+ - **value** (batch, time, dim): Tensor containing value vector
+ - **pos_embedding** (batch, time, dim): Positional embedding tensor
+ - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+ Returns:
+ - **outputs**: Tensor produces by relative multi head attention module.
+ """
+
+ def __init__(
+ self,
+ d_model: int = 512,
+ num_heads: int = 16,
+ ):
+ super().__init__()
+ assert d_model % num_heads == 0, "d_model % num_heads should be zero."
+ self.d_model = d_model
+ self.d_head = int(d_model / num_heads)
+ self.num_heads = num_heads
+ self.sqrt_dim = math.sqrt(d_model)
+
+ self.query_proj = nn.Linear(d_model, d_model)
+ self.key_proj = nn.Linear(d_model, d_model, bias=False)
+ self.value_proj = nn.Linear(d_model, d_model, bias=False)
+ self.pos_proj = nn.Linear(d_model, d_model, bias=False)
+
+ self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+ self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+ torch.nn.init.xavier_uniform_(self.u_bias)
+ torch.nn.init.xavier_uniform_(self.v_bias)
+ self.out_proj = nn.Linear(d_model, d_model)
+
+ def forward(
+ self,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ pos_embedding: torch.Tensor,
+ mask: torch.Tensor,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ batch_size = query.shape[0]
+ query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
+ key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+ value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+ pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
+ u_bias = self.u_bias.expand_as(query)
+ v_bias = self.v_bias.expand_as(query)
+ a = (query + u_bias).transpose(1, 2)
+ content_score = a @ key.transpose(2, 3)
+ b = (query + v_bias).transpose(1, 2)
+ pos_score = b @ pos_embedding.permute(0, 2, 3, 1)
+ pos_score = self._relative_shift(pos_score)
+
+ score = content_score + pos_score
+ score = score * (1.0 / self.sqrt_dim)
+
+ score.masked_fill_(mask, -1e9)
+
+ attn = F.softmax(score, -1)
+
+ context = (attn @ value).transpose(1, 2)
+ context = context.contiguous().view(batch_size, -1, self.d_model)
+
+ return self.out_proj(context), attn
+
+ def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor: # pylint: disable=no-self-use
+ batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
+ zeros = torch.zeros((batch_size, num_heads, seq_length1, 1), device=pos_score.device)
+ padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
+ padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
+ pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
+ return pos_score
+
+
+class MultiHeadAttention(nn.Module):
+ """
+ input:
+ query --- [N, T_q, query_dim]
+ key --- [N, T_k, key_dim]
+ output:
+ out --- [N, T_q, num_units]
+ """
+
+ def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
+ super().__init__()
+ self.num_units = num_units
+ self.num_heads = num_heads
+ self.key_dim = key_dim
+
+ self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+ self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+ self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+
+ def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
+ querys = self.W_query(query) # [N, T_q, num_units]
+ keys = self.W_key(key) # [N, T_k, num_units]
+ values = self.W_value(key)
+ split_size = self.num_units // self.num_heads
+ querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
+ keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
+ values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
+ # score = softmax(QK^T / (d_k ** 0.5))
+ scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
+ scores = scores / (self.key_dim**0.5)
+ scores = F.softmax(scores, dim=3)
+ # out = score * V
+ out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
+ out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
+ return out
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/conv_layers.py b/submodules/TTS/TTS/tts/layers/delightful_tts/conv_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb9aa4495fd9b25fb88ce0dd1493cfa1f2c47d5c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/conv_layers.py
@@ -0,0 +1,671 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+from torch.nn.utils import parametrize
+
+from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor
+
+
+def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+ pad = kernel_size // 2
+ return (pad, pad - (kernel_size + 1) % 2)
+
+
+class ConvNorm(nn.Module):
+ """A 1-dimensional convolutional layer with optional weight normalization.
+
+ This layer wraps a 1D convolutional layer from PyTorch and applies
+ optional weight normalization. The layer can be used in a similar way to
+ the convolutional layers in PyTorch's `torch.nn` module.
+
+ Args:
+ in_channels (int): The number of channels in the input signal.
+ out_channels (int): The number of channels in the output signal.
+ kernel_size (int, optional): The size of the convolving kernel.
+ Defaults to 1.
+ stride (int, optional): The stride of the convolution. Defaults to 1.
+ padding (int, optional): Zero-padding added to both sides of the input.
+ If `None`, the padding will be calculated so that the output has
+ the same length as the input. Defaults to `None`.
+ dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+ bias (bool, optional): If `True`, add bias after convolution. Defaults to `True`.
+ w_init_gain (str, optional): The weight initialization function to use.
+ Can be either 'linear' or 'relu'. Defaults to 'linear'.
+ use_weight_norm (bool, optional): If `True`, apply weight normalization
+ to the convolutional weights. Defaults to `False`.
+
+ Shapes:
+ - Input: :math:`[N, D, T]`
+
+ - Output: :math:`[N, out_dim, T]` where `out_dim` is the number of output dimensions.
+
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=None,
+ dilation=1,
+ bias=True,
+ w_init_gain="linear",
+ use_weight_norm=False,
+ ):
+ super(ConvNorm, self).__init__() # pylint: disable=super-with-arguments
+ if padding is None:
+ assert kernel_size % 2 == 1
+ padding = int(dilation * (kernel_size - 1) / 2)
+ self.kernel_size = kernel_size
+ self.dilation = dilation
+ self.use_weight_norm = use_weight_norm
+ conv_fn = nn.Conv1d
+ self.conv = conv_fn(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias,
+ )
+ nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain))
+ if self.use_weight_norm:
+ self.conv = nn.utils.parametrizations.weight_norm(self.conv)
+
+ def forward(self, signal, mask=None):
+ conv_signal = self.conv(signal)
+ if mask is not None:
+ # always re-zero output if mask is
+ # available to match zero-padding
+ conv_signal = conv_signal * mask
+ return conv_signal
+
+
+class ConvLSTMLinear(nn.Module):
+ def __init__(
+ self,
+ in_dim,
+ out_dim,
+ n_layers=2,
+ n_channels=256,
+ kernel_size=3,
+ p_dropout=0.1,
+ lstm_type="bilstm",
+ use_linear=True,
+ ):
+ super(ConvLSTMLinear, self).__init__() # pylint: disable=super-with-arguments
+ self.out_dim = out_dim
+ self.lstm_type = lstm_type
+ self.use_linear = use_linear
+ self.dropout = nn.Dropout(p=p_dropout)
+
+ convolutions = []
+ for i in range(n_layers):
+ conv_layer = ConvNorm(
+ in_dim if i == 0 else n_channels,
+ n_channels,
+ kernel_size=kernel_size,
+ stride=1,
+ padding=int((kernel_size - 1) / 2),
+ dilation=1,
+ w_init_gain="relu",
+ )
+ conv_layer = nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight")
+ convolutions.append(conv_layer)
+
+ self.convolutions = nn.ModuleList(convolutions)
+
+ if not self.use_linear:
+ n_channels = out_dim
+
+ if self.lstm_type != "":
+ use_bilstm = False
+ lstm_channels = n_channels
+ if self.lstm_type == "bilstm":
+ use_bilstm = True
+ lstm_channels = int(n_channels // 2)
+
+ self.bilstm = nn.LSTM(n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm)
+ lstm_norm_fn_pntr = nn.utils.spectral_norm
+ self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
+ if self.lstm_type == "bilstm":
+ self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
+
+ if self.use_linear:
+ self.dense = nn.Linear(n_channels, out_dim)
+
+ def run_padded_sequence(self, context, lens):
+ context_embedded = []
+ for b_ind in range(context.size()[0]): # TODO: speed up
+ curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
+ for conv in self.convolutions:
+ curr_context = self.dropout(F.relu(conv(curr_context)))
+ context_embedded.append(curr_context[0].transpose(0, 1))
+ context = nn.utils.rnn.pad_sequence(context_embedded, batch_first=True)
+ return context
+
+ def run_unsorted_inputs(self, fn, context, lens): # pylint: disable=no-self-use
+ lens_sorted, ids_sorted = torch.sort(lens, descending=True)
+ unsort_ids = [0] * lens.size(0)
+ for i in range(len(ids_sorted)): # pylint: disable=consider-using-enumerate
+ unsort_ids[ids_sorted[i]] = i
+ lens_sorted = lens_sorted.long().cpu()
+
+ context = context[ids_sorted]
+ context = nn.utils.rnn.pack_padded_sequence(context, lens_sorted, batch_first=True)
+ context = fn(context)[0]
+ context = nn.utils.rnn.pad_packed_sequence(context, batch_first=True)[0]
+
+ # map back to original indices
+ context = context[unsort_ids]
+ return context
+
+ def forward(self, context, lens):
+ if context.size()[0] > 1:
+ context = self.run_padded_sequence(context, lens)
+ # to B, D, T
+ context = context.transpose(1, 2)
+ else:
+ for conv in self.convolutions:
+ context = self.dropout(F.relu(conv(context)))
+
+ if self.lstm_type != "":
+ context = context.transpose(1, 2)
+ self.bilstm.flatten_parameters()
+ if lens is not None:
+ context = self.run_unsorted_inputs(self.bilstm, context, lens)
+ else:
+ context = self.bilstm(context)[0]
+ context = context.transpose(1, 2)
+
+ x_hat = context
+ if self.use_linear:
+ x_hat = self.dense(context.transpose(1, 2)).transpose(1, 2)
+
+ return x_hat
+
+
+class DepthWiseConv1d(nn.Module):
+ def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int):
+ super().__init__()
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, groups=in_channels)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.conv(x)
+
+
+class PointwiseConv1d(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ stride: int = 1,
+ padding: int = 0,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=stride,
+ padding=padding,
+ bias=bias,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.conv(x)
+
+
+class BSConv1d(nn.Module):
+ """https://arxiv.org/pdf/2003.13549.pdf"""
+
+ def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
+ super().__init__()
+ self.pointwise = nn.Conv1d(channels_in, channels_out, kernel_size=1)
+ self.depthwise = nn.Conv1d(
+ channels_out,
+ channels_out,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=channels_out,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x1 = self.pointwise(x)
+ x2 = self.depthwise(x1)
+ return x2
+
+
+class BSConv2d(nn.Module):
+ """https://arxiv.org/pdf/2003.13549.pdf"""
+
+ def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
+ super().__init__()
+ self.pointwise = nn.Conv2d(channels_in, channels_out, kernel_size=1)
+ self.depthwise = nn.Conv2d(
+ channels_out,
+ channels_out,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=channels_out,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x1 = self.pointwise(x)
+ x2 = self.depthwise(x1)
+ return x2
+
+
+class Conv1dGLU(nn.Module):
+ """From DeepVoice 3"""
+
+ def __init__(self, d_model: int, kernel_size: int, padding: int, embedding_dim: int):
+ super().__init__()
+ self.conv = BSConv1d(d_model, 2 * d_model, kernel_size=kernel_size, padding=padding)
+ self.embedding_proj = nn.Linear(embedding_dim, d_model)
+ self.register_buffer("sqrt", torch.sqrt(torch.FloatTensor([0.5])).squeeze(0))
+ self.softsign = torch.nn.Softsign()
+
+ def forward(self, x: torch.Tensor, embeddings: torch.Tensor) -> torch.Tensor:
+ x = x.permute((0, 2, 1))
+ residual = x
+ x = self.conv(x)
+ splitdim = 1
+ a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
+ embeddings = self.embedding_proj(embeddings).unsqueeze(2)
+ softsign = self.softsign(embeddings)
+ softsign = softsign.expand_as(a)
+ a = a + softsign
+ x = a * torch.sigmoid(b)
+ x = x + residual
+ x = x * self.sqrt
+ x = x.permute((0, 2, 1))
+ return x
+
+
+class ConvTransposed(nn.Module):
+ """
+ A 1D convolutional transposed layer for PyTorch.
+ This layer applies a 1D convolutional transpose operation to its input tensor,
+ where the number of channels of the input tensor is the same as the number of channels of the output tensor.
+
+ Attributes:
+ in_channels (int): The number of channels in the input tensor.
+ out_channels (int): The number of channels in the output tensor.
+ kernel_size (int): The size of the convolutional kernel. Default: 1.
+ padding (int): The number of padding elements to add to the input tensor. Default: 0.
+ conv (BSConv1d): The 1D convolutional transpose layer.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int = 1,
+ padding: int = 0,
+ ):
+ super().__init__()
+ self.conv = BSConv1d(
+ in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = x.contiguous().transpose(1, 2)
+ x = self.conv(x)
+ x = x.contiguous().transpose(1, 2)
+ return x
+
+
+class DepthwiseConvModule(nn.Module):
+ def __init__(self, dim: int, kernel_size: int = 7, expansion: int = 4, lrelu_slope: float = 0.3):
+ super().__init__()
+ padding = calc_same_padding(kernel_size)
+ self.depthwise = nn.Conv1d(
+ dim,
+ dim * expansion,
+ kernel_size=kernel_size,
+ padding=padding[0],
+ groups=dim,
+ )
+ self.act = nn.LeakyReLU(lrelu_slope)
+ self.out = nn.Conv1d(dim * expansion, dim, 1, 1, 0)
+ self.ln = nn.LayerNorm(dim)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.ln(x)
+ x = x.permute((0, 2, 1))
+ x = self.depthwise(x)
+ x = self.act(x)
+ x = self.out(x)
+ x = x.permute((0, 2, 1))
+ return x
+
+
+class AddCoords(nn.Module):
+ def __init__(self, rank: int, with_r: bool = False):
+ super().__init__()
+ self.rank = rank
+ self.with_r = with_r
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if self.rank == 1:
+ batch_size_shape, channel_in_shape, dim_x = x.shape # pylint: disable=unused-variable
+ xx_range = torch.arange(dim_x, dtype=torch.int32)
+ xx_channel = xx_range[None, None, :]
+
+ xx_channel = xx_channel.float() / (dim_x - 1)
+ xx_channel = xx_channel * 2 - 1
+ xx_channel = xx_channel.repeat(batch_size_shape, 1, 1)
+
+ xx_channel = xx_channel.to(x.device)
+ out = torch.cat([x, xx_channel], dim=1)
+
+ if self.with_r:
+ rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2))
+ out = torch.cat([out, rr], dim=1)
+
+ elif self.rank == 2:
+ batch_size_shape, channel_in_shape, dim_y, dim_x = x.shape
+ xx_ones = torch.ones([1, 1, 1, dim_x], dtype=torch.int32)
+ yy_ones = torch.ones([1, 1, 1, dim_y], dtype=torch.int32)
+
+ xx_range = torch.arange(dim_y, dtype=torch.int32)
+ yy_range = torch.arange(dim_x, dtype=torch.int32)
+ xx_range = xx_range[None, None, :, None]
+ yy_range = yy_range[None, None, :, None]
+
+ xx_channel = torch.matmul(xx_range, xx_ones)
+ yy_channel = torch.matmul(yy_range, yy_ones)
+
+ # transpose y
+ yy_channel = yy_channel.permute(0, 1, 3, 2)
+
+ xx_channel = xx_channel.float() / (dim_y - 1)
+ yy_channel = yy_channel.float() / (dim_x - 1)
+
+ xx_channel = xx_channel * 2 - 1
+ yy_channel = yy_channel * 2 - 1
+
+ xx_channel = xx_channel.repeat(batch_size_shape, 1, 1, 1)
+ yy_channel = yy_channel.repeat(batch_size_shape, 1, 1, 1)
+
+ xx_channel = xx_channel.to(x.device)
+ yy_channel = yy_channel.to(x.device)
+
+ out = torch.cat([x, xx_channel, yy_channel], dim=1)
+
+ if self.with_r:
+ rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2))
+ out = torch.cat([out, rr], dim=1)
+
+ elif self.rank == 3:
+ batch_size_shape, channel_in_shape, dim_z, dim_y, dim_x = x.shape
+ xx_ones = torch.ones([1, 1, 1, 1, dim_x], dtype=torch.int32)
+ yy_ones = torch.ones([1, 1, 1, 1, dim_y], dtype=torch.int32)
+ zz_ones = torch.ones([1, 1, 1, 1, dim_z], dtype=torch.int32)
+
+ xy_range = torch.arange(dim_y, dtype=torch.int32)
+ xy_range = xy_range[None, None, None, :, None]
+
+ yz_range = torch.arange(dim_z, dtype=torch.int32)
+ yz_range = yz_range[None, None, None, :, None]
+
+ zx_range = torch.arange(dim_x, dtype=torch.int32)
+ zx_range = zx_range[None, None, None, :, None]
+
+ xy_channel = torch.matmul(xy_range, xx_ones)
+ xx_channel = torch.cat([xy_channel + i for i in range(dim_z)], dim=2)
+
+ yz_channel = torch.matmul(yz_range, yy_ones)
+ yz_channel = yz_channel.permute(0, 1, 3, 4, 2)
+ yy_channel = torch.cat([yz_channel + i for i in range(dim_x)], dim=4)
+
+ zx_channel = torch.matmul(zx_range, zz_ones)
+ zx_channel = zx_channel.permute(0, 1, 4, 2, 3)
+ zz_channel = torch.cat([zx_channel + i for i in range(dim_y)], dim=3)
+
+ xx_channel = xx_channel.to(x.device)
+ yy_channel = yy_channel.to(x.device)
+ zz_channel = zz_channel.to(x.device)
+ out = torch.cat([x, xx_channel, yy_channel, zz_channel], dim=1)
+
+ if self.with_r:
+ rr = torch.sqrt(
+ torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2) + torch.pow(zz_channel - 0.5, 2)
+ )
+ out = torch.cat([out, rr], dim=1)
+ else:
+ raise NotImplementedError
+
+ return out
+
+
+class CoordConv1d(nn.modules.conv.Conv1d):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ padding: int = 0,
+ dilation: int = 1,
+ groups: int = 1,
+ bias: bool = True,
+ with_r: bool = False,
+ ):
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ groups,
+ bias,
+ )
+ self.rank = 1
+ self.addcoords = AddCoords(self.rank, with_r)
+ self.conv = nn.Conv1d(
+ in_channels + self.rank + int(with_r),
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ groups,
+ bias,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.addcoords(x)
+ x = self.conv(x)
+ return x
+
+
+class CoordConv2d(nn.modules.conv.Conv2d):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ padding: int = 0,
+ dilation: int = 1,
+ groups: int = 1,
+ bias: bool = True,
+ with_r: bool = False,
+ ):
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ groups,
+ bias,
+ )
+ self.rank = 2
+ self.addcoords = AddCoords(self.rank, with_r)
+ self.conv = nn.Conv2d(
+ in_channels + self.rank + int(with_r),
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ groups,
+ bias,
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.addcoords(x)
+ x = self.conv(x)
+ return x
+
+
+class LVCBlock(torch.nn.Module):
+ """the location-variable convolutions"""
+
+ def __init__( # pylint: disable=dangerous-default-value
+ self,
+ in_channels,
+ cond_channels,
+ stride,
+ dilations=[1, 3, 9, 27],
+ lReLU_slope=0.2,
+ conv_kernel_size=3,
+ cond_hop_length=256,
+ kpnet_hidden_channels=64,
+ kpnet_conv_size=3,
+ kpnet_dropout=0.0,
+ ):
+ super().__init__()
+
+ self.cond_hop_length = cond_hop_length
+ self.conv_layers = len(dilations)
+ self.conv_kernel_size = conv_kernel_size
+
+ self.kernel_predictor = KernelPredictor(
+ cond_channels=cond_channels,
+ conv_in_channels=in_channels,
+ conv_out_channels=2 * in_channels,
+ conv_layers=len(dilations),
+ conv_kernel_size=conv_kernel_size,
+ kpnet_hidden_channels=kpnet_hidden_channels,
+ kpnet_conv_size=kpnet_conv_size,
+ kpnet_dropout=kpnet_dropout,
+ kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
+ )
+
+ self.convt_pre = nn.Sequential(
+ nn.LeakyReLU(lReLU_slope),
+ nn.utils.parametrizations.weight_norm(
+ nn.ConvTranspose1d(
+ in_channels,
+ in_channels,
+ 2 * stride,
+ stride=stride,
+ padding=stride // 2 + stride % 2,
+ output_padding=stride % 2,
+ )
+ ),
+ )
+
+ self.conv_blocks = nn.ModuleList()
+ for dilation in dilations:
+ self.conv_blocks.append(
+ nn.Sequential(
+ nn.LeakyReLU(lReLU_slope),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ in_channels,
+ in_channels,
+ conv_kernel_size,
+ padding=dilation * (conv_kernel_size - 1) // 2,
+ dilation=dilation,
+ )
+ ),
+ nn.LeakyReLU(lReLU_slope),
+ )
+ )
+
+ def forward(self, x, c):
+ """forward propagation of the location-variable convolutions.
+ Args:
+ x (Tensor): the input sequence (batch, in_channels, in_length)
+ c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+
+ Returns:
+ Tensor: the output sequence (batch, in_channels, in_length)
+ """
+ _, in_channels, _ = x.shape # (B, c_g, L')
+
+ x = self.convt_pre(x) # (B, c_g, stride * L')
+ kernels, bias = self.kernel_predictor(c)
+
+ for i, conv in enumerate(self.conv_blocks):
+ output = conv(x) # (B, c_g, stride * L')
+
+ k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length)
+ b = bias[:, i, :, :] # (B, 2 * c_g, cond_length)
+
+ output = self.location_variable_convolution(
+ output, k, b, hop_size=self.cond_hop_length
+ ) # (B, 2 * c_g, stride * L'): LVC
+ x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
+ output[:, in_channels:, :]
+ ) # (B, c_g, stride * L'): GAU
+
+ return x
+
+ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): # pylint: disable=no-self-use
+ """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
+ Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
+ Args:
+ x (Tensor): the input sequence (batch, in_channels, in_length).
+ kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
+ bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
+ dilation (int): the dilation of convolution.
+ hop_size (int): the hop_size of the conditioning sequence.
+ Returns:
+ (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
+ """
+ batch, _, in_length = x.shape
+ batch, _, out_channels, kernel_size, kernel_length = kernel.shape
+ assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
+
+ padding = dilation * int((kernel_size - 1) / 2)
+ x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding)
+ x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
+
+ if hop_size < dilation:
+ x = F.pad(x, (0, dilation), "constant", 0)
+ x = x.unfold(
+ 3, dilation, dilation
+ ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
+ x = x[:, :, :, :, :hop_size]
+ x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
+ x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
+
+ o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
+ o = o.to(memory_format=torch.channels_last_3d)
+ bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+ o = o + bias
+ o = o.contiguous().view(batch, out_channels, -1)
+
+ return o
+
+ def remove_weight_norm(self):
+ self.kernel_predictor.remove_weight_norm()
+ parametrize.remove_parametrizations(self.convt_pre[1], "weight")
+ for block in self.conv_blocks:
+ parametrize.remove_parametrizations(block[1], "weight")
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/encoders.py b/submodules/TTS/TTS/tts/layers/delightful_tts/encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..0878f0677a29d092597a46e8a3b11e4a521769b8
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/encoders.py
@@ -0,0 +1,261 @@
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
+from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
+from TTS.tts.layers.delightful_tts.networks import STL
+
+
+def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
+ batch_size = lengths.shape[0]
+ max_len = torch.max(lengths).item()
+ ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
+ mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+ return mask
+
+
+def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
+ return torch.ceil(lens / stride).int()
+
+
+class ReferenceEncoder(nn.Module):
+ """
+ Referance encoder for utterance and phoneme prosody encoders. Reference encoder
+ made up of convolution and RNN layers.
+
+ Args:
+ num_mels (int): Number of mel frames to produce.
+ ref_enc_filters (list[int]): List of channel sizes for encoder layers.
+ ref_enc_size (int): Size of the kernel for the conv layers.
+ ref_enc_strides (List[int]): List of strides to use for conv layers.
+ ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, dim, time): Tensor containing mel vector
+ - **lengths** (batch): Tensor containing the mel lengths.
+ Returns:
+ - **outputs** (batch, time, dim): Tensor produced by Reference Encoder.
+ """
+
+ def __init__(
+ self,
+ num_mels: int,
+ ref_enc_filters: List[Union[int, int, int, int, int, int]],
+ ref_enc_size: int,
+ ref_enc_strides: List[Union[int, int, int, int, int]],
+ ref_enc_gru_size: int,
+ ):
+ super().__init__()
+
+ n_mel_channels = num_mels
+ self.n_mel_channels = n_mel_channels
+ K = len(ref_enc_filters)
+ filters = [self.n_mel_channels] + ref_enc_filters
+ strides = [1] + ref_enc_strides
+ # Use CoordConv at the first layer to better preserve positional information: https://arxiv.org/pdf/1811.02122.pdf
+ convs = [
+ CoordConv1d(
+ in_channels=filters[0],
+ out_channels=filters[0 + 1],
+ kernel_size=ref_enc_size,
+ stride=strides[0],
+ padding=ref_enc_size // 2,
+ with_r=True,
+ )
+ ]
+ convs2 = [
+ nn.Conv1d(
+ in_channels=filters[i],
+ out_channels=filters[i + 1],
+ kernel_size=ref_enc_size,
+ stride=strides[i],
+ padding=ref_enc_size // 2,
+ )
+ for i in range(1, K)
+ ]
+ convs.extend(convs2)
+ self.convs = nn.ModuleList(convs)
+
+ self.norms = nn.ModuleList([nn.InstanceNorm1d(num_features=ref_enc_filters[i], affine=True) for i in range(K)])
+
+ self.gru = nn.GRU(
+ input_size=ref_enc_filters[-1],
+ hidden_size=ref_enc_gru_size,
+ batch_first=True,
+ )
+
+ def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ inputs --- [N, n_mels, timesteps]
+ outputs --- [N, E//2]
+ """
+
+ mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1)
+ x = x.masked_fill(mel_masks, 0)
+ for conv, norm in zip(self.convs, self.norms):
+ x = conv(x)
+ x = F.leaky_relu(x, 0.3) # [N, 128, Ty//2^K, n_mels//2^K]
+ x = norm(x)
+
+ for _ in range(2):
+ mel_lens = stride_lens(mel_lens)
+
+ mel_masks = get_mask_from_lengths(mel_lens)
+
+ x = x.masked_fill(mel_masks.unsqueeze(1), 0)
+ x = x.permute((0, 2, 1))
+ x = torch.nn.utils.rnn.pack_padded_sequence(x, mel_lens.cpu().int(), batch_first=True, enforce_sorted=False)
+
+ self.gru.flatten_parameters()
+ x, memory = self.gru(x) # memory --- [N, Ty, E//2], out --- [1, N, E//2]
+ x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
+
+ return x, memory, mel_masks
+
+ def calculate_channels( # pylint: disable=no-self-use
+ self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int
+ ) -> int:
+ for _ in range(n_convs):
+ L = (L - kernel_size + 2 * pad) // stride + 1
+ return L
+
+
+class UtteranceLevelProsodyEncoder(nn.Module):
+ def __init__(
+ self,
+ num_mels: int,
+ ref_enc_filters: List[Union[int, int, int, int, int, int]],
+ ref_enc_size: int,
+ ref_enc_strides: List[Union[int, int, int, int, int]],
+ ref_enc_gru_size: int,
+ dropout: float,
+ n_hidden: int,
+ bottleneck_size_u: int,
+ token_num: int,
+ ):
+ """
+ Encoder to extract prosody from utterance. it is made up of a reference encoder
+ with a couple of linear layers and style token layer with dropout.
+
+ Args:
+ num_mels (int): Number of mel frames to produce.
+ ref_enc_filters (list[int]): List of channel sizes for ref encoder layers.
+ ref_enc_size (int): Size of the kernel for the ref encoder conv layers.
+ ref_enc_strides (List[int]): List of strides to use for teh ref encoder conv layers.
+ ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
+ dropout (float): Probability of dropout.
+ n_hidden (int): Size of hidden layers.
+ bottleneck_size_u (int): Size of the bottle neck layer.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, dim, time): Tensor containing mel vector
+ - **lengths** (batch): Tensor containing the mel lengths.
+ Returns:
+ - **outputs** (batch, 1, dim): Tensor produced by Utterance Level Prosody Encoder.
+ """
+ super().__init__()
+
+ self.E = n_hidden
+ self.d_q = self.d_k = n_hidden
+ bottleneck_size = bottleneck_size_u
+
+ self.encoder = ReferenceEncoder(
+ ref_enc_filters=ref_enc_filters,
+ ref_enc_gru_size=ref_enc_gru_size,
+ ref_enc_size=ref_enc_size,
+ ref_enc_strides=ref_enc_strides,
+ num_mels=num_mels,
+ )
+ self.encoder_prj = nn.Linear(ref_enc_gru_size, self.E // 2)
+ self.stl = STL(n_hidden=n_hidden, token_num=token_num)
+ self.encoder_bottleneck = nn.Linear(self.E, bottleneck_size)
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, mels: torch.Tensor, mel_lens: torch.Tensor) -> torch.Tensor:
+ """
+ Shapes:
+ mels: :math: `[B, C, T]`
+ mel_lens: :math: `[B]`
+
+ out --- [N, seq_len, E]
+ """
+ _, embedded_prosody, _ = self.encoder(mels, mel_lens)
+
+ # Bottleneck
+ embedded_prosody = self.encoder_prj(embedded_prosody)
+
+ # Style Token
+ out = self.encoder_bottleneck(self.stl(embedded_prosody))
+ out = self.dropout(out)
+
+ out = out.view((-1, 1, out.shape[3]))
+ return out
+
+
+class PhonemeLevelProsodyEncoder(nn.Module):
+ def __init__(
+ self,
+ num_mels: int,
+ ref_enc_filters: List[Union[int, int, int, int, int, int]],
+ ref_enc_size: int,
+ ref_enc_strides: List[Union[int, int, int, int, int]],
+ ref_enc_gru_size: int,
+ dropout: float,
+ n_hidden: int,
+ n_heads: int,
+ bottleneck_size_p: int,
+ ):
+ super().__init__()
+
+ self.E = n_hidden
+ self.d_q = self.d_k = n_hidden
+ bottleneck_size = bottleneck_size_p
+
+ self.encoder = ReferenceEncoder(
+ ref_enc_filters=ref_enc_filters,
+ ref_enc_gru_size=ref_enc_gru_size,
+ ref_enc_size=ref_enc_size,
+ ref_enc_strides=ref_enc_strides,
+ num_mels=num_mels,
+ )
+ self.encoder_prj = nn.Linear(ref_enc_gru_size, n_hidden)
+ self.attention = ConformerMultiHeadedSelfAttention(
+ d_model=n_hidden,
+ num_heads=n_heads,
+ dropout_p=dropout,
+ )
+ self.encoder_bottleneck = nn.Linear(n_hidden, bottleneck_size)
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ src_mask: torch.Tensor,
+ mels: torch.Tensor,
+ mel_lens: torch.Tensor,
+ encoding: torch.Tensor,
+ ) -> torch.Tensor:
+ """
+ x --- [N, seq_len, encoder_embedding_dim]
+ mels --- [N, Ty/r, n_mels*r], r=1
+ out --- [N, seq_len, bottleneck_size]
+ attn --- [N, seq_len, ref_len], Ty/r = ref_len
+ """
+ embedded_prosody, _, mel_masks = self.encoder(mels, mel_lens)
+
+ # Bottleneck
+ embedded_prosody = self.encoder_prj(embedded_prosody)
+
+ attn_mask = mel_masks.view((mel_masks.shape[0], 1, 1, -1))
+ x, _ = self.attention(
+ query=x,
+ key=embedded_prosody,
+ value=embedded_prosody,
+ mask=attn_mask,
+ encoding=encoding,
+ )
+ x = self.encoder_bottleneck(x)
+ x = x.masked_fill(src_mask.unsqueeze(-1), 0.0)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/energy_adaptor.py b/submodules/TTS/TTS/tts/layers/delightful_tts/energy_adaptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea0d1e47214d81a42b934bbaaa4b3ebb9f63bcc6
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/energy_adaptor.py
@@ -0,0 +1,82 @@
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.utils.helpers import average_over_durations
+
+
+class EnergyAdaptor(nn.Module): # pylint: disable=abstract-method
+ """Variance Adaptor with an added 1D conv layer. Used to
+ get energy embeddings.
+
+ Args:
+ channels_in (int): Number of in channels for conv layers.
+ channels_out (int): Number of out channels.
+ kernel_size (int): Size the kernel for the conv layers.
+ dropout (float): Probability of dropout.
+ lrelu_slope (float): Slope for the leaky relu.
+ emb_kernel_size (int): Size the kernel for the pitch embedding.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, time1, dim): Tensor containing input vector
+ - **target** (batch, 1, time2): Tensor containing the energy target
+ - **dr** (batch, time1): Tensor containing aligner durations vector
+ - **mask** (batch, time1): Tensor containing indices to be masked
+ Returns:
+ - **energy prediction** (batch, 1, time1): Tensor produced by energy predictor
+ - **energy embedding** (batch, channels, time1): Tensor produced energy adaptor
+ - **average energy target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
+
+ """
+
+ def __init__(
+ self,
+ channels_in: int,
+ channels_hidden: int,
+ channels_out: int,
+ kernel_size: int,
+ dropout: float,
+ lrelu_slope: float,
+ emb_kernel_size: int,
+ ):
+ super().__init__()
+ self.energy_predictor = VariancePredictor(
+ channels_in=channels_in,
+ channels=channels_hidden,
+ channels_out=channels_out,
+ kernel_size=kernel_size,
+ p_dropout=dropout,
+ lrelu_slope=lrelu_slope,
+ )
+ self.energy_emb = nn.Conv1d(
+ 1,
+ channels_hidden,
+ kernel_size=emb_kernel_size,
+ padding=int((emb_kernel_size - 1) / 2),
+ )
+
+ def get_energy_embedding_train(
+ self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Shapes:
+ x: :math: `[B, T_src, C]`
+ target: :math: `[B, 1, T_max2]`
+ dr: :math: `[B, T_src]`
+ mask: :math: `[B, T_src]`
+ """
+ energy_pred = self.energy_predictor(x, mask)
+ energy_pred.unsqueeze_(1)
+ avg_energy_target = average_over_durations(target, dr)
+ energy_emb = self.energy_emb(avg_energy_target)
+ return energy_pred, avg_energy_target, energy_emb
+
+ def get_energy_embedding(self, x: torch.Tensor, mask: torch.Tensor, energy_transform: Callable) -> torch.Tensor:
+ energy_pred = self.energy_predictor(x, mask)
+ energy_pred.unsqueeze_(1)
+ if energy_transform is not None:
+ energy_pred = energy_transform(energy_pred, (~mask).sum(dim=(1, 2)), self.pitch_mean, self.pitch_std)
+ energy_emb_pred = self.energy_emb(energy_pred)
+ return energy_emb_pred, energy_pred
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/kernel_predictor.py b/submodules/TTS/TTS/tts/layers/delightful_tts/kernel_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..96c550b6c2609c52762bb3eaca373e31f0599bf8
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/kernel_predictor.py
@@ -0,0 +1,128 @@
+import torch.nn as nn # pylint: disable=consider-using-from-import
+from torch.nn.utils import parametrize
+
+
+class KernelPredictor(nn.Module):
+ """Kernel predictor for the location-variable convolutions
+
+ Args:
+ cond_channels (int): number of channel for the conditioning sequence,
+ conv_in_channels (int): number of channel for the input sequence,
+ conv_out_channels (int): number of channel for the output sequence,
+ conv_layers (int): number of layers
+
+ """
+
+ def __init__( # pylint: disable=dangerous-default-value
+ self,
+ cond_channels,
+ conv_in_channels,
+ conv_out_channels,
+ conv_layers,
+ conv_kernel_size=3,
+ kpnet_hidden_channels=64,
+ kpnet_conv_size=3,
+ kpnet_dropout=0.0,
+ kpnet_nonlinear_activation="LeakyReLU",
+ kpnet_nonlinear_activation_params={"negative_slope": 0.1},
+ ):
+ super().__init__()
+
+ self.conv_in_channels = conv_in_channels
+ self.conv_out_channels = conv_out_channels
+ self.conv_kernel_size = conv_kernel_size
+ self.conv_layers = conv_layers
+
+ kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w
+ kpnet_bias_channels = conv_out_channels * conv_layers # l_b
+
+ self.input_conv = nn.Sequential(
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ )
+
+ self.residual_convs = nn.ModuleList()
+ padding = (kpnet_conv_size - 1) // 2
+ for _ in range(3):
+ self.residual_convs.append(
+ nn.Sequential(
+ nn.Dropout(kpnet_dropout),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_hidden_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_hidden_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ )
+ )
+ self.kernel_conv = nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_kernel_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ )
+ self.bias_conv = nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_bias_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ )
+
+ def forward(self, c):
+ """
+ Args:
+ c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+ """
+ batch, _, cond_length = c.shape
+ c = self.input_conv(c)
+ for residual_conv in self.residual_convs:
+ residual_conv.to(c.device)
+ c = c + residual_conv(c)
+ k = self.kernel_conv(c)
+ b = self.bias_conv(c)
+ kernels = k.contiguous().view(
+ batch,
+ self.conv_layers,
+ self.conv_in_channels,
+ self.conv_out_channels,
+ self.conv_kernel_size,
+ cond_length,
+ )
+ bias = b.contiguous().view(
+ batch,
+ self.conv_layers,
+ self.conv_out_channels,
+ cond_length,
+ )
+
+ return kernels, bias
+
+ def remove_weight_norm(self):
+ parametrize.remove_parametrizations(self.input_conv[0], "weight")
+ parametrize.remove_parametrizations(self.kernel_conv, "weight")
+ parametrize.remove_parametrizations(self.bias_conv, "weight")
+ for block in self.residual_convs:
+ parametrize.remove_parametrizations(block[1], "weight")
+ parametrize.remove_parametrizations(block[3], "weight")
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/networks.py b/submodules/TTS/TTS/tts/layers/delightful_tts/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..4305022f18cf95565b2da2553740276818fb486c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/networks.py
@@ -0,0 +1,219 @@
+import math
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
+
+
+def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
+ assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
+ # Kaiming initialization
+ return torch.randn(shape) * np.sqrt(2 / shape[1])
+
+
+def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor:
+ pe = torch.zeros(length, d_model, device=device)
+ position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0)
+ return pe
+
+
+class BottleneckLayer(nn.Module):
+ """
+ Bottleneck layer for reducing the dimensionality of a tensor.
+
+ Args:
+ in_dim: The number of input dimensions.
+ reduction_factor: The factor by which to reduce the number of dimensions.
+ norm: The normalization method to use. Can be "weightnorm" or "instancenorm".
+ non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu".
+ kernel_size: The size of the convolutional kernel.
+ use_partial_padding: Whether to use partial padding with the convolutional kernel.
+
+ Shape:
+ - Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions.
+
+ - Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions.
+ """
+
+ def __init__(
+ self,
+ in_dim,
+ reduction_factor,
+ norm="weightnorm",
+ non_linearity="relu",
+ kernel_size=3,
+ use_partial_padding=False, # pylint: disable=unused-argument
+ ):
+ super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments
+
+ self.reduction_factor = reduction_factor
+ reduced_dim = int(in_dim / reduction_factor)
+ self.out_dim = reduced_dim
+ if self.reduction_factor > 1:
+ fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm"))
+ if norm == "instancenorm":
+ fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True))
+
+ self.projection_fn = fn
+ self.non_linearity = nn.ReLU()
+ if non_linearity == "leakyrelu":
+ self.non_linearity = nn.LeakyReLU()
+
+ def forward(self, x):
+ if self.reduction_factor > 1:
+ x = self.projection_fn(x)
+ x = self.non_linearity(x)
+ return x
+
+
+class GLUActivation(nn.Module):
+ """Class that implements the Gated Linear Unit (GLU) activation function.
+
+ The GLU activation function is a variant of the Leaky ReLU activation function,
+ where the output of the activation function is gated by an input tensor.
+
+ """
+
+ def __init__(self, slope: float):
+ super().__init__()
+ self.lrelu = nn.LeakyReLU(slope)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ out, gate = x.chunk(2, dim=1)
+ x = out * self.lrelu(gate)
+ return x
+
+
+class StyleEmbedAttention(nn.Module):
+ def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
+ super().__init__()
+ self.num_units = num_units
+ self.num_heads = num_heads
+ self.key_dim = key_dim
+
+ self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+ self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+ self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+
+ def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor:
+ values = self.W_value(key_soft)
+ split_size = self.num_units // self.num_heads
+ values = torch.stack(torch.split(values, split_size, dim=2), dim=0)
+
+ out_soft = scores_soft = None
+ querys = self.W_query(query) # [N, T_q, num_units]
+ keys = self.W_key(key_soft) # [N, T_k, num_units]
+
+ # [h, N, T_q, num_units/h]
+ querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)
+ # [h, N, T_k, num_units/h]
+ keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)
+ # [h, N, T_k, num_units/h]
+
+ # score = softmax(QK^T / (d_k ** 0.5))
+ scores_soft = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
+ scores_soft = scores_soft / (self.key_dim**0.5)
+ scores_soft = F.softmax(scores_soft, dim=3)
+
+ # out = score * V
+ # [h, N, T_q, num_units/h]
+ out_soft = torch.matmul(scores_soft, values)
+ out_soft = torch.cat(torch.split(out_soft, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
+
+ return out_soft # , scores_soft
+
+
+class EmbeddingPadded(nn.Module):
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+ super().__init__()
+ padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64)
+ padding_mult[padding_idx] = 0
+ self.register_buffer("padding_mult", padding_mult)
+ self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim)))
+
+ def forward(self, idx: torch.Tensor) -> torch.Tensor:
+ embeddings_zeroed = self.embeddings * self.padding_mult
+ x = F.embedding(idx, embeddings_zeroed)
+ return x
+
+
+class EmbeddingProjBlock(nn.Module):
+ def __init__(self, embedding_dim: int):
+ super().__init__()
+ self.layers = nn.ModuleList(
+ [
+ nn.Linear(embedding_dim, embedding_dim),
+ nn.LeakyReLU(0.3),
+ nn.Linear(embedding_dim, embedding_dim),
+ nn.LeakyReLU(0.3),
+ ]
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ res = x
+ for layer in self.layers:
+ x = layer(x)
+ x = x + res
+ return x
+
+
+class LinearNorm(nn.Module):
+ def __init__(self, in_features: int, out_features: int, bias: bool = False):
+ super().__init__()
+ self.linear = nn.Linear(in_features, out_features, bias)
+
+ nn.init.xavier_uniform_(self.linear.weight)
+ if bias:
+ nn.init.constant_(self.linear.bias, 0.0)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.linear(x)
+ return x
+
+
+class STL(nn.Module):
+ """
+ A PyTorch module for the Style Token Layer (STL) as described in
+ "A Style-Based Generator Architecture for Generative Adversarial Networks"
+ (https://arxiv.org/abs/1812.04948)
+
+ The STL applies a multi-headed attention mechanism over the learned style tokens,
+ using the text input as the query and the style tokens as the keys and values.
+ The output of the attention mechanism is used as the text's style embedding.
+
+ Args:
+ token_num (int): The number of style tokens.
+ n_hidden (int): Number of hidden dimensions.
+ """
+
+ def __init__(self, n_hidden: int, token_num: int):
+ super(STL, self).__init__() # pylint: disable=super-with-arguments
+
+ num_heads = 1
+ E = n_hidden
+ self.token_num = token_num
+ self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads))
+ d_q = E // 2
+ d_k = E // num_heads
+ self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads)
+
+ torch.nn.init.normal_(self.embed, mean=0, std=0.5)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ N = x.size(0)
+ query = x.unsqueeze(1) # [N, 1, E//2]
+
+ keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
+
+ # Weighted sum
+ emotion_embed_soft = self.attention(query, keys_soft)
+
+ return emotion_embed_soft
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py b/submodules/TTS/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..28418f7163361120914f277446f76ac9f0363254
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py
@@ -0,0 +1,65 @@
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
+
+
+class PhonemeProsodyPredictor(nn.Module):
+ """Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf
+ It consists of 2 layers of 1D convolutions each followed by a relu activation, layer norm
+ and dropout, then finally a linear layer.
+
+ Args:
+ hidden_size (int): Size of hidden channels.
+ kernel_size (int): Kernel size for the conv layers.
+ dropout: (float): Probability of dropout.
+ bottleneck_size (int): bottleneck size for last linear layer.
+ lrelu_slope (float): Slope of the leaky relu.
+ """
+
+ def __init__(
+ self,
+ hidden_size: int,
+ kernel_size: int,
+ dropout: float,
+ bottleneck_size: int,
+ lrelu_slope: float,
+ ):
+ super().__init__()
+ self.d_model = hidden_size
+ self.layers = nn.ModuleList(
+ [
+ ConvTransposed(
+ self.d_model,
+ self.d_model,
+ kernel_size=kernel_size,
+ padding=(kernel_size - 1) // 2,
+ ),
+ nn.LeakyReLU(lrelu_slope),
+ nn.LayerNorm(self.d_model),
+ nn.Dropout(dropout),
+ ConvTransposed(
+ self.d_model,
+ self.d_model,
+ kernel_size=kernel_size,
+ padding=(kernel_size - 1) // 2,
+ ),
+ nn.LeakyReLU(lrelu_slope),
+ nn.LayerNorm(self.d_model),
+ nn.Dropout(dropout),
+ ]
+ )
+ self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size)
+
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+ """
+ Shapes:
+ x: :math: `[B, T, D]`
+ mask: :math: `[B, T]`
+ """
+ mask = mask.unsqueeze(2)
+ for layer in self.layers:
+ x = layer(x)
+ x = x.masked_fill(mask, 0.0)
+ x = self.predictor_bottleneck(x)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/submodules/TTS/TTS/tts/layers/delightful_tts/pitch_adaptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9031369e0f019cf115d0d43b288bb97d9db48467
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/pitch_adaptor.py
@@ -0,0 +1,88 @@
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.utils.helpers import average_over_durations
+
+
+class PitchAdaptor(nn.Module): # pylint: disable=abstract-method
+ """Module to get pitch embeddings via pitch predictor
+
+ Args:
+ n_input (int): Number of pitch predictor input channels.
+ n_hidden (int): Number of pitch predictor hidden channels.
+ n_out (int): Number of pitch predictor out channels.
+ kernel size (int): Size of the kernel for conv layers.
+ emb_kernel_size (int): Size the kernel for the pitch embedding.
+ p_dropout (float): Probability of dropout.
+ lrelu_slope (float): Slope for the leaky relu.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, time1, dim): Tensor containing input vector
+ - **target** (batch, 1, time2): Tensor containing the pitch target
+ - **dr** (batch, time1): Tensor containing aligner durations vector
+ - **mask** (batch, time1): Tensor containing indices to be masked
+ Returns:
+ - **pitch prediction** (batch, 1, time1): Tensor produced by pitch predictor
+ - **pitch embedding** (batch, channels, time1): Tensor produced pitch pitch adaptor
+ - **average pitch target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
+ """
+
+ def __init__(
+ self,
+ n_input: int,
+ n_hidden: int,
+ n_out: int,
+ kernel_size: int,
+ emb_kernel_size: int,
+ p_dropout: float,
+ lrelu_slope: float,
+ ):
+ super().__init__()
+ self.pitch_predictor = VariancePredictor(
+ channels_in=n_input,
+ channels=n_hidden,
+ channels_out=n_out,
+ kernel_size=kernel_size,
+ p_dropout=p_dropout,
+ lrelu_slope=lrelu_slope,
+ )
+ self.pitch_emb = nn.Conv1d(
+ 1,
+ n_input,
+ kernel_size=emb_kernel_size,
+ padding=int((emb_kernel_size - 1) / 2),
+ )
+
+ def get_pitch_embedding_train(
+ self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Shapes:
+ x: :math: `[B, T_src, C]`
+ target: :math: `[B, 1, T_max2]`
+ dr: :math: `[B, T_src]`
+ mask: :math: `[B, T_src]`
+ """
+ pitch_pred = self.pitch_predictor(x, mask) # [B, T_src, C_hidden], [B, T_src] --> [B, T_src]
+ pitch_pred.unsqueeze_(1) # --> [B, 1, T_src]
+ avg_pitch_target = average_over_durations(target, dr) # [B, 1, T_mel], [B, T_src] --> [B, 1, T_src]
+ pitch_emb = self.pitch_emb(avg_pitch_target) # [B, 1, T_src] --> [B, C_hidden, T_src]
+ return pitch_pred, avg_pitch_target, pitch_emb
+
+ def get_pitch_embedding(
+ self,
+ x: torch.Tensor,
+ mask: torch.Tensor,
+ pitch_transform: Callable,
+ pitch_mean: torch.Tensor,
+ pitch_std: torch.Tensor,
+ ) -> torch.Tensor:
+ pitch_pred = self.pitch_predictor(x, mask)
+ if pitch_transform is not None:
+ pitch_pred = pitch_transform(pitch_pred, (~mask).sum(), pitch_mean, pitch_std)
+ pitch_pred.unsqueeze_(1)
+ pitch_emb_pred = self.pitch_emb(pitch_pred)
+ return pitch_emb_pred, pitch_pred
diff --git a/submodules/TTS/TTS/tts/layers/delightful_tts/variance_predictor.py b/submodules/TTS/TTS/tts/layers/delightful_tts/variance_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..68303a1bd1148089eab7ee8be12d4f37ddf420e1
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/delightful_tts/variance_predictor.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
+
+
+class VariancePredictor(nn.Module):
+ """
+ Network is 2-layer 1D convolutions with leaky relu activation and then
+ followed by layer normalization then a dropout layer and finally an
+ extra linear layer to project the hidden states into the output sequence.
+
+ Args:
+ channels_in (int): Number of in channels for conv layers.
+ channels_out (int): Number of out channels for the last linear layer.
+ kernel_size (int): Size the kernel for the conv layers.
+ p_dropout (float): Probability of dropout.
+ lrelu_slope (float): Slope for the leaky relu.
+
+ Inputs: inputs, mask
+ - **inputs** (batch, time, dim): Tensor containing input vector
+ - **mask** (batch, time): Tensor containing indices to be masked
+ Returns:
+ - **outputs** (batch, time): Tensor produced by last linear layer.
+ """
+
+ def __init__(
+ self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float
+ ):
+ super().__init__()
+
+ self.layers = nn.ModuleList(
+ [
+ ConvTransposed(
+ channels_in,
+ channels,
+ kernel_size=kernel_size,
+ padding=(kernel_size - 1) // 2,
+ ),
+ nn.LeakyReLU(lrelu_slope),
+ nn.LayerNorm(channels),
+ nn.Dropout(p_dropout),
+ ConvTransposed(
+ channels,
+ channels,
+ kernel_size=kernel_size,
+ padding=(kernel_size - 1) // 2,
+ ),
+ nn.LeakyReLU(lrelu_slope),
+ nn.LayerNorm(channels),
+ nn.Dropout(p_dropout),
+ ]
+ )
+
+ self.linear_layer = nn.Linear(channels, channels_out)
+
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+ """
+ Shapes:
+ x: :math: `[B, T_src, C]`
+ mask: :math: `[B, T_src]`
+ """
+ for layer in self.layers:
+ x = layer(x)
+ x = self.linear_layer(x)
+ x = x.squeeze(-1)
+ x = x.masked_fill(mask, 0.0)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/feed_forward/__init__.py b/submodules/TTS/TTS/tts/layers/feed_forward/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/feed_forward/decoder.py b/submodules/TTS/TTS/tts/layers/feed_forward/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0376e2e3926e65254c3a81d085d48c97df033958
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/feed_forward/decoder.py
@@ -0,0 +1,228 @@
+import torch
+from torch import nn
+
+from TTS.tts.layers.generic.res_conv_bn import Conv1dBN, Conv1dBNBlock, ResidualConv1dBNBlock
+from TTS.tts.layers.generic.transformer import FFTransformerBlock
+from TTS.tts.layers.generic.wavenet import WNBlocks
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
+
+
+class WaveNetDecoder(nn.Module):
+ """WaveNet based decoder with a prenet and a postnet.
+
+ prenet: conv1d_1x1
+ postnet: 3 x [conv1d_1x1 -> relu] -> conv1d_1x1
+
+ TODO: Integrate speaker conditioning vector.
+
+ Note:
+ default wavenet parameters;
+ params = {
+ "num_blocks": 12,
+ "hidden_channels":192,
+ "kernel_size": 5,
+ "dilation_rate": 1,
+ "num_layers": 4,
+ "dropout_p": 0.05
+ }
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels for prenet and postnet.
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, c_in_channels, params):
+ super().__init__()
+ # prenet
+ self.prenet = torch.nn.Conv1d(in_channels, params["hidden_channels"], 1)
+ # wavenet layers
+ self.wn = WNBlocks(params["hidden_channels"], c_in_channels=c_in_channels, **params)
+ # postnet
+ self.postnet = [
+ torch.nn.Conv1d(params["hidden_channels"], hidden_channels, 1),
+ torch.nn.ReLU(),
+ torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
+ torch.nn.ReLU(),
+ torch.nn.Conv1d(hidden_channels, hidden_channels, 1),
+ torch.nn.ReLU(),
+ torch.nn.Conv1d(hidden_channels, out_channels, 1),
+ ]
+ self.postnet = nn.Sequential(*self.postnet)
+
+ def forward(self, x, x_mask=None, g=None):
+ x = self.prenet(x) * x_mask
+ x = self.wn(x, x_mask, g)
+ o = self.postnet(x) * x_mask
+ return o
+
+
+class RelativePositionTransformerDecoder(nn.Module):
+ """Decoder with Relative Positional Transformer.
+
+ Note:
+ Default params
+ params={
+ 'hidden_channels_ffn': 128,
+ 'num_heads': 2,
+ "kernel_size": 3,
+ "dropout_p": 0.1,
+ "num_layers": 8,
+ "rel_attn_window_size": 4,
+ "input_length": None
+ }
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels including Transformer layers.
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, params):
+ super().__init__()
+ self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
+ self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
+
+ def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
+ o = self.prenet(x) * x_mask
+ o = self.rel_pos_transformer(o, x_mask)
+ return o
+
+
+class FFTransformerDecoder(nn.Module):
+ """Decoder with FeedForwardTransformer.
+
+ Default params
+ params={
+ 'hidden_channels_ffn': 1024,
+ 'num_heads': 2,
+ "dropout_p": 0.1,
+ "num_layers": 6,
+ }
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels including Transformer layers.
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, params):
+ super().__init__()
+ self.transformer_block = FFTransformerBlock(in_channels, **params)
+ self.postnet = nn.Conv1d(in_channels, out_channels, 1)
+
+ def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
+ # TODO: handle multi-speaker
+ x_mask = 1 if x_mask is None else x_mask
+ o = self.transformer_block(x) * x_mask
+ o = self.postnet(o) * x_mask
+ return o
+
+
+class ResidualConv1dBNDecoder(nn.Module):
+ """Residual Convolutional Decoder as in the original Speedy Speech paper
+
+ TODO: Integrate speaker conditioning vector.
+
+ Note:
+ Default params
+ params = {
+ "kernel_size": 4,
+ "dilations": 4 * [1, 2, 4, 8] + [1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 17
+ }
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels including ResidualConv1dBNBlock layers.
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, params):
+ super().__init__()
+ self.res_conv_block = ResidualConv1dBNBlock(in_channels, hidden_channels, hidden_channels, **params)
+ self.post_conv = nn.Conv1d(hidden_channels, hidden_channels, 1)
+ self.postnet = nn.Sequential(
+ Conv1dBNBlock(
+ hidden_channels, hidden_channels, hidden_channels, params["kernel_size"], 1, num_conv_blocks=2
+ ),
+ nn.Conv1d(hidden_channels, out_channels, 1),
+ )
+
+ def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
+ o = self.res_conv_block(x, x_mask)
+ o = self.post_conv(o) + x
+ return self.postnet(o) * x_mask
+
+
+class Decoder(nn.Module):
+ """Decodes the expanded phoneme encoding into spectrograms
+ Args:
+ out_channels (int): number of output channels.
+ in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
+ decoder_type (str): decoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
+ decoder_params (dict): model parameters for specified decoder type.
+ c_in_channels (int): number of channels for conditional input.
+
+ Shapes:
+ - input: (B, C, T)
+ """
+
+ # pylint: disable=dangerous-default-value
+ def __init__(
+ self,
+ out_channels,
+ in_hidden_channels,
+ decoder_type="residual_conv_bn",
+ decoder_params={
+ "kernel_size": 4,
+ "dilations": 4 * [1, 2, 4, 8] + [1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 17,
+ },
+ c_in_channels=0,
+ ):
+ super().__init__()
+
+ if decoder_type.lower() == "relative_position_transformer":
+ self.decoder = RelativePositionTransformerDecoder(
+ in_channels=in_hidden_channels,
+ out_channels=out_channels,
+ hidden_channels=in_hidden_channels,
+ params=decoder_params,
+ )
+ elif decoder_type.lower() == "residual_conv_bn":
+ self.decoder = ResidualConv1dBNDecoder(
+ in_channels=in_hidden_channels,
+ out_channels=out_channels,
+ hidden_channels=in_hidden_channels,
+ params=decoder_params,
+ )
+ elif decoder_type.lower() == "wavenet":
+ self.decoder = WaveNetDecoder(
+ in_channels=in_hidden_channels,
+ out_channels=out_channels,
+ hidden_channels=in_hidden_channels,
+ c_in_channels=c_in_channels,
+ params=decoder_params,
+ )
+ elif decoder_type.lower() == "fftransformer":
+ self.decoder = FFTransformerDecoder(in_hidden_channels, out_channels, decoder_params)
+ else:
+ raise ValueError(f"[!] Unknown decoder type - {decoder_type}")
+
+ def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
+ """
+ Args:
+ x: [B, C, T]
+ x_mask: [B, 1, T]
+ g: [B, C_g, 1]
+ """
+ # TODO: implement multi-speaker
+ o = self.decoder(x, x_mask, g)
+ return o
diff --git a/submodules/TTS/TTS/tts/layers/feed_forward/duration_predictor.py b/submodules/TTS/TTS/tts/layers/feed_forward/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4422648f4337e48aab39671836fcfb5e12ff4be7
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/feed_forward/duration_predictor.py
@@ -0,0 +1,41 @@
+from torch import nn
+
+from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
+
+
+class DurationPredictor(nn.Module):
+ """Speedy Speech duration predictor model.
+ Predicts phoneme durations from encoder outputs.
+
+ Note:
+ Outputs interpreted as log(durations)
+ To get actual durations, do exp transformation
+
+ conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
+
+ Args:
+ hidden_channels (int): number of channels in the inner layers.
+ """
+
+ def __init__(self, hidden_channels):
+ super().__init__()
+
+ self.layers = nn.ModuleList(
+ [
+ Conv1dBN(hidden_channels, hidden_channels, 4, 1),
+ Conv1dBN(hidden_channels, hidden_channels, 3, 1),
+ Conv1dBN(hidden_channels, hidden_channels, 1, 1),
+ nn.Conv1d(hidden_channels, 1, 1),
+ ]
+ )
+
+ def forward(self, x, x_mask):
+ """
+ Shapes:
+ x: [B, C, T]
+ x_mask: [B, 1, T]
+ """
+ o = x
+ for layer in self.layers:
+ o = layer(o) * x_mask
+ return o
diff --git a/submodules/TTS/TTS/tts/layers/feed_forward/encoder.py b/submodules/TTS/TTS/tts/layers/feed_forward/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf939ffc73fedac299228e090b2df3bb4cc553c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/feed_forward/encoder.py
@@ -0,0 +1,162 @@
+from torch import nn
+
+from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
+from TTS.tts.layers.generic.transformer import FFTransformerBlock
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
+
+
+class RelativePositionTransformerEncoder(nn.Module):
+ """Speedy speech encoder built on Transformer with Relative Position encoding.
+
+ TODO: Integrate speaker conditioning vector.
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, params):
+ super().__init__()
+ self.prenet = ResidualConv1dBNBlock(
+ in_channels,
+ hidden_channels,
+ hidden_channels,
+ kernel_size=5,
+ num_res_blocks=3,
+ num_conv_blocks=1,
+ dilations=[1, 1, 1],
+ )
+ self.rel_pos_transformer = RelativePositionTransformer(hidden_channels, out_channels, hidden_channels, **params)
+
+ def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
+ if x_mask is None:
+ x_mask = 1
+ o = self.prenet(x) * x_mask
+ o = self.rel_pos_transformer(o, x_mask)
+ return o
+
+
+class ResidualConv1dBNEncoder(nn.Module):
+ """Residual Convolutional Encoder as in the original Speedy Speech paper
+
+ TODO: Integrate speaker conditioning vector.
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of hidden channels
+ params (dict): dictionary for residual convolutional blocks.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, params):
+ super().__init__()
+ self.prenet = nn.Sequential(nn.Conv1d(in_channels, hidden_channels, 1), nn.ReLU())
+ self.res_conv_block = ResidualConv1dBNBlock(hidden_channels, hidden_channels, hidden_channels, **params)
+
+ self.postnet = nn.Sequential(
+ *[
+ nn.Conv1d(hidden_channels, hidden_channels, 1),
+ nn.ReLU(),
+ nn.BatchNorm1d(hidden_channels),
+ nn.Conv1d(hidden_channels, out_channels, 1),
+ ]
+ )
+
+ def forward(self, x, x_mask=None, g=None): # pylint: disable=unused-argument
+ if x_mask is None:
+ x_mask = 1
+ o = self.prenet(x) * x_mask
+ o = self.res_conv_block(o, x_mask)
+ o = self.postnet(o + x) * x_mask
+ return o * x_mask
+
+
+class Encoder(nn.Module):
+ # pylint: disable=dangerous-default-value
+ """Factory class for Speedy Speech encoder enables different encoder types internally.
+
+ Args:
+ num_chars (int): number of characters.
+ out_channels (int): number of output channels.
+ in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
+ encoder_type (str): encoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
+ encoder_params (dict): model parameters for specified encoder type.
+ c_in_channels (int): number of channels for conditional input.
+
+ Note:
+ Default encoder_params to be set in config.json...
+
+ ```python
+ # for 'relative_position_transformer'
+ encoder_params={
+ 'hidden_channels_ffn': 128,
+ 'num_heads': 2,
+ "kernel_size": 3,
+ "dropout_p": 0.1,
+ "num_layers": 6,
+ "rel_attn_window_size": 4,
+ "input_length": None
+ },
+
+ # for 'residual_conv_bn'
+ encoder_params = {
+ "kernel_size": 4,
+ "dilations": 4 * [1, 2, 4] + [1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 13
+ }
+
+ # for 'fftransformer'
+ encoder_params = {
+ "hidden_channels_ffn": 1024 ,
+ "num_heads": 2,
+ "num_layers": 6,
+ "dropout_p": 0.1
+ }
+ ```
+ """
+
+ def __init__(
+ self,
+ in_hidden_channels,
+ out_channels,
+ encoder_type="residual_conv_bn",
+ encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13},
+ c_in_channels=0,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.in_channels = in_hidden_channels
+ self.hidden_channels = in_hidden_channels
+ self.encoder_type = encoder_type
+ self.c_in_channels = c_in_channels
+
+ # init encoder
+ if encoder_type.lower() == "relative_position_transformer":
+ # text encoder
+ # pylint: disable=unexpected-keyword-arg
+ self.encoder = RelativePositionTransformerEncoder(
+ in_hidden_channels, out_channels, in_hidden_channels, encoder_params
+ )
+ elif encoder_type.lower() == "residual_conv_bn":
+ self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params)
+ elif encoder_type.lower() == "fftransformer":
+ assert (
+ in_hidden_channels == out_channels
+ ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+ # pylint: disable=unexpected-keyword-arg
+ self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params)
+ else:
+ raise NotImplementedError(" [!] unknown encoder type.")
+
+ def forward(self, x, x_mask, g=None): # pylint: disable=unused-argument
+ """
+ Shapes:
+ x: [B, C, T]
+ x_mask: [B, 1, T]
+ g: [B, C, 1]
+ """
+ o = self.encoder(x, x_mask)
+ return o * x_mask
diff --git a/submodules/TTS/TTS/tts/layers/generic/__init__.py b/submodules/TTS/TTS/tts/layers/generic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/generic/aligner.py b/submodules/TTS/TTS/tts/layers/generic/aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa6f0e9c4879207695b2de1193c9147b5a3fa4b
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/aligner.py
@@ -0,0 +1,92 @@
+from typing import Tuple
+
+import torch
+from torch import nn
+
+
+class AlignmentNetwork(torch.nn.Module):
+ """Aligner Network for learning alignment between the input text and the model output with Gaussian Attention.
+
+ ::
+
+ query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
+ key -> conv1d -> relu -> conv1d -----------------------^
+
+ Args:
+ in_query_channels (int): Number of channels in the query network. Defaults to 80.
+ in_key_channels (int): Number of channels in the key network. Defaults to 512.
+ attn_channels (int): Number of inner channels in the attention layers. Defaults to 80.
+ temperature (float): Temperature for the softmax. Defaults to 0.0005.
+ """
+
+ def __init__(
+ self,
+ in_query_channels=80,
+ in_key_channels=512,
+ attn_channels=80,
+ temperature=0.0005,
+ ):
+ super().__init__()
+ self.temperature = temperature
+ self.softmax = torch.nn.Softmax(dim=3)
+ self.log_softmax = torch.nn.LogSoftmax(dim=3)
+
+ self.key_layer = nn.Sequential(
+ nn.Conv1d(
+ in_key_channels,
+ in_key_channels * 2,
+ kernel_size=3,
+ padding=1,
+ bias=True,
+ ),
+ torch.nn.ReLU(),
+ nn.Conv1d(in_key_channels * 2, attn_channels, kernel_size=1, padding=0, bias=True),
+ )
+
+ self.query_layer = nn.Sequential(
+ nn.Conv1d(
+ in_query_channels,
+ in_query_channels * 2,
+ kernel_size=3,
+ padding=1,
+ bias=True,
+ ),
+ torch.nn.ReLU(),
+ nn.Conv1d(in_query_channels * 2, in_query_channels, kernel_size=1, padding=0, bias=True),
+ torch.nn.ReLU(),
+ nn.Conv1d(in_query_channels, attn_channels, kernel_size=1, padding=0, bias=True),
+ )
+
+ self.init_layers()
+
+ def init_layers(self):
+ torch.nn.init.xavier_uniform_(self.key_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
+ torch.nn.init.xavier_uniform_(self.key_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
+ torch.nn.init.xavier_uniform_(self.query_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
+ torch.nn.init.xavier_uniform_(self.query_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
+ torch.nn.init.xavier_uniform_(self.query_layer[4].weight, gain=torch.nn.init.calculate_gain("linear"))
+
+ def forward(
+ self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
+ ) -> Tuple[torch.tensor, torch.tensor]:
+ """Forward pass of the aligner encoder.
+ Shapes:
+ - queries: :math:`[B, C, T_de]`
+ - keys: :math:`[B, C_emb, T_en]`
+ - mask: :math:`[B, T_de]`
+ Output:
+ attn (torch.tensor): :math:`[B, 1, T_en, T_de]` soft attention mask.
+ attn_logp (torch.tensor): :math:`[ßB, 1, T_en , T_de]` log probabilities.
+ """
+ key_out = self.key_layer(keys)
+ query_out = self.query_layer(queries)
+ attn_factor = (query_out[:, :, :, None] - key_out[:, :, None]) ** 2
+ attn_logp = -self.temperature * attn_factor.sum(1, keepdim=True)
+ if attn_prior is not None:
+ attn_logp = self.log_softmax(attn_logp) + torch.log(attn_prior[:, None] + 1e-8)
+
+ if mask is not None:
+ attn_logp.data.masked_fill_(~mask.bool().unsqueeze(2), -float("inf"))
+
+ attn = self.softmax(attn_logp)
+ return attn, attn_logp
diff --git a/submodules/TTS/TTS/tts/layers/generic/gated_conv.py b/submodules/TTS/TTS/tts/layers/generic/gated_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a29c4499f970db538a4b99c3c05cba22576195f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/gated_conv.py
@@ -0,0 +1,37 @@
+from torch import nn
+
+from .normalization import LayerNorm
+
+
+class GatedConvBlock(nn.Module):
+ """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf
+ Args:
+ in_out_channels (int): number of input/output channels.
+ kernel_size (int): convolution kernel size.
+ dropout_p (float): dropout rate.
+ """
+
+ def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers):
+ super().__init__()
+ # class arguments
+ self.dropout_p = dropout_p
+ self.num_layers = num_layers
+ # define layers
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+ self.layers = nn.ModuleList()
+ for _ in range(num_layers):
+ self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)]
+ self.norm_layers += [LayerNorm(2 * in_out_channels)]
+
+ def forward(self, x, x_mask):
+ o = x
+ res = x
+ for idx in range(self.num_layers):
+ o = nn.functional.dropout(o, p=self.dropout_p, training=self.training)
+ o = self.conv_layers[idx](o * x_mask)
+ o = self.norm_layers[idx](o)
+ o = nn.functional.glu(o, dim=1)
+ o = res + o
+ res = o
+ return o
diff --git a/submodules/TTS/TTS/tts/layers/generic/normalization.py b/submodules/TTS/TTS/tts/layers/generic/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0270e405e4246e47b7bc0787e4cd4b069533f92
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/normalization.py
@@ -0,0 +1,123 @@
+import torch
+from torch import nn
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-4):
+ """Layer norm for the 2nd dimension of the input.
+ Args:
+ channels (int): number of channels (2nd dimension) of the input.
+ eps (float): to prevent 0 division
+
+ Shapes:
+ - input: (B, C, T)
+ - output: (B, C, T)
+ """
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(1, channels, 1) * 0.1)
+ self.beta = nn.Parameter(torch.zeros(1, channels, 1))
+
+ def forward(self, x):
+ mean = torch.mean(x, 1, keepdim=True)
+ variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+ x = (x - mean) * torch.rsqrt(variance + self.eps)
+ x = x * self.gamma + self.beta
+ return x
+
+
+class LayerNorm2(nn.Module):
+ """Layer norm for the 2nd dimension of the input using torch primitive.
+ Args:
+ channels (int): number of channels (2nd dimension) of the input.
+ eps (float): to prevent 0 division
+
+ Shapes:
+ - input: (B, C, T)
+ - output: (B, C, T)
+ """
+
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = torch.nn.functional.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+class TemporalBatchNorm1d(nn.BatchNorm1d):
+ """Normalize each channel separately over time and batch."""
+
+ def __init__(self, channels, affine=True, track_running_stats=True, momentum=0.1):
+ super().__init__(channels, affine=affine, track_running_stats=track_running_stats, momentum=momentum)
+
+ def forward(self, x):
+ return super().forward(x.transpose(2, 1)).transpose(2, 1)
+
+
+class ActNorm(nn.Module):
+ """Activation Normalization bijector as an alternative to Batch Norm. It computes
+ mean and std from a sample data in advance and it uses these values
+ for normalization at training.
+
+ Args:
+ channels (int): input channels.
+ ddi (False): data depended initialization flag.
+
+ Shapes:
+ - inputs: (B, C, T)
+ - outputs: (B, C, T)
+ """
+
+ def __init__(self, channels, ddi=False, **kwargs): # pylint: disable=unused-argument
+ super().__init__()
+ self.channels = channels
+ self.initialized = not ddi
+
+ self.logs = nn.Parameter(torch.zeros(1, channels, 1))
+ self.bias = nn.Parameter(torch.zeros(1, channels, 1))
+
+ def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument
+ if x_mask is None:
+ x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype)
+ x_len = torch.sum(x_mask, [1, 2])
+ if not self.initialized:
+ self.initialize(x, x_mask)
+ self.initialized = True
+
+ if reverse:
+ z = (x - self.bias) * torch.exp(-self.logs) * x_mask
+ logdet = None
+ else:
+ z = (self.bias + torch.exp(self.logs) * x) * x_mask
+ logdet = torch.sum(self.logs) * x_len # [b]
+
+ return z, logdet
+
+ def store_inverse(self):
+ pass
+
+ def set_ddi(self, ddi):
+ self.initialized = not ddi
+
+ def initialize(self, x, x_mask):
+ with torch.no_grad():
+ denom = torch.sum(x_mask, [0, 2])
+ m = torch.sum(x * x_mask, [0, 2]) / denom
+ m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom
+ v = m_sq - (m**2)
+ logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))
+
+ bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
+ logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype)
+
+ self.bias.data.copy_(bias_init)
+ self.logs.data.copy_(logs_init)
diff --git a/submodules/TTS/TTS/tts/layers/generic/pos_encoding.py b/submodules/TTS/TTS/tts/layers/generic/pos_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..913add0d14332bf70c3ecd2a95869d0071310bd4
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/pos_encoding.py
@@ -0,0 +1,69 @@
+import math
+
+import torch
+from torch import nn
+
+
+class PositionalEncoding(nn.Module):
+ """Sinusoidal positional encoding for non-recurrent neural networks.
+ Implementation based on "Attention Is All You Need"
+
+ Args:
+ channels (int): embedding size
+ dropout_p (float): dropout rate applied to the output.
+ max_len (int): maximum sequence length.
+ use_scale (bool): whether to use a learnable scaling coefficient.
+ """
+
+ def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
+ super().__init__()
+ if channels % 2 != 0:
+ raise ValueError(
+ "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
+ )
+ self.use_scale = use_scale
+ if use_scale:
+ self.scale = torch.nn.Parameter(torch.ones(1))
+ pe = torch.zeros(max_len, channels)
+ position = torch.arange(0, max_len).unsqueeze(1)
+ div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels)
+ pe[:, 0::2] = torch.sin(position.float() * div_term)
+ pe[:, 1::2] = torch.cos(position.float() * div_term)
+ pe = pe.unsqueeze(0).transpose(1, 2)
+ self.register_buffer("pe", pe)
+ if dropout_p > 0:
+ self.dropout = nn.Dropout(p=dropout_p)
+ self.channels = channels
+
+ def forward(self, x, mask=None, first_idx=None, last_idx=None):
+ """
+ Shapes:
+ x: [B, C, T]
+ mask: [B, 1, T]
+ first_idx: int
+ last_idx: int
+ """
+
+ x = x * math.sqrt(self.channels)
+ if first_idx is None:
+ if self.pe.size(2) < x.size(2):
+ raise RuntimeError(
+ f"Sequence is {x.size(2)} but PositionalEncoding is"
+ f" limited to {self.pe.size(2)}. See max_len argument."
+ )
+ if mask is not None:
+ pos_enc = self.pe[:, :, : x.size(2)] * mask
+ else:
+ pos_enc = self.pe[:, :, : x.size(2)]
+ if self.use_scale:
+ x = x + self.scale * pos_enc
+ else:
+ x = x + pos_enc
+ else:
+ if self.use_scale:
+ x = x + self.scale * self.pe[:, :, first_idx:last_idx]
+ else:
+ x = x + self.pe[:, :, first_idx:last_idx]
+ if hasattr(self, "dropout"):
+ x = self.dropout(x)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/generic/res_conv_bn.py b/submodules/TTS/TTS/tts/layers/generic/res_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4beda291aa15398024b5b16cd6bf12b88898a0a9
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/res_conv_bn.py
@@ -0,0 +1,127 @@
+from torch import nn
+
+
+class ZeroTemporalPad(nn.Module):
+ """Pad sequences to equal lentgh in the temporal dimension"""
+
+ def __init__(self, kernel_size, dilation):
+ super().__init__()
+ total_pad = dilation * (kernel_size - 1)
+ begin = total_pad // 2
+ end = total_pad - begin
+ self.pad_layer = nn.ZeroPad2d((0, 0, begin, end))
+
+ def forward(self, x):
+ return self.pad_layer(x)
+
+
+class Conv1dBN(nn.Module):
+ """1d convolutional with batch norm.
+ conv1d -> relu -> BN blocks.
+
+ Note:
+ Batch normalization is applied after ReLU regarding the original implementation.
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ kernel_size (int): kernel size for convolutional filters.
+ dilation (int): dilation for convolution layers.
+ """
+
+ def __init__(self, in_channels, out_channels, kernel_size, dilation):
+ super().__init__()
+ padding = dilation * (kernel_size - 1)
+ pad_s = padding // 2
+ pad_e = padding - pad_s
+ self.conv1d = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation)
+ self.pad = nn.ZeroPad2d((pad_s, pad_e, 0, 0)) # uneven left and right padding
+ self.norm = nn.BatchNorm1d(out_channels)
+
+ def forward(self, x):
+ o = self.conv1d(x)
+ o = self.pad(o)
+ o = nn.functional.relu(o)
+ o = self.norm(o)
+ return o
+
+
+class Conv1dBNBlock(nn.Module):
+ """1d convolutional block with batch norm. It is a set of conv1d -> relu -> BN blocks.
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of inner convolution channels.
+ kernel_size (int): kernel size for convolutional filters.
+ dilation (int): dilation for convolution layers.
+ num_conv_blocks (int, optional): number of convolutional blocks. Defaults to 2.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation, num_conv_blocks=2):
+ super().__init__()
+ self.conv_bn_blocks = []
+ for idx in range(num_conv_blocks):
+ layer = Conv1dBN(
+ in_channels if idx == 0 else hidden_channels,
+ out_channels if idx == (num_conv_blocks - 1) else hidden_channels,
+ kernel_size,
+ dilation,
+ )
+ self.conv_bn_blocks.append(layer)
+ self.conv_bn_blocks = nn.Sequential(*self.conv_bn_blocks)
+
+ def forward(self, x):
+ """
+ Shapes:
+ x: (B, D, T)
+ """
+ return self.conv_bn_blocks(x)
+
+
+class ResidualConv1dBNBlock(nn.Module):
+ """Residual Convolutional Blocks with BN
+ Each block has 'num_conv_block' conv layers and 'num_res_blocks' such blocks are connected
+ with residual connections.
+
+ conv_block = (conv1d -> relu -> bn) x 'num_conv_blocks'
+ residuak_conv_block = (x -> conv_block -> + ->) x 'num_res_blocks'
+ ' - - - - - - - - - ^
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ hidden_channels (int): number of inner convolution channels.
+ kernel_size (int): kernel size for convolutional filters.
+ dilations (list): dilations for each convolution layer.
+ num_res_blocks (int, optional): number of residual blocks. Defaults to 13.
+ num_conv_blocks (int, optional): number of convolutional blocks in each residual block. Defaults to 2.
+ """
+
+ def __init__(
+ self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
+ ):
+ super().__init__()
+ assert len(dilations) == num_res_blocks
+ self.res_blocks = nn.ModuleList()
+ for idx, dilation in enumerate(dilations):
+ block = Conv1dBNBlock(
+ in_channels if idx == 0 else hidden_channels,
+ out_channels if (idx + 1) == len(dilations) else hidden_channels,
+ hidden_channels,
+ kernel_size,
+ dilation,
+ num_conv_blocks,
+ )
+ self.res_blocks.append(block)
+
+ def forward(self, x, x_mask=None):
+ if x_mask is None:
+ x_mask = 1.0
+ o = x * x_mask
+ for block in self.res_blocks:
+ res = o
+ o = block(o)
+ o = o + res
+ if x_mask is not None:
+ o = o * x_mask
+ return o
diff --git a/submodules/TTS/TTS/tts/layers/generic/time_depth_sep_conv.py b/submodules/TTS/TTS/tts/layers/generic/time_depth_sep_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..186cea02e75e156c40923de91086c369a9ea02ee
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/time_depth_sep_conv.py
@@ -0,0 +1,84 @@
+import torch
+from torch import nn
+
+
+class TimeDepthSeparableConv(nn.Module):
+ """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
+ It shows competative results with less computation and memory footprint."""
+
+ def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hid_channels = hid_channels
+ self.kernel_size = kernel_size
+
+ self.time_conv = nn.Conv1d(
+ in_channels,
+ 2 * hid_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=bias,
+ )
+ self.norm1 = nn.BatchNorm1d(2 * hid_channels)
+ self.depth_conv = nn.Conv1d(
+ hid_channels,
+ hid_channels,
+ kernel_size,
+ stride=1,
+ padding=(kernel_size - 1) // 2,
+ groups=hid_channels,
+ bias=bias,
+ )
+ self.norm2 = nn.BatchNorm1d(hid_channels)
+ self.time_conv2 = nn.Conv1d(
+ hid_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=bias,
+ )
+ self.norm3 = nn.BatchNorm1d(out_channels)
+
+ def forward(self, x):
+ x_res = x
+ x = self.time_conv(x)
+ x = self.norm1(x)
+ x = nn.functional.glu(x, dim=1)
+ x = self.depth_conv(x)
+ x = self.norm2(x)
+ x = x * torch.sigmoid(x)
+ x = self.time_conv2(x)
+ x = self.norm3(x)
+ x = x_res + x
+ return x
+
+
+class TimeDepthSeparableConvBlock(nn.Module):
+ def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
+ super().__init__()
+ assert (kernel_size - 1) % 2 == 0
+ assert num_layers > 1
+
+ self.layers = nn.ModuleList()
+ layer = TimeDepthSeparableConv(
+ in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
+ )
+ self.layers.append(layer)
+ for idx in range(num_layers - 1):
+ layer = TimeDepthSeparableConv(
+ hid_channels,
+ hid_channels,
+ out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
+ kernel_size,
+ bias,
+ )
+ self.layers.append(layer)
+
+ def forward(self, x, mask):
+ for layer in self.layers:
+ x = layer(x * mask)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/generic/transformer.py b/submodules/TTS/TTS/tts/layers/generic/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7ecee2bacb68cd330e18630531c97bc6f2e6a3
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/transformer.py
@@ -0,0 +1,89 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class FFTransformer(nn.Module):
+ def __init__(self, in_out_channels, num_heads, hidden_channels_ffn=1024, kernel_size_fft=3, dropout_p=0.1):
+ super().__init__()
+ self.self_attn = nn.MultiheadAttention(in_out_channels, num_heads, dropout=dropout_p)
+
+ padding = (kernel_size_fft - 1) // 2
+ self.conv1 = nn.Conv1d(in_out_channels, hidden_channels_ffn, kernel_size=kernel_size_fft, padding=padding)
+ self.conv2 = nn.Conv1d(hidden_channels_ffn, in_out_channels, kernel_size=kernel_size_fft, padding=padding)
+
+ self.norm1 = nn.LayerNorm(in_out_channels)
+ self.norm2 = nn.LayerNorm(in_out_channels)
+
+ self.dropout1 = nn.Dropout(dropout_p)
+ self.dropout2 = nn.Dropout(dropout_p)
+
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
+ """😦 ugly looking with all the transposing"""
+ src = src.permute(2, 0, 1)
+ src2, enc_align = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
+ src = src + self.dropout1(src2)
+ src = self.norm1(src + src2)
+ # T x B x D -> B x D x T
+ src = src.permute(1, 2, 0)
+ src2 = self.conv2(F.relu(self.conv1(src)))
+ src2 = self.dropout2(src2)
+ src = src + src2
+ src = src.transpose(1, 2)
+ src = self.norm2(src)
+ src = src.transpose(1, 2)
+ return src, enc_align
+
+
+class FFTransformerBlock(nn.Module):
+ def __init__(self, in_out_channels, num_heads, hidden_channels_ffn, num_layers, dropout_p):
+ super().__init__()
+ self.fft_layers = nn.ModuleList(
+ [
+ FFTransformer(
+ in_out_channels=in_out_channels,
+ num_heads=num_heads,
+ hidden_channels_ffn=hidden_channels_ffn,
+ dropout_p=dropout_p,
+ )
+ for _ in range(num_layers)
+ ]
+ )
+
+ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
+ """
+ TODO: handle multi-speaker
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - mask: :math:`[B, 1, T] or [B, T]`
+ """
+ if mask is not None and mask.ndim == 3:
+ mask = mask.squeeze(1)
+ # mask is negated, torch uses 1s and 0s reversely.
+ mask = ~mask.bool()
+ alignments = []
+ for layer in self.fft_layers:
+ x, align = layer(x, src_key_padding_mask=mask)
+ alignments.append(align.unsqueeze(1))
+ alignments = torch.cat(alignments, 1)
+ return x
+
+
+class FFTDurationPredictor:
+ def __init__(
+ self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
+ ): # pylint: disable=unused-argument
+ self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
+ self.proj = nn.Linear(in_channels, 1)
+
+ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - mask: :math:`[B, 1, T]`
+
+ TODO: Handle the cond input
+ """
+ x = self.fft(x, mask=mask)
+ x = self.proj(x)
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/generic/wavenet.py b/submodules/TTS/TTS/tts/layers/generic/wavenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8de63b49fea8a7140ddd0493446f0541abe6a0a
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/generic/wavenet.py
@@ -0,0 +1,176 @@
+import torch
+from torch import nn
+from torch.nn.utils import parametrize
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class WN(torch.nn.Module):
+ """Wavenet layers with weight norm and no input conditioning.
+
+ |-----------------------------------------------------------------------------|
+ | |-> tanh -| |
+ res -|- conv1d(dilation) -> dropout -> + -| * -> conv1d1x1 -> split -|- + -> res
+ g -------------------------------------| |-> sigmoid -| |
+ o --------------------------------------------------------------------------- + --------- o
+
+ Args:
+ in_channels (int): number of input channels.
+ hidden_channes (int): number of hidden channels.
+ kernel_size (int): filter kernel size for the first conv layer.
+ dilation_rate (int): dilations rate to increase dilation per layer.
+ If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
+ num_layers (int): number of wavenet layers.
+ c_in_channels (int): number of channels of conditioning input.
+ dropout_p (float): dropout rate.
+ weight_norm (bool): enable/disable weight norm for convolution layers.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_layers,
+ c_in_channels=0,
+ dropout_p=0,
+ weight_norm=True,
+ ):
+ super().__init__()
+ assert kernel_size % 2 == 1
+ assert hidden_channels % 2 == 0
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.num_layers = num_layers
+ self.c_in_channels = c_in_channels
+ self.dropout_p = dropout_p
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.dropout = nn.Dropout(dropout_p)
+
+ # init conditioning layer
+ if c_in_channels > 0:
+ cond_layer = torch.nn.Conv1d(c_in_channels, 2 * hidden_channels * num_layers, 1)
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight")
+ # intermediate layers
+ for i in range(num_layers):
+ dilation = dilation_rate**i
+ padding = int((kernel_size * dilation - dilation) / 2)
+ if i == 0:
+ in_layer = torch.nn.Conv1d(
+ in_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
+ )
+ else:
+ in_layer = torch.nn.Conv1d(
+ hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
+ )
+ in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
+ self.in_layers.append(in_layer)
+
+ if i < num_layers - 1:
+ res_skip_channels = 2 * hidden_channels
+ else:
+ res_skip_channels = hidden_channels
+
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
+ self.res_skip_layers.append(res_skip_layer)
+ # setup weight norm
+ if not weight_norm:
+ self.remove_weight_norm()
+
+ def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-argument
+ output = torch.zeros_like(x)
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
+ x_mask = 1.0 if x_mask is None else x_mask
+ if g is not None:
+ g = self.cond_layer(g)
+ for i in range(self.num_layers):
+ x_in = self.in_layers[i](x)
+ x_in = self.dropout(x_in)
+ if g is not None:
+ cond_offset = i * 2 * self.hidden_channels
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+ else:
+ g_l = torch.zeros_like(x_in)
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.num_layers - 1:
+ x = (x + res_skip_acts[:, : self.hidden_channels, :]) * x_mask
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
+ else:
+ output = output + res_skip_acts
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.c_in_channels != 0:
+ parametrize.remove_parametrizations(self.cond_layer, "weight")
+ for l in self.in_layers:
+ parametrize.remove_parametrizations(l, "weight")
+ for l in self.res_skip_layers:
+ parametrize.remove_parametrizations(l, "weight")
+
+
+class WNBlocks(nn.Module):
+ """Wavenet blocks.
+
+ Note: After each block dilation resets to 1 and it increases in each block
+ along the dilation rate.
+
+ Args:
+ in_channels (int): number of input channels.
+ hidden_channes (int): number of hidden channels.
+ kernel_size (int): filter kernel size for the first conv layer.
+ dilation_rate (int): dilations rate to increase dilation per layer.
+ If it is 2, dilations are 1, 2, 4, 8 for the next 4 layers.
+ num_blocks (int): number of wavenet blocks.
+ num_layers (int): number of wavenet layers.
+ c_in_channels (int): number of channels of conditioning input.
+ dropout_p (float): dropout rate.
+ weight_norm (bool): enable/disable weight norm for convolution layers.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_blocks,
+ num_layers,
+ c_in_channels=0,
+ dropout_p=0,
+ weight_norm=True,
+ ):
+ super().__init__()
+ self.wn_blocks = nn.ModuleList()
+ for idx in range(num_blocks):
+ layer = WN(
+ in_channels=in_channels if idx == 0 else hidden_channels,
+ hidden_channels=hidden_channels,
+ kernel_size=kernel_size,
+ dilation_rate=dilation_rate,
+ num_layers=num_layers,
+ c_in_channels=c_in_channels,
+ dropout_p=dropout_p,
+ weight_norm=weight_norm,
+ )
+ self.wn_blocks.append(layer)
+
+ def forward(self, x, x_mask=None, g=None):
+ o = x
+ for layer in self.wn_blocks:
+ o = layer(o, x_mask, g)
+ return o
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/__init__.py b/submodules/TTS/TTS/tts/layers/glow_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/decoder.py b/submodules/TTS/TTS/tts/layers/glow_tts/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c5174ac5e67885288043885290c2906656c99c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/glow_tts/decoder.py
@@ -0,0 +1,141 @@
+import torch
+from torch import nn
+
+from TTS.tts.layers.generic.normalization import ActNorm
+from TTS.tts.layers.glow_tts.glow import CouplingBlock, InvConvNear
+
+
+def squeeze(x, x_mask=None, num_sqz=2):
+ """GlowTTS squeeze operation
+ Increase number of channels and reduce number of time steps
+ by the same factor.
+
+ Note:
+ each 's' is a n-dimensional vector.
+ ``[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]``
+ """
+ b, c, t = x.size()
+
+ t = (t // num_sqz) * num_sqz
+ x = x[:, :, :t]
+ x_sqz = x.view(b, c, t // num_sqz, num_sqz)
+ x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * num_sqz, t // num_sqz)
+
+ if x_mask is not None:
+ x_mask = x_mask[:, :, num_sqz - 1 :: num_sqz]
+ else:
+ x_mask = torch.ones(b, 1, t // num_sqz).to(device=x.device, dtype=x.dtype)
+ return x_sqz * x_mask, x_mask
+
+
+def unsqueeze(x, x_mask=None, num_sqz=2):
+ """GlowTTS unsqueeze operation (revert the squeeze)
+
+ Note:
+ each 's' is a n-dimensional vector.
+ ``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5, s2, s4, s6]]``
+ """
+ b, c, t = x.size()
+
+ x_unsqz = x.view(b, num_sqz, c // num_sqz, t)
+ x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // num_sqz, t * num_sqz)
+
+ if x_mask is not None:
+ x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, num_sqz).view(b, 1, t * num_sqz)
+ else:
+ x_mask = torch.ones(b, 1, t * num_sqz).to(device=x.device, dtype=x.dtype)
+ return x_unsqz * x_mask, x_mask
+
+
+class Decoder(nn.Module):
+ """Stack of Glow Decoder Modules.
+
+ ::
+
+ Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
+
+ Args:
+ in_channels (int): channels of input tensor.
+ hidden_channels (int): hidden decoder channels.
+ kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
+ dilation_rate (int): rate to increase dilation by each layer in a decoder block.
+ num_flow_blocks (int): number of decoder blocks.
+ num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
+ dropout_p (float): wavenet dropout rate.
+ sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_flow_blocks,
+ num_coupling_layers,
+ dropout_p=0.0,
+ num_splits=4,
+ num_squeeze=2,
+ sigmoid_scale=False,
+ c_in_channels=0,
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.num_flow_blocks = num_flow_blocks
+ self.num_coupling_layers = num_coupling_layers
+ self.dropout_p = dropout_p
+ self.num_splits = num_splits
+ self.num_squeeze = num_squeeze
+ self.sigmoid_scale = sigmoid_scale
+ self.c_in_channels = c_in_channels
+
+ self.flows = nn.ModuleList()
+ for _ in range(num_flow_blocks):
+ self.flows.append(ActNorm(channels=in_channels * num_squeeze))
+ self.flows.append(InvConvNear(channels=in_channels * num_squeeze, num_splits=num_splits))
+ self.flows.append(
+ CouplingBlock(
+ in_channels * num_squeeze,
+ hidden_channels,
+ kernel_size=kernel_size,
+ dilation_rate=dilation_rate,
+ num_layers=num_coupling_layers,
+ c_in_channels=c_in_channels,
+ dropout_p=dropout_p,
+ sigmoid_scale=sigmoid_scale,
+ )
+ )
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1 ,T]`
+ - g: :math:`[B, C]`
+ """
+ if not reverse:
+ flows = self.flows
+ logdet_tot = 0
+ else:
+ flows = reversed(self.flows)
+ logdet_tot = None
+
+ if self.num_squeeze > 1:
+ x, x_mask = squeeze(x, x_mask, self.num_squeeze)
+ for f in flows:
+ if not reverse:
+ x, logdet = f(x, x_mask, g=g, reverse=reverse)
+ logdet_tot += logdet
+ else:
+ x, logdet = f(x, x_mask, g=g, reverse=reverse)
+ if self.num_squeeze > 1:
+ x, x_mask = unsqueeze(x, x_mask, self.num_squeeze)
+ return x, logdet_tot
+
+ def store_inverse(self):
+ for f in self.flows:
+ f.store_inverse()
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/duration_predictor.py b/submodules/TTS/TTS/tts/layers/glow_tts/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e766ed6ab5a0348eaca8d1482be124003d8b8c68
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/glow_tts/duration_predictor.py
@@ -0,0 +1,69 @@
+import torch
+from torch import nn
+
+from ..generic.normalization import LayerNorm
+
+
+class DurationPredictor(nn.Module):
+ """Glow-TTS duration prediction model.
+
+ ::
+
+ [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
+
+ Args:
+ in_channels (int): Number of channels of the input tensor.
+ hidden_channels (int): Number of hidden channels of the network.
+ kernel_size (int): Kernel size for the conv layers.
+ dropout_p (float): Dropout rate used after each conv layer.
+ """
+
+ def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None):
+ super().__init__()
+
+ # add language embedding dim in the input
+ if language_emb_dim:
+ in_channels += language_emb_dim
+
+ # class arguments
+ self.in_channels = in_channels
+ self.filter_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dropout_p = dropout_p
+ # layers
+ self.drop = nn.Dropout(dropout_p)
+ self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
+ self.norm_1 = LayerNorm(hidden_channels)
+ self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
+ self.norm_2 = LayerNorm(hidden_channels)
+ # output layer
+ self.proj = nn.Conv1d(hidden_channels, 1, 1)
+ if cond_channels is not None and cond_channels != 0:
+ self.cond = nn.Conv1d(cond_channels, in_channels, 1)
+
+ if language_emb_dim != 0 and language_emb_dim is not None:
+ self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1)
+
+ def forward(self, x, x_mask, g=None, lang_emb=None):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ - g: :math:`[B, C, 1]`
+ """
+ if g is not None:
+ x = x + self.cond(g)
+
+ if lang_emb is not None:
+ x = x + self.cond_lang(lang_emb)
+
+ x = self.conv_1(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_1(x)
+ x = self.drop(x)
+ x = self.conv_2(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_2(x)
+ x = self.drop(x)
+ x = self.proj(x * x_mask)
+ return x * x_mask
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/encoder.py b/submodules/TTS/TTS/tts/layers/glow_tts/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b43e527f5e9ca2bd0880bf204e04a1526bc8dfb
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/glow_tts/encoder.py
@@ -0,0 +1,179 @@
+import math
+
+import torch
+from torch import nn
+
+from TTS.tts.layers.generic.gated_conv import GatedConvBlock
+from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
+from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
+from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
+from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class Encoder(nn.Module):
+ """Glow-TTS encoder module.
+
+ ::
+
+ embedding -> -> encoder_module -> --> proj_mean
+ |
+ |-> proj_var
+ |
+ |-> concat -> duration_predictor
+ ↑
+ speaker_embed
+
+ Args:
+ num_chars (int): number of characters.
+ out_channels (int): number of output channels.
+ hidden_channels (int): encoder's embedding size.
+ hidden_channels_ffn (int): transformer's feed-forward channels.
+ kernel_size (int): kernel size for conv layers and duration predictor.
+ dropout_p (float): dropout rate for any dropout layer.
+ mean_only (bool): if True, output only mean values and use constant std.
+ use_prenet (bool): if True, use pre-convolutional layers before transformer layers.
+ c_in_channels (int): number of channels in conditional input.
+
+ Shapes:
+ - input: (B, T, C)
+
+ ::
+
+ suggested encoder params...
+
+ for encoder_type == 'rel_pos_transformer'
+ encoder_params={
+ 'kernel_size':3,
+ 'dropout_p': 0.1,
+ 'num_layers': 6,
+ 'num_heads': 2,
+ 'hidden_channels_ffn': 768, # 4 times the hidden_channels
+ 'input_length': None
+ }
+
+ for encoder_type == 'gated_conv'
+ encoder_params={
+ 'kernel_size':5,
+ 'dropout_p': 0.1,
+ 'num_layers': 9,
+ }
+
+ for encoder_type == 'residual_conv_bn'
+ encoder_params={
+ "kernel_size": 4,
+ "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
+ "num_conv_blocks": 2,
+ "num_res_blocks": 13
+ }
+
+ for encoder_type == 'time_depth_separable'
+ encoder_params={
+ "kernel_size": 5,
+ 'num_layers': 9,
+ }
+ """
+
+ def __init__(
+ self,
+ num_chars,
+ out_channels,
+ hidden_channels,
+ hidden_channels_dp,
+ encoder_type,
+ encoder_params,
+ dropout_p_dp=0.1,
+ mean_only=False,
+ use_prenet=True,
+ c_in_channels=0,
+ ):
+ super().__init__()
+ # class arguments
+ self.num_chars = num_chars
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.hidden_channels_dp = hidden_channels_dp
+ self.dropout_p_dp = dropout_p_dp
+ self.mean_only = mean_only
+ self.use_prenet = use_prenet
+ self.c_in_channels = c_in_channels
+ self.encoder_type = encoder_type
+ # embedding layer
+ self.emb = nn.Embedding(num_chars, hidden_channels)
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+ # init encoder module
+ if encoder_type.lower() == "rel_pos_transformer":
+ if use_prenet:
+ self.prenet = ResidualConv1dLayerNormBlock(
+ hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5
+ )
+ self.encoder = RelativePositionTransformer(
+ hidden_channels, hidden_channels, hidden_channels, **encoder_params
+ )
+ elif encoder_type.lower() == "gated_conv":
+ self.encoder = GatedConvBlock(hidden_channels, **encoder_params)
+ elif encoder_type.lower() == "residual_conv_bn":
+ if use_prenet:
+ self.prenet = nn.Sequential(nn.Conv1d(hidden_channels, hidden_channels, 1), nn.ReLU())
+ self.encoder = ResidualConv1dBNBlock(hidden_channels, hidden_channels, hidden_channels, **encoder_params)
+ self.postnet = nn.Sequential(
+ nn.Conv1d(self.hidden_channels, self.hidden_channels, 1), nn.BatchNorm1d(self.hidden_channels)
+ )
+ elif encoder_type.lower() == "time_depth_separable":
+ if use_prenet:
+ self.prenet = ResidualConv1dLayerNormBlock(
+ hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5
+ )
+ self.encoder = TimeDepthSeparableConvBlock(
+ hidden_channels, hidden_channels, hidden_channels, **encoder_params
+ )
+ else:
+ raise ValueError(" [!] Unkown encoder type.")
+
+ # final projection layers
+ self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)
+ if not mean_only:
+ self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1)
+ # duration predictor
+ self.duration_predictor = DurationPredictor(
+ hidden_channels + c_in_channels, hidden_channels_dp, 3, dropout_p_dp
+ )
+
+ def forward(self, x, x_lengths, g=None):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_lengths: :math:`[B]`
+ - g (optional): :math:`[B, 1, T]`
+ """
+ # embedding layer
+ # [B ,T, D]
+ x = self.emb(x) * math.sqrt(self.hidden_channels)
+ # [B, D, T]
+ x = torch.transpose(x, 1, -1)
+ # compute input sequence mask
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+ # prenet
+ if hasattr(self, "prenet") and self.use_prenet:
+ x = self.prenet(x, x_mask)
+ # encoder
+ x = self.encoder(x, x_mask)
+ # postnet
+ if hasattr(self, "postnet"):
+ x = self.postnet(x) * x_mask
+ # set duration predictor input
+ if g is not None:
+ g_exp = g.expand(-1, -1, x.size(-1))
+ x_dp = torch.cat([x.detach(), g_exp], 1)
+ else:
+ x_dp = x.detach()
+ # final projection layer
+ x_m = self.proj_m(x) * x_mask
+ if not self.mean_only:
+ x_logs = self.proj_s(x) * x_mask
+ else:
+ x_logs = torch.zeros_like(x_m)
+ # duration predictor
+ logw = self.duration_predictor(x_dp, x_mask)
+ return x_m, x_logs, logw, x_mask
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/glow.py b/submodules/TTS/TTS/tts/layers/glow_tts/glow.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02c3118085fbd3305796d4ce7f0d149fa1bf72e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/glow_tts/glow.py
@@ -0,0 +1,233 @@
+import torch
+from packaging.version import Version
+from torch import nn
+from torch.nn import functional as F
+
+from TTS.tts.layers.generic.wavenet import WN
+
+from ..generic.normalization import LayerNorm
+
+
+class ResidualConv1dLayerNormBlock(nn.Module):
+ """Conv1d with Layer Normalization and residual connection as in GlowTTS paper.
+ https://arxiv.org/pdf/1811.00002.pdf
+
+ ::
+
+ x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o
+ |---------------> conv1d_1x1 ------------------|
+
+ Args:
+ in_channels (int): number of input tensor channels.
+ hidden_channels (int): number of inner layer channels.
+ out_channels (int): number of output tensor channels.
+ kernel_size (int): kernel size of conv1d filter.
+ num_layers (int): number of blocks.
+ dropout_p (float): dropout rate for each block.
+ """
+
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, num_layers, dropout_p):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.num_layers = num_layers
+ self.dropout_p = dropout_p
+ assert num_layers > 1, " [!] number of layers should be > 0."
+ assert kernel_size % 2 == 1, " [!] kernel size should be odd number."
+
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+
+ for idx in range(num_layers):
+ self.conv_layers.append(
+ nn.Conv1d(
+ in_channels if idx == 0 else hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+ )
+ )
+ self.norm_layers.append(LayerNorm(hidden_channels))
+
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ """
+ x_res = x
+ for i in range(self.num_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.norm_layers[i](x * x_mask)
+ x = F.dropout(F.relu(x), self.dropout_p, training=self.training)
+ x = x_res + self.proj(x)
+ return x * x_mask
+
+
+class InvConvNear(nn.Module):
+ """Invertible Convolution with input splitting as in GlowTTS paper.
+ https://arxiv.org/pdf/1811.00002.pdf
+
+ Args:
+ channels (int): input and output channels.
+ num_splits (int): number of splits, also H and W of conv layer.
+ no_jacobian (bool): enable/disable jacobian computations.
+
+ Note:
+ Split the input into groups of size self.num_splits and
+ perform 1x1 convolution separately. Cast 1x1 conv operation
+ to 2d by reshaping the input for efficiency.
+ """
+
+ def __init__(self, channels, num_splits=4, no_jacobian=False, **kwargs): # pylint: disable=unused-argument
+ super().__init__()
+ assert num_splits % 2 == 0
+ self.channels = channels
+ self.num_splits = num_splits
+ self.no_jacobian = no_jacobian
+ self.weight_inv = None
+
+ if Version(torch.__version__) < Version("1.9"):
+ w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
+ else:
+ w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
+
+ if torch.det(w_init) < 0:
+ w_init[:, 0] = -1 * w_init[:, 0]
+ self.weight = nn.Parameter(w_init)
+
+ def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ """
+ b, c, t = x.size()
+ assert c % self.num_splits == 0
+ if x_mask is None:
+ x_mask = 1
+ x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t
+ else:
+ x_len = torch.sum(x_mask, [1, 2])
+
+ x = x.view(b, 2, c // self.num_splits, self.num_splits // 2, t)
+ x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.num_splits, c // self.num_splits, t)
+
+ if reverse:
+ if self.weight_inv is not None:
+ weight = self.weight_inv
+ else:
+ weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
+ logdet = None
+ else:
+ weight = self.weight
+ if self.no_jacobian:
+ logdet = 0
+ else:
+ logdet = torch.logdet(self.weight) * (c / self.num_splits) * x_len # [b]
+
+ weight = weight.view(self.num_splits, self.num_splits, 1, 1)
+ z = F.conv2d(x, weight)
+
+ z = z.view(b, 2, self.num_splits // 2, c // self.num_splits, t)
+ z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask
+ return z, logdet
+
+ def store_inverse(self):
+ weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
+ self.weight_inv = nn.Parameter(weight_inv, requires_grad=False)
+
+
+class CouplingBlock(nn.Module):
+ """Glow Affine Coupling block as in GlowTTS paper.
+ https://arxiv.org/pdf/1811.00002.pdf
+
+ ::
+
+ x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o
+ '-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^
+
+ Args:
+ in_channels (int): number of input tensor channels.
+ hidden_channels (int): number of hidden channels.
+ kernel_size (int): WaveNet filter kernel size.
+ dilation_rate (int): rate to increase dilation by each layer in a decoder block.
+ num_layers (int): number of WaveNet layers.
+ c_in_channels (int): number of conditioning input channels.
+ dropout_p (int): wavenet dropout rate.
+ sigmoid_scale (bool): enable/disable sigmoid scaling for output scale.
+
+ Note:
+ It does not use the conditional inputs differently from WaveGlow.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_layers,
+ c_in_channels=0,
+ dropout_p=0,
+ sigmoid_scale=False,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.num_layers = num_layers
+ self.c_in_channels = c_in_channels
+ self.dropout_p = dropout_p
+ self.sigmoid_scale = sigmoid_scale
+ # input layer
+ start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
+ start = torch.nn.utils.parametrizations.weight_norm(start)
+ self.start = start
+ # output layer
+ # Initializing last layer to 0 makes the affine coupling layers
+ # do nothing at first. This helps with training stability
+ end = torch.nn.Conv1d(hidden_channels, in_channels, 1)
+ end.weight.data.zero_()
+ end.bias.data.zero_()
+ self.end = end
+ # coupling layers
+ self.wn = WN(hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels, dropout_p)
+
+ def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): # pylint: disable=unused-argument
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ - g: :math:`[B, C, 1]`
+ """
+ if x_mask is None:
+ x_mask = 1
+ x_0, x_1 = x[:, : self.in_channels // 2], x[:, self.in_channels // 2 :]
+
+ x = self.start(x_0) * x_mask
+ x = self.wn(x, x_mask, g)
+ out = self.end(x)
+
+ z_0 = x_0
+ t = out[:, : self.in_channels // 2, :]
+ s = out[:, self.in_channels // 2 :, :]
+ if self.sigmoid_scale:
+ s = torch.log(1e-6 + torch.sigmoid(s + 2))
+
+ if reverse:
+ z_1 = (x_1 - t) * torch.exp(-s) * x_mask
+ logdet = None
+ else:
+ z_1 = (t + torch.exp(s) * x_1) * x_mask
+ logdet = torch.sum(s * x_mask, [1, 2])
+
+ z = torch.cat([z_0, z_1], 1)
+ return z, logdet
+
+ def store_inverse(self):
+ self.wn.remove_weight_norm()
diff --git a/submodules/TTS/TTS/tts/layers/glow_tts/transformer.py b/submodules/TTS/TTS/tts/layers/glow_tts/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..02688d611fe41394e8e1fedbc5742845eae85cfd
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/glow_tts/transformer.py
@@ -0,0 +1,432 @@
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
+
+
+class RelativePositionMultiHeadAttention(nn.Module):
+ """Multi-head attention with Relative Positional embedding.
+ https://arxiv.org/pdf/1809.04281.pdf
+
+ It learns positional embeddings for a window of neighbours. For keys and values,
+ it learns different set of embeddings. Key embeddings are agregated with the attention
+ scores and value embeddings are aggregated with the output.
+
+ Note:
+ Example with relative attention window size 2
+
+ - input = [a, b, c, d, e]
+ - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)]
+
+ So it learns 4 embedding vectors (in total 8) separately for key and value vectors.
+
+ Considering the input c
+
+ - e(t-2) corresponds to c -> a
+ - e(t-2) corresponds to c -> b
+ - e(t-2) corresponds to c -> d
+ - e(t-2) corresponds to c -> e
+
+ These embeddings are shared among different time steps. So input a, b, d and e also uses
+ the same embeddings.
+
+ Embeddings are ignored when the relative window is out of limit for the first and the last
+ n items.
+
+ Args:
+ channels (int): input and inner layer channels.
+ out_channels (int): output channels.
+ num_heads (int): number of attention heads.
+ rel_attn_window_size (int, optional): relation attention window size.
+ If 4, for each time step next and previous 4 time steps are attended.
+ If default, relative encoding is disabled and it is a regular transformer.
+ Defaults to None.
+ heads_share (bool, optional): [description]. Defaults to True.
+ dropout_p (float, optional): dropout rate. Defaults to 0..
+ input_length (int, optional): intput length for positional encoding. Defaults to None.
+ proximal_bias (bool, optional): enable/disable proximal bias as in the paper. Defaults to False.
+ proximal_init (bool, optional): enable/disable poximal init as in the paper.
+ Init key and query layer weights the same. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ channels,
+ out_channels,
+ num_heads,
+ rel_attn_window_size=None,
+ heads_share=True,
+ dropout_p=0.0,
+ input_length=None,
+ proximal_bias=False,
+ proximal_init=False,
+ ):
+ super().__init__()
+ assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
+ # class attributes
+ self.channels = channels
+ self.out_channels = out_channels
+ self.num_heads = num_heads
+ self.rel_attn_window_size = rel_attn_window_size
+ self.heads_share = heads_share
+ self.input_length = input_length
+ self.proximal_bias = proximal_bias
+ self.dropout_p = dropout_p
+ self.attn = None
+ # query, key, value layers
+ self.k_channels = channels // num_heads
+ self.conv_q = nn.Conv1d(channels, channels, 1)
+ self.conv_k = nn.Conv1d(channels, channels, 1)
+ self.conv_v = nn.Conv1d(channels, channels, 1)
+ # output layers
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
+ self.dropout = nn.Dropout(dropout_p)
+ # relative positional encoding layers
+ if rel_attn_window_size is not None:
+ n_heads_rel = 1 if heads_share else num_heads
+ rel_stddev = self.k_channels**-0.5
+ emb_rel_k = nn.Parameter(
+ torch.randn(n_heads_rel, rel_attn_window_size * 2 + 1, self.k_channels) * rel_stddev
+ )
+ emb_rel_v = nn.Parameter(
+ torch.randn(n_heads_rel, rel_attn_window_size * 2 + 1, self.k_channels) * rel_stddev
+ )
+ self.register_parameter("emb_rel_k", emb_rel_k)
+ self.register_parameter("emb_rel_v", emb_rel_v)
+
+ # init layers
+ nn.init.xavier_uniform_(self.conv_q.weight)
+ nn.init.xavier_uniform_(self.conv_k.weight)
+ # proximal bias
+ if proximal_init:
+ self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+ self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+ nn.init.xavier_uniform_(self.conv_v.weight)
+
+ def forward(self, x, c, attn_mask=None):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - c: :math:`[B, C, T]`
+ - attn_mask: :math:`[B, 1, T, T]`
+ """
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
+ # compute raw attention scores
+ scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+ # relative positional encoding for scores
+ if self.rel_attn_window_size is not None:
+ assert t_s == t_t, "Relative attention is only available for self-attention."
+ # get relative key embeddings
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+ rel_logits = self._relative_position_to_absolute_position(rel_logits)
+ scores_local = rel_logits / math.sqrt(self.k_channels)
+ scores = scores + scores_local
+ # proximan bias
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
+ scores = scores + self._attn_proximity_bias(t_s).to(device=scores.device, dtype=scores.dtype)
+ # attention score masking
+ if mask is not None:
+ # add small value to prevent oor error.
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.input_length is not None:
+ block_mask = torch.ones_like(scores).triu(-1 * self.input_length).tril(self.input_length)
+ scores = scores * block_mask + -1e4 * (1 - block_mask)
+ # attention score normalization
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+ # apply dropout to attention weights
+ p_attn = self.dropout(p_attn)
+ # compute output
+ output = torch.matmul(p_attn, value)
+ # relative positional encoding for values
+ if self.rel_attn_window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+ return output, p_attn
+
+ @staticmethod
+ def _matmul_with_relative_values(p_attn, re):
+ """
+ Args:
+ p_attn (Tensor): attention weights.
+ re (Tensor): relative value embedding vector. (a_(i,j)^V)
+
+ Shapes:
+ -p_attn: :math:`[B, H, T, V]`
+ -re: :math:`[H or 1, V, D]`
+ -logits: :math:`[B, H, T, D]`
+ """
+ logits = torch.matmul(p_attn, re.unsqueeze(0))
+ return logits
+
+ @staticmethod
+ def _matmul_with_relative_keys(query, re):
+ """
+ Args:
+ query (Tensor): batch of query vectors. (x*W^Q)
+ re (Tensor): relative key embedding vector. (a_(i,j)^K)
+
+ Shapes:
+ - query: :math:`[B, H, T, D]`
+ - re: :math:`[H or 1, V, D]`
+ - logits: :math:`[B, H, T, V]`
+ """
+ # logits = torch.einsum('bhld, kmd -> bhlm', [query, re.to(query.dtype)])
+ logits = torch.matmul(query, re.unsqueeze(0).transpose(-2, -1))
+ return logits
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ """Convert embedding vestors to a tensor of embeddings"""
+ # Pad first before slice to avoid using cond ops.
+ pad_length = max(length - (self.rel_attn_window_size + 1), 0)
+ slice_start_position = max((self.rel_attn_window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = F.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+ return used_relative_embeddings
+
+ @staticmethod
+ def _relative_position_to_absolute_position(x):
+ """Converts tensor from relative to absolute indexing for local attention.
+ Shapes:
+ x: :math:`[B, C, T, 2 * T - 1]`
+ Returns:
+ A Tensor of shape :math:`[B, C, T, T]`
+ """
+ batch, heads, length, _ = x.size()
+ # Pad to shift from relative to absolute indexing.
+ x = F.pad(x, [0, 1, 0, 0, 0, 0, 0, 0])
+ # Pad extra elements so to add up to shape (len+1, 2*len-1).
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = F.pad(x_flat, [0, length - 1, 0, 0, 0, 0])
+ # Reshape and slice out the padded elements.
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
+ return x_final
+
+ @staticmethod
+ def _absolute_position_to_relative_position(x):
+ """
+ Shapes:
+ - x: :math:`[B, C, T, T]`
+ - ret: :math:`[B, C, T, 2*T-1]`
+ """
+ batch, heads, length, _ = x.size()
+ # padd along column
+ x = F.pad(x, [0, length - 1, 0, 0, 0, 0, 0, 0])
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ # add 0's in the beginning that will skew the elements after reshape
+ x_flat = F.pad(x_flat, [length, 0, 0, 0, 0, 0])
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ @staticmethod
+ def _attn_proximity_bias(length):
+ """Produce an attention mask that discourages distant
+ attention values.
+ Args:
+ length (int): an integer scalar.
+ Returns:
+ a Tensor with shape :math:`[1, 1, T, T]`
+ """
+ # L
+ r = torch.arange(length, dtype=torch.float32)
+ # L x L
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ # scale mask values
+ diff = -torch.log1p(torch.abs(diff))
+ # 1 x 1 x L x L
+ return diff.unsqueeze(0).unsqueeze(0)
+
+
+class FeedForwardNetwork(nn.Module):
+ """Feed Forward Inner layers for Transformer.
+
+ Args:
+ in_channels (int): input tensor channels.
+ out_channels (int): output tensor channels.
+ hidden_channels (int): inner layers hidden channels.
+ kernel_size (int): conv1d filter kernel size.
+ dropout_p (float, optional): dropout rate. Defaults to 0.
+ """
+
+ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dropout_p = dropout_p
+
+ if causal:
+ self.padding = self._causal_padding
+ else:
+ self.padding = self._same_padding
+
+ self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size)
+ self.conv_2 = nn.Conv1d(hidden_channels, out_channels, kernel_size)
+ self.dropout = nn.Dropout(dropout_p)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding(x * x_mask))
+ x = torch.relu(x)
+ x = self.dropout(x)
+ x = self.conv_2(self.padding(x * x_mask))
+ return x * x_mask
+
+ def _causal_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = self.kernel_size - 1
+ pad_r = 0
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, self._pad_shape(padding))
+ return x
+
+ def _same_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = (self.kernel_size - 1) // 2
+ pad_r = self.kernel_size // 2
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, self._pad_shape(padding))
+ return x
+
+ @staticmethod
+ def _pad_shape(padding):
+ l = padding[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+class RelativePositionTransformer(nn.Module):
+ """Transformer with Relative Potional Encoding.
+ https://arxiv.org/abs/1803.02155
+
+ Args:
+ in_channels (int): number of channels of the input tensor.
+ out_chanels (int): number of channels of the output tensor.
+ hidden_channels (int): model hidden channels.
+ hidden_channels_ffn (int): hidden channels of FeedForwardNetwork.
+ num_heads (int): number of attention heads.
+ num_layers (int): number of transformer layers.
+ kernel_size (int, optional): kernel size of feed-forward inner layers. Defaults to 1.
+ dropout_p (float, optional): dropout rate for self-attention and feed-forward inner layers_per_stack. Defaults to 0.
+ rel_attn_window_size (int, optional): relation attention window size.
+ If 4, for each time step next and previous 4 time steps are attended.
+ If default, relative encoding is disabled and it is a regular transformer.
+ Defaults to None.
+ input_length (int, optional): input lenght to limit position encoding. Defaults to None.
+ layer_norm_type (str, optional): type "1" uses torch tensor operations and type "2" uses torch layer_norm
+ primitive. Use type "2", type "1: is for backward compat. Defaults to "1".
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ hidden_channels: int,
+ hidden_channels_ffn: int,
+ num_heads: int,
+ num_layers: int,
+ kernel_size=1,
+ dropout_p=0.0,
+ rel_attn_window_size: int = None,
+ input_length: int = None,
+ layer_norm_type: str = "1",
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.hidden_channels_ffn = hidden_channels_ffn
+ self.num_heads = num_heads
+ self.num_layers = num_layers
+ self.kernel_size = kernel_size
+ self.dropout_p = dropout_p
+ self.rel_attn_window_size = rel_attn_window_size
+
+ self.dropout = nn.Dropout(dropout_p)
+ self.attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+
+ for idx in range(self.num_layers):
+ self.attn_layers.append(
+ RelativePositionMultiHeadAttention(
+ hidden_channels if idx != 0 else in_channels,
+ hidden_channels,
+ num_heads,
+ rel_attn_window_size=rel_attn_window_size,
+ dropout_p=dropout_p,
+ input_length=input_length,
+ )
+ )
+ if layer_norm_type == "1":
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ elif layer_norm_type == "2":
+ self.norm_layers_1.append(LayerNorm2(hidden_channels))
+ else:
+ raise ValueError(" [!] Unknown layer norm type")
+
+ if hidden_channels != out_channels and (idx + 1) == self.num_layers:
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+
+ self.ffn_layers.append(
+ FeedForwardNetwork(
+ hidden_channels,
+ hidden_channels if (idx + 1) != self.num_layers else out_channels,
+ hidden_channels_ffn,
+ kernel_size,
+ dropout_p=dropout_p,
+ )
+ )
+
+ if layer_norm_type == "1":
+ self.norm_layers_2.append(LayerNorm(hidden_channels if (idx + 1) != self.num_layers else out_channels))
+ elif layer_norm_type == "2":
+ self.norm_layers_2.append(LayerNorm2(hidden_channels if (idx + 1) != self.num_layers else out_channels))
+ else:
+ raise ValueError(" [!] Unknown layer norm type")
+
+ def forward(self, x, x_mask):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ """
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ for i in range(self.num_layers):
+ x = x * x_mask
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.dropout(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.dropout(y)
+
+ if (i + 1) == self.num_layers and hasattr(self, "proj"):
+ x = self.proj(x)
+
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/losses.py b/submodules/TTS/TTS/tts/layers/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..de5f408c48cf9183dfb14c30a6248a2b300bde4d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/losses.py
@@ -0,0 +1,889 @@
+import math
+
+import numpy as np
+import torch
+from coqpit import Coqpit
+from torch import nn
+from torch.nn import functional
+
+from TTS.tts.utils.helpers import sequence_mask
+from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
+from TTS.utils.audio.torch_transforms import TorchSTFT
+
+
+# pylint: disable=abstract-method
+# relates https://github.com/pytorch/pytorch/issues/42305
+class L1LossMasked(nn.Module):
+ def __init__(self, seq_len_norm):
+ super().__init__()
+ self.seq_len_norm = seq_len_norm
+
+ def forward(self, x, target, length):
+ """
+ Args:
+ x: A Variable containing a FloatTensor of size
+ (batch, max_len, dim) which contains the
+ unnormalized probability for each class.
+ target: A Variable containing a LongTensor of size
+ (batch, max_len, dim) which contains the index of the true
+ class for each corresponding step.
+ length: A Variable containing a LongTensor of size (batch,)
+ which contains the length of each data in a batch.
+ Shapes:
+ x: B x T X D
+ target: B x T x D
+ length: B
+ Returns:
+ loss: An average loss value in range [0, 1] masked by the length.
+ """
+ # mask: (batch, max_len, 1)
+ target.requires_grad = False
+ mask = sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
+ if self.seq_len_norm:
+ norm_w = mask / mask.sum(dim=1, keepdim=True)
+ out_weights = norm_w.div(target.shape[0] * target.shape[2])
+ mask = mask.expand_as(x)
+ loss = functional.l1_loss(x * mask, target * mask, reduction="none")
+ loss = loss.mul(out_weights.to(loss.device)).sum()
+ else:
+ mask = mask.expand_as(x)
+ loss = functional.l1_loss(x * mask, target * mask, reduction="sum")
+ loss = loss / mask.sum()
+ return loss
+
+
+class MSELossMasked(nn.Module):
+ def __init__(self, seq_len_norm):
+ super().__init__()
+ self.seq_len_norm = seq_len_norm
+
+ def forward(self, x, target, length):
+ """
+ Args:
+ x: A Variable containing a FloatTensor of size
+ (batch, max_len, dim) which contains the
+ unnormalized probability for each class.
+ target: A Variable containing a LongTensor of size
+ (batch, max_len, dim) which contains the index of the true
+ class for each corresponding step.
+ length: A Variable containing a LongTensor of size (batch,)
+ which contains the length of each data in a batch.
+ Shapes:
+ - x: :math:`[B, T, D]`
+ - target: :math:`[B, T, D]`
+ - length: :math:`B`
+ Returns:
+ loss: An average loss value in range [0, 1] masked by the length.
+ """
+ # mask: (batch, max_len, 1)
+ target.requires_grad = False
+ mask = sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
+ if self.seq_len_norm:
+ norm_w = mask / mask.sum(dim=1, keepdim=True)
+ out_weights = norm_w.div(target.shape[0] * target.shape[2])
+ mask = mask.expand_as(x)
+ loss = functional.mse_loss(x * mask, target * mask, reduction="none")
+ loss = loss.mul(out_weights.to(loss.device)).sum()
+ else:
+ mask = mask.expand_as(x)
+ loss = functional.mse_loss(x * mask, target * mask, reduction="sum")
+ loss = loss / mask.sum()
+ return loss
+
+
+def sample_wise_min_max(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+ """Min-Max normalize tensor through first dimension
+ Shapes:
+ - x: :math:`[B, D1, D2]`
+ - m: :math:`[B, D1, 1]`
+ """
+ maximum = torch.amax(x.masked_fill(~mask, 0), dim=(1, 2), keepdim=True)
+ minimum = torch.amin(x.masked_fill(~mask, np.inf), dim=(1, 2), keepdim=True)
+ return (x - minimum) / (maximum - minimum + 1e-8)
+
+
+class SSIMLoss(torch.nn.Module):
+ """SSIM loss as (1 - SSIM)
+ SSIM is explained here https://en.wikipedia.org/wiki/Structural_similarity
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.loss_func = _SSIMLoss()
+
+ def forward(self, y_hat, y, length):
+ """
+ Args:
+ y_hat (tensor): model prediction values.
+ y (tensor): target values.
+ length (tensor): length of each sample in a batch for masking.
+
+ Shapes:
+ y_hat: B x T X D
+ y: B x T x D
+ length: B
+
+ Returns:
+ loss: An average loss value in range [0, 1] masked by the length.
+ """
+ mask = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2)
+ y_norm = sample_wise_min_max(y, mask)
+ y_hat_norm = sample_wise_min_max(y_hat, mask)
+ ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
+
+ if ssim_loss.item() > 1.0:
+ print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 1.0")
+ ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
+
+ if ssim_loss.item() < 0.0:
+ print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0")
+ ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
+
+ return ssim_loss
+
+
+class AttentionEntropyLoss(nn.Module):
+ # pylint: disable=R0201
+ def forward(self, align):
+ """
+ Forces attention to be more decisive by penalizing
+ soft attention weights
+ """
+ entropy = torch.distributions.Categorical(probs=align).entropy()
+ loss = (entropy / np.log(align.shape[1])).mean()
+ return loss
+
+
+class BCELossMasked(nn.Module):
+ """BCE loss with masking.
+
+ Used mainly for stopnet in autoregressive models.
+
+ Args:
+ pos_weight (float): weight for positive samples. If set < 1, penalize early stopping. Defaults to None.
+ """
+
+ def __init__(self, pos_weight: float = None):
+ super().__init__()
+ self.register_buffer("pos_weight", torch.tensor([pos_weight]))
+
+ def forward(self, x, target, length):
+ """
+ Args:
+ x: A Variable containing a FloatTensor of size
+ (batch, max_len) which contains the
+ unnormalized probability for each class.
+ target: A Variable containing a LongTensor of size
+ (batch, max_len) which contains the index of the true
+ class for each corresponding step.
+ length: A Variable containing a LongTensor of size (batch,)
+ which contains the length of each data in a batch.
+ Shapes:
+ x: B x T
+ target: B x T
+ length: B
+ Returns:
+ loss: An average loss value in range [0, 1] masked by the length.
+ """
+ target.requires_grad = False
+ if length is not None:
+ # mask: (batch, max_len, 1)
+ mask = sequence_mask(sequence_length=length, max_len=target.size(1))
+ num_items = mask.sum()
+ loss = functional.binary_cross_entropy_with_logits(
+ x.masked_select(mask),
+ target.masked_select(mask),
+ pos_weight=self.pos_weight.to(x.device),
+ reduction="sum",
+ )
+ else:
+ loss = functional.binary_cross_entropy_with_logits(
+ x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
+ )
+ num_items = torch.numel(x)
+ loss = loss / num_items
+ return loss
+
+
+class DifferentialSpectralLoss(nn.Module):
+ """Differential Spectral Loss
+ https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf"""
+
+ def __init__(self, loss_func):
+ super().__init__()
+ self.loss_func = loss_func
+
+ def forward(self, x, target, length=None):
+ """
+ Shapes:
+ x: B x T
+ target: B x T
+ length: B
+ Returns:
+ loss: An average loss value in range [0, 1] masked by the length.
+ """
+ x_diff = x[:, 1:] - x[:, :-1]
+ target_diff = target[:, 1:] - target[:, :-1]
+ if length is None:
+ return self.loss_func(x_diff, target_diff)
+ return self.loss_func(x_diff, target_diff, length - 1)
+
+
+class GuidedAttentionLoss(torch.nn.Module):
+ def __init__(self, sigma=0.4):
+ super().__init__()
+ self.sigma = sigma
+
+ def _make_ga_masks(self, ilens, olens):
+ B = len(ilens)
+ max_ilen = max(ilens)
+ max_olen = max(olens)
+ ga_masks = torch.zeros((B, max_olen, max_ilen))
+ for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+ ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma)
+ return ga_masks
+
+ def forward(self, att_ws, ilens, olens):
+ ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device)
+ seq_masks = self._make_masks(ilens, olens).to(att_ws.device)
+ losses = ga_masks * att_ws
+ loss = torch.mean(losses.masked_select(seq_masks))
+ return loss
+
+ @staticmethod
+ def _make_ga_mask(ilen, olen, sigma):
+ grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen))
+ grid_x, grid_y = grid_x.float(), grid_y.float()
+ return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2)))
+
+ @staticmethod
+ def _make_masks(ilens, olens):
+ in_masks = sequence_mask(ilens)
+ out_masks = sequence_mask(olens)
+ return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
+
+
+class Huber(nn.Module):
+ # pylint: disable=R0201
+ def forward(self, x, y, length=None):
+ """
+ Shapes:
+ x: B x T
+ y: B x T
+ length: B
+ """
+ mask = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2).float()
+ return torch.nn.functional.smooth_l1_loss(x * mask, y * mask, reduction="sum") / mask.sum()
+
+
+class ForwardSumLoss(nn.Module):
+ def __init__(self, blank_logprob=-1):
+ super().__init__()
+ self.log_softmax = torch.nn.LogSoftmax(dim=3)
+ self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
+ self.blank_logprob = blank_logprob
+
+ def forward(self, attn_logprob, in_lens, out_lens):
+ key_lens = in_lens
+ query_lens = out_lens
+ attn_logprob_padded = torch.nn.functional.pad(input=attn_logprob, pad=(1, 0), value=self.blank_logprob)
+
+ total_loss = 0.0
+ for bid in range(attn_logprob.shape[0]):
+ target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
+ curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[: query_lens[bid], :, : key_lens[bid] + 1]
+
+ curr_logprob = self.log_softmax(curr_logprob[None])[0]
+ loss = self.ctc_loss(
+ curr_logprob,
+ target_seq,
+ input_lengths=query_lens[bid : bid + 1],
+ target_lengths=key_lens[bid : bid + 1],
+ )
+ total_loss = total_loss + loss
+
+ total_loss = total_loss / attn_logprob.shape[0]
+ return total_loss
+
+
+########################
+# MODEL LOSS LAYERS
+########################
+
+
+class TacotronLoss(torch.nn.Module):
+ """Collection of Tacotron set-up based on provided config."""
+
+ def __init__(self, c, ga_sigma=0.4):
+ super().__init__()
+ self.stopnet_pos_weight = c.stopnet_pos_weight
+ self.use_capacitron_vae = c.use_capacitron_vae
+ if self.use_capacitron_vae:
+ self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
+ self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
+ self.ga_alpha = c.ga_alpha
+ self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
+ self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
+ self.decoder_alpha = c.decoder_loss_alpha
+ self.postnet_alpha = c.postnet_loss_alpha
+ self.decoder_ssim_alpha = c.decoder_ssim_alpha
+ self.postnet_ssim_alpha = c.postnet_ssim_alpha
+ self.config = c
+
+ # postnet and decoder loss
+ if c.loss_masking:
+ self.criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron"] else MSELossMasked(c.seq_len_norm)
+ else:
+ self.criterion = nn.L1Loss() if c.model in ["Tacotron"] else nn.MSELoss()
+ # guided attention loss
+ if c.ga_alpha > 0:
+ self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
+ # differential spectral loss
+ if c.postnet_diff_spec_alpha > 0 or c.decoder_diff_spec_alpha > 0:
+ self.criterion_diff_spec = DifferentialSpectralLoss(loss_func=self.criterion)
+ # ssim loss
+ if c.postnet_ssim_alpha > 0 or c.decoder_ssim_alpha > 0:
+ self.criterion_ssim = SSIMLoss()
+ # stopnet loss
+ # pylint: disable=not-callable
+ self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
+
+ # For dev pruposes only
+ self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
+
+ def forward(
+ self,
+ postnet_output,
+ decoder_output,
+ mel_input,
+ linear_input,
+ stopnet_output,
+ stopnet_target,
+ stop_target_length,
+ capacitron_vae_outputs,
+ output_lens,
+ decoder_b_output,
+ alignments,
+ alignment_lens,
+ alignments_backwards,
+ input_lens,
+ ):
+ # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
+ # the target should be set acccordingly
+ postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
+
+ return_dict = {}
+ # remove lengths if no masking is applied
+ if not self.config.loss_masking:
+ output_lens = None
+ # decoder and postnet losses
+ if self.config.loss_masking:
+ if self.decoder_alpha > 0:
+ decoder_loss = self.criterion(decoder_output, mel_input, output_lens)
+ if self.postnet_alpha > 0:
+ postnet_loss = self.criterion(postnet_output, postnet_target, output_lens)
+ else:
+ if self.decoder_alpha > 0:
+ decoder_loss = self.criterion(decoder_output, mel_input)
+ if self.postnet_alpha > 0:
+ postnet_loss = self.criterion(postnet_output, postnet_target)
+ loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
+ return_dict["decoder_loss"] = decoder_loss
+ return_dict["postnet_loss"] = postnet_loss
+
+ if self.use_capacitron_vae:
+ # extract capacitron vae infos
+ posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
+
+ # KL divergence term between the posterior and the prior
+ kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
+
+ # Limit the mutual information between the data and latent space by the variational capacity limit
+ kl_capacity = kl_term - self.capacitron_capacity
+
+ # pass beta through softplus to keep it positive
+ beta = torch.nn.functional.softplus(beta)[0]
+
+ # This is the term going to the main ADAM optimiser, we detach beta because
+ # beta is optimised by a separate, SGD optimiser below
+ capacitron_vae_loss = beta.detach() * kl_capacity
+
+ # normalize the capacitron_vae_loss as in L1Loss or MSELoss.
+ # After this, both the standard loss and capacitron_vae_loss will be in the same scale.
+ # For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
+ # Note: the batch is not considered because the L1Loss was calculated in "sum" mode
+ # divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
+
+ # get B T D dimension from input
+ B, T, D = mel_input.size()
+ # normalize
+ if self.config.loss_masking:
+ # if mask loss get T using the mask
+ T = output_lens.sum() / B
+
+ # Only for dev purposes to be able to compare the reconstruction loss with the values in the
+ # original Capacitron paper
+ return_dict["capaciton_reconstruction_loss"] = (
+ self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
+ ) + kl_capacity
+
+ capacitron_vae_loss = capacitron_vae_loss / (T * D)
+ capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
+
+ # This is the term to purely optimise beta and to pass into the SGD optimizer
+ beta_loss = torch.negative(beta) * kl_capacity.detach()
+
+ loss += capacitron_vae_loss
+
+ return_dict["capacitron_vae_loss"] = capacitron_vae_loss
+ return_dict["capacitron_vae_beta_loss"] = beta_loss
+ return_dict["capacitron_vae_kl_term"] = kl_term
+ return_dict["capacitron_beta"] = beta
+
+ stop_loss = (
+ self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
+ if self.config.stopnet
+ else torch.zeros(1)
+ )
+ loss += stop_loss
+ return_dict["stopnet_loss"] = stop_loss
+
+ # backward decoder loss (if enabled)
+ if self.config.bidirectional_decoder:
+ if self.config.loss_masking:
+ decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1,)), mel_input, output_lens)
+ else:
+ decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1,)), mel_input)
+ decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1,)), decoder_output)
+ loss += self.decoder_alpha * (decoder_b_loss + decoder_c_loss)
+ return_dict["decoder_b_loss"] = decoder_b_loss
+ return_dict["decoder_c_loss"] = decoder_c_loss
+
+ # double decoder consistency loss (if enabled)
+ if self.config.double_decoder_consistency:
+ if self.config.loss_masking:
+ decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens)
+ else:
+ decoder_b_loss = self.criterion(decoder_b_output, mel_input)
+ # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output)
+ attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards)
+ loss += self.decoder_alpha * (decoder_b_loss + attention_c_loss)
+ return_dict["decoder_coarse_loss"] = decoder_b_loss
+ return_dict["decoder_ddc_loss"] = attention_c_loss
+
+ # guided attention loss (if enabled)
+ if self.config.ga_alpha > 0:
+ ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
+ loss += ga_loss * self.ga_alpha
+ return_dict["ga_loss"] = ga_loss
+
+ # decoder differential spectral loss
+ if self.config.decoder_diff_spec_alpha > 0:
+ decoder_diff_spec_loss = self.criterion_diff_spec(decoder_output, mel_input, output_lens)
+ loss += decoder_diff_spec_loss * self.decoder_diff_spec_alpha
+ return_dict["decoder_diff_spec_loss"] = decoder_diff_spec_loss
+
+ # postnet differential spectral loss
+ if self.config.postnet_diff_spec_alpha > 0:
+ postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens)
+ loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha
+ return_dict["postnet_diff_spec_loss"] = postnet_diff_spec_loss
+
+ # decoder ssim loss
+ if self.config.decoder_ssim_alpha > 0:
+ decoder_ssim_loss = self.criterion_ssim(decoder_output, mel_input, output_lens)
+ loss += decoder_ssim_loss * self.postnet_ssim_alpha
+ return_dict["decoder_ssim_loss"] = decoder_ssim_loss
+
+ # postnet ssim loss
+ if self.config.postnet_ssim_alpha > 0:
+ postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens)
+ loss += postnet_ssim_loss * self.postnet_ssim_alpha
+ return_dict["postnet_ssim_loss"] = postnet_ssim_loss
+
+ return_dict["loss"] = loss
+ return return_dict
+
+
+class GlowTTSLoss(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.constant_factor = 0.5 * math.log(2 * math.pi)
+
+ def forward(self, z, means, scales, log_det, y_lengths, o_dur_log, o_attn_dur, x_lengths):
+ return_dict = {}
+ # flow loss - neg log likelihood
+ pz = torch.sum(scales) + 0.5 * torch.sum(torch.exp(-2 * scales) * (z - means) ** 2)
+ log_mle = self.constant_factor + (pz - torch.sum(log_det)) / (torch.sum(y_lengths) * z.shape[2])
+ # duration loss - MSE
+ loss_dur = torch.sum((o_dur_log - o_attn_dur) ** 2) / torch.sum(x_lengths)
+ # duration loss - huber loss
+ # loss_dur = torch.nn.functional.smooth_l1_loss(o_dur_log, o_attn_dur, reduction="sum") / torch.sum(x_lengths)
+ return_dict["loss"] = log_mle + loss_dur
+ return_dict["log_mle"] = log_mle
+ return_dict["loss_dur"] = loss_dur
+
+ # check if any loss is NaN
+ for key, loss in return_dict.items():
+ if torch.isnan(loss):
+ raise RuntimeError(f" [!] NaN loss with {key}.")
+ return return_dict
+
+
+def mse_loss_custom(x, y):
+ """MSE loss using the torch back-end without reduction.
+ It uses less VRAM than the raw code"""
+ expanded_x, expanded_y = torch.broadcast_tensors(x, y)
+ return torch._C._nn.mse_loss(expanded_x, expanded_y, 0) # pylint: disable=protected-access, c-extension-no-member
+
+
+class MDNLoss(nn.Module):
+ """Mixture of Density Network Loss as described in https://arxiv.org/pdf/2003.01950.pdf."""
+
+ def forward(self, logp, text_lengths, mel_lengths): # pylint: disable=no-self-use
+ """
+ Shapes:
+ mu: [B, D, T]
+ log_sigma: [B, D, T]
+ mel_spec: [B, D, T]
+ """
+ B, T_seq, T_mel = logp.shape
+ log_alpha = logp.new_ones(B, T_seq, T_mel) * (-1e4)
+ log_alpha[:, 0, 0] = logp[:, 0, 0]
+ for t in range(1, T_mel):
+ prev_step = torch.cat(
+ [log_alpha[:, :, t - 1 : t], functional.pad(log_alpha[:, :, t - 1 : t], (0, 0, 1, -1), value=-1e4)],
+ dim=-1,
+ )
+ log_alpha[:, :, t] = torch.logsumexp(prev_step + 1e-4, dim=-1) + logp[:, :, t]
+ alpha_last = log_alpha[torch.arange(B), text_lengths - 1, mel_lengths - 1]
+ mdn_loss = -alpha_last.mean() / T_seq
+ return mdn_loss # , log_prob_matrix
+
+
+class AlignTTSLoss(nn.Module):
+ """Modified AlignTTS Loss.
+ Computes
+ - L1 and SSIM losses from output spectrograms.
+ - Huber loss for duration predictor.
+ - MDNLoss for Mixture of Density Network.
+
+ All loss values are aggregated by a weighted sum of the alpha values.
+
+ Args:
+ c (dict): TTS model configuration.
+ """
+
+ def __init__(self, c):
+ super().__init__()
+ self.mdn_loss = MDNLoss()
+ self.spec_loss = MSELossMasked(False)
+ self.ssim = SSIMLoss()
+ self.dur_loss = MSELossMasked(False)
+
+ self.ssim_alpha = c.ssim_alpha
+ self.dur_loss_alpha = c.dur_loss_alpha
+ self.spec_loss_alpha = c.spec_loss_alpha
+ self.mdn_alpha = c.mdn_alpha
+
+ def forward(
+ self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, phase
+ ):
+ # ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step)
+ spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0
+ if phase == 0:
+ mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+ elif phase == 1:
+ spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
+ ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+ elif phase == 2:
+ mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+ spec_loss = self.spec_lossX(decoder_output, decoder_target, decoder_output_lens)
+ ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+ elif phase == 3:
+ dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
+ else:
+ mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens)
+ spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
+ ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+ dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens)
+ loss = (
+ self.spec_loss_alpha * spec_loss
+ + self.ssim_alpha * ssim_loss
+ + self.dur_loss_alpha * dur_loss
+ + self.mdn_alpha * mdn_loss
+ )
+ return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss}
+
+
+class VitsGeneratorLoss(nn.Module):
+ def __init__(self, c: Coqpit):
+ super().__init__()
+ self.kl_loss_alpha = c.kl_loss_alpha
+ self.gen_loss_alpha = c.gen_loss_alpha
+ self.feat_loss_alpha = c.feat_loss_alpha
+ self.dur_loss_alpha = c.dur_loss_alpha
+ self.mel_loss_alpha = c.mel_loss_alpha
+ self.spk_encoder_loss_alpha = c.speaker_encoder_loss_alpha
+ self.stft = TorchSTFT(
+ c.audio.fft_size,
+ c.audio.hop_length,
+ c.audio.win_length,
+ sample_rate=c.audio.sample_rate,
+ mel_fmin=c.audio.mel_fmin,
+ mel_fmax=c.audio.mel_fmax,
+ n_mels=c.audio.num_mels,
+ use_mel=True,
+ do_amp_to_db=True,
+ )
+
+ @staticmethod
+ def feature_loss(feats_real, feats_generated):
+ loss = 0
+ for dr, dg in zip(feats_real, feats_generated):
+ for rl, gl in zip(dr, dg):
+ rl = rl.float().detach()
+ gl = gl.float()
+ loss += torch.mean(torch.abs(rl - gl))
+ return loss * 2
+
+ @staticmethod
+ def generator_loss(scores_fake):
+ loss = 0
+ gen_losses = []
+ for dg in scores_fake:
+ dg = dg.float()
+ l = torch.mean((1 - dg) ** 2)
+ gen_losses.append(l)
+ loss += l
+
+ return loss, gen_losses
+
+ @staticmethod
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+ """
+ z_p, logs_q: [b, h, t_t]
+ m_p, logs_p: [b, h, t_t]
+ """
+ z_p = z_p.float()
+ logs_q = logs_q.float()
+ m_p = m_p.float()
+ logs_p = logs_p.float()
+ z_mask = z_mask.float()
+
+ kl = logs_p - logs_q - 0.5
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+ kl = torch.sum(kl * z_mask)
+ l = kl / torch.sum(z_mask)
+ return l
+
+ @staticmethod
+ def cosine_similarity_loss(gt_spk_emb, syn_spk_emb):
+ return -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean()
+
+ def forward(
+ self,
+ mel_slice,
+ mel_slice_hat,
+ z_p,
+ logs_q,
+ m_p,
+ logs_p,
+ z_len,
+ scores_disc_fake,
+ feats_disc_fake,
+ feats_disc_real,
+ loss_duration,
+ use_speaker_encoder_as_loss=False,
+ gt_spk_emb=None,
+ syn_spk_emb=None,
+ ):
+ """
+ Shapes:
+ - mel_slice : :math:`[B, 1, T]`
+ - mel_slice_hat: :math:`[B, 1, T]`
+ - z_p: :math:`[B, C, T]`
+ - logs_q: :math:`[B, C, T]`
+ - m_p: :math:`[B, C, T]`
+ - logs_p: :math:`[B, C, T]`
+ - z_len: :math:`[B]`
+ - scores_disc_fake[i]: :math:`[B, C]`
+ - feats_disc_fake[i][j]: :math:`[B, C, T', P]`
+ - feats_disc_real[i][j]: :math:`[B, C, T', P]`
+ """
+ loss = 0.0
+ return_dict = {}
+ z_mask = sequence_mask(z_len).float()
+ # compute losses
+ loss_kl = (
+ self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1))
+ * self.kl_loss_alpha
+ )
+ loss_feat = (
+ self.feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha
+ )
+ loss_gen = self.generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha
+ loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha
+ loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha
+ loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration
+
+ if use_speaker_encoder_as_loss:
+ loss_se = self.cosine_similarity_loss(gt_spk_emb, syn_spk_emb) * self.spk_encoder_loss_alpha
+ loss = loss + loss_se
+ return_dict["loss_spk_encoder"] = loss_se
+ # pass losses to the dict
+ return_dict["loss_gen"] = loss_gen
+ return_dict["loss_kl"] = loss_kl
+ return_dict["loss_feat"] = loss_feat
+ return_dict["loss_mel"] = loss_mel
+ return_dict["loss_duration"] = loss_duration
+ return_dict["loss"] = loss
+ return return_dict
+
+
+class VitsDiscriminatorLoss(nn.Module):
+ def __init__(self, c: Coqpit):
+ super().__init__()
+ self.disc_loss_alpha = c.disc_loss_alpha
+
+ @staticmethod
+ def discriminator_loss(scores_real, scores_fake):
+ loss = 0
+ real_losses = []
+ fake_losses = []
+ for dr, dg in zip(scores_real, scores_fake):
+ dr = dr.float()
+ dg = dg.float()
+ real_loss = torch.mean((1 - dr) ** 2)
+ fake_loss = torch.mean(dg**2)
+ loss += real_loss + fake_loss
+ real_losses.append(real_loss.item())
+ fake_losses.append(fake_loss.item())
+ return loss, real_losses, fake_losses
+
+ def forward(self, scores_disc_real, scores_disc_fake):
+ loss = 0.0
+ return_dict = {}
+ loss_disc, loss_disc_real, _ = self.discriminator_loss(
+ scores_real=scores_disc_real, scores_fake=scores_disc_fake
+ )
+ return_dict["loss_disc"] = loss_disc * self.disc_loss_alpha
+ loss = loss + return_dict["loss_disc"]
+ return_dict["loss"] = loss
+
+ for i, ldr in enumerate(loss_disc_real):
+ return_dict[f"loss_disc_real_{i}"] = ldr
+ return return_dict
+
+
+class ForwardTTSLoss(nn.Module):
+ """Generic configurable ForwardTTS loss."""
+
+ def __init__(self, c):
+ super().__init__()
+ if c.spec_loss_type == "mse":
+ self.spec_loss = MSELossMasked(False)
+ elif c.spec_loss_type == "l1":
+ self.spec_loss = L1LossMasked(False)
+ else:
+ raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type))
+
+ if c.duration_loss_type == "mse":
+ self.dur_loss = MSELossMasked(False)
+ elif c.duration_loss_type == "l1":
+ self.dur_loss = L1LossMasked(False)
+ elif c.duration_loss_type == "huber":
+ self.dur_loss = Huber()
+ else:
+ raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type))
+
+ if c.model_args.use_aligner:
+ self.aligner_loss = ForwardSumLoss()
+ self.aligner_loss_alpha = c.aligner_loss_alpha
+
+ if c.model_args.use_pitch:
+ self.pitch_loss = MSELossMasked(False)
+ self.pitch_loss_alpha = c.pitch_loss_alpha
+
+ if c.model_args.use_energy:
+ self.energy_loss = MSELossMasked(False)
+ self.energy_loss_alpha = c.energy_loss_alpha
+
+ if c.use_ssim_loss:
+ self.ssim = SSIMLoss() if c.use_ssim_loss else None
+ self.ssim_loss_alpha = c.ssim_loss_alpha
+
+ self.spec_loss_alpha = c.spec_loss_alpha
+ self.dur_loss_alpha = c.dur_loss_alpha
+ self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
+
+ @staticmethod
+ def _binary_alignment_loss(alignment_hard, alignment_soft):
+ """Binary loss that forces soft alignments to match the hard alignments as
+ explained in `https://arxiv.org/pdf/2108.10447.pdf`.
+ """
+ log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
+ return -log_sum / alignment_hard.sum()
+
+ def forward(
+ self,
+ decoder_output,
+ decoder_target,
+ decoder_output_lens,
+ dur_output,
+ dur_target,
+ pitch_output,
+ pitch_target,
+ energy_output,
+ energy_target,
+ input_lens,
+ alignment_logprob=None,
+ alignment_hard=None,
+ alignment_soft=None,
+ binary_loss_weight=None,
+ ):
+ loss = 0
+ return_dict = {}
+ if hasattr(self, "ssim_loss") and self.ssim_loss_alpha > 0:
+ ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens)
+ loss = loss + self.ssim_loss_alpha * ssim_loss
+ return_dict["loss_ssim"] = self.ssim_loss_alpha * ssim_loss
+
+ if self.spec_loss_alpha > 0:
+ spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens)
+ loss = loss + self.spec_loss_alpha * spec_loss
+ return_dict["loss_spec"] = self.spec_loss_alpha * spec_loss
+
+ if self.dur_loss_alpha > 0:
+ log_dur_tgt = torch.log(dur_target.float() + 1)
+ dur_loss = self.dur_loss(dur_output[:, :, None], log_dur_tgt[:, :, None], input_lens)
+ loss = loss + self.dur_loss_alpha * dur_loss
+ return_dict["loss_dur"] = self.dur_loss_alpha * dur_loss
+
+ if hasattr(self, "pitch_loss") and self.pitch_loss_alpha > 0:
+ pitch_loss = self.pitch_loss(pitch_output.transpose(1, 2), pitch_target.transpose(1, 2), input_lens)
+ loss = loss + self.pitch_loss_alpha * pitch_loss
+ return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
+
+ if hasattr(self, "energy_loss") and self.energy_loss_alpha > 0:
+ energy_loss = self.energy_loss(energy_output.transpose(1, 2), energy_target.transpose(1, 2), input_lens)
+ loss = loss + self.energy_loss_alpha * energy_loss
+ return_dict["loss_energy"] = self.energy_loss_alpha * energy_loss
+
+ if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
+ aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
+ loss = loss + self.aligner_loss_alpha * aligner_loss
+ return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
+
+ if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None:
+ binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft)
+ loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
+ if binary_loss_weight:
+ return_dict["loss_binary_alignment"] = (
+ self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight
+ )
+ else:
+ return_dict["loss_binary_alignment"] = self.binary_alignment_loss_alpha * binary_alignment_loss
+
+ return_dict["loss"] = loss
+ return return_dict
diff --git a/submodules/TTS/TTS/tts/layers/overflow/__init__.py b/submodules/TTS/TTS/tts/layers/overflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/overflow/common_layers.py b/submodules/TTS/TTS/tts/layers/overflow/common_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b036dd1bda92fb709f0cce796cf5a668a1c081df
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/overflow/common_layers.py
@@ -0,0 +1,323 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm.auto import tqdm
+
+from TTS.tts.layers.tacotron.common_layers import Linear
+from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
+
+
+class Encoder(nn.Module):
+ r"""Neural HMM Encoder
+
+ Same as Tacotron 2 encoder but increases the input length by states per phone
+
+ Args:
+ num_chars (int): Number of characters in the input.
+ state_per_phone (int): Number of states per phone.
+ in_out_channels (int): number of input and output channels.
+ n_convolutions (int): number of convolutional layers.
+ """
+
+ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
+ super().__init__()
+
+ self.state_per_phone = state_per_phone
+ self.in_out_channels = in_out_channels
+
+ self.emb = nn.Embedding(num_chars, in_out_channels)
+ self.convolutions = nn.ModuleList()
+ for _ in range(n_convolutions):
+ self.convolutions.append(ConvBNBlock(in_out_channels, in_out_channels, 5, "relu"))
+ self.lstm = nn.LSTM(
+ in_out_channels,
+ int(in_out_channels / 2) * state_per_phone,
+ num_layers=1,
+ batch_first=True,
+ bias=True,
+ bidirectional=True,
+ )
+ self.rnn_state = None
+
+ def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+ """Forward pass to the encoder.
+
+ Args:
+ x (torch.FloatTensor): input text indices.
+ - shape: :math:`(b, T_{in})`
+ x_len (torch.LongTensor): input text lengths.
+ - shape: :math:`(b,)`
+
+ Returns:
+ Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
+ -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
+ """
+ b, T = x.shape
+ o = self.emb(x).transpose(1, 2)
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ o = nn.utils.rnn.pack_padded_sequence(o, x_len.cpu(), batch_first=True)
+ self.lstm.flatten_parameters()
+ o, _ = self.lstm(o)
+ o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
+ o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
+ x_len = x_len * self.state_per_phone
+ return o, x_len
+
+ def inference(self, x, x_len):
+ """Inference to the encoder.
+
+ Args:
+ x (torch.FloatTensor): input text indices.
+ - shape: :math:`(b, T_{in})`
+ x_len (torch.LongTensor): input text lengths.
+ - shape: :math:`(b,)`
+
+ Returns:
+ Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
+ -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
+ """
+ b, T = x.shape
+ o = self.emb(x).transpose(1, 2)
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ # self.lstm.flatten_parameters()
+ o, _ = self.lstm(o)
+ o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
+ x_len = x_len * self.state_per_phone
+ return o, x_len
+
+
+class ParameterModel(nn.Module):
+ r"""Main neural network of the outputnet
+
+ Note: Do not put dropout layers here, the model will not converge.
+
+ Args:
+ outputnet_size (List[int]): the architecture of the parameter model
+ input_size (int): size of input for the first layer
+ output_size (int): size of output i.e size of the feature dim
+ frame_channels (int): feature dim to set the flat start bias
+ flat_start_params (dict): flat start parameters to set the bias
+ """
+
+ def __init__(
+ self,
+ outputnet_size: List[int],
+ input_size: int,
+ output_size: int,
+ frame_channels: int,
+ flat_start_params: dict,
+ ):
+ super().__init__()
+ self.frame_channels = frame_channels
+
+ self.layers = nn.ModuleList(
+ [Linear(inp, out) for inp, out in zip([input_size] + outputnet_size[:-1], outputnet_size)]
+ )
+ self.last_layer = nn.Linear(outputnet_size[-1], output_size)
+ self.flat_start_output_layer(
+ flat_start_params["mean"], flat_start_params["std"], flat_start_params["transition_p"]
+ )
+
+ def flat_start_output_layer(self, mean, std, transition_p):
+ self.last_layer.weight.data.zero_()
+ self.last_layer.bias.data[0 : self.frame_channels] = mean
+ self.last_layer.bias.data[self.frame_channels : 2 * self.frame_channels] = OverflowUtils.inverse_softplus(std)
+ self.last_layer.bias.data[2 * self.frame_channels :] = OverflowUtils.inverse_sigmod(transition_p)
+
+ def forward(self, x):
+ for layer in self.layers:
+ x = F.relu(layer(x))
+ x = self.last_layer(x)
+ return x
+
+
+class Outputnet(nn.Module):
+ r"""
+ This network takes current state and previous observed values as input
+ and returns its parameters, mean, standard deviation and probability
+ of transition to the next state
+ """
+
+ def __init__(
+ self,
+ encoder_dim: int,
+ memory_rnn_dim: int,
+ frame_channels: int,
+ outputnet_size: List[int],
+ flat_start_params: dict,
+ std_floor: float = 1e-2,
+ ):
+ super().__init__()
+
+ self.frame_channels = frame_channels
+ self.flat_start_params = flat_start_params
+ self.std_floor = std_floor
+
+ input_size = memory_rnn_dim + encoder_dim
+ output_size = 2 * frame_channels + 1
+
+ self.parametermodel = ParameterModel(
+ outputnet_size=outputnet_size,
+ input_size=input_size,
+ output_size=output_size,
+ flat_start_params=flat_start_params,
+ frame_channels=frame_channels,
+ )
+
+ def forward(self, ar_mels, inputs):
+ r"""Inputs observation and returns the means, stds and transition probability for the current state
+
+ Args:
+ ar_mel_inputs (torch.FloatTensor): shape (batch, prenet_dim)
+ states (torch.FloatTensor): (batch, hidden_states, hidden_state_dim)
+
+ Returns:
+ means: means for the emission observation for each feature
+ - shape: (B, hidden_states, feature_size)
+ stds: standard deviations for the emission observation for each feature
+ - shape: (batch, hidden_states, feature_size)
+ transition_vectors: transition vector for the current hidden state
+ - shape: (batch, hidden_states)
+ """
+ batch_size, prenet_dim = ar_mels.shape[0], ar_mels.shape[1]
+ N = inputs.shape[1]
+
+ ar_mels = ar_mels.unsqueeze(1).expand(batch_size, N, prenet_dim)
+ ar_mels = torch.cat((ar_mels, inputs), dim=2)
+ ar_mels = self.parametermodel(ar_mels)
+
+ mean, std, transition_vector = (
+ ar_mels[:, :, 0 : self.frame_channels],
+ ar_mels[:, :, self.frame_channels : 2 * self.frame_channels],
+ ar_mels[:, :, 2 * self.frame_channels :].squeeze(2),
+ )
+ std = F.softplus(std)
+ std = self._floor_std(std)
+ return mean, std, transition_vector
+
+ def _floor_std(self, std):
+ r"""
+ It clamps the standard deviation to not to go below some level
+ This removes the problem when the model tries to cheat for higher likelihoods by converting
+ one of the gaussians to a point mass.
+
+ Args:
+ std (float Tensor): tensor containing the standard deviation to be
+ """
+ original_tensor = std.clone().detach()
+ std = torch.clamp(std, min=self.std_floor)
+ if torch.any(original_tensor != std):
+ print(
+ "[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
+ )
+ return std
+
+
+class OverflowUtils:
+ @staticmethod
+ def get_data_parameters_for_flat_start(
+ data_loader: torch.utils.data.DataLoader, out_channels: int, states_per_phone: int
+ ):
+ """Generates data parameters for flat starting the HMM.
+
+ Args:
+ data_loader (torch.utils.data.Dataloader): _description_
+ out_channels (int): mel spectrogram channels
+ states_per_phone (_type_): HMM states per phone
+ """
+
+ # State related information for transition_p
+ total_state_len = 0
+ total_mel_len = 0
+
+ # Useful for data mean an std
+ total_mel_sum = 0
+ total_mel_sq_sum = 0
+
+ for batch in tqdm(data_loader, leave=False):
+ text_lengths = batch["token_id_lengths"]
+ mels = batch["mel"]
+ mel_lengths = batch["mel_lengths"]
+
+ total_state_len += torch.sum(text_lengths)
+ total_mel_len += torch.sum(mel_lengths)
+ total_mel_sum += torch.sum(mels)
+ total_mel_sq_sum += torch.sum(torch.pow(mels, 2))
+
+ data_mean = total_mel_sum / (total_mel_len * out_channels)
+ data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2))
+ average_num_states = total_state_len / len(data_loader.dataset)
+ average_mel_len = total_mel_len / len(data_loader.dataset)
+ average_duration_each_state = average_mel_len / average_num_states
+ init_transition_prob = 1 / average_duration_each_state
+
+ return data_mean, data_std, (init_transition_prob * states_per_phone)
+
+ @staticmethod
+ @torch.no_grad()
+ def update_flat_start_transition(model, transition_p):
+ model.neural_hmm.output_net.parametermodel.flat_start_output_layer(0.0, 1.0, transition_p)
+
+ @staticmethod
+ def log_clamped(x, eps=1e-04):
+ """
+ Avoids the log(0) problem
+
+ Args:
+ x (torch.tensor): input tensor
+ eps (float, optional): lower bound. Defaults to 1e-04.
+
+ Returns:
+ torch.tensor: :math:`log(x)`
+ """
+ clamped_x = torch.clamp(x, min=eps)
+ return torch.log(clamped_x)
+
+ @staticmethod
+ def inverse_sigmod(x):
+ r"""
+ Inverse of the sigmoid function
+ """
+ if not torch.is_tensor(x):
+ x = torch.tensor(x)
+ return OverflowUtils.log_clamped(x / (1.0 - x))
+
+ @staticmethod
+ def inverse_softplus(x):
+ r"""
+ Inverse of the softplus function
+ """
+ if not torch.is_tensor(x):
+ x = torch.tensor(x)
+ return OverflowUtils.log_clamped(torch.exp(x) - 1.0)
+
+ @staticmethod
+ def logsumexp(x, dim):
+ r"""
+ Differentiable LogSumExp: Does not creates nan gradients
+ when all the inputs are -inf yeilds 0 gradients.
+ Args:
+ x : torch.Tensor - The input tensor
+ dim: int - The dimension on which the log sum exp has to be applied
+ """
+
+ m, _ = x.max(dim=dim)
+ mask = m == -float("inf")
+ s = (x - m.masked_fill_(mask, 0).unsqueeze(dim=dim)).exp().sum(dim=dim)
+ return s.masked_fill_(mask, 1).log() + m.masked_fill_(mask, -float("inf"))
+
+ @staticmethod
+ def double_pad(list_of_different_shape_tensors):
+ r"""
+ Pads the list of tensors in 2 dimensions
+ """
+ second_dim_lens = [len(a) for a in [i[0] for i in list_of_different_shape_tensors]]
+ second_dim_max = max(second_dim_lens)
+ padded_x = [F.pad(x, (0, second_dim_max - len(x[0]))) for x in list_of_different_shape_tensors]
+ return nn.utils.rnn.pad_sequence(padded_x, batch_first=True)
diff --git a/submodules/TTS/TTS/tts/layers/overflow/decoder.py b/submodules/TTS/TTS/tts/layers/overflow/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd7ae88068cfaffe179f2e61354cc7eb760268c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/overflow/decoder.py
@@ -0,0 +1,81 @@
+import torch
+from torch import nn
+
+from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class Decoder(nn.Module):
+ """Uses glow decoder with some modifications.
+ ::
+
+ Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
+
+ Args:
+ in_channels (int): channels of input tensor.
+ hidden_channels (int): hidden decoder channels.
+ kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
+ dilation_rate (int): rate to increase dilation by each layer in a decoder block.
+ num_flow_blocks (int): number of decoder blocks.
+ num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
+ dropout_p (float): wavenet dropout rate.
+ sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_flow_blocks,
+ num_coupling_layers,
+ dropout_p=0.0,
+ num_splits=4,
+ num_squeeze=2,
+ sigmoid_scale=False,
+ c_in_channels=0,
+ ):
+ super().__init__()
+
+ self.glow_decoder = GlowDecoder(
+ in_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_flow_blocks,
+ num_coupling_layers,
+ dropout_p,
+ num_splits,
+ num_squeeze,
+ sigmoid_scale,
+ c_in_channels,
+ )
+ self.n_sqz = num_squeeze
+
+ def forward(self, x, x_len, g=None, reverse=False):
+ """
+ Input shapes:
+ - x: :math:`[B, C, T]`
+ - x_len :math:`[B]`
+ - g: :math:`[B, C]`
+
+ Output shapes:
+ - x: :math:`[B, C, T]`
+ - x_len :math:`[B]`
+ - logget_tot :math:`[B]`
+ """
+ x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
+ x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
+ x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
+ return x, x_len, logdet_tot
+
+ def preprocess(self, y, y_lengths, y_max_length):
+ if y_max_length is not None:
+ y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
+ y = y[:, :, :y_max_length]
+ y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
+ return y, y_lengths, y_max_length
+
+ def store_inverse(self):
+ self.glow_decoder.store_inverse()
diff --git a/submodules/TTS/TTS/tts/layers/overflow/neural_hmm.py b/submodules/TTS/TTS/tts/layers/overflow/neural_hmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0631ba98c00029e9871c965e4c7f465aa32bc406
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/overflow/neural_hmm.py
@@ -0,0 +1,553 @@
+from typing import List
+
+import torch
+import torch.distributions as tdist
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from TTS.tts.layers.overflow.common_layers import Outputnet, OverflowUtils
+from TTS.tts.layers.tacotron.common_layers import Prenet
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class NeuralHMM(nn.Module):
+ """Autoregressive left to right HMM model primarily used in "Neural HMMs are all you need (for high-quality attention-free TTS)"
+
+ Paper::
+ https://arxiv.org/abs/2108.13320
+
+ Paper abstract::
+ Neural sequence-to-sequence TTS has achieved significantly better output quality than statistical speech synthesis using
+ HMMs. However, neural TTS is generally not probabilistic and uses non-monotonic attention. Attention failures increase
+ training time and can make synthesis babble incoherently. This paper describes how the old and new paradigms can be
+ combined to obtain the advantages of both worlds, by replacing attention in neural TTS with an autoregressive left-right
+ no-skip hidden Markov model defined by a neural network. Based on this proposal, we modify Tacotron 2 to obtain an
+ HMM-based neural TTS model with monotonic alignment, trained to maximise the full sequence likelihood without
+ approximation. We also describe how to combine ideas from classical and contemporary TTS for best results. The resulting
+ example system is smaller and simpler than Tacotron 2, and learns to speak with fewer iterations and less data, whilst
+ achieving comparable naturalness prior to the post-net. Our approach also allows easy control over speaking rate.
+
+ Args:
+ frame_channels (int): Output dimension to generate.
+ ar_order (int): Autoregressive order of the model. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+ deterministic_transition (bool): deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+ encoder_dim (int): Channels of encoder input and character embedding tensors. Defaults to 512.
+ prenet_type (str): `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the Prenet.
+ prenet_dim (int): Dimension of the Prenet.
+ prenet_n_layers (int): Number of layers in the Prenet.
+ prenet_dropout (float): Dropout probability of the Prenet.
+ prenet_dropout_at_inference (bool): If True, dropout is applied at inference time.
+ memory_rnn_dim (int): Size of the memory RNN to process output of prenet.
+ outputnet_size (List[int]): Size of the output network inside the neural HMM.
+ flat_start_params (dict): Parameters for the flat start initialization of the neural HMM.
+ std_floor (float): Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint.
+ use_grad_checkpointing (bool, optional): Use gradient checkpointing to save memory. Defaults to True.
+ """
+
+ def __init__(
+ self,
+ frame_channels: int,
+ ar_order: int,
+ deterministic_transition: bool,
+ encoder_dim: int,
+ prenet_type: str,
+ prenet_dim: int,
+ prenet_n_layers: int,
+ prenet_dropout: float,
+ prenet_dropout_at_inference: bool,
+ memory_rnn_dim: int,
+ outputnet_size: List[int],
+ flat_start_params: dict,
+ std_floor: float,
+ use_grad_checkpointing: bool = True,
+ ):
+ super().__init__()
+
+ self.frame_channels = frame_channels
+ self.ar_order = ar_order
+ self.deterministic_transition = deterministic_transition
+ self.prenet_dim = prenet_dim
+ self.memory_rnn_dim = memory_rnn_dim
+ self.use_grad_checkpointing = use_grad_checkpointing
+
+ self.transition_model = TransitionModel()
+ self.emission_model = EmissionModel()
+
+ assert ar_order > 0, f"AR order must be greater than 0 provided {ar_order}"
+
+ self.ar_order = ar_order
+ self.prenet = Prenet(
+ in_features=frame_channels * ar_order,
+ prenet_type=prenet_type,
+ prenet_dropout=prenet_dropout,
+ dropout_at_inference=prenet_dropout_at_inference,
+ out_features=[self.prenet_dim for _ in range(prenet_n_layers)],
+ bias=False,
+ )
+ self.memory_rnn = nn.LSTMCell(input_size=prenet_dim, hidden_size=memory_rnn_dim)
+ self.output_net = Outputnet(
+ encoder_dim, memory_rnn_dim, frame_channels, outputnet_size, flat_start_params, std_floor
+ )
+ self.register_buffer("go_tokens", torch.zeros(ar_order, 1))
+
+ def forward(self, inputs, inputs_len, mels, mel_lens):
+ r"""HMM forward algorithm for training uses logarithmic version of Rabiner (1989) forward algorithm.
+
+ Args:
+ inputs (torch.FloatTensor): Encoder outputs
+ inputs_len (torch.LongTensor): Encoder output lengths
+ mels (torch.FloatTensor): Mel inputs
+ mel_lens (torch.LongTensor): Length of mel inputs
+
+ Shapes:
+ - inputs: (B, T, D_out_enc)
+ - inputs_len: (B)
+ - mels: (B, D_mel, T_mel)
+ - mel_lens: (B)
+
+ Returns:
+ log_prob (torch.FloatTensor): Log probability of the sequence
+ """
+ # Get dimensions of inputs
+ batch_size, N, _ = inputs.shape
+ T_max = torch.max(mel_lens)
+ mels = mels.permute(0, 2, 1)
+
+ # Intialize forward algorithm
+ log_state_priors = self._initialize_log_state_priors(inputs)
+ log_c, log_alpha_scaled, transition_matrix, means = self._initialize_forward_algorithm_variables(mels, N)
+
+ # Initialize autoregression elements
+ ar_inputs = self._add_go_token(mels)
+ h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
+
+ for t in range(T_max):
+ # Process Autoregression
+ h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
+ # Get mean, std and transition vector from decoder for this timestep
+ # Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
+ if self.use_grad_checkpointing and self.training:
+ mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs)
+ else:
+ mean, std, transition_vector = self.output_net(h_memory, inputs)
+
+ if t == 0:
+ log_alpha_temp = log_state_priors + self.emission_model(mels[:, 0], mean, std, inputs_len)
+ else:
+ log_alpha_temp = self.emission_model(mels[:, t], mean, std, inputs_len) + self.transition_model(
+ log_alpha_scaled[:, t - 1, :], transition_vector, inputs_len
+ )
+ log_c[:, t] = torch.logsumexp(log_alpha_temp, dim=1)
+ log_alpha_scaled[:, t, :] = log_alpha_temp - log_c[:, t].unsqueeze(1)
+ transition_matrix[:, t] = transition_vector # needed for absorption state calculation
+
+ # Save for plotting
+ means.append(mean.detach())
+
+ log_c, log_alpha_scaled = self._mask_lengths(mel_lens, log_c, log_alpha_scaled)
+
+ sum_final_log_c = self.get_absorption_state_scaling_factor(
+ mel_lens, log_alpha_scaled, inputs_len, transition_matrix
+ )
+
+ log_probs = torch.sum(log_c, dim=1) + sum_final_log_c
+
+ return log_probs, log_alpha_scaled, transition_matrix, means
+
+ @staticmethod
+ def _mask_lengths(mel_lens, log_c, log_alpha_scaled):
+ """
+ Mask the lengths of the forward variables so that the variable lenghts
+ do not contribute in the loss calculation
+ Args:
+ mel_inputs (torch.FloatTensor): (batch, T, frame_channels)
+ mel_inputs_lengths (torch.IntTensor): (batch)
+ log_c (torch.FloatTensor): (batch, T)
+ Returns:
+ log_c (torch.FloatTensor) : scaled probabilities (batch, T)
+ log_alpha_scaled (torch.FloatTensor): forward probabilities (batch, T, N)
+ """
+ mask_log_c = sequence_mask(mel_lens)
+ log_c = log_c * mask_log_c
+ mask_log_alpha_scaled = mask_log_c.unsqueeze(2)
+ log_alpha_scaled = log_alpha_scaled * mask_log_alpha_scaled
+ return log_c, log_alpha_scaled
+
+ def _process_ar_timestep(
+ self,
+ t,
+ ar_inputs,
+ h_memory,
+ c_memory,
+ ):
+ """
+ Process autoregression in timestep
+ 1. At a specific t timestep
+ 2. Perform data dropout if applied (we did not use it)
+ 3. Run the autoregressive frame through the prenet (has dropout)
+ 4. Run the prenet output through the post prenet rnn
+
+ Args:
+ t (int): mel-spec timestep
+ ar_inputs (torch.FloatTensor): go-token appended mel-spectrograms
+ - shape: (b, D_out, T_out)
+ h_post_prenet (torch.FloatTensor): previous timestep rnn hidden state
+ - shape: (b, memory_rnn_dim)
+ c_post_prenet (torch.FloatTensor): previous timestep rnn cell state
+ - shape: (b, memory_rnn_dim)
+
+ Returns:
+ h_post_prenet (torch.FloatTensor): rnn hidden state of the current timestep
+ c_post_prenet (torch.FloatTensor): rnn cell state of the current timestep
+ """
+ prenet_input = ar_inputs[:, t : t + self.ar_order].flatten(1)
+ memory_inputs = self.prenet(prenet_input)
+ h_memory, c_memory = self.memory_rnn(memory_inputs, (h_memory, c_memory))
+ return h_memory, c_memory
+
+ def _add_go_token(self, mel_inputs):
+ """Append the go token to create the autoregressive input
+ Args:
+ mel_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
+ Returns:
+ ar_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
+ """
+ batch_size, T, _ = mel_inputs.shape
+ go_tokens = self.go_tokens.unsqueeze(0).expand(batch_size, self.ar_order, self.frame_channels)
+ ar_inputs = torch.cat((go_tokens, mel_inputs), dim=1)[:, :T]
+ return ar_inputs
+
+ @staticmethod
+ def _initialize_forward_algorithm_variables(mel_inputs, N):
+ r"""Initialize placeholders for forward algorithm variables, to use a stable
+ version we will use log_alpha_scaled and the scaling constant
+
+ Args:
+ mel_inputs (torch.FloatTensor): (b, T_max, frame_channels)
+ N (int): number of states
+ Returns:
+ log_c (torch.FloatTensor): Scaling constant (b, T_max)
+ """
+ b, T_max, _ = mel_inputs.shape
+ log_alpha_scaled = mel_inputs.new_zeros((b, T_max, N))
+ log_c = mel_inputs.new_zeros(b, T_max)
+ transition_matrix = mel_inputs.new_zeros((b, T_max, N))
+
+ # Saving for plotting later, will not have gradient tapes
+ means = []
+ return log_c, log_alpha_scaled, transition_matrix, means
+
+ @staticmethod
+ def _init_lstm_states(batch_size, hidden_state_dim, device_tensor):
+ r"""
+ Initialize Hidden and Cell states for LSTM Cell
+
+ Args:
+ batch_size (Int): batch size
+ hidden_state_dim (Int): dimensions of the h and c
+ device_tensor (torch.FloatTensor): useful for the device and type
+
+ Returns:
+ (torch.FloatTensor): shape (batch_size, hidden_state_dim)
+ can be hidden state for LSTM
+ (torch.FloatTensor): shape (batch_size, hidden_state_dim)
+ can be the cell state for LSTM
+ """
+ return (
+ device_tensor.new_zeros(batch_size, hidden_state_dim),
+ device_tensor.new_zeros(batch_size, hidden_state_dim),
+ )
+
+ def get_absorption_state_scaling_factor(self, mels_len, log_alpha_scaled, inputs_len, transition_vector):
+ """Returns the final scaling factor of absorption state
+
+ Args:
+ mels_len (torch.IntTensor): Input size of mels to
+ get the last timestep of log_alpha_scaled
+ log_alpha_scaled (torch.FloatTEnsor): State probabilities
+ text_lengths (torch.IntTensor): length of the states to
+ mask the values of states lengths
+ (
+ Useful when the batch has very different lengths,
+ when the length of an observation is less than
+ the number of max states, then the log alpha after
+ the state value is filled with -infs. So we mask
+ those values so that it only consider the states
+ which are needed for that length
+ )
+ transition_vector (torch.FloatTensor): transtiion vector for each state per timestep
+
+ Shapes:
+ - mels_len: (batch_size)
+ - log_alpha_scaled: (batch_size, N, T)
+ - text_lengths: (batch_size)
+ - transition_vector: (batch_size, N, T)
+
+ Returns:
+ sum_final_log_c (torch.FloatTensor): (batch_size)
+
+ """
+ N = torch.max(inputs_len)
+ max_inputs_len = log_alpha_scaled.shape[2]
+ state_lengths_mask = sequence_mask(inputs_len, max_len=max_inputs_len)
+
+ last_log_alpha_scaled_index = (
+ (mels_len - 1).unsqueeze(-1).expand(-1, N).unsqueeze(1)
+ ) # Batch X Hidden State Size
+ last_log_alpha_scaled = torch.gather(log_alpha_scaled, 1, last_log_alpha_scaled_index).squeeze(1)
+ last_log_alpha_scaled = last_log_alpha_scaled.masked_fill(~state_lengths_mask, -float("inf"))
+
+ last_transition_vector = torch.gather(transition_vector, 1, last_log_alpha_scaled_index).squeeze(1)
+ last_transition_probability = torch.sigmoid(last_transition_vector)
+ log_probability_of_transitioning = OverflowUtils.log_clamped(last_transition_probability)
+
+ last_transition_probability_index = self.get_mask_for_last_item(inputs_len, inputs_len.device)
+ log_probability_of_transitioning = log_probability_of_transitioning.masked_fill(
+ ~last_transition_probability_index, -float("inf")
+ )
+ final_log_c = last_log_alpha_scaled + log_probability_of_transitioning
+
+ # If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
+ # Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
+ final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
+
+ sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
+ return sum_final_log_c
+
+ @staticmethod
+ def get_mask_for_last_item(lengths, device, out_tensor=None):
+ """Returns n-1 mask for the last item in the sequence.
+
+ Args:
+ lengths (torch.IntTensor): lengths in a batch
+ device (str, optional): Defaults to "cpu".
+ out_tensor (torch.Tensor, optional): uses the memory of a specific tensor.
+ Defaults to None.
+
+ Returns:
+ - Shape: :math:`(b, max_len)`
+ """
+ max_len = torch.max(lengths).item()
+ ids = (
+ torch.arange(0, max_len, device=device) if out_tensor is None else torch.arange(0, max_len, out=out_tensor)
+ )
+ mask = ids == lengths.unsqueeze(1) - 1
+ return mask
+
+ @torch.inference_mode()
+ def inference(
+ self,
+ inputs: torch.FloatTensor,
+ input_lens: torch.LongTensor,
+ sampling_temp: float,
+ max_sampling_time: int,
+ duration_threshold: float,
+ ):
+ """Inference from autoregressive neural HMM
+
+ Args:
+ inputs (torch.FloatTensor): input states
+ - shape: :math:`(b, T, d)`
+ input_lens (torch.LongTensor): input state lengths
+ - shape: :math:`(b)`
+ sampling_temp (float): sampling temperature
+ max_sampling_temp (int): max sampling temperature
+ duration_threshold (float): duration threshold to switch to next state
+ - Use this to change the spearking rate of the synthesised audio
+ """
+
+ b = inputs.shape[0]
+ outputs = {
+ "hmm_outputs": [],
+ "hmm_outputs_len": [],
+ "alignments": [],
+ "input_parameters": [],
+ "output_parameters": [],
+ }
+ for i in range(b):
+ neural_hmm_outputs, states_travelled, input_parameters, output_parameters = self.sample(
+ inputs[i : i + 1], input_lens[i], sampling_temp, max_sampling_time, duration_threshold
+ )
+
+ outputs["hmm_outputs"].append(neural_hmm_outputs)
+ outputs["hmm_outputs_len"].append(neural_hmm_outputs.shape[0])
+ outputs["alignments"].append(states_travelled)
+ outputs["input_parameters"].append(input_parameters)
+ outputs["output_parameters"].append(output_parameters)
+
+ outputs["hmm_outputs"] = nn.utils.rnn.pad_sequence(outputs["hmm_outputs"], batch_first=True)
+ outputs["hmm_outputs_len"] = torch.tensor(
+ outputs["hmm_outputs_len"], dtype=input_lens.dtype, device=input_lens.device
+ )
+ return outputs
+
+ @torch.inference_mode()
+ def sample(self, inputs, input_lens, sampling_temp, max_sampling_time, duration_threshold):
+ """Samples an output from the parameter models
+
+ Args:
+ inputs (torch.FloatTensor): input states
+ - shape: :math:`(1, T, d)`
+ input_lens (torch.LongTensor): input state lengths
+ - shape: :math:`(1)`
+ sampling_temp (float): sampling temperature
+ max_sampling_time (int): max sampling time
+ duration_threshold (float): duration threshold to switch to next state
+
+ Returns:
+ outputs (torch.FloatTensor): Output Observations
+ - Shape: :math:`(T, output_dim)`
+ states_travelled (list[int]): Hidden states travelled
+ - Shape: :math:`(T)`
+ input_parameters (list[torch.FloatTensor]): Input parameters
+ output_parameters (list[torch.FloatTensor]): Output parameters
+ """
+ states_travelled, outputs, t = [], [], 0
+
+ # Sample initial state
+ current_state = 0
+ states_travelled.append(current_state)
+
+ # Prepare autoregression
+ prenet_input = self.go_tokens.unsqueeze(0).expand(1, self.ar_order, self.frame_channels)
+ h_memory, c_memory = self._init_lstm_states(1, self.memory_rnn_dim, prenet_input)
+
+ input_parameter_values = []
+ output_parameter_values = []
+ quantile = 1
+ while True:
+ memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
+ # will be 1 while sampling
+ h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))
+
+ z_t = inputs[:, current_state].unsqueeze(0) # Add fake time dimension
+ mean, std, transition_vector = self.output_net(h_memory, z_t)
+
+ transition_probability = torch.sigmoid(transition_vector.flatten())
+ staying_probability = torch.sigmoid(-transition_vector.flatten())
+
+ # Save for plotting
+ input_parameter_values.append([prenet_input, current_state])
+ output_parameter_values.append([mean, std, transition_probability])
+
+ x_t = self.emission_model.sample(mean, std, sampling_temp=sampling_temp)
+
+ # Prepare autoregressive input for next iteration
+ prenet_input = torch.cat((prenet_input, x_t), dim=1)[:, 1:]
+
+ outputs.append(x_t.flatten())
+
+ transition_matrix = torch.cat((staying_probability, transition_probability))
+ quantile *= staying_probability
+ if not self.deterministic_transition:
+ switch = transition_matrix.multinomial(1)[0].item()
+ else:
+ switch = quantile < duration_threshold
+
+ if switch:
+ current_state += 1
+ quantile = 1
+
+ states_travelled.append(current_state)
+
+ if (current_state == input_lens) or (max_sampling_time and t == max_sampling_time - 1):
+ break
+
+ t += 1
+
+ return (
+ torch.stack(outputs, dim=0),
+ F.one_hot(input_lens.new_tensor(states_travelled)),
+ input_parameter_values,
+ output_parameter_values,
+ )
+
+ @staticmethod
+ def _initialize_log_state_priors(text_embeddings):
+ """Creates the log pi in forward algorithm.
+
+ Args:
+ text_embeddings (torch.FloatTensor): used to create the log pi
+ on current device
+
+ Shapes:
+ - text_embeddings: (B, T, D_out_enc)
+ """
+ N = text_embeddings.shape[1]
+ log_state_priors = text_embeddings.new_full([N], -float("inf"))
+ log_state_priors[0] = 0.0
+ return log_state_priors
+
+
+class TransitionModel(nn.Module):
+ """Transition Model of the HMM, it represents the probability of transitioning
+ form current state to all other states"""
+
+ def forward(self, log_alpha_scaled, transition_vector, inputs_len): # pylint: disable=no-self-use
+ r"""
+ product of the past state with transitional probabilities in log space
+
+ Args:
+ log_alpha_scaled (torch.Tensor): Multiply previous timestep's alphas by
+ transition matrix (in log domain)
+ - shape: (batch size, N)
+ transition_vector (torch.tensor): transition vector for each state
+ - shape: (N)
+ inputs_len (int tensor): Lengths of states in a batch
+ - shape: (batch)
+
+ Returns:
+ out (torch.FloatTensor): log probability of transitioning to each state
+ """
+ transition_p = torch.sigmoid(transition_vector)
+ staying_p = torch.sigmoid(-transition_vector)
+
+ log_staying_probability = OverflowUtils.log_clamped(staying_p)
+ log_transition_probability = OverflowUtils.log_clamped(transition_p)
+
+ staying = log_alpha_scaled + log_staying_probability
+ leaving = log_alpha_scaled + log_transition_probability
+ leaving = leaving.roll(1, dims=1)
+ leaving[:, 0] = -float("inf")
+ inputs_len_mask = sequence_mask(inputs_len)
+ out = OverflowUtils.logsumexp(torch.stack((staying, leaving), dim=2), dim=2)
+ out = out.masked_fill(~inputs_len_mask, -float("inf")) # There are no states to contribute to the loss
+ return out
+
+
+class EmissionModel(nn.Module):
+ """Emission Model of the HMM, it represents the probability of
+ emitting an observation based on the current state"""
+
+ def __init__(self) -> None:
+ super().__init__()
+ self.distribution_function: tdist.Distribution = tdist.normal.Normal
+
+ def sample(self, means, stds, sampling_temp):
+ return self.distribution_function(means, stds * sampling_temp).sample() if sampling_temp > 0 else means
+
+ def forward(self, x_t, means, stds, state_lengths):
+ r"""Calculates the log probability of the the given data (x_t)
+ being observed from states with given means and stds
+ Args:
+ x_t (float tensor) : observation at current time step
+ - shape: (batch, feature_dim)
+ means (float tensor): means of the distributions of hidden states
+ - shape: (batch, hidden_state, feature_dim)
+ stds (float tensor): standard deviations of the distributions of the hidden states
+ - shape: (batch, hidden_state, feature_dim)
+ state_lengths (int tensor): Lengths of states in a batch
+ - shape: (batch)
+
+ Returns:
+ out (float tensor): observation log likelihoods,
+ expressing the probability of an observation
+ being generated from a state i
+ shape: (batch, hidden_state)
+ """
+ emission_dists = self.distribution_function(means, stds)
+ out = emission_dists.log_prob(x_t.unsqueeze(1))
+ state_lengths_mask = sequence_mask(state_lengths).unsqueeze(2)
+ out = torch.sum(out * state_lengths_mask, dim=2)
+ return out
diff --git a/submodules/TTS/TTS/tts/layers/overflow/plotting_utils.py b/submodules/TTS/TTS/tts/layers/overflow/plotting_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a63aeb370a38a29660dc93267f4be138381c7df6
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/overflow/plotting_utils.py
@@ -0,0 +1,79 @@
+from typing import Any
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+
+def validate_numpy_array(value: Any):
+ r"""
+ Validates the input and makes sure it returns a numpy array (i.e on CPU)
+
+ Args:
+ value (Any): the input value
+
+ Raises:
+ TypeError: if the value is not a numpy array or torch tensor
+
+ Returns:
+ np.ndarray: numpy array of the value
+ """
+ if isinstance(value, np.ndarray):
+ pass
+ elif isinstance(value, list):
+ value = np.array(value)
+ elif torch.is_tensor(value):
+ value = value.cpu().numpy()
+ else:
+ raise TypeError("Value must be a numpy array, a torch tensor or a list")
+
+ return value
+
+
+def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
+ """Get the most probable state means from the log_alpha_scaled.
+
+ Args:
+ log_alpha_scaled (torch.Tensor): Log alpha scaled values.
+ - Shape: :math:`(T, N)`
+ means (torch.Tensor): Means of the states.
+ - Shape: :math:`(N, T, D_out)`
+ decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
+ """
+ max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
+ max_len = means.shape[0]
+ n_mel_channels = means.shape[2]
+ max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
+ means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
+ if decoder is not None:
+ mel = (
+ decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
+ .squeeze(0)
+ .T
+ )
+ else:
+ mel = means
+ return mel
+
+
+def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
+ """Generates trainsition probabilities plot for the states and the probability of transition.
+
+ Args:
+ states (torch.IntTensor): the states
+ transition_probabilities (torch.FloatTensor): the transition probabilities
+ """
+ states = validate_numpy_array(states)
+ transition_probabilities = validate_numpy_array(transition_probabilities)
+
+ fig, ax = plt.subplots(figsize=(30, 3))
+ ax.plot(transition_probabilities, "o")
+ ax.set_title("Transition probability of state")
+ ax.set_xlabel("hidden state")
+ ax.set_ylabel("probability")
+ ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
+ ax.set_xticklabels([int(x) for x in states], rotation=90)
+ plt.tight_layout()
+ if not output_fig:
+ plt.close()
+ return fig
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/__init__.py b/submodules/TTS/TTS/tts/layers/tacotron/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/attentions.py b/submodules/TTS/TTS/tts/layers/tacotron/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c3798e6b8f5fbc66224af66c9955e245b94097
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/attentions.py
@@ -0,0 +1,486 @@
+import torch
+from scipy.stats import betabinom
+from torch import nn
+from torch.nn import functional as F
+
+from TTS.tts.layers.tacotron.common_layers import Linear
+
+
+class LocationLayer(nn.Module):
+ """Layers for Location Sensitive Attention
+
+ Args:
+ attention_dim (int): number of channels in the input tensor.
+ attention_n_filters (int, optional): number of filters in convolution. Defaults to 32.
+ attention_kernel_size (int, optional): kernel size of convolution filter. Defaults to 31.
+ """
+
+ def __init__(self, attention_dim, attention_n_filters=32, attention_kernel_size=31):
+ super().__init__()
+ self.location_conv1d = nn.Conv1d(
+ in_channels=2,
+ out_channels=attention_n_filters,
+ kernel_size=attention_kernel_size,
+ stride=1,
+ padding=(attention_kernel_size - 1) // 2,
+ bias=False,
+ )
+ self.location_dense = Linear(attention_n_filters, attention_dim, bias=False, init_gain="tanh")
+
+ def forward(self, attention_cat):
+ """
+ Shapes:
+ attention_cat: [B, 2, C]
+ """
+ processed_attention = self.location_conv1d(attention_cat)
+ processed_attention = self.location_dense(processed_attention.transpose(1, 2))
+ return processed_attention
+
+
+class GravesAttention(nn.Module):
+ """Graves Attention as is ref1 with updates from ref2.
+ ref1: https://arxiv.org/abs/1910.10288
+ ref2: https://arxiv.org/pdf/1906.01083.pdf
+
+ Args:
+ query_dim (int): number of channels in query tensor.
+ K (int): number of Gaussian heads to be used for computing attention.
+ """
+
+ COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
+
+ def __init__(self, query_dim, K):
+ super().__init__()
+ self._mask_value = 1e-8
+ self.K = K
+ # self.attention_alignment = 0.05
+ self.eps = 1e-5
+ self.J = None
+ self.N_a = nn.Sequential(
+ nn.Linear(query_dim, query_dim, bias=True), nn.ReLU(), nn.Linear(query_dim, 3 * K, bias=True)
+ )
+ self.attention_weights = None
+ self.mu_prev = None
+ self.init_layers()
+
+ def init_layers(self):
+ torch.nn.init.constant_(self.N_a[2].bias[(2 * self.K) : (3 * self.K)], 1.0) # bias mean
+ torch.nn.init.constant_(self.N_a[2].bias[self.K : (2 * self.K)], 10) # bias std
+
+ def init_states(self, inputs):
+ if self.J is None or inputs.shape[1] + 1 > self.J.shape[-1]:
+ self.J = torch.arange(0, inputs.shape[1] + 2.0).to(inputs.device) + 0.5
+ self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
+ self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
+
+ # pylint: disable=R0201
+ # pylint: disable=unused-argument
+ def preprocess_inputs(self, inputs):
+ return None
+
+ def forward(self, query, inputs, processed_inputs, mask):
+ """
+ Shapes:
+ query: [B, C_attention_rnn]
+ inputs: [B, T_in, C_encoder]
+ processed_inputs: place_holder
+ mask: [B, T_in]
+ """
+ gbk_t = self.N_a(query)
+ gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
+
+ # attention model parameters
+ # each B x K
+ g_t = gbk_t[:, 0, :]
+ b_t = gbk_t[:, 1, :]
+ k_t = gbk_t[:, 2, :]
+
+ # dropout to decorrelate attention heads
+ g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
+
+ # attention GMM parameters
+ sig_t = torch.nn.functional.softplus(b_t) + self.eps
+
+ mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
+ g_t = torch.softmax(g_t, dim=-1) + self.eps
+
+ j = self.J[: inputs.size(1) + 1]
+
+ # attention weights
+ phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
+
+ # discritize attention weights
+ alpha_t = torch.sum(phi_t, 1)
+ alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
+ alpha_t[alpha_t == 0] = 1e-8
+
+ # apply masking
+ if mask is not None:
+ alpha_t.data.masked_fill_(~mask, self._mask_value)
+
+ context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
+ self.attention_weights = alpha_t
+ self.mu_prev = mu_t
+ return context
+
+
+class OriginalAttention(nn.Module):
+ """Bahdanau Attention with various optional modifications.
+ - Location sensitive attnetion: https://arxiv.org/abs/1712.05884
+ - Forward Attention: https://arxiv.org/abs/1807.06736 + state masking at inference
+ - Using sigmoid instead of softmax normalization
+ - Attention windowing at inference time
+
+ Note:
+ Location Sensitive Attention extends the additive attention mechanism
+ to use cumulative attention weights from previous decoder time steps with the current time step features.
+
+ Forward attention computes most probable monotonic alignment. The modified attention probabilities at each
+ timestep are computed recursively by the forward algorithm.
+
+ Transition agent in the forward attention explicitly gates the attention mechanism whether to move forward or
+ stay at each decoder timestep.
+
+ Attention windowing is a inductive prior that prevents the model from attending to previous and future timesteps
+ beyond a certain window.
+
+ Args:
+ query_dim (int): number of channels in the query tensor.
+ embedding_dim (int): number of channels in the vakue tensor. In general, the value tensor is the output of the encoder layer.
+ attention_dim (int): number of channels of the inner attention layers.
+ location_attention (bool): enable/disable location sensitive attention.
+ attention_location_n_filters (int): number of location attention filters.
+ attention_location_kernel_size (int): filter size of location attention convolution layer.
+ windowing (int): window size for attention windowing. if it is 5, for computing the attention, it only considers the time steps [(t-5), ..., (t+5)] of the input.
+ norm (str): normalization method applied to the attention weights. 'softmax' or 'sigmoid'
+ forward_attn (bool): enable/disable forward attention.
+ trans_agent (bool): enable/disable transition agent in the forward attention.
+ forward_attn_mask (int): enable/disable an explicit masking in forward attention. It is useful to set at especially inference time.
+ """
+
+ # Pylint gets confused by PyTorch conventions here
+ # pylint: disable=attribute-defined-outside-init
+ def __init__(
+ self,
+ query_dim,
+ embedding_dim,
+ attention_dim,
+ location_attention,
+ attention_location_n_filters,
+ attention_location_kernel_size,
+ windowing,
+ norm,
+ forward_attn,
+ trans_agent,
+ forward_attn_mask,
+ ):
+ super().__init__()
+ self.query_layer = Linear(query_dim, attention_dim, bias=False, init_gain="tanh")
+ self.inputs_layer = Linear(embedding_dim, attention_dim, bias=False, init_gain="tanh")
+ self.v = Linear(attention_dim, 1, bias=True)
+ if trans_agent:
+ self.ta = nn.Linear(query_dim + embedding_dim, 1, bias=True)
+ if location_attention:
+ self.location_layer = LocationLayer(
+ attention_dim,
+ attention_location_n_filters,
+ attention_location_kernel_size,
+ )
+ self._mask_value = -float("inf")
+ self.windowing = windowing
+ self.win_idx = None
+ self.norm = norm
+ self.forward_attn = forward_attn
+ self.trans_agent = trans_agent
+ self.forward_attn_mask = forward_attn_mask
+ self.location_attention = location_attention
+
+ def init_win_idx(self):
+ self.win_idx = -1
+ self.win_back = 2
+ self.win_front = 6
+
+ def init_forward_attn(self, inputs):
+ B = inputs.shape[0]
+ T = inputs.shape[1]
+ self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
+ self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
+
+ def init_location_attention(self, inputs):
+ B = inputs.size(0)
+ T = inputs.size(1)
+ self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
+
+ def init_states(self, inputs):
+ B = inputs.size(0)
+ T = inputs.size(1)
+ self.attention_weights = torch.zeros([B, T], device=inputs.device)
+ if self.location_attention:
+ self.init_location_attention(inputs)
+ if self.forward_attn:
+ self.init_forward_attn(inputs)
+ if self.windowing:
+ self.init_win_idx()
+
+ def preprocess_inputs(self, inputs):
+ return self.inputs_layer(inputs)
+
+ def update_location_attention(self, alignments):
+ self.attention_weights_cum += alignments
+
+ def get_location_attention(self, query, processed_inputs):
+ attention_cat = torch.cat((self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1)
+ processed_query = self.query_layer(query.unsqueeze(1))
+ processed_attention_weights = self.location_layer(attention_cat)
+ energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_inputs))
+ energies = energies.squeeze(-1)
+ return energies, processed_query
+
+ def get_attention(self, query, processed_inputs):
+ processed_query = self.query_layer(query.unsqueeze(1))
+ energies = self.v(torch.tanh(processed_query + processed_inputs))
+ energies = energies.squeeze(-1)
+ return energies, processed_query
+
+ def apply_windowing(self, attention, inputs):
+ back_win = self.win_idx - self.win_back
+ front_win = self.win_idx + self.win_front
+ if back_win > 0:
+ attention[:, :back_win] = -float("inf")
+ if front_win < inputs.shape[1]:
+ attention[:, front_win:] = -float("inf")
+ # this is a trick to solve a special problem.
+ # but it does not hurt.
+ if self.win_idx == -1:
+ attention[:, 0] = attention.max()
+ # Update the window
+ self.win_idx = torch.argmax(attention, 1).long()[0].item()
+ return attention
+
+ def apply_forward_attention(self, alignment):
+ # forward attention
+ fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
+ # compute transition potentials
+ alpha = ((1 - self.u) * self.alpha + self.u * fwd_shifted_alpha + 1e-8) * alignment
+ # force incremental alignment
+ if not self.training and self.forward_attn_mask:
+ _, n = fwd_shifted_alpha.max(1)
+ val, _ = alpha.max(1)
+ for b in range(alignment.shape[0]):
+ alpha[b, n[b] + 3 :] = 0
+ alpha[b, : (n[b] - 1)] = 0 # ignore all previous states to prevent repetition.
+ alpha[b, (n[b] - 2)] = 0.01 * val[b] # smoothing factor for the prev step
+ # renormalize attention weights
+ alpha = alpha / alpha.sum(dim=1, keepdim=True)
+ return alpha
+
+ def forward(self, query, inputs, processed_inputs, mask):
+ """
+ shapes:
+ query: [B, C_attn_rnn]
+ inputs: [B, T_en, D_en]
+ processed_inputs: [B, T_en, D_attn]
+ mask: [B, T_en]
+ """
+ if self.location_attention:
+ attention, _ = self.get_location_attention(query, processed_inputs)
+ else:
+ attention, _ = self.get_attention(query, processed_inputs)
+ # apply masking
+ if mask is not None:
+ attention.data.masked_fill_(~mask, self._mask_value)
+ # apply windowing - only in eval mode
+ if not self.training and self.windowing:
+ attention = self.apply_windowing(attention, inputs)
+
+ # normalize attention values
+ if self.norm == "softmax":
+ alignment = torch.softmax(attention, dim=-1)
+ elif self.norm == "sigmoid":
+ alignment = torch.sigmoid(attention) / torch.sigmoid(attention).sum(dim=1, keepdim=True)
+ else:
+ raise ValueError("Unknown value for attention norm type")
+
+ if self.location_attention:
+ self.update_location_attention(alignment)
+
+ # apply forward attention if enabled
+ if self.forward_attn:
+ alignment = self.apply_forward_attention(alignment)
+ self.alpha = alignment
+
+ context = torch.bmm(alignment.unsqueeze(1), inputs)
+ context = context.squeeze(1)
+ self.attention_weights = alignment
+
+ # compute transition agent
+ if self.forward_attn and self.trans_agent:
+ ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
+ self.u = torch.sigmoid(self.ta(ta_input))
+ return context
+
+
+class MonotonicDynamicConvolutionAttention(nn.Module):
+ """Dynamic convolution attention from
+ https://arxiv.org/pdf/1910.10288.pdf
+
+
+ query -> linear -> tanh -> linear ->|
+ | mask values
+ v | |
+ atten_w(t-1) -|-> conv1d_dynamic -> linear -|-> tanh -> + -> softmax -> * -> * -> context
+ |-> conv1d_static -> linear -| |
+ |-> conv1d_prior -> log ----------------|
+
+ query: attention rnn output.
+
+ Note:
+ Dynamic convolution attention is an alternation of the location senstive attention with
+ dynamically computed convolution filters from the previous attention scores and a set of
+ constraints to keep the attention alignment diagonal.
+ DCA is sensitive to mixed precision training and might cause instable training.
+
+ Args:
+ query_dim (int): number of channels in the query tensor.
+ embedding_dim (int): number of channels in the value tensor.
+ static_filter_dim (int): number of channels in the convolution layer computing the static filters.
+ static_kernel_size (int): kernel size for the convolution layer computing the static filters.
+ dynamic_filter_dim (int): number of channels in the convolution layer computing the dynamic filters.
+ dynamic_kernel_size (int): kernel size for the convolution layer computing the dynamic filters.
+ prior_filter_len (int, optional): [description]. Defaults to 11 from the paper.
+ alpha (float, optional): [description]. Defaults to 0.1 from the paper.
+ beta (float, optional): [description]. Defaults to 0.9 from the paper.
+ """
+
+ def __init__(
+ self,
+ query_dim,
+ embedding_dim, # pylint: disable=unused-argument
+ attention_dim,
+ static_filter_dim,
+ static_kernel_size,
+ dynamic_filter_dim,
+ dynamic_kernel_size,
+ prior_filter_len=11,
+ alpha=0.1,
+ beta=0.9,
+ ):
+ super().__init__()
+ self._mask_value = 1e-8
+ self.dynamic_filter_dim = dynamic_filter_dim
+ self.dynamic_kernel_size = dynamic_kernel_size
+ self.prior_filter_len = prior_filter_len
+ self.attention_weights = None
+ # setup key and query layers
+ self.query_layer = nn.Linear(query_dim, attention_dim)
+ self.key_layer = nn.Linear(attention_dim, dynamic_filter_dim * dynamic_kernel_size, bias=False)
+ self.static_filter_conv = nn.Conv1d(
+ 1,
+ static_filter_dim,
+ static_kernel_size,
+ padding=(static_kernel_size - 1) // 2,
+ bias=False,
+ )
+ self.static_filter_layer = nn.Linear(static_filter_dim, attention_dim, bias=False)
+ self.dynamic_filter_layer = nn.Linear(dynamic_filter_dim, attention_dim)
+ self.v = nn.Linear(attention_dim, 1, bias=False)
+
+ prior = betabinom.pmf(range(prior_filter_len), prior_filter_len - 1, alpha, beta)
+ self.register_buffer("prior", torch.FloatTensor(prior).flip(0))
+
+ # pylint: disable=unused-argument
+ def forward(self, query, inputs, processed_inputs, mask):
+ """
+ query: [B, C_attn_rnn]
+ inputs: [B, T_en, D_en]
+ processed_inputs: place holder.
+ mask: [B, T_en]
+ """
+ # compute prior filters
+ prior_filter = F.conv1d(
+ F.pad(self.attention_weights.unsqueeze(1), (self.prior_filter_len - 1, 0)), self.prior.view(1, 1, -1)
+ )
+ prior_filter = torch.log(prior_filter.clamp_min_(1e-6)).squeeze(1)
+ G = self.key_layer(torch.tanh(self.query_layer(query)))
+ # compute dynamic filters
+ dynamic_filter = F.conv1d(
+ self.attention_weights.unsqueeze(0),
+ G.view(-1, 1, self.dynamic_kernel_size),
+ padding=(self.dynamic_kernel_size - 1) // 2,
+ groups=query.size(0),
+ )
+ dynamic_filter = dynamic_filter.view(query.size(0), self.dynamic_filter_dim, -1).transpose(1, 2)
+ # compute static filters
+ static_filter = self.static_filter_conv(self.attention_weights.unsqueeze(1)).transpose(1, 2)
+ alignment = (
+ self.v(
+ torch.tanh(self.static_filter_layer(static_filter) + self.dynamic_filter_layer(dynamic_filter))
+ ).squeeze(-1)
+ + prior_filter
+ )
+ # compute attention weights
+ attention_weights = F.softmax(alignment, dim=-1)
+ # apply masking
+ if mask is not None:
+ attention_weights.data.masked_fill_(~mask, self._mask_value)
+ self.attention_weights = attention_weights
+ # compute context
+ context = torch.bmm(attention_weights.unsqueeze(1), inputs).squeeze(1)
+ return context
+
+ def preprocess_inputs(self, inputs): # pylint: disable=no-self-use
+ return None
+
+ def init_states(self, inputs):
+ B = inputs.size(0)
+ T = inputs.size(1)
+ self.attention_weights = torch.zeros([B, T], device=inputs.device)
+ self.attention_weights[:, 0] = 1.0
+
+
+def init_attn(
+ attn_type,
+ query_dim,
+ embedding_dim,
+ attention_dim,
+ location_attention,
+ attention_location_n_filters,
+ attention_location_kernel_size,
+ windowing,
+ norm,
+ forward_attn,
+ trans_agent,
+ forward_attn_mask,
+ attn_K,
+):
+ if attn_type == "original":
+ return OriginalAttention(
+ query_dim,
+ embedding_dim,
+ attention_dim,
+ location_attention,
+ attention_location_n_filters,
+ attention_location_kernel_size,
+ windowing,
+ norm,
+ forward_attn,
+ trans_agent,
+ forward_attn_mask,
+ )
+ if attn_type == "graves":
+ return GravesAttention(query_dim, attn_K)
+ if attn_type == "dynamic_convolution":
+ return MonotonicDynamicConvolutionAttention(
+ query_dim,
+ embedding_dim,
+ attention_dim,
+ static_filter_dim=8,
+ static_kernel_size=21,
+ dynamic_filter_dim=8,
+ dynamic_kernel_size=21,
+ prior_filter_len=11,
+ alpha=0.1,
+ beta=0.9,
+ )
+
+ raise RuntimeError(f" [!] Given Attention Type '{attn_type}' is not exist.")
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/capacitron_layers.py b/submodules/TTS/TTS/tts/layers/tacotron/capacitron_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2181ffa7ec4e1f54d86cc5865a8fa7f6b6e362af
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/capacitron_layers.py
@@ -0,0 +1,205 @@
+import torch
+from torch import nn
+from torch.distributions.multivariate_normal import MultivariateNormal as MVN
+from torch.nn import functional as F
+
+
+class CapacitronVAE(nn.Module):
+ """Effective Use of Variational Embedding Capacity for prosody transfer.
+
+ See https://arxiv.org/abs/1906.03402"""
+
+ def __init__(
+ self,
+ num_mel,
+ capacitron_VAE_embedding_dim,
+ encoder_output_dim=256,
+ reference_encoder_out_dim=128,
+ speaker_embedding_dim=None,
+ text_summary_embedding_dim=None,
+ ):
+ super().__init__()
+ # Init distributions
+ self.prior_distribution = MVN(
+ torch.zeros(capacitron_VAE_embedding_dim), torch.eye(capacitron_VAE_embedding_dim)
+ )
+ self.approximate_posterior_distribution = None
+ # define output ReferenceEncoder dim to the capacitron_VAE_embedding_dim
+ self.encoder = ReferenceEncoder(num_mel, out_dim=reference_encoder_out_dim)
+
+ # Init beta, the lagrange-like term for the KL distribution
+ self.beta = torch.nn.Parameter(torch.log(torch.exp(torch.Tensor([1.0])) - 1), requires_grad=True)
+ mlp_input_dimension = reference_encoder_out_dim
+
+ if text_summary_embedding_dim is not None:
+ self.text_summary_net = TextSummary(text_summary_embedding_dim, encoder_output_dim=encoder_output_dim)
+ mlp_input_dimension += text_summary_embedding_dim
+ if speaker_embedding_dim is not None:
+ # TODO: Test a multispeaker model!
+ mlp_input_dimension += speaker_embedding_dim
+ self.post_encoder_mlp = PostEncoderMLP(mlp_input_dimension, capacitron_VAE_embedding_dim)
+
+ def forward(self, reference_mel_info=None, text_info=None, speaker_embedding=None):
+ # Use reference
+ if reference_mel_info is not None:
+ reference_mels = reference_mel_info[0] # [batch_size, num_frames, num_mels]
+ mel_lengths = reference_mel_info[1] # [batch_size]
+ enc_out = self.encoder(reference_mels, mel_lengths)
+
+ # concat speaker_embedding and/or text summary embedding
+ if text_info is not None:
+ text_inputs = text_info[0] # [batch_size, num_characters, num_embedding]
+ input_lengths = text_info[1]
+ text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
+ enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
+ if speaker_embedding is not None:
+ speaker_embedding = torch.squeeze(speaker_embedding)
+ enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
+
+ # Feed the output of the ref encoder and information about text/speaker into
+ # an MLP to produce the parameteres for the approximate poterior distributions
+ mu, sigma = self.post_encoder_mlp(enc_out)
+ # convert to cpu because prior_distribution was created on cpu
+ mu = mu.cpu()
+ sigma = sigma.cpu()
+
+ # Sample from the posterior: z ~ q(z|x)
+ self.approximate_posterior_distribution = MVN(mu, torch.diag_embed(sigma))
+ VAE_embedding = self.approximate_posterior_distribution.rsample()
+ # Infer from the model, bypasses encoding
+ else:
+ # Sample from the prior: z ~ p(z)
+ VAE_embedding = self.prior_distribution.sample().unsqueeze(0)
+
+ # reshape to [batch_size, 1, capacitron_VAE_embedding_dim]
+ return VAE_embedding.unsqueeze(1), self.approximate_posterior_distribution, self.prior_distribution, self.beta
+
+
+class ReferenceEncoder(nn.Module):
+ """NN module creating a fixed size prosody embedding from a spectrogram.
+
+ inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+ outputs: [batch_size, embedding_dim]
+ """
+
+ def __init__(self, num_mel, out_dim):
+ super().__init__()
+ self.num_mel = num_mel
+ filters = [1] + [32, 32, 64, 64, 128, 128]
+ num_layers = len(filters) - 1
+ convs = [
+ nn.Conv2d(
+ in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
+ )
+ for i in range(num_layers)
+ ]
+ self.convs = nn.ModuleList(convs)
+ self.training = False
+ self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
+
+ post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
+ self.recurrence = nn.LSTM(
+ input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
+ )
+
+ def forward(self, inputs, input_lengths):
+ batch_size = inputs.size(0)
+ x = inputs.view(batch_size, 1, -1, self.num_mel) # [batch_size, num_channels==1, num_frames, num_mel]
+ valid_lengths = input_lengths.float() # [batch_size]
+ for conv, bn in zip(self.convs, self.bns):
+ x = conv(x)
+ x = bn(x)
+ x = F.relu(x)
+
+ # Create the post conv width mask based on the valid lengths of the output of the convolution.
+ # The valid lengths for the output of a convolution on varying length inputs is
+ # ceil(input_length/stride) + 1 for stride=3 and padding=2
+ # For example (kernel_size=3, stride=2, padding=2):
+ # 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
+ # _____
+ # x _____
+ # x _____
+ # x ____
+ # x
+ # x x x x -> Output valid length = 4
+ # Since every example in te batch is zero padded and therefore have separate valid_lengths,
+ # we need to mask off all the values AFTER the valid length for each example in the batch.
+ # Otherwise, the convolutions create noise and a lot of not real information
+ valid_lengths = (valid_lengths / 2).float()
+ valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1 # 2 is stride -- size: [batch_size]
+ post_conv_max_width = x.size(2)
+
+ mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
+ len(valid_lengths), post_conv_max_width
+ ) < valid_lengths.unsqueeze(1)
+ mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2) # [batch_size, 1, post_conv_max_width, 1]
+ x = x * mask
+
+ x = x.transpose(1, 2)
+ # x: 4D tensor [batch_size, post_conv_width,
+ # num_channels==128, post_conv_height]
+
+ post_conv_width = x.size(1)
+ x = x.contiguous().view(batch_size, post_conv_width, -1)
+ # x: 3D tensor [batch_size, post_conv_width,
+ # num_channels*post_conv_height]
+
+ # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+ post_conv_input_lengths = valid_lengths
+ packed_seqs = nn.utils.rnn.pack_padded_sequence(
+ x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
+ ) # dynamic rnn sequence padding
+ self.recurrence.flatten_parameters()
+ _, (ht, _) = self.recurrence(packed_seqs)
+ last_output = ht[-1]
+
+ return last_output.to(inputs.device) # [B, 128]
+
+ @staticmethod
+ def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
+ """Height of spec after n convolutions with fixed kernel/stride/pad."""
+ for _ in range(n_convs):
+ height = (height - kernel_size + 2 * pad) // stride + 1
+ return height
+
+
+class TextSummary(nn.Module):
+ def __init__(self, embedding_dim, encoder_output_dim):
+ super().__init__()
+ self.lstm = nn.LSTM(
+ encoder_output_dim, # text embedding dimension from the text encoder
+ embedding_dim, # fixed length output summary the lstm creates from the input
+ batch_first=True,
+ bidirectional=False,
+ )
+
+ def forward(self, inputs, input_lengths):
+ # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+ packed_seqs = nn.utils.rnn.pack_padded_sequence(
+ inputs, input_lengths.tolist(), batch_first=True, enforce_sorted=False
+ ) # dynamic rnn sequence padding
+ self.lstm.flatten_parameters()
+ _, (ht, _) = self.lstm(packed_seqs)
+ last_output = ht[-1]
+ return last_output
+
+
+class PostEncoderMLP(nn.Module):
+ def __init__(self, input_size, hidden_size):
+ super().__init__()
+ self.hidden_size = hidden_size
+ modules = [
+ nn.Linear(input_size, hidden_size), # Hidden Layer
+ nn.Tanh(),
+ nn.Linear(hidden_size, hidden_size * 2),
+ ] # Output layer twice the size for mean and variance
+ self.net = nn.Sequential(*modules)
+ self.softplus = nn.Softplus()
+
+ def forward(self, _input):
+ mlp_output = self.net(_input)
+ # The mean parameter is unconstrained
+ mu = mlp_output[:, : self.hidden_size]
+ # The standard deviation must be positive. Parameterise with a softplus
+ sigma = self.softplus(mlp_output[:, self.hidden_size :])
+ return mu, sigma
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/common_layers.py b/submodules/TTS/TTS/tts/layers/tacotron/common_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78ff1e75f6c23eb1a0fe827247a1127bc8f9958
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/common_layers.py
@@ -0,0 +1,119 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Linear(nn.Module):
+ """Linear layer with a specific initialization.
+
+ Args:
+ in_features (int): number of channels in the input tensor.
+ out_features (int): number of channels in the output tensor.
+ bias (bool, optional): enable/disable bias in the layer. Defaults to True.
+ init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
+ """
+
+ def __init__(self, in_features, out_features, bias=True, init_gain="linear"):
+ super().__init__()
+ self.linear_layer = torch.nn.Linear(in_features, out_features, bias=bias)
+ self._init_w(init_gain)
+
+ def _init_w(self, init_gain):
+ torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(init_gain))
+
+ def forward(self, x):
+ return self.linear_layer(x)
+
+
+class LinearBN(nn.Module):
+ """Linear layer with Batch Normalization.
+
+ x -> linear -> BN -> o
+
+ Args:
+ in_features (int): number of channels in the input tensor.
+ out_features (int ): number of channels in the output tensor.
+ bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
+ init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
+ """
+
+ def __init__(self, in_features, out_features, bias=True, init_gain="linear"):
+ super().__init__()
+ self.linear_layer = torch.nn.Linear(in_features, out_features, bias=bias)
+ self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
+ self._init_w(init_gain)
+
+ def _init_w(self, init_gain):
+ torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(init_gain))
+
+ def forward(self, x):
+ """
+ Shapes:
+ x: [T, B, C] or [B, C]
+ """
+ out = self.linear_layer(x)
+ if len(out.shape) == 3:
+ out = out.permute(1, 2, 0)
+ out = self.batch_normalization(out)
+ if len(out.shape) == 3:
+ out = out.permute(2, 0, 1)
+ return out
+
+
+class Prenet(nn.Module):
+ """Tacotron specific Prenet with an optional Batch Normalization.
+
+ Note:
+ Prenet with BN improves the model performance significantly especially
+ if it is enabled after learning a diagonal attention alignment with the original
+ prenet. However, if the target dataset is high quality then it also works from
+ the start. It is also suggested to disable dropout if BN is in use.
+
+ prenet_type == "original"
+ x -> [linear -> ReLU -> Dropout]xN -> o
+
+ prenet_type == "bn"
+ x -> [linear -> BN -> ReLU -> Dropout]xN -> o
+
+ Args:
+ in_features (int): number of channels in the input tensor and the inner layers.
+ prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
+ prenet_dropout (bool, optional): dropout rate. Defaults to True.
+ dropout_at_inference (bool, optional): use dropout at inference. It leads to a better quality for some models.
+ out_features (list, optional): List of output channels for each prenet block.
+ It also defines number of the prenet blocks based on the length of argument list.
+ Defaults to [256, 256].
+ bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
+ """
+
+ # pylint: disable=dangerous-default-value
+ def __init__(
+ self,
+ in_features,
+ prenet_type="original",
+ prenet_dropout=True,
+ dropout_at_inference=False,
+ out_features=[256, 256],
+ bias=True,
+ ):
+ super().__init__()
+ self.prenet_type = prenet_type
+ self.prenet_dropout = prenet_dropout
+ self.dropout_at_inference = dropout_at_inference
+ in_features = [in_features] + out_features[:-1]
+ if prenet_type == "bn":
+ self.linear_layers = nn.ModuleList(
+ [LinearBN(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features)]
+ )
+ elif prenet_type == "original":
+ self.linear_layers = nn.ModuleList(
+ [Linear(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features)]
+ )
+
+ def forward(self, x):
+ for linear in self.linear_layers:
+ if self.prenet_dropout:
+ x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training or self.dropout_at_inference)
+ else:
+ x = F.relu(linear(x))
+ return x
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/gst_layers.py b/submodules/TTS/TTS/tts/layers/tacotron/gst_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..05dba7084ff5533b68779d46238530f4988db934
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/gst_layers.py
@@ -0,0 +1,149 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class GST(nn.Module):
+ """Global Style Token Module for factorizing prosody in speech.
+
+ See https://arxiv.org/pdf/1803.09017"""
+
+ def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, embedded_speaker_dim=None):
+ super().__init__()
+ self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim)
+ self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, embedded_speaker_dim)
+
+ def forward(self, inputs, speaker_embedding=None):
+ enc_out = self.encoder(inputs)
+ # concat speaker_embedding
+ if speaker_embedding is not None:
+ enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
+ style_embed = self.style_token_layer(enc_out)
+
+ return style_embed
+
+
+class ReferenceEncoder(nn.Module):
+ """NN module creating a fixed size prosody embedding from a spectrogram.
+
+ inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+ outputs: [batch_size, embedding_dim]
+ """
+
+ def __init__(self, num_mel, embedding_dim):
+ super().__init__()
+ self.num_mel = num_mel
+ filters = [1] + [32, 32, 64, 64, 128, 128]
+ num_layers = len(filters) - 1
+ convs = [
+ nn.Conv2d(
+ in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)
+ )
+ for i in range(num_layers)
+ ]
+ self.convs = nn.ModuleList(convs)
+ self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
+
+ post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 1, num_layers)
+ self.recurrence = nn.GRU(
+ input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True
+ )
+
+ def forward(self, inputs):
+ batch_size = inputs.size(0)
+ x = inputs.view(batch_size, 1, -1, self.num_mel)
+ # x: 4D tensor [batch_size, num_channels==1, num_frames, num_mel]
+ for conv, bn in zip(self.convs, self.bns):
+ x = conv(x)
+ x = bn(x)
+ x = F.relu(x)
+
+ x = x.transpose(1, 2)
+ # x: 4D tensor [batch_size, post_conv_width,
+ # num_channels==128, post_conv_height]
+ post_conv_width = x.size(1)
+ x = x.contiguous().view(batch_size, post_conv_width, -1)
+ # x: 3D tensor [batch_size, post_conv_width,
+ # num_channels*post_conv_height]
+ self.recurrence.flatten_parameters()
+ _, out = self.recurrence(x)
+ # out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
+
+ return out.squeeze(0)
+
+ @staticmethod
+ def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
+ """Height of spec after n convolutions with fixed kernel/stride/pad."""
+ for _ in range(n_convs):
+ height = (height - kernel_size + 2 * pad) // stride + 1
+ return height
+
+
+class StyleTokenLayer(nn.Module):
+ """NN Module attending to style tokens based on prosody encodings."""
+
+ def __init__(self, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None):
+ super().__init__()
+
+ self.query_dim = gst_embedding_dim // 2
+
+ if d_vector_dim:
+ self.query_dim += d_vector_dim
+
+ self.key_dim = gst_embedding_dim // num_heads
+ self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim))
+ nn.init.normal_(self.style_tokens, mean=0, std=0.5)
+ self.attention = MultiHeadAttention(
+ query_dim=self.query_dim, key_dim=self.key_dim, num_units=gst_embedding_dim, num_heads=num_heads
+ )
+
+ def forward(self, inputs):
+ batch_size = inputs.size(0)
+ prosody_encoding = inputs.unsqueeze(1)
+ # prosody_encoding: 3D tensor [batch_size, 1, encoding_size==128]
+ tokens = torch.tanh(self.style_tokens).unsqueeze(0).expand(batch_size, -1, -1)
+ # tokens: 3D tensor [batch_size, num tokens, token embedding size]
+ style_embed = self.attention(prosody_encoding, tokens)
+
+ return style_embed
+
+
+class MultiHeadAttention(nn.Module):
+ """
+ input:
+ query --- [N, T_q, query_dim]
+ key --- [N, T_k, key_dim]
+ output:
+ out --- [N, T_q, num_units]
+ """
+
+ def __init__(self, query_dim, key_dim, num_units, num_heads):
+ super().__init__()
+ self.num_units = num_units
+ self.num_heads = num_heads
+ self.key_dim = key_dim
+
+ self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+ self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+ self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+
+ def forward(self, query, key):
+ queries = self.W_query(query) # [N, T_q, num_units]
+ keys = self.W_key(key) # [N, T_k, num_units]
+ values = self.W_value(key)
+
+ split_size = self.num_units // self.num_heads
+ queries = torch.stack(torch.split(queries, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
+ keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
+ values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
+
+ # score = softmax(QK^T / (d_k**0.5))
+ scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
+ scores = scores / (self.key_dim**0.5)
+ scores = F.softmax(scores, dim=3)
+
+ # out = score * V
+ out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
+ out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
+
+ return out
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/tacotron.py b/submodules/TTS/TTS/tts/layers/tacotron/tacotron.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a47c35ef67852456d7211f32502ffb84509d61f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/tacotron.py
@@ -0,0 +1,503 @@
+# coding: utf-8
+# adapted from https://github.com/r9y9/tacotron_pytorch
+
+import torch
+from torch import nn
+
+from .attentions import init_attn
+from .common_layers import Prenet
+
+
+class BatchNormConv1d(nn.Module):
+ r"""A wrapper for Conv1d with BatchNorm. It sets the activation
+ function between Conv and BatchNorm layers. BatchNorm layer
+ is initialized with the TF default values for momentum and eps.
+
+ Args:
+ in_channels: size of each input sample
+ out_channels: size of each output samples
+ kernel_size: kernel size of conv filters
+ stride: stride of conv filters
+ padding: padding of conv filters
+ activation: activation function set b/w Conv1d and BatchNorm
+
+ Shapes:
+ - input: (B, D)
+ - output: (B, D)
+ """
+
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None):
+ super().__init__()
+ self.padding = padding
+ self.padder = nn.ConstantPad1d(padding, 0)
+ self.conv1d = nn.Conv1d(
+ in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0, bias=False
+ )
+ # Following tensorflow's default parameters
+ self.bn = nn.BatchNorm1d(out_channels, momentum=0.99, eps=1e-3)
+ self.activation = activation
+ # self.init_layers()
+
+ def init_layers(self):
+ if isinstance(self.activation, torch.nn.ReLU):
+ w_gain = "relu"
+ elif isinstance(self.activation, torch.nn.Tanh):
+ w_gain = "tanh"
+ elif self.activation is None:
+ w_gain = "linear"
+ else:
+ raise RuntimeError("Unknown activation function")
+ torch.nn.init.xavier_uniform_(self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_gain))
+
+ def forward(self, x):
+ x = self.padder(x)
+ x = self.conv1d(x)
+ x = self.bn(x)
+ if self.activation is not None:
+ x = self.activation(x)
+ return x
+
+
+class Highway(nn.Module):
+ r"""Highway layers as explained in https://arxiv.org/abs/1505.00387
+
+ Args:
+ in_features (int): size of each input sample
+ out_feature (int): size of each output sample
+
+ Shapes:
+ - input: (B, *, H_in)
+ - output: (B, *, H_out)
+ """
+
+ # TODO: Try GLU layer
+ def __init__(self, in_features, out_feature):
+ super().__init__()
+ self.H = nn.Linear(in_features, out_feature)
+ self.H.bias.data.zero_()
+ self.T = nn.Linear(in_features, out_feature)
+ self.T.bias.data.fill_(-1)
+ self.relu = nn.ReLU()
+ self.sigmoid = nn.Sigmoid()
+ # self.init_layers()
+
+ def init_layers(self):
+ torch.nn.init.xavier_uniform_(self.H.weight, gain=torch.nn.init.calculate_gain("relu"))
+ torch.nn.init.xavier_uniform_(self.T.weight, gain=torch.nn.init.calculate_gain("sigmoid"))
+
+ def forward(self, inputs):
+ H = self.relu(self.H(inputs))
+ T = self.sigmoid(self.T(inputs))
+ return H * T + inputs * (1.0 - T)
+
+
+class CBHG(nn.Module):
+ """CBHG module: a recurrent neural network composed of:
+ - 1-d convolution banks
+ - Highway networks + residual connections
+ - Bidirectional gated recurrent units
+
+ Args:
+ in_features (int): sample size
+ K (int): max filter size in conv bank
+ projections (list): conv channel sizes for conv projections
+ num_highways (int): number of highways layers
+
+ Shapes:
+ - input: (B, C, T_in)
+ - output: (B, T_in, C*2)
+ """
+
+ # pylint: disable=dangerous-default-value
+ def __init__(
+ self,
+ in_features,
+ K=16,
+ conv_bank_features=128,
+ conv_projections=[128, 128],
+ highway_features=128,
+ gru_features=128,
+ num_highways=4,
+ ):
+ super().__init__()
+ self.in_features = in_features
+ self.conv_bank_features = conv_bank_features
+ self.highway_features = highway_features
+ self.gru_features = gru_features
+ self.conv_projections = conv_projections
+ self.relu = nn.ReLU()
+ # list of conv1d bank with filter size k=1...K
+ # TODO: try dilational layers instead
+ self.conv1d_banks = nn.ModuleList(
+ [
+ BatchNormConv1d(
+ in_features,
+ conv_bank_features,
+ kernel_size=k,
+ stride=1,
+ padding=[(k - 1) // 2, k // 2],
+ activation=self.relu,
+ )
+ for k in range(1, K + 1)
+ ]
+ )
+ # max pooling of conv bank, with padding
+ # TODO: try average pooling OR larger kernel size
+ out_features = [K * conv_bank_features] + conv_projections[:-1]
+ activations = [self.relu] * (len(conv_projections) - 1)
+ activations += [None]
+ # setup conv1d projection layers
+ layer_set = []
+ for in_size, out_size, ac in zip(out_features, conv_projections, activations):
+ layer = BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, padding=[1, 1], activation=ac)
+ layer_set.append(layer)
+ self.conv1d_projections = nn.ModuleList(layer_set)
+ # setup Highway layers
+ if self.highway_features != conv_projections[-1]:
+ self.pre_highway = nn.Linear(conv_projections[-1], highway_features, bias=False)
+ self.highways = nn.ModuleList([Highway(highway_features, highway_features) for _ in range(num_highways)])
+ # bi-directional GPU layer
+ self.gru = nn.GRU(gru_features, gru_features, 1, batch_first=True, bidirectional=True)
+
+ def forward(self, inputs):
+ # (B, in_features, T_in)
+ x = inputs
+ # (B, hid_features*K, T_in)
+ # Concat conv1d bank outputs
+ outs = []
+ for conv1d in self.conv1d_banks:
+ out = conv1d(x)
+ outs.append(out)
+ x = torch.cat(outs, dim=1)
+ assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
+ for conv1d in self.conv1d_projections:
+ x = conv1d(x)
+ x += inputs
+ x = x.transpose(1, 2)
+ if self.highway_features != self.conv_projections[-1]:
+ x = self.pre_highway(x)
+ # Residual connection
+ # TODO: try residual scaling as in Deep Voice 3
+ # TODO: try plain residual layers
+ for highway in self.highways:
+ x = highway(x)
+ # (B, T_in, hid_features*2)
+ # TODO: replace GRU with convolution as in Deep Voice 3
+ self.gru.flatten_parameters()
+ outputs, _ = self.gru(x)
+ return outputs
+
+
+class EncoderCBHG(nn.Module):
+ r"""CBHG module with Encoder specific arguments"""
+
+ def __init__(self):
+ super().__init__()
+ self.cbhg = CBHG(
+ 128,
+ K=16,
+ conv_bank_features=128,
+ conv_projections=[128, 128],
+ highway_features=128,
+ gru_features=128,
+ num_highways=4,
+ )
+
+ def forward(self, x):
+ return self.cbhg(x)
+
+
+class Encoder(nn.Module):
+ r"""Stack Prenet and CBHG module for encoder
+ Args:
+ inputs (FloatTensor): embedding features
+
+ Shapes:
+ - inputs: (B, T, D_in)
+ - outputs: (B, T, 128 * 2)
+ """
+
+ def __init__(self, in_features):
+ super().__init__()
+ self.prenet = Prenet(in_features, out_features=[256, 128])
+ self.cbhg = EncoderCBHG()
+
+ def forward(self, inputs):
+ # B x T x prenet_dim
+ outputs = self.prenet(inputs)
+ outputs = self.cbhg(outputs.transpose(1, 2))
+ return outputs
+
+
+class PostCBHG(nn.Module):
+ def __init__(self, mel_dim):
+ super().__init__()
+ self.cbhg = CBHG(
+ mel_dim,
+ K=8,
+ conv_bank_features=128,
+ conv_projections=[256, mel_dim],
+ highway_features=128,
+ gru_features=128,
+ num_highways=4,
+ )
+
+ def forward(self, x):
+ return self.cbhg(x)
+
+
+class Decoder(nn.Module):
+ """Tacotron decoder.
+
+ Args:
+ in_channels (int): number of input channels.
+ frame_channels (int): number of feature frame channels.
+ r (int): number of outputs per time step (reduction rate).
+ memory_size (int): size of the past window. if <= 0 memory_size = r
+ attn_type (string): type of attention used in decoder.
+ attn_windowing (bool): if true, define an attention window centered to maximum
+ attention response. It provides more robust attention alignment especially
+ at interence time.
+ attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
+ prenet_type (string): 'original' or 'bn'.
+ prenet_dropout (float): prenet dropout rate.
+ forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
+ trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
+ forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
+ location_attn (bool): if true, use location sensitive attention.
+ attn_K (int): number of attention heads for GravesAttention.
+ separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
+ d_vector_dim (int): size of speaker embedding vector, for multi-speaker training.
+ max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 500.
+ """
+
+ # Pylint gets confused by PyTorch conventions here
+ # pylint: disable=attribute-defined-outside-init
+
+ def __init__(
+ self,
+ in_channels,
+ frame_channels,
+ r,
+ memory_size,
+ attn_type,
+ attn_windowing,
+ attn_norm,
+ prenet_type,
+ prenet_dropout,
+ forward_attn,
+ trans_agent,
+ forward_attn_mask,
+ location_attn,
+ attn_K,
+ separate_stopnet,
+ max_decoder_steps,
+ ):
+ super().__init__()
+ self.r_init = r
+ self.r = r
+ self.in_channels = in_channels
+ self.max_decoder_steps = max_decoder_steps
+ self.use_memory_queue = memory_size > 0
+ self.memory_size = memory_size if memory_size > 0 else r
+ self.frame_channels = frame_channels
+ self.separate_stopnet = separate_stopnet
+ self.query_dim = 256
+ # memory -> |Prenet| -> processed_memory
+ prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels
+ self.prenet = Prenet(prenet_dim, prenet_type, prenet_dropout, out_features=[256, 128])
+ # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
+ # attention_rnn generates queries for the attention mechanism
+ self.attention_rnn = nn.GRUCell(in_channels + 128, self.query_dim)
+ self.attention = init_attn(
+ attn_type=attn_type,
+ query_dim=self.query_dim,
+ embedding_dim=in_channels,
+ attention_dim=128,
+ location_attention=location_attn,
+ attention_location_n_filters=32,
+ attention_location_kernel_size=31,
+ windowing=attn_windowing,
+ norm=attn_norm,
+ forward_attn=forward_attn,
+ trans_agent=trans_agent,
+ forward_attn_mask=forward_attn_mask,
+ attn_K=attn_K,
+ )
+ # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
+ self.project_to_decoder_in = nn.Linear(256 + in_channels, 256)
+ # decoder_RNN_input -> |RNN| -> RNN_state
+ self.decoder_rnns = nn.ModuleList([nn.GRUCell(256, 256) for _ in range(2)])
+ # RNN_state -> |Linear| -> mel_spec
+ self.proj_to_mel = nn.Linear(256, frame_channels * self.r_init)
+ # learn init values instead of zero init.
+ self.stopnet = StopNet(256 + frame_channels * self.r_init)
+
+ def set_r(self, new_r):
+ self.r = new_r
+
+ def _reshape_memory(self, memory):
+ """
+ Reshape the spectrograms for given 'r'
+ """
+ # Grouping multiple frames if necessary
+ if memory.size(-1) == self.frame_channels:
+ memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
+ # Time first (T_decoder, B, frame_channels)
+ memory = memory.transpose(0, 1)
+ return memory
+
+ def _init_states(self, inputs):
+ """
+ Initialization of decoder states
+ """
+ B = inputs.size(0)
+ # go frame as zeros matrix
+ if self.use_memory_queue:
+ self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
+ else:
+ self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels)
+ # decoder states
+ self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
+ self.decoder_rnn_hiddens = [
+ torch.zeros(1, device=inputs.device).repeat(B, 256) for idx in range(len(self.decoder_rnns))
+ ]
+ self.context_vec = inputs.data.new(B, self.in_channels).zero_()
+ # cache attention inputs
+ self.processed_inputs = self.attention.preprocess_inputs(inputs)
+
+ def _parse_outputs(self, outputs, attentions, stop_tokens):
+ # Back to batch first
+ attentions = torch.stack(attentions).transpose(0, 1)
+ stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
+ outputs = torch.stack(outputs).transpose(0, 1).contiguous()
+ outputs = outputs.view(outputs.size(0), -1, self.frame_channels)
+ outputs = outputs.transpose(1, 2)
+ return outputs, attentions, stop_tokens
+
+ def decode(self, inputs, mask=None):
+ # Prenet
+ processed_memory = self.prenet(self.memory_input)
+ # Attention RNN
+ self.attention_rnn_hidden = self.attention_rnn(
+ torch.cat((processed_memory, self.context_vec), -1), self.attention_rnn_hidden
+ )
+ self.context_vec = self.attention(self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
+ # Concat RNN output and attention context vector
+ decoder_input = self.project_to_decoder_in(torch.cat((self.attention_rnn_hidden, self.context_vec), -1))
+
+ # Pass through the decoder RNNs
+ for idx, decoder_rnn in enumerate(self.decoder_rnns):
+ self.decoder_rnn_hiddens[idx] = decoder_rnn(decoder_input, self.decoder_rnn_hiddens[idx])
+ # Residual connection
+ decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
+ decoder_output = decoder_input
+
+ # predict mel vectors from decoder vectors
+ output = self.proj_to_mel(decoder_output)
+ # output = torch.sigmoid(output)
+ # predict stop token
+ stopnet_input = torch.cat([decoder_output, output], -1)
+ if self.separate_stopnet:
+ stop_token = self.stopnet(stopnet_input.detach())
+ else:
+ stop_token = self.stopnet(stopnet_input)
+ output = output[:, : self.r * self.frame_channels]
+ return output, stop_token, self.attention.attention_weights
+
+ def _update_memory_input(self, new_memory):
+ if self.use_memory_queue:
+ if self.memory_size > self.r:
+ # memory queue size is larger than number of frames per decoder iter
+ self.memory_input = torch.cat(
+ [new_memory, self.memory_input[:, : (self.memory_size - self.r) * self.frame_channels].clone()],
+ dim=-1,
+ )
+ else:
+ # memory queue size smaller than number of frames per decoder iter
+ self.memory_input = new_memory[:, : self.memory_size * self.frame_channels]
+ else:
+ # use only the last frame prediction
+ # assert new_memory.shape[-1] == self.r * self.frame_channels
+ self.memory_input = new_memory[:, self.frame_channels * (self.r - 1) :]
+
+ def forward(self, inputs, memory, mask):
+ """
+ Args:
+ inputs: Encoder outputs.
+ memory: Decoder memory (autoregression. If None (at eval-time),
+ decoder outputs are used as decoder inputs. If None, it uses the last
+ output as the input.
+ mask: Attention mask for sequence padding.
+
+ Shapes:
+ - inputs: (B, T, D_out_enc)
+ - memory: (B, T_mel, D_mel)
+ """
+ # Run greedy decoding if memory is None
+ memory = self._reshape_memory(memory)
+ outputs = []
+ attentions = []
+ stop_tokens = []
+ t = 0
+ self._init_states(inputs)
+ self.attention.init_states(inputs)
+ while len(outputs) < memory.size(0):
+ if t > 0:
+ new_memory = memory[t - 1]
+ self._update_memory_input(new_memory)
+
+ output, stop_token, attention = self.decode(inputs, mask)
+ outputs += [output]
+ attentions += [attention]
+ stop_tokens += [stop_token.squeeze(1)]
+ t += 1
+ return self._parse_outputs(outputs, attentions, stop_tokens)
+
+ def inference(self, inputs):
+ """
+ Args:
+ inputs: encoder outputs.
+ Shapes:
+ - inputs: batch x time x encoder_out_dim
+ """
+ outputs = []
+ attentions = []
+ stop_tokens = []
+ t = 0
+ self._init_states(inputs)
+ self.attention.init_states(inputs)
+ while True:
+ if t > 0:
+ new_memory = outputs[-1]
+ self._update_memory_input(new_memory)
+ output, stop_token, attention = self.decode(inputs, None)
+ stop_token = torch.sigmoid(stop_token.data)
+ outputs += [output]
+ attentions += [attention]
+ stop_tokens += [stop_token]
+ t += 1
+ if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6):
+ break
+ if t > self.max_decoder_steps:
+ print(" | > Decoder stopped with 'max_decoder_steps")
+ break
+ return self._parse_outputs(outputs, attentions, stop_tokens)
+
+
+class StopNet(nn.Module):
+ r"""Stopnet signalling decoder to stop inference.
+ Args:
+ in_features (int): feature dimension of input.
+ """
+
+ def __init__(self, in_features):
+ super().__init__()
+ self.dropout = nn.Dropout(0.1)
+ self.linear = nn.Linear(in_features, 1)
+ torch.nn.init.xavier_uniform_(self.linear.weight, gain=torch.nn.init.calculate_gain("linear"))
+
+ def forward(self, inputs):
+ outputs = self.dropout(inputs)
+ outputs = self.linear(outputs)
+ return outputs
diff --git a/submodules/TTS/TTS/tts/layers/tacotron/tacotron2.py b/submodules/TTS/TTS/tts/layers/tacotron/tacotron2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c79b70997249efc94cbac630bcc7d6c571f5743e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tacotron/tacotron2.py
@@ -0,0 +1,414 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .attentions import init_attn
+from .common_layers import Linear, Prenet
+
+
+# pylint: disable=no-value-for-parameter
+# pylint: disable=unexpected-keyword-arg
+class ConvBNBlock(nn.Module):
+ r"""Convolutions with Batch Normalization and non-linear activation.
+
+ Args:
+ in_channels (int): number of input channels.
+ out_channels (int): number of output channels.
+ kernel_size (int): convolution kernel size.
+ activation (str): 'relu', 'tanh', None (linear).
+
+ Shapes:
+ - input: (B, C_in, T)
+ - output: (B, C_out, T)
+ """
+
+ def __init__(self, in_channels, out_channels, kernel_size, activation=None):
+ super().__init__()
+ assert (kernel_size - 1) % 2 == 0
+ padding = (kernel_size - 1) // 2
+ self.convolution1d = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding)
+ self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
+ self.dropout = nn.Dropout(p=0.5)
+ if activation == "relu":
+ self.activation = nn.ReLU()
+ elif activation == "tanh":
+ self.activation = nn.Tanh()
+ else:
+ self.activation = nn.Identity()
+
+ def forward(self, x):
+ o = self.convolution1d(x)
+ o = self.batch_normalization(o)
+ o = self.activation(o)
+ o = self.dropout(o)
+ return o
+
+
+class Postnet(nn.Module):
+ r"""Tacotron2 Postnet
+
+ Args:
+ in_out_channels (int): number of output channels.
+
+ Shapes:
+ - input: (B, C_in, T)
+ - output: (B, C_in, T)
+ """
+
+ def __init__(self, in_out_channels, num_convs=5):
+ super().__init__()
+ self.convolutions = nn.ModuleList()
+ self.convolutions.append(ConvBNBlock(in_out_channels, 512, kernel_size=5, activation="tanh"))
+ for _ in range(1, num_convs - 1):
+ self.convolutions.append(ConvBNBlock(512, 512, kernel_size=5, activation="tanh"))
+ self.convolutions.append(ConvBNBlock(512, in_out_channels, kernel_size=5, activation=None))
+
+ def forward(self, x):
+ o = x
+ for layer in self.convolutions:
+ o = layer(o)
+ return o
+
+
+class Encoder(nn.Module):
+ r"""Tacotron2 Encoder
+
+ Args:
+ in_out_channels (int): number of input and output channels.
+
+ Shapes:
+ - input: (B, C_in, T)
+ - output: (B, C_in, T)
+ """
+
+ def __init__(self, in_out_channels=512):
+ super().__init__()
+ self.convolutions = nn.ModuleList()
+ for _ in range(3):
+ self.convolutions.append(ConvBNBlock(in_out_channels, in_out_channels, 5, "relu"))
+ self.lstm = nn.LSTM(
+ in_out_channels, int(in_out_channels / 2), num_layers=1, batch_first=True, bias=True, bidirectional=True
+ )
+ self.rnn_state = None
+
+ def forward(self, x, input_lengths):
+ o = x
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ o = nn.utils.rnn.pack_padded_sequence(o, input_lengths.cpu(), batch_first=True)
+ self.lstm.flatten_parameters()
+ o, _ = self.lstm(o)
+ o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
+ return o
+
+ def inference(self, x):
+ o = x
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ # self.lstm.flatten_parameters()
+ o, _ = self.lstm(o)
+ return o
+
+
+# adapted from https://github.com/NVIDIA/tacotron2/
+class Decoder(nn.Module):
+ """Tacotron2 decoder. We don't use Zoneout but Dropout between RNN layers.
+
+ Args:
+ in_channels (int): number of input channels.
+ frame_channels (int): number of feature frame channels.
+ r (int): number of outputs per time step (reduction rate).
+ memory_size (int): size of the past window. if <= 0 memory_size = r
+ attn_type (string): type of attention used in decoder.
+ attn_win (bool): if true, define an attention window centered to maximum
+ attention response. It provides more robust attention alignment especially
+ at interence time.
+ attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
+ prenet_type (string): 'original' or 'bn'.
+ prenet_dropout (float): prenet dropout rate.
+ forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
+ trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
+ forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
+ location_attn (bool): if true, use location sensitive attention.
+ attn_K (int): number of attention heads for GravesAttention.
+ separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
+ max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000.
+ """
+
+ # Pylint gets confused by PyTorch conventions here
+ # pylint: disable=attribute-defined-outside-init
+ def __init__(
+ self,
+ in_channels,
+ frame_channels,
+ r,
+ attn_type,
+ attn_win,
+ attn_norm,
+ prenet_type,
+ prenet_dropout,
+ forward_attn,
+ trans_agent,
+ forward_attn_mask,
+ location_attn,
+ attn_K,
+ separate_stopnet,
+ max_decoder_steps,
+ ):
+ super().__init__()
+ self.frame_channels = frame_channels
+ self.r_init = r
+ self.r = r
+ self.encoder_embedding_dim = in_channels
+ self.separate_stopnet = separate_stopnet
+ self.max_decoder_steps = max_decoder_steps
+ self.stop_threshold = 0.5
+
+ # model dimensions
+ self.query_dim = 1024
+ self.decoder_rnn_dim = 1024
+ self.prenet_dim = 256
+ self.attn_dim = 128
+ self.p_attention_dropout = 0.1
+ self.p_decoder_dropout = 0.1
+
+ # memory -> |Prenet| -> processed_memory
+ prenet_dim = self.frame_channels
+ self.prenet = Prenet(
+ prenet_dim, prenet_type, prenet_dropout, out_features=[self.prenet_dim, self.prenet_dim], bias=False
+ )
+
+ self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_channels, self.query_dim, bias=True)
+
+ self.attention = init_attn(
+ attn_type=attn_type,
+ query_dim=self.query_dim,
+ embedding_dim=in_channels,
+ attention_dim=128,
+ location_attention=location_attn,
+ attention_location_n_filters=32,
+ attention_location_kernel_size=31,
+ windowing=attn_win,
+ norm=attn_norm,
+ forward_attn=forward_attn,
+ trans_agent=trans_agent,
+ forward_attn_mask=forward_attn_mask,
+ attn_K=attn_K,
+ )
+
+ self.decoder_rnn = nn.LSTMCell(self.query_dim + in_channels, self.decoder_rnn_dim, bias=True)
+
+ self.linear_projection = Linear(self.decoder_rnn_dim + in_channels, self.frame_channels * self.r_init)
+
+ self.stopnet = nn.Sequential(
+ nn.Dropout(0.1),
+ Linear(self.decoder_rnn_dim + self.frame_channels * self.r_init, 1, bias=True, init_gain="sigmoid"),
+ )
+ self.memory_truncated = None
+
+ def set_r(self, new_r):
+ self.r = new_r
+
+ def get_go_frame(self, inputs):
+ B = inputs.size(0)
+ memory = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.r)
+ return memory
+
+ def _init_states(self, inputs, mask, keep_states=False):
+ B = inputs.size(0)
+ # T = inputs.size(1)
+ if not keep_states:
+ self.query = torch.zeros(1, device=inputs.device).repeat(B, self.query_dim)
+ self.attention_rnn_cell_state = torch.zeros(1, device=inputs.device).repeat(B, self.query_dim)
+ self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(B, self.decoder_rnn_dim)
+ self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(B, self.decoder_rnn_dim)
+ self.context = torch.zeros(1, device=inputs.device).repeat(B, self.encoder_embedding_dim)
+ self.inputs = inputs
+ self.processed_inputs = self.attention.preprocess_inputs(inputs)
+ self.mask = mask
+
+ def _reshape_memory(self, memory):
+ """
+ Reshape the spectrograms for given 'r'
+ """
+ # Grouping multiple frames if necessary
+ if memory.size(-1) == self.frame_channels:
+ memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
+ # Time first (T_decoder, B, frame_channels)
+ memory = memory.transpose(0, 1)
+ return memory
+
+ def _parse_outputs(self, outputs, stop_tokens, alignments):
+ alignments = torch.stack(alignments).transpose(0, 1)
+ stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
+ outputs = torch.stack(outputs).transpose(0, 1).contiguous()
+ outputs = outputs.view(outputs.size(0), -1, self.frame_channels)
+ outputs = outputs.transpose(1, 2)
+ return outputs, stop_tokens, alignments
+
+ def _update_memory(self, memory):
+ if len(memory.shape) == 2:
+ return memory[:, self.frame_channels * (self.r - 1) :]
+ return memory[:, :, self.frame_channels * (self.r - 1) :]
+
+ def decode(self, memory):
+ """
+ shapes:
+ - memory: B x r * self.frame_channels
+ """
+ # self.context: B x D_en
+ # query_input: B x D_en + (r * self.frame_channels)
+ query_input = torch.cat((memory, self.context), -1)
+ # self.query and self.attention_rnn_cell_state : B x D_attn_rnn
+ self.query, self.attention_rnn_cell_state = self.attention_rnn(
+ query_input, (self.query, self.attention_rnn_cell_state)
+ )
+ self.query = F.dropout(self.query, self.p_attention_dropout, self.training)
+ self.attention_rnn_cell_state = F.dropout(
+ self.attention_rnn_cell_state, self.p_attention_dropout, self.training
+ )
+ # B x D_en
+ self.context = self.attention(self.query, self.inputs, self.processed_inputs, self.mask)
+ # B x (D_en + D_attn_rnn)
+ decoder_rnn_input = torch.cat((self.query, self.context), -1)
+ # self.decoder_hidden and self.decoder_cell: B x D_decoder_rnn
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+ decoder_rnn_input, (self.decoder_hidden, self.decoder_cell)
+ )
+ self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
+ # B x (D_decoder_rnn + D_en)
+ decoder_hidden_context = torch.cat((self.decoder_hidden, self.context), dim=1)
+ # B x (self.r * self.frame_channels)
+ decoder_output = self.linear_projection(decoder_hidden_context)
+ # B x (D_decoder_rnn + (self.r * self.frame_channels))
+ stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
+ if self.separate_stopnet:
+ stop_token = self.stopnet(stopnet_input.detach())
+ else:
+ stop_token = self.stopnet(stopnet_input)
+ # select outputs for the reduction rate self.r
+ decoder_output = decoder_output[:, : self.r * self.frame_channels]
+ return decoder_output, self.attention.attention_weights, stop_token
+
+ def forward(self, inputs, memories, mask):
+ r"""Train Decoder with teacher forcing.
+ Args:
+ inputs: Encoder outputs.
+ memories: Feature frames for teacher-forcing.
+ mask: Attention mask for sequence padding.
+
+ Shapes:
+ - inputs: (B, T, D_out_enc)
+ - memory: (B, T_mel, D_mel)
+ - outputs: (B, T_mel, D_mel)
+ - alignments: (B, T_in, T_out)
+ - stop_tokens: (B, T_out)
+ """
+ memory = self.get_go_frame(inputs).unsqueeze(0)
+ memories = self._reshape_memory(memories)
+ memories = torch.cat((memory, memories), dim=0)
+ memories = self._update_memory(memories)
+ memories = self.prenet(memories)
+
+ self._init_states(inputs, mask=mask)
+ self.attention.init_states(inputs)
+
+ outputs, stop_tokens, alignments = [], [], []
+ while len(outputs) < memories.size(0) - 1:
+ memory = memories[len(outputs)]
+ decoder_output, attention_weights, stop_token = self.decode(memory)
+ outputs += [decoder_output.squeeze(1)]
+ stop_tokens += [stop_token.squeeze(1)]
+ alignments += [attention_weights]
+
+ outputs, stop_tokens, alignments = self._parse_outputs(outputs, stop_tokens, alignments)
+ return outputs, alignments, stop_tokens
+
+ def inference(self, inputs):
+ r"""Decoder inference without teacher forcing and use
+ Stopnet to stop decoder.
+ Args:
+ inputs: Encoder outputs.
+
+ Shapes:
+ - inputs: (B, T, D_out_enc)
+ - outputs: (B, T_mel, D_mel)
+ - alignments: (B, T_in, T_out)
+ - stop_tokens: (B, T_out)
+ """
+ memory = self.get_go_frame(inputs)
+ memory = self._update_memory(memory)
+
+ self._init_states(inputs, mask=None)
+ self.attention.init_states(inputs)
+
+ outputs, stop_tokens, alignments, t = [], [], [], 0
+ while True:
+ memory = self.prenet(memory)
+ decoder_output, alignment, stop_token = self.decode(memory)
+ stop_token = torch.sigmoid(stop_token.data)
+ outputs += [decoder_output.squeeze(1)]
+ stop_tokens += [stop_token]
+ alignments += [alignment]
+
+ if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
+ break
+ if len(outputs) == self.max_decoder_steps:
+ print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}")
+ break
+
+ memory = self._update_memory(decoder_output)
+ t += 1
+
+ outputs, stop_tokens, alignments = self._parse_outputs(outputs, stop_tokens, alignments)
+
+ return outputs, alignments, stop_tokens
+
+ def inference_truncated(self, inputs):
+ """
+ Preserve decoder states for continuous inference
+ """
+ if self.memory_truncated is None:
+ self.memory_truncated = self.get_go_frame(inputs)
+ self._init_states(inputs, mask=None, keep_states=False)
+ else:
+ self._init_states(inputs, mask=None, keep_states=True)
+
+ self.attention.init_states(inputs)
+ outputs, stop_tokens, alignments, t = [], [], [], 0
+ while True:
+ memory = self.prenet(self.memory_truncated)
+ decoder_output, alignment, stop_token = self.decode(memory)
+ stop_token = torch.sigmoid(stop_token.data)
+ outputs += [decoder_output.squeeze(1)]
+ stop_tokens += [stop_token]
+ alignments += [alignment]
+
+ if stop_token > 0.7:
+ break
+ if len(outputs) == self.max_decoder_steps:
+ print(" | > Decoder stopped with 'max_decoder_steps")
+ break
+
+ self.memory_truncated = decoder_output
+ t += 1
+
+ outputs, stop_tokens, alignments = self._parse_outputs(outputs, stop_tokens, alignments)
+
+ return outputs, alignments, stop_tokens
+
+ def inference_step(self, inputs, t, memory=None):
+ """
+ For debug purposes
+ """
+ if t == 0:
+ memory = self.get_go_frame(inputs)
+ self._init_states(inputs, mask=None)
+
+ memory = self.prenet(memory)
+ decoder_output, stop_token, alignment = self.decode(memory)
+ stop_token = torch.sigmoid(stop_token.data)
+ memory = decoder_output
+ return decoder_output, stop_token, alignment
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/arch_utils.py b/submodules/TTS/TTS/tts/layers/tortoise/arch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dad1814369599f0bc637a92624a73dfab99dc1a1
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/arch_utils.py
@@ -0,0 +1,433 @@
+import functools
+import math
+import os
+
+import fsspec
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from transformers import LogitsWarper
+
+from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
+
+
+def zero_module(module):
+ """
+ Zero out the parameters of a module and return it.
+ """
+ for p in module.parameters():
+ p.detach().zero_()
+ return module
+
+
+class GroupNorm32(nn.GroupNorm):
+ def forward(self, x):
+ return super().forward(x.float()).type(x.dtype)
+
+
+def normalization(channels):
+ """
+ Make a standard normalization layer.
+
+ :param channels: number of input channels.
+ :return: an nn.Module for normalization.
+ """
+ groups = 32
+ if channels <= 16:
+ groups = 8
+ elif channels <= 64:
+ groups = 16
+ while channels % groups != 0:
+ groups = int(groups / 2)
+ assert groups > 2
+ return GroupNorm32(groups, channels)
+
+
+class QKVAttentionLegacy(nn.Module):
+ """
+ A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
+ """
+
+ def __init__(self, n_heads):
+ super().__init__()
+ self.n_heads = n_heads
+
+ def forward(self, qkv, mask=None, rel_pos=None):
+ """
+ Apply QKV attention.
+
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+ :return: an [N x (H * C) x T] tensor after attention.
+ """
+ bs, width, length = qkv.shape
+ assert width % (3 * self.n_heads) == 0
+ ch = width // (3 * self.n_heads)
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+ scale = 1 / math.sqrt(math.sqrt(ch))
+ weight = torch.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards
+ if rel_pos is not None:
+ weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(
+ bs * self.n_heads, weight.shape[-2], weight.shape[-1]
+ )
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+ if mask is not None:
+ # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
+ mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
+ weight = weight * mask
+ a = torch.einsum("bts,bcs->bct", weight, v)
+
+ return a.reshape(bs, -1, length)
+
+
+class AttentionBlock(nn.Module):
+ """
+ An attention block that allows spatial positions to attend to each other.
+
+ Originally ported from here, but adapted to the N-d case.
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+ """
+
+ def __init__(
+ self,
+ channels,
+ num_heads=1,
+ num_head_channels=-1,
+ do_checkpoint=True,
+ relative_pos_embeddings=False,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.do_checkpoint = do_checkpoint
+ if num_head_channels == -1:
+ self.num_heads = num_heads
+ else:
+ assert (
+ channels % num_head_channels == 0
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+ self.num_heads = channels // num_head_channels
+ self.norm = normalization(channels)
+ self.qkv = nn.Conv1d(channels, channels * 3, 1)
+ # split heads before split qkv
+ self.attention = QKVAttentionLegacy(self.num_heads)
+
+ self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
+ if relative_pos_embeddings:
+ self.relative_pos_embeddings = RelativePositionBias(
+ scale=(channels // self.num_heads) ** 0.5,
+ causal=False,
+ heads=num_heads,
+ num_buckets=32,
+ max_distance=64,
+ )
+ else:
+ self.relative_pos_embeddings = None
+
+ def forward(self, x, mask=None):
+ b, c, *spatial = x.shape
+ x = x.reshape(b, c, -1)
+ qkv = self.qkv(self.norm(x))
+ h = self.attention(qkv, mask, self.relative_pos_embeddings)
+ h = self.proj_out(h)
+ return (x + h).reshape(b, c, *spatial)
+
+
+class Upsample(nn.Module):
+ """
+ An upsampling layer with an optional convolution.
+
+ :param channels: channels in the inputs and outputs.
+ :param use_conv: a bool determining if a convolution is applied.
+ """
+
+ def __init__(self, channels, use_conv, out_channels=None, factor=4):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.factor = factor
+ if use_conv:
+ ksize = 5
+ pad = 2
+ self.conv = nn.Conv1d(self.channels, self.out_channels, ksize, padding=pad)
+
+ def forward(self, x):
+ assert x.shape[1] == self.channels
+ x = F.interpolate(x, scale_factor=self.factor, mode="nearest")
+ if self.use_conv:
+ x = self.conv(x)
+ return x
+
+
+class Downsample(nn.Module):
+ """
+ A downsampling layer with an optional convolution.
+
+ :param channels: channels in the inputs and outputs.
+ :param use_conv: a bool determining if a convolution is applied.
+ """
+
+ def __init__(self, channels, use_conv, out_channels=None, factor=4, ksize=5, pad=2):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+
+ stride = factor
+ if use_conv:
+ self.op = nn.Conv1d(self.channels, self.out_channels, ksize, stride=stride, padding=pad)
+ else:
+ assert self.channels == self.out_channels
+ self.op = nn.AvgPool1d(kernel_size=stride, stride=stride)
+
+ def forward(self, x):
+ assert x.shape[1] == self.channels
+ return self.op(x)
+
+
+class ResBlock(nn.Module):
+ def __init__(
+ self,
+ channels,
+ dropout,
+ out_channels=None,
+ use_conv=False,
+ use_scale_shift_norm=False,
+ up=False,
+ down=False,
+ kernel_size=3,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.dropout = dropout
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.use_scale_shift_norm = use_scale_shift_norm
+ padding = 1 if kernel_size == 3 else 2
+
+ self.in_layers = nn.Sequential(
+ normalization(channels),
+ nn.SiLU(),
+ nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding),
+ )
+
+ self.updown = up or down
+
+ if up:
+ self.h_upd = Upsample(channels, False)
+ self.x_upd = Upsample(channels, False)
+ elif down:
+ self.h_upd = Downsample(channels, False)
+ self.x_upd = Downsample(channels, False)
+ else:
+ self.h_upd = self.x_upd = nn.Identity()
+
+ self.out_layers = nn.Sequential(
+ normalization(self.out_channels),
+ nn.SiLU(),
+ nn.Dropout(p=dropout),
+ zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)),
+ )
+
+ if self.out_channels == channels:
+ self.skip_connection = nn.Identity()
+ elif use_conv:
+ self.skip_connection = nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding)
+ else:
+ self.skip_connection = nn.Conv1d(channels, self.out_channels, 1)
+
+ def forward(self, x):
+ if self.updown:
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+ h = in_rest(x)
+ h = self.h_upd(h)
+ x = self.x_upd(x)
+ h = in_conv(h)
+ else:
+ h = self.in_layers(x)
+ h = self.out_layers(h)
+ return self.skip_connection(x) + h
+
+
+class AudioMiniEncoder(nn.Module):
+ def __init__(
+ self,
+ spec_dim,
+ embedding_dim,
+ base_channels=128,
+ depth=2,
+ resnet_blocks=2,
+ attn_blocks=4,
+ num_attn_heads=4,
+ dropout=0,
+ downsample_factor=2,
+ kernel_size=3,
+ ):
+ super().__init__()
+ self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1))
+ ch = base_channels
+ res = []
+ for l in range(depth):
+ for r in range(resnet_blocks):
+ res.append(ResBlock(ch, dropout, kernel_size=kernel_size))
+ res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
+ ch *= 2
+ self.res = nn.Sequential(*res)
+ self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
+ attn = []
+ for a in range(attn_blocks):
+ attn.append(
+ AttentionBlock(
+ embedding_dim,
+ num_attn_heads,
+ )
+ )
+ self.attn = nn.Sequential(*attn)
+ self.dim = embedding_dim
+
+ def forward(self, x):
+ h = self.init(x)
+ h = self.res(h)
+ h = self.final(h)
+ h = self.attn(h)
+ return h[:, :, 0]
+
+
+DEFAULT_MEL_NORM_FILE = "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth"
+
+
+class TorchMelSpectrogram(nn.Module):
+ def __init__(
+ self,
+ filter_length=1024,
+ hop_length=256,
+ win_length=1024,
+ n_mel_channels=80,
+ mel_fmin=0,
+ mel_fmax=8000,
+ sampling_rate=22050,
+ normalize=False,
+ mel_norm_file=DEFAULT_MEL_NORM_FILE,
+ ):
+ super().__init__()
+ # These are the default tacotron values for the MEL spectrogram.
+ self.filter_length = filter_length
+ self.hop_length = hop_length
+ self.win_length = win_length
+ self.n_mel_channels = n_mel_channels
+ self.mel_fmin = mel_fmin
+ self.mel_fmax = mel_fmax
+ self.sampling_rate = sampling_rate
+ self.mel_stft = torchaudio.transforms.MelSpectrogram(
+ n_fft=self.filter_length,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ power=2,
+ normalized=normalize,
+ sample_rate=self.sampling_rate,
+ f_min=self.mel_fmin,
+ f_max=self.mel_fmax,
+ n_mels=self.n_mel_channels,
+ norm="slaney",
+ )
+ self.mel_norm_file = mel_norm_file
+ if self.mel_norm_file is not None:
+ with fsspec.open(self.mel_norm_file) as f:
+ self.mel_norms = torch.load(f)
+ else:
+ self.mel_norms = None
+
+ def forward(self, inp):
+ if (
+ len(inp.shape) == 3
+ ): # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
+ inp = inp.squeeze(1)
+ assert len(inp.shape) == 2
+ self.mel_stft = self.mel_stft.to(inp.device)
+ mel = self.mel_stft(inp)
+ # Perform dynamic range compression
+ mel = torch.log(torch.clamp(mel, min=1e-5))
+ if self.mel_norms is not None:
+ self.mel_norms = self.mel_norms.to(mel.device)
+ mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1)
+ return mel
+
+
+class CheckpointedLayer(nn.Module):
+ """
+ Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses
+ checkpoint for all other args.
+ """
+
+ def __init__(self, wrap):
+ super().__init__()
+ self.wrap = wrap
+
+ def forward(self, x, *args, **kwargs):
+ for k, v in kwargs.items():
+ assert not (isinstance(v, torch.Tensor) and v.requires_grad) # This would screw up checkpointing.
+ partial = functools.partial(self.wrap, **kwargs)
+ return partial(x, *args)
+
+
+class CheckpointedXTransformerEncoder(nn.Module):
+ """
+ Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid
+ to channels-last that XTransformer expects.
+ """
+
+ def __init__(self, needs_permute=True, exit_permute=True, checkpoint=True, **xtransformer_kwargs):
+ super().__init__()
+ self.transformer = ContinuousTransformerWrapper(**xtransformer_kwargs)
+ self.needs_permute = needs_permute
+ self.exit_permute = exit_permute
+
+ if not checkpoint:
+ return
+ for i in range(len(self.transformer.attn_layers.layers)):
+ n, b, r = self.transformer.attn_layers.layers[i]
+ self.transformer.attn_layers.layers[i] = nn.ModuleList([n, CheckpointedLayer(b), r])
+
+ def forward(self, x, **kwargs):
+ if self.needs_permute:
+ x = x.permute(0, 2, 1)
+ h = self.transformer(x, **kwargs)
+ if self.exit_permute:
+ h = h.permute(0, 2, 1)
+ return h
+
+
+class TypicalLogitsWarper(LogitsWarper):
+ def __init__(
+ self,
+ mass: float = 0.9,
+ filter_value: float = -float("Inf"),
+ min_tokens_to_keep: int = 1,
+ ):
+ self.filter_value = filter_value
+ self.mass = mass
+ self.min_tokens_to_keep = min_tokens_to_keep
+
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+ # calculate entropy
+ normalized = torch.nn.functional.log_softmax(scores, dim=-1)
+ p = torch.exp(normalized)
+ ent = -(normalized * p).nansum(-1, keepdim=True)
+
+ # shift and sort
+ shifted_scores = torch.abs((-normalized) - ent)
+ sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+ sorted_logits = scores.gather(-1, sorted_indices)
+ cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+ # Remove tokens with cumulative mass above the threshold
+ last_ind = (cumulative_probs < self.mass).sum(dim=1)
+ last_ind[last_ind < 0] = 0
+ sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
+ if self.min_tokens_to_keep > 1:
+ # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+ sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+
+ scores = scores.masked_fill(indices_to_remove, self.filter_value)
+ return scores
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/audio_utils.py b/submodules/TTS/TTS/tts/layers/tortoise/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..70711ed7a485ecd4a8c8eb8ab6c338aa79871de7
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/audio_utils.py
@@ -0,0 +1,177 @@
+import os
+from glob import glob
+from typing import Dict, List
+
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from scipy.io.wavfile import read
+
+from TTS.utils.audio.torch_transforms import TorchSTFT
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ if data.dtype == np.int32:
+ norm_fix = 2**31
+ elif data.dtype == np.int16:
+ norm_fix = 2**15
+ elif data.dtype == np.float16 or data.dtype == np.float32:
+ norm_fix = 1.0
+ else:
+ raise NotImplementedError(f"Provided data dtype not supported: {data.dtype}")
+ return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
+
+
+def check_audio(audio, audiopath: str):
+ # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
+ # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
+ if torch.any(audio > 2) or not torch.any(audio < 0):
+ print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
+ audio.clip_(-1, 1)
+
+
+def read_audio_file(audiopath: str):
+ if audiopath[-4:] == ".wav":
+ audio, lsr = load_wav_to_torch(audiopath)
+ elif audiopath[-4:] == ".mp3":
+ audio, lsr = librosa.load(audiopath, sr=None)
+ audio = torch.FloatTensor(audio)
+ else:
+ assert False, f"Unsupported audio format provided: {audiopath[-4:]}"
+
+ # Remove any channel data.
+ if len(audio.shape) > 1:
+ if audio.shape[0] < 5:
+ audio = audio[0]
+ else:
+ assert audio.shape[1] < 5
+ audio = audio[:, 0]
+
+ return audio, lsr
+
+
+def load_required_audio(audiopath: str):
+ audio, lsr = read_audio_file(audiopath)
+
+ audios = [torchaudio.functional.resample(audio, lsr, sampling_rate) for sampling_rate in (22050, 24000)]
+ for audio in audios:
+ check_audio(audio, audiopath)
+
+ return [audio.unsqueeze(0) for audio in audios]
+
+
+def load_audio(audiopath, sampling_rate):
+ audio, lsr = read_audio_file(audiopath)
+
+ if lsr != sampling_rate:
+ audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
+ check_audio(audio, audiopath)
+
+ return audio.unsqueeze(0)
+
+
+TACOTRON_MEL_MAX = 2.3143386840820312
+TACOTRON_MEL_MIN = -11.512925148010254
+
+
+def denormalize_tacotron_mel(norm_mel):
+ return ((norm_mel + 1) / 2) * (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN) + TACOTRON_MEL_MIN
+
+
+def normalize_tacotron_mel(mel):
+ return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
+
+
+def get_voices(extra_voice_dirs: List[str] = []):
+ dirs = extra_voice_dirs
+ voices: Dict[str, List[str]] = {}
+ for d in dirs:
+ subs = os.listdir(d)
+ for sub in subs:
+ subj = os.path.join(d, sub)
+ if os.path.isdir(subj):
+ voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3")) + list(glob(f"{subj}/*.pth"))
+ return voices
+
+
+def load_voice(voice: str, extra_voice_dirs: List[str] = []):
+ if voice == "random":
+ return None, None
+
+ voices = get_voices(extra_voice_dirs)
+ paths = voices[voice]
+ if len(paths) == 1 and paths[0].endswith(".pth"):
+ return None, torch.load(paths[0])
+ else:
+ conds = []
+ for cond_path in paths:
+ c = load_required_audio(cond_path)
+ conds.append(c)
+ return conds, None
+
+
+def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
+ latents = []
+ clips = []
+ for voice in voices:
+ if voice == "random":
+ if len(voices) > 1:
+ print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
+ return None, None
+ clip, latent = load_voice(voice, extra_voice_dirs)
+ if latent is None:
+ assert (
+ len(latents) == 0
+ ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+ clips.extend(clip)
+ elif clip is None:
+ assert (
+ len(clips) == 0
+ ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+ latents.append(latent)
+ if len(latents) == 0:
+ return clips, None
+ else:
+ latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)
+ latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)
+ latents = (latents_0, latents_1)
+ return None, latents
+
+
+def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"):
+ stft = TorchSTFT(
+ n_fft=1024,
+ hop_length=256,
+ win_length=1024,
+ use_mel=True,
+ n_mels=100,
+ sample_rate=24000,
+ mel_fmin=0,
+ mel_fmax=12000,
+ )
+ stft = stft.to(device)
+ mel = stft(wav)
+ mel = dynamic_range_compression(mel)
+ if do_normalization:
+ mel = normalize_tacotron_mel(mel)
+ return mel
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/autoregressive.py b/submodules/TTS/TTS/tts/layers/tortoise/autoregressive.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d881bc1029ef577f24ae28f9414e431661142a
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/autoregressive.py
@@ -0,0 +1,631 @@
+# AGPL: a notification must be added stating that changes have been made to that file.
+import functools
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+
+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, TypicalLogitsWarper
+
+
+def null_position_embeddings(range, dim):
+ return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
+
+
+def _p(t):
+ return t and (len(t), len(t[0]), t[0][0].shape) # kv_cache debug
+
+
+class ResBlock(nn.Module):
+ """
+ Basic residual convolutional block that uses GroupNorm.
+ """
+
+ def __init__(self, chan):
+ super().__init__()
+ self.net = nn.Sequential(
+ nn.Conv1d(chan, chan, kernel_size=3, padding=1),
+ nn.GroupNorm(chan // 8, chan),
+ nn.ReLU(),
+ nn.Conv1d(chan, chan, kernel_size=3, padding=1),
+ nn.GroupNorm(chan // 8, chan),
+ )
+
+ def forward(self, x):
+ return F.relu(self.net(x) + x)
+
+
+class GPT2InferenceModel(GPT2PreTrainedModel):
+ def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear, kv_cache):
+ super().__init__(config)
+ self.transformer = gpt
+ self.text_pos_embedding = text_pos_emb
+ self.embeddings = embeddings
+ self.lm_head = nn.Sequential(norm, linear)
+ self.kv_cache = kv_cache
+
+ def store_mel_emb(self, mel_emb):
+ self.cached_mel_emb = mel_emb
+
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+ token_type_ids = kwargs.get("token_type_ids", None) # usually None
+ if not self.kv_cache:
+ past_key_values = None
+ # only last token for inputs_ids if past is defined in kwargs
+ if past_key_values:
+ input_ids = input_ids[:, -1].unsqueeze(-1)
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+ attention_mask = kwargs.get("attention_mask", None)
+ position_ids = kwargs.get("position_ids", None)
+
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+ else:
+ position_ids = None
+ return {
+ "input_ids": input_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "position_ids": position_ids,
+ "attention_mask": attention_mask,
+ "token_type_ids": token_type_ids,
+ }
+
+ def forward(
+ self,
+ input_ids=None,
+ past_key_values=None,
+ attention_mask=None,
+ token_type_ids=None,
+ position_ids=None,
+ head_mask=None,
+ inputs_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ assert self.cached_mel_emb is not None
+ assert inputs_embeds is None # Not supported by this inference model.
+ assert labels is None # Training not supported by this inference model.
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # Create embedding
+ mel_len = self.cached_mel_emb.shape[1]
+ if input_ids.shape[1] != 1:
+ text_inputs = input_ids[:, mel_len:]
+ text_emb = self.embeddings(text_inputs)
+ text_emb = text_emb + self.text_pos_embedding(text_emb)
+ if self.cached_mel_emb.shape[0] != text_emb.shape[0]:
+ mel_emb = self.cached_mel_emb.repeat_interleave(text_emb.shape[0] // self.cached_mel_emb.shape[0], 0)
+ else: # this outcome only occurs once per loop in most cases
+ mel_emb = self.cached_mel_emb
+ emb = torch.cat([mel_emb, text_emb], dim=1)
+ else:
+ emb = self.embeddings(input_ids)
+ emb = emb + self.text_pos_embedding.get_fixed_embedding(
+ attention_mask.shape[1] - mel_len, attention_mask.device
+ )
+
+ transformer_outputs = self.transformer(
+ inputs_embeds=emb,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ lm_logits = self.lm_head(hidden_states)
+
+ if not return_dict:
+ return (lm_logits,) + transformer_outputs[1:]
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=None,
+ logits=lm_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ cross_attentions=transformer_outputs.cross_attentions,
+ )
+
+ @staticmethod
+ def _reorder_cache(past, beam_idx):
+ """
+ This function is used to re-order the :obj:`past_key_values` cache if
+ :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
+ called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ """
+ return tuple(
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+ for layer_past in past
+ )
+
+
+class ConditioningEncoder(nn.Module):
+ def __init__(
+ self,
+ spec_dim,
+ embedding_dim,
+ attn_blocks=6,
+ num_attn_heads=4,
+ do_checkpointing=False,
+ mean=False,
+ ):
+ super().__init__()
+ attn = []
+ self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
+ for a in range(attn_blocks):
+ attn.append(AttentionBlock(embedding_dim, num_attn_heads))
+ self.attn = nn.Sequential(*attn)
+ self.dim = embedding_dim
+ self.do_checkpointing = do_checkpointing
+ self.mean = mean
+
+ def forward(self, x):
+ h = self.init(x)
+ h = self.attn(h)
+ if self.mean:
+ return h.mean(dim=2)
+ else:
+ return h[:, :, 0]
+
+
+class LearnedPositionEmbeddings(nn.Module):
+ def __init__(self, seq_len, model_dim, init=0.02):
+ super().__init__()
+ self.emb = nn.Embedding(seq_len, model_dim)
+ # Initializing this way is standard for GPT-2
+ self.emb.weight.data.normal_(mean=0.0, std=init)
+
+ def forward(self, x):
+ sl = x.shape[1]
+ return self.emb(torch.arange(0, sl, device=x.device))
+
+ def get_fixed_embedding(self, ind, dev):
+ return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind]
+
+
+def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
+ """
+ GPT-2 implemented by the HuggingFace library.
+ """
+ from transformers import GPT2Config, GPT2Model
+
+ gpt_config = GPT2Config(
+ vocab_size=256, # Unused.
+ n_positions=max_mel_seq_len + max_text_seq_len,
+ n_ctx=max_mel_seq_len + max_text_seq_len,
+ n_embd=model_dim,
+ n_layer=layers,
+ n_head=heads,
+ gradient_checkpointing=checkpointing,
+ use_cache=not checkpointing,
+ )
+ gpt = GPT2Model(gpt_config)
+ # Override the built in positional embeddings
+ del gpt.wpe # TODO: figure out relevance in fixing exported model definition: Embedding(1012, 1024)
+ gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
+ # Built-in token embeddings are unused.
+ del gpt.wte
+ return (
+ gpt,
+ LearnedPositionEmbeddings(max_mel_seq_len, model_dim),
+ LearnedPositionEmbeddings(max_text_seq_len, model_dim),
+ None,
+ None,
+ )
+
+
+class MelEncoder(nn.Module):
+ def __init__(self, channels, mel_channels=80, resblocks_per_reduction=2):
+ super().__init__()
+ self.channels = channels
+ self.encoder = nn.Sequential(
+ nn.Conv1d(mel_channels, channels // 4, kernel_size=3, padding=1),
+ nn.Sequential(*[ResBlock(channels // 4) for _ in range(resblocks_per_reduction)]),
+ nn.Conv1d(channels // 4, channels // 2, kernel_size=3, stride=2, padding=1),
+ nn.GroupNorm(channels // 16, channels // 2),
+ nn.ReLU(),
+ nn.Sequential(*[ResBlock(channels // 2) for _ in range(resblocks_per_reduction)]),
+ nn.Conv1d(channels // 2, channels, kernel_size=3, stride=2, padding=1),
+ nn.GroupNorm(channels // 8, channels),
+ nn.ReLU(),
+ nn.Sequential(*[ResBlock(channels) for _ in range(resblocks_per_reduction)]),
+ )
+ self.reduction = 4
+
+ def forward(self, x):
+ for e in self.encoder:
+ x = e(x)
+ return x.permute(0, 2, 1)
+
+
+class UnifiedVoice(nn.Module):
+ def __init__(
+ self,
+ layers=8,
+ model_dim=512,
+ heads=8,
+ max_text_tokens=120,
+ max_mel_tokens=250,
+ max_conditioning_inputs=1,
+ mel_length_compression=1024,
+ number_text_tokens=256,
+ start_text_token=None,
+ number_mel_codes=8194,
+ start_mel_token=8192,
+ stop_mel_token=8193,
+ train_solo_embeddings=False,
+ use_mel_codes_as_input=True,
+ checkpointing=True,
+ types=1,
+ ):
+ """
+ Args:
+ layers: Number of layers in transformer stack.
+ model_dim: Operating dimensions of the transformer
+ heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64
+ max_text_tokens: Maximum number of text tokens that will be encountered by model.
+ max_mel_tokens: Maximum number of MEL tokens that will be encountered by model.
+ max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s).
+ mel_length_compression: The factor between and . Used to compute MEL code padding given wav input length.
+ number_text_tokens:
+ start_text_token:
+ stop_text_token:
+ number_mel_codes:
+ start_mel_token:
+ stop_mel_token:
+ train_solo_embeddings:
+ use_mel_codes_as_input:
+ checkpointing:
+ """
+ super().__init__()
+
+ self.number_text_tokens = number_text_tokens
+ self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token
+ self.stop_text_token = 0
+ self.number_mel_codes = number_mel_codes
+ self.start_mel_token = start_mel_token
+ self.stop_mel_token = stop_mel_token
+ self.layers = layers
+ self.heads = heads
+ self.max_mel_tokens = max_mel_tokens
+ self.max_text_tokens = max_text_tokens
+ self.model_dim = model_dim
+ self.max_conditioning_inputs = max_conditioning_inputs
+ self.mel_length_compression = mel_length_compression
+ self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
+ self.text_embedding = nn.Embedding(self.number_text_tokens * types + 1, model_dim)
+ if use_mel_codes_as_input:
+ self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
+ else:
+ self.mel_embedding = MelEncoder(model_dim, resblocks_per_reduction=1)
+ (
+ self.gpt,
+ self.mel_pos_embedding,
+ self.text_pos_embedding,
+ self.mel_layer_pos_embedding,
+ self.text_layer_pos_embedding,
+ ) = build_hf_gpt_transformer(
+ layers,
+ model_dim,
+ heads,
+ self.max_mel_tokens + 2 + self.max_conditioning_inputs,
+ self.max_text_tokens + 2,
+ checkpointing,
+ )
+ if train_solo_embeddings:
+ self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
+ self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
+ else:
+ self.mel_solo_embedding = 0
+ self.text_solo_embedding = 0
+
+ self.final_norm = nn.LayerNorm(model_dim)
+ self.text_head = nn.Linear(model_dim, self.number_text_tokens * types + 1)
+ self.mel_head = nn.Linear(model_dim, self.number_mel_codes)
+
+ # Initialize the embeddings per the GPT-2 scheme
+ embeddings = [self.text_embedding]
+ if use_mel_codes_as_input:
+ embeddings.append(self.mel_embedding)
+ for module in embeddings:
+ module.weight.data.normal_(mean=0.0, std=0.02)
+
+ def post_init_gpt2_config(self, kv_cache=True):
+ seq_length = self.max_mel_tokens + self.max_text_tokens + 2
+ gpt_config = GPT2Config(
+ vocab_size=self.max_mel_tokens,
+ n_positions=seq_length,
+ n_ctx=seq_length,
+ n_embd=self.model_dim,
+ n_layer=self.layers,
+ n_head=self.heads,
+ gradient_checkpointing=False,
+ use_cache=True,
+ )
+ self.inference_model = GPT2InferenceModel(
+ gpt_config,
+ self.gpt,
+ self.mel_pos_embedding,
+ self.mel_embedding,
+ self.final_norm,
+ self.mel_head,
+ kv_cache=kv_cache,
+ )
+ # self.inference_model = PrunedGPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head)
+ self.gpt.wte = self.mel_embedding
+ # self.inference_model.save_pretrained("")
+
+ def build_aligned_inputs_and_targets(self, input, start_token, stop_token):
+ inp = F.pad(input, (1, 0), value=start_token)
+ tar = F.pad(input, (0, 1), value=stop_token)
+ return inp, tar
+
+ def set_mel_padding(self, mel_input_tokens, wav_lengths):
+ """
+ Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in
+ that audio clip, reformats the tokens with STOP_MEL_TOKEN in place of the zero padding. This is required
+ preformatting to create a working TTS model.
+ """
+ # Set padding areas within MEL (currently it is coded with the MEL code for ).
+ mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode="trunc")
+ for b in range(len(mel_lengths)):
+ actual_end = (
+ mel_lengths[b] + 1
+ ) # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
+ if actual_end < mel_input_tokens.shape[-1]:
+ mel_input_tokens[b, actual_end:] = self.stop_mel_token
+ return mel_input_tokens
+
+ def get_logits(
+ self,
+ speech_conditioning_inputs,
+ first_inputs,
+ first_head,
+ second_inputs=None,
+ second_head=None,
+ get_attns=False,
+ return_latent=False,
+ ):
+ if second_inputs is not None:
+ emb = torch.cat([speech_conditioning_inputs, first_inputs, second_inputs], dim=1)
+ else:
+ emb = torch.cat([speech_conditioning_inputs, first_inputs], dim=1)
+
+ gpt_out = self.gpt(inputs_embeds=emb, return_dict=True, output_attentions=get_attns)
+ if get_attns:
+ return gpt_out.attentions
+
+ enc = gpt_out.last_hidden_state[:, 1:] # The first logit is tied to the speech_conditioning_input
+ enc = self.final_norm(enc)
+
+ if return_latent:
+ return (
+ enc[
+ :,
+ speech_conditioning_inputs.shape[1] : speech_conditioning_inputs.shape[1] + first_inputs.shape[1],
+ ],
+ enc[:, -second_inputs.shape[1] :],
+ )
+
+ first_logits = enc[:, : first_inputs.shape[1]]
+ first_logits = first_head(first_logits)
+ first_logits = first_logits.permute(0, 2, 1)
+ if second_inputs is not None:
+ second_logits = enc[:, -second_inputs.shape[1] :]
+ second_logits = second_head(second_logits)
+ second_logits = second_logits.permute(0, 2, 1)
+ return first_logits, second_logits
+ else:
+ return first_logits
+
+ def get_conditioning(self, speech_conditioning_input):
+ speech_conditioning_input = (
+ speech_conditioning_input.unsqueeze(1)
+ if len(speech_conditioning_input.shape) == 3
+ else speech_conditioning_input
+ )
+ conds = []
+ for j in range(speech_conditioning_input.shape[1]):
+ conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
+ conds = torch.stack(conds, dim=1)
+ conds = conds.mean(dim=1)
+ return conds
+
+ def forward(
+ self,
+ speech_conditioning_latent,
+ text_inputs,
+ text_lengths,
+ mel_codes,
+ wav_lengths,
+ types=None,
+ text_first=True,
+ raw_mels=None,
+ return_attentions=False,
+ return_latent=False,
+ clip_inputs=True,
+ ):
+ """
+ Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
+ (actuated by `text_first`).
+
+ speech_conditioning_input: MEL float tensor, (b,1024)
+ text_inputs: long tensor, (b,t)
+ text_lengths: long tensor, (b,)
+ mel_inputs: long tensor, (b,m)
+ wav_lengths: long tensor, (b,)
+ raw_mels: MEL float tensor (b,80,s)
+
+ If return_attentions is specified, only logits are returned.
+ If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned.
+ If clip_inputs is True, the inputs will be clipped to the smallest input size across each input modality.
+ """
+ # Types are expressed by expanding the text embedding space.
+ if types is not None:
+ text_inputs = text_inputs * (1 + types).unsqueeze(-1)
+
+ if clip_inputs:
+ # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
+ # chopping the inputs by the maximum actual length.
+ max_text_len = text_lengths.max()
+ text_inputs = text_inputs[:, :max_text_len]
+ max_mel_len = wav_lengths.max() // self.mel_length_compression
+ mel_codes = mel_codes[:, :max_mel_len]
+ if raw_mels is not None:
+ raw_mels = raw_mels[:, :, : max_mel_len * 4]
+ mel_codes = self.set_mel_padding(mel_codes, wav_lengths)
+ text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
+ mel_codes = F.pad(mel_codes, (0, 1), value=self.stop_mel_token)
+
+ conds = speech_conditioning_latent.unsqueeze(1)
+ text_inputs, text_targets = self.build_aligned_inputs_and_targets(
+ text_inputs, self.start_text_token, self.stop_text_token
+ )
+ text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+ mel_codes, mel_targets = self.build_aligned_inputs_and_targets(
+ mel_codes, self.start_mel_token, self.stop_mel_token
+ )
+ if raw_mels is not None:
+ mel_inp = F.pad(raw_mels, (0, 8))
+ else:
+ mel_inp = mel_codes
+ mel_emb = self.mel_embedding(mel_inp)
+ mel_emb = mel_emb + self.mel_pos_embedding(mel_codes)
+
+ if text_first:
+ text_logits, mel_logits = self.get_logits(
+ conds,
+ text_emb,
+ self.text_head,
+ mel_emb,
+ self.mel_head,
+ get_attns=return_attentions,
+ return_latent=return_latent,
+ )
+ if return_latent:
+ return mel_logits[
+ :, :-2
+ ] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.
+ else:
+ mel_logits, text_logits = self.get_logits(
+ conds,
+ mel_emb,
+ self.mel_head,
+ text_emb,
+ self.text_head,
+ get_attns=return_attentions,
+ return_latent=return_latent,
+ )
+ if return_latent:
+ return text_logits[
+ :, :-2
+ ] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.
+
+ if return_attentions:
+ return mel_logits
+ loss_text = F.cross_entropy(text_logits, text_targets.long())
+ loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
+ return loss_text.mean(), loss_mel.mean(), mel_logits
+
+ def inference_speech(
+ self,
+ speech_conditioning_latent,
+ text_inputs,
+ input_tokens=None,
+ num_return_sequences=1,
+ max_generate_length=None,
+ typical_sampling=False,
+ typical_mass=0.9,
+ **hf_generate_kwargs,
+ ):
+ text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
+ text_inputs, text_targets = self.build_aligned_inputs_and_targets(
+ text_inputs, self.start_text_token, self.stop_text_token
+ )
+ text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+
+ conds = speech_conditioning_latent.unsqueeze(1)
+ emb = torch.cat([conds, text_emb], dim=1)
+ self.inference_model.store_mel_emb(emb)
+
+ fake_inputs = torch.full(
+ (
+ emb.shape[0],
+ conds.shape[1] + emb.shape[1],
+ ),
+ fill_value=1,
+ dtype=torch.long,
+ device=text_inputs.device,
+ )
+ fake_inputs[:, -1] = self.start_mel_token
+ trunc_index = fake_inputs.shape[1]
+ if input_tokens is None:
+ inputs = fake_inputs
+ else:
+ assert (
+ num_return_sequences % input_tokens.shape[0] == 0
+ ), "The number of return sequences must be divisible by the number of input sequences"
+ fake_inputs = fake_inputs.repeat(num_return_sequences, 1)
+ input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1)
+ inputs = torch.cat([fake_inputs, input_tokens], dim=1)
+
+ logits_processor = (
+ LogitsProcessorList([TypicalLogitsWarper(mass=typical_mass)]) if typical_sampling else LogitsProcessorList()
+ ) # TODO disable this
+ max_length = (
+ trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length
+ )
+ gen = self.inference_model.generate(
+ inputs,
+ bos_token_id=self.start_mel_token,
+ pad_token_id=self.stop_mel_token,
+ eos_token_id=self.stop_mel_token,
+ max_length=max_length,
+ logits_processor=logits_processor,
+ num_return_sequences=num_return_sequences,
+ **hf_generate_kwargs,
+ )
+ return gen[:, trunc_index:]
+
+
+if __name__ == "__main__":
+ gpt = UnifiedVoice(
+ model_dim=256,
+ heads=4,
+ train_solo_embeddings=True,
+ use_mel_codes_as_input=True,
+ max_conditioning_inputs=4,
+ )
+ l = gpt(
+ torch.randn(2, 3, 80, 800),
+ torch.randint(high=120, size=(2, 120)),
+ torch.tensor([32, 120]),
+ torch.randint(high=8192, size=(2, 250)),
+ torch.tensor([250 * 256, 195 * 256]),
+ )
+ gpt.text_forward(
+ torch.randn(2, 80, 800),
+ torch.randint(high=50, size=(2, 80)),
+ torch.tensor([32, 80]),
+ )
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/classifier.py b/submodules/TTS/TTS/tts/layers/tortoise/classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..8764bb070b5ad8267ee2992ccc33f5bb65bad005
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/classifier.py
@@ -0,0 +1,144 @@
+import torch
+import torch.nn as nn
+
+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, Downsample, Upsample, normalization, zero_module
+
+
+class ResBlock(nn.Module):
+ def __init__(
+ self,
+ channels,
+ dropout,
+ out_channels=None,
+ use_conv=False,
+ use_scale_shift_norm=False,
+ dims=2,
+ up=False,
+ down=False,
+ kernel_size=3,
+ do_checkpoint=True,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.dropout = dropout
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.use_scale_shift_norm = use_scale_shift_norm
+ self.do_checkpoint = do_checkpoint
+ padding = 1 if kernel_size == 3 else 2
+
+ self.in_layers = nn.Sequential(
+ normalization(channels),
+ nn.SiLU(),
+ nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding),
+ )
+
+ self.updown = up or down
+
+ if up:
+ self.h_upd = Upsample(channels, False, dims)
+ self.x_upd = Upsample(channels, False, dims)
+ elif down:
+ self.h_upd = Downsample(channels, False, dims)
+ self.x_upd = Downsample(channels, False, dims)
+ else:
+ self.h_upd = self.x_upd = nn.Identity()
+
+ self.out_layers = nn.Sequential(
+ normalization(self.out_channels),
+ nn.SiLU(),
+ nn.Dropout(p=dropout),
+ zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)),
+ )
+
+ if self.out_channels == channels:
+ self.skip_connection = nn.Identity()
+ elif use_conv:
+ self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, kernel_size, padding=padding)
+ else:
+ self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1)
+
+ def forward(self, x):
+ if self.updown:
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+ h = in_rest(x)
+ h = self.h_upd(h)
+ x = self.x_upd(x)
+ h = in_conv(h)
+ else:
+ h = self.in_layers(x)
+ h = self.out_layers(h)
+ return self.skip_connection(x) + h
+
+
+class AudioMiniEncoder(nn.Module):
+ def __init__(
+ self,
+ spec_dim,
+ embedding_dim,
+ base_channels=128,
+ depth=2,
+ resnet_blocks=2,
+ attn_blocks=4,
+ num_attn_heads=4,
+ dropout=0,
+ downsample_factor=2,
+ kernel_size=3,
+ ):
+ super().__init__()
+ self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1))
+ ch = base_channels
+ res = []
+ self.layers = depth
+ for l in range(depth):
+ for r in range(resnet_blocks):
+ res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size))
+ res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
+ ch *= 2
+ self.res = nn.Sequential(*res)
+ self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
+ attn = []
+ for a in range(attn_blocks):
+ attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
+ self.attn = nn.Sequential(*attn)
+ self.dim = embedding_dim
+
+ def forward(self, x):
+ h = self.init(x)
+ h = self.res(h)
+ h = self.final(h)
+ for blk in self.attn:
+ h = blk(h)
+ return h[:, :, 0]
+
+
+class AudioMiniEncoderWithClassifierHead(nn.Module):
+ def __init__(self, classes, distribute_zero_label=True, **kwargs):
+ super().__init__()
+ self.enc = AudioMiniEncoder(**kwargs)
+ self.head = nn.Linear(self.enc.dim, classes)
+ self.num_classes = classes
+ self.distribute_zero_label = distribute_zero_label
+
+ def forward(self, x, labels=None):
+ h = self.enc(x)
+ logits = self.head(h)
+ if labels is None:
+ return logits
+ else:
+ if self.distribute_zero_label:
+ oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes)
+ zeros_indices = (labels == 0).unsqueeze(-1)
+ # Distribute 20% of the probability mass on all classes when zero is specified, to compensate for dataset noise.
+ zero_extra_mass = torch.full_like(
+ oh_labels,
+ dtype=torch.float,
+ fill_value=0.2 / (self.num_classes - 1),
+ )
+ zero_extra_mass[:, 0] = -0.2
+ zero_extra_mass = zero_extra_mass * zeros_indices
+ oh_labels = oh_labels + zero_extra_mass
+ else:
+ oh_labels = labels
+ loss = nn.functional.cross_entropy(logits, oh_labels)
+ return loss
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/clvp.py b/submodules/TTS/TTS/tts/layers/tortoise/clvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..69b8c17c3fe71f55be12b728fa3c8f0e85cefb89
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/clvp.py
@@ -0,0 +1,159 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+
+from TTS.tts.layers.tortoise.arch_utils import CheckpointedXTransformerEncoder
+from TTS.tts.layers.tortoise.transformer import Transformer
+from TTS.tts.layers.tortoise.xtransformers import Encoder
+
+
+def exists(val):
+ return val is not None
+
+
+def masked_mean(t, mask, dim=1):
+ t = t.masked_fill(~mask[:, :, None], 0.0)
+ return t.sum(dim=1) / mask.sum(dim=1)[..., None]
+
+
+class CLVP(nn.Module):
+ """
+ CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding
+ transcribed text.
+
+ Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py
+ """
+
+ def __init__(
+ self,
+ *,
+ dim_text=512,
+ dim_speech=512,
+ dim_latent=512,
+ num_text_tokens=256,
+ text_enc_depth=6,
+ text_seq_len=120,
+ text_heads=8,
+ num_speech_tokens=8192,
+ speech_enc_depth=6,
+ speech_heads=8,
+ speech_seq_len=250,
+ text_mask_percentage=0,
+ voice_mask_percentage=0,
+ wav_token_compression=1024,
+ use_xformers=False,
+ ):
+ super().__init__()
+ self.text_emb = nn.Embedding(num_text_tokens, dim_text)
+ self.to_text_latent = nn.Linear(dim_text, dim_latent, bias=False)
+
+ self.speech_emb = nn.Embedding(num_speech_tokens, dim_speech)
+ self.to_speech_latent = nn.Linear(dim_speech, dim_latent, bias=False)
+
+ if use_xformers:
+ self.text_transformer = CheckpointedXTransformerEncoder(
+ needs_permute=False,
+ exit_permute=False,
+ max_seq_len=-1,
+ attn_layers=Encoder(
+ dim=dim_text,
+ depth=text_enc_depth,
+ heads=text_heads,
+ ff_dropout=0.1,
+ ff_mult=2,
+ attn_dropout=0.1,
+ use_rmsnorm=True,
+ ff_glu=True,
+ rotary_pos_emb=True,
+ ),
+ )
+ self.speech_transformer = CheckpointedXTransformerEncoder(
+ needs_permute=False,
+ exit_permute=False,
+ max_seq_len=-1,
+ attn_layers=Encoder(
+ dim=dim_speech,
+ depth=speech_enc_depth,
+ heads=speech_heads,
+ ff_dropout=0.1,
+ ff_mult=2,
+ attn_dropout=0.1,
+ use_rmsnorm=True,
+ ff_glu=True,
+ rotary_pos_emb=True,
+ ),
+ )
+ else:
+ self.text_transformer = Transformer(
+ causal=False, seq_len=text_seq_len, dim=dim_text, depth=text_enc_depth, heads=text_heads
+ )
+ self.speech_transformer = Transformer(
+ causal=False, seq_len=speech_seq_len, dim=dim_speech, depth=speech_enc_depth, heads=speech_heads
+ )
+
+ self.temperature = nn.Parameter(torch.tensor(1.0))
+ self.text_mask_percentage = text_mask_percentage
+ self.voice_mask_percentage = voice_mask_percentage
+ self.wav_token_compression = wav_token_compression
+ self.xformers = use_xformers
+ if not use_xformers:
+ self.text_pos_emb = nn.Embedding(text_seq_len, dim_text)
+ self.speech_pos_emb = nn.Embedding(num_speech_tokens, dim_speech)
+
+ def forward(self, text, speech_tokens, return_loss=False):
+ b, device = text.shape[0], text.device
+ if self.training:
+ text_mask = torch.rand_like(text.float()) > self.text_mask_percentage
+ voice_mask = torch.rand_like(speech_tokens.float()) > self.voice_mask_percentage
+ else:
+ text_mask = torch.ones_like(text.float()).bool()
+ voice_mask = torch.ones_like(speech_tokens.float()).bool()
+
+ text_emb = self.text_emb(text)
+ speech_emb = self.speech_emb(speech_tokens)
+
+ if not self.xformers:
+ text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device))
+ speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device))
+
+ enc_text = self.text_transformer(text_emb, mask=text_mask)
+ enc_speech = self.speech_transformer(speech_emb, mask=voice_mask)
+
+ text_latents = masked_mean(enc_text, text_mask, dim=1)
+ speech_latents = masked_mean(enc_speech, voice_mask, dim=1)
+
+ text_latents = self.to_text_latent(text_latents)
+ speech_latents = self.to_speech_latent(speech_latents)
+
+ text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents))
+
+ temp = self.temperature.exp()
+
+ if not return_loss:
+ sim = einsum("n d, n d -> n", text_latents, speech_latents) * temp
+ return sim
+
+ sim = einsum("i d, j d -> i j", text_latents, speech_latents) * temp
+ labels = torch.arange(b, device=device)
+ loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2
+ return loss
+
+
+if __name__ == "__main__":
+ clip = CLVP(text_mask_percentage=0.2, voice_mask_percentage=0.2)
+ clip(
+ torch.randint(0, 256, (2, 120)),
+ torch.tensor([50, 100]),
+ torch.randint(0, 8192, (2, 250)),
+ torch.tensor([101, 102]),
+ return_loss=True,
+ )
+ nonloss = clip(
+ torch.randint(0, 256, (2, 120)),
+ torch.tensor([50, 100]),
+ torch.randint(0, 8192, (2, 250)),
+ torch.tensor([101, 102]),
+ return_loss=False,
+ )
+ print(nonloss.shape)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/diffusion.py b/submodules/TTS/TTS/tts/layers/tortoise/diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bea02ca08a46cb474406014c690b7973e33d55d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/diffusion.py
@@ -0,0 +1,1242 @@
+"""
+This is an almost carbon copy of gaussian_diffusion.py from OpenAI's ImprovedDiffusion repo, which itself:
+
+This code started out as a PyTorch port of Ho et al's diffusion models:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
+
+Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
+"""
+
+import enum
+import math
+
+import numpy as np
+import torch
+import torch as th
+from tqdm import tqdm
+
+from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
+
+try:
+ from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
+
+ K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
+except ImportError:
+ K_DIFFUSION_SAMPLERS = None
+
+
+SAMPLERS = ["dpm++2m", "p", "ddim"]
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+ """
+ Compute the KL divergence between two gaussians.
+
+ Shapes are automatically broadcasted, so batches can be compared to
+ scalars, among other use cases.
+ """
+ tensor = None
+ for obj in (mean1, logvar1, mean2, logvar2):
+ if isinstance(obj, th.Tensor):
+ tensor = obj
+ break
+ assert tensor is not None, "at least one argument must be a Tensor"
+
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
+ # Tensors, but it does not work for th.exp().
+ logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
+
+ return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
+
+
+def approx_standard_normal_cdf(x):
+ """
+ A fast approximation of the cumulative distribution function of the
+ standard normal.
+ """
+ return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+
+
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+ """
+ Compute the log-likelihood of a Gaussian distribution discretizing to a
+ given image.
+
+ :param x: the target images. It is assumed that this was uint8 values,
+ rescaled to the range [-1, 1].
+ :param means: the Gaussian mean Tensor.
+ :param log_scales: the Gaussian log stddev Tensor.
+ :return: a tensor like x of log probabilities (in nats).
+ """
+ assert x.shape == means.shape == log_scales.shape
+ centered_x = x - means
+ inv_stdv = th.exp(-log_scales)
+ plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+ cdf_plus = approx_standard_normal_cdf(plus_in)
+ min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+ cdf_min = approx_standard_normal_cdf(min_in)
+ log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+ log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+ cdf_delta = cdf_plus - cdf_min
+ log_probs = th.where(
+ x < -0.999,
+ log_cdf_plus,
+ th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+ )
+ assert log_probs.shape == x.shape
+ return log_probs
+
+
+def mean_flat(tensor):
+ """
+ Take the mean over all non-batch dimensions.
+ """
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+ """
+ Get a pre-defined beta schedule for the given name.
+
+ The beta schedule library consists of beta schedules which remain similar
+ in the limit of num_diffusion_timesteps.
+ Beta schedules may be added, but should not be removed or changed once
+ they are committed to maintain backwards compatibility.
+ """
+ if schedule_name == "linear":
+ # Linear schedule from Ho et al, extended to work for any number of
+ # diffusion steps.
+ scale = 1000 / num_diffusion_timesteps
+ beta_start = scale * 0.0001
+ beta_end = scale * 0.02
+ return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+ elif schedule_name == "cosine":
+ return betas_for_alpha_bar(
+ num_diffusion_timesteps,
+ lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+ )
+ else:
+ raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function,
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
+
+ :param num_diffusion_timesteps: the number of betas to produce.
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+ produces the cumulative product of (1-beta) up to that
+ part of the diffusion process.
+ :param max_beta: the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+ """
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return np.array(betas)
+
+
+class ModelMeanType(enum.Enum):
+ """
+ Which type of output the model predicts.
+ """
+
+ PREVIOUS_X = "previous_x" # the model predicts x_{t-1}
+ START_X = "start_x" # the model predicts x_0
+ EPSILON = "epsilon" # the model predicts epsilon
+
+
+class ModelVarType(enum.Enum):
+ """
+ What is used as the model's output variance.
+
+ The LEARNED_RANGE option has been added to allow the model to predict
+ values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+ """
+
+ LEARNED = "learned"
+ FIXED_SMALL = "fixed_small"
+ FIXED_LARGE = "fixed_large"
+ LEARNED_RANGE = "learned_range"
+
+
+class LossType(enum.Enum):
+ MSE = "mse" # use raw MSE loss (and KL when learning variances)
+ RESCALED_MSE = "rescaled_mse" # use raw MSE loss (with RESCALED_KL when learning variances)
+ KL = "kl" # use the variational lower-bound
+ RESCALED_KL = "rescaled_kl" # like KL, but rescale to estimate the full VLB
+
+ def is_vb(self):
+ return self == LossType.KL or self == LossType.RESCALED_KL
+
+
+class GaussianDiffusion:
+ """
+ Utilities for training and sampling diffusion models.
+
+ Ported directly from here, and then adapted over time to further experimentation.
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+
+ :param betas: a 1-D numpy array of betas for each diffusion timestep,
+ starting at T and going to 1.
+ :param model_mean_type: a ModelMeanType determining what the model outputs.
+ :param model_var_type: a ModelVarType determining how variance is output.
+ :param loss_type: a LossType determining the loss function to use.
+ :param rescale_timesteps: if True, pass floating point timesteps into the
+ model so that they are always scaled like in the
+ original paper (0 to 1000).
+ """
+
+ def __init__(
+ self,
+ *,
+ betas,
+ model_mean_type,
+ model_var_type,
+ loss_type,
+ rescale_timesteps=False,
+ conditioning_free=False,
+ conditioning_free_k=1,
+ ramp_conditioning_free=True,
+ sampler="p",
+ ):
+ self.sampler = sampler
+ self.model_mean_type = ModelMeanType(model_mean_type)
+ self.model_var_type = ModelVarType(model_var_type)
+ self.loss_type = LossType(loss_type)
+ self.rescale_timesteps = rescale_timesteps
+ self.conditioning_free = conditioning_free
+ self.conditioning_free_k = conditioning_free_k
+ self.ramp_conditioning_free = ramp_conditioning_free
+
+ # Use float64 for accuracy.
+ betas = np.array(betas, dtype=np.float64)
+ self.betas = betas
+ assert len(betas.shape) == 1, "betas must be 1-D"
+ assert (betas > 0).all() and (betas <= 1).all()
+
+ self.num_timesteps = int(betas.shape[0])
+
+ alphas = 1.0 - betas
+ self.alphas_cumprod = np.cumprod(alphas, axis=0)
+ self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+ self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+ assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+
+ # calculations for diffusion q(x_t | x_{t-1}) and others
+ self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+ self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+ self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+ self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+ self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
+ self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+ # log calculation clipped because the posterior variance is 0 at the
+ # beginning of the diffusion chain.
+ self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
+ self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+ self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+
+ def q_mean_variance(self, x_start, t):
+ """
+ Get the distribution q(x_t | x_0).
+
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+ """
+ mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+ variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+ log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+ return mean, variance, log_variance
+
+ def q_sample(self, x_start, t, noise=None):
+ """
+ Diffuse the data for a given number of diffusion steps.
+
+ In other words, sample from q(x_t | x_0).
+
+ :param x_start: the initial data batch.
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+ :param noise: if specified, the split-out normal noise.
+ :return: A noisy version of x_start.
+ """
+ if noise is None:
+ noise = th.randn_like(x_start)
+ assert noise.shape == x_start.shape
+ return (
+ _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+ + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+ )
+
+ def q_posterior_mean_variance(self, x_start, x_t, t):
+ """
+ Compute the mean and variance of the diffusion posterior:
+
+ q(x_{t-1} | x_t, x_0)
+
+ """
+ assert x_start.shape == x_t.shape
+ posterior_mean = (
+ _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+ + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+ )
+ posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+ posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+ assert (
+ posterior_mean.shape[0]
+ == posterior_variance.shape[0]
+ == posterior_log_variance_clipped.shape[0]
+ == x_start.shape[0]
+ )
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+ def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+ """
+ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+ the initial x, x_0.
+
+ :param model: the model, which takes a signal and a batch of timesteps
+ as input.
+ :param x: the [N x C x ...] tensor at time t.
+ :param t: a 1-D Tensor of timesteps.
+ :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+ :param denoised_fn: if not None, a function which applies to the
+ x_start prediction before it is used to sample. Applies before
+ clip_denoised.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+ :return: a dict with the following keys:
+ - 'mean': the model mean output.
+ - 'variance': the model variance output.
+ - 'log_variance': the log of 'variance'.
+ - 'pred_xstart': the prediction for x_0.
+ """
+ if model_kwargs is None:
+ model_kwargs = {}
+
+ B, C = x.shape[:2]
+ assert t.shape == (B,)
+ model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+ if self.conditioning_free:
+ model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs)
+
+ if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+ assert model_output.shape == (B, C * 2, *x.shape[2:])
+ model_output, model_var_values = th.split(model_output, C, dim=1)
+ if self.conditioning_free:
+ model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1)
+ if self.model_var_type == ModelVarType.LEARNED:
+ model_log_variance = model_var_values
+ model_variance = th.exp(model_log_variance)
+ else:
+ min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+ max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+ # The model_var_values is [-1, 1] for [min_var, max_var].
+ frac = (model_var_values + 1) / 2
+ model_log_variance = frac * max_log + (1 - frac) * min_log
+ model_variance = th.exp(model_log_variance)
+ else:
+ model_variance, model_log_variance = {
+ # for fixedlarge, we set the initial (log-)variance like so
+ # to get a better decoder log likelihood.
+ ModelVarType.FIXED_LARGE: (
+ np.append(self.posterior_variance[1], self.betas[1:]),
+ np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+ ),
+ ModelVarType.FIXED_SMALL: (
+ self.posterior_variance,
+ self.posterior_log_variance_clipped,
+ ),
+ }[self.model_var_type]
+ model_variance = _extract_into_tensor(model_variance, t, x.shape)
+ model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+
+ if self.conditioning_free:
+ if self.ramp_conditioning_free:
+ assert t.shape[0] == 1 # This should only be used in inference.
+ cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps)
+ else:
+ cfk = self.conditioning_free_k
+ model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning
+
+ def process_xstart(x):
+ if denoised_fn is not None:
+ x = denoised_fn(x)
+ if clip_denoised:
+ return x.clamp(-1, 1)
+ return x
+
+ if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+ pred_xstart = process_xstart(self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output))
+ model_mean = model_output
+ elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+ if self.model_mean_type == ModelMeanType.START_X:
+ pred_xstart = process_xstart(model_output)
+ else:
+ pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
+ model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+ else:
+ raise NotImplementedError(self.model_mean_type)
+
+ assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+ return {
+ "mean": model_mean,
+ "variance": model_variance,
+ "log_variance": model_log_variance,
+ "pred_xstart": pred_xstart,
+ }
+
+ def _predict_xstart_from_eps(self, x_t, t, eps):
+ assert x_t.shape == eps.shape
+ return (
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+ - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+ )
+
+ def _predict_xstart_from_xprev(self, x_t, t, xprev):
+ assert x_t.shape == xprev.shape
+ return ( # (xprev - coef2*x_t) / coef1
+ _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
+ - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t
+ )
+
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+ return (
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+
+ def _scale_timesteps(self, t):
+ if self.rescale_timesteps:
+ return t.float() * (1000.0 / self.num_timesteps)
+ return t
+
+ def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+ """
+ Compute the mean for the previous step, given a function cond_fn that
+ computes the gradient of a conditional log probability with respect to
+ x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+ condition on y.
+
+ This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+ """
+ gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+ new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+ return new_mean
+
+ def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+ """
+ Compute what the p_mean_variance output would have been, should the
+ model's score function be conditioned by cond_fn.
+
+ See condition_mean() for details on cond_fn.
+
+ Unlike condition_mean(), this instead uses the conditioning strategy
+ from Song et al (2020).
+ """
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+
+ eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+ eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+
+ out = p_mean_var.copy()
+ out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+ out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+ return out
+
+ def k_diffusion_sample_loop(
+ self,
+ k_sampler,
+ pbar,
+ model,
+ shape,
+ noise=None, # all given
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ device=None, # ALL UNUSED
+ model_kwargs=None, # {'precomputed_aligned_embeddings': precomputed_embeddings},
+ progress=False, # unused as well
+ ):
+ assert isinstance(model_kwargs, dict)
+ if device is None:
+ device = next(model.parameters()).device
+ s_in = noise.new_ones([noise.shape[0]])
+
+ def model_split(*args, **kwargs):
+ model_output = model(*args, **kwargs)
+ model_epsilon, model_var = th.split(model_output, model_output.shape[1] // 2, dim=1)
+ return model_epsilon, model_var
+
+ #
+ """
+ print(self.betas)
+ print(th.tensor(self.betas))
+ noise_schedule = NoiseScheduleVP(schedule='discrete', betas=th.tensor(self.betas))
+ """
+ noise_schedule = NoiseScheduleVP(schedule="linear", continuous_beta_0=0.1 / 4, continuous_beta_1=20.0 / 4)
+
+ def model_fn_prewrap(x, t, *args, **kwargs):
+ """
+ x_in = torch.cat([x] * 2)
+ t_in = torch.cat([t_continuous] * 2)
+ c_in = torch.cat([unconditional_condition, condition])
+ noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+ print(t)
+ print(self.timestep_map)
+ exit()
+ """
+ """
+ model_output = model(x, self._scale_timesteps(t*4000), **model_kwargs)
+ out = self.p_mean_variance(model, x, t*4000, model_kwargs=model_kwargs)
+ return out['pred_xstart']
+ """
+ x, _ = x.chunk(2)
+ t, _ = (t * 1000).chunk(2)
+ res = torch.cat(
+ [
+ model_split(x, t, conditioning_free=True, **model_kwargs)[0],
+ model_split(x, t, **model_kwargs)[0],
+ ]
+ )
+ pbar.update(1)
+ return res
+
+ model_fn = model_wrapper(
+ model_fn_prewrap,
+ noise_schedule,
+ model_type="noise", # "noise" or "x_start" or "v" or "score"
+ model_kwargs=model_kwargs,
+ guidance_type="classifier-free",
+ condition=th.Tensor(1),
+ unconditional_condition=th.Tensor(1),
+ guidance_scale=self.conditioning_free_k,
+ )
+ dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+ x_sample = dpm_solver.sample(
+ noise,
+ steps=self.num_timesteps,
+ order=2,
+ skip_type="time_uniform",
+ method="multistep",
+ )
+ #'''
+ return x_sample
+
+ def sample_loop(self, *args, **kwargs):
+ s = self.sampler
+ if s == "p":
+ return self.p_sample_loop(*args, **kwargs)
+ elif s == "ddim":
+ return self.ddim_sample_loop(*args, **kwargs)
+ elif s == "dpm++2m":
+ if self.conditioning_free is not True:
+ raise RuntimeError("cond_free must be true")
+ with tqdm(total=self.num_timesteps) as pbar:
+ if K_DIFFUSION_SAMPLERS is None:
+ raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
+ return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
+ else:
+ raise RuntimeError("sampler not impl")
+
+ def p_sample(
+ self,
+ model,
+ x,
+ t,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ ):
+ """
+ Sample x_{t-1} from the model at the given timestep.
+
+ :param model: the model to sample from.
+ :param x: the current tensor at x_{t-1}.
+ :param t: the value of t, starting at 0 for the first diffusion step.
+ :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+ :param denoised_fn: if not None, a function which applies to the
+ x_start prediction before it is used to sample.
+ :param cond_fn: if not None, this is a gradient function that acts
+ similarly to the model.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+ :return: a dict containing the following keys:
+ - 'sample': a random sample from the model.
+ - 'pred_xstart': a prediction of x_0.
+ """
+ out = self.p_mean_variance(
+ model,
+ x,
+ t,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ model_kwargs=model_kwargs,
+ )
+ noise = th.randn_like(x)
+ nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0
+ if cond_fn is not None:
+ out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+ sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+
+ def p_sample_loop(
+ self,
+ model,
+ shape,
+ noise=None,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ device=None,
+ progress=False,
+ ):
+ """
+ Generate samples from the model.
+
+ :param model: the model module.
+ :param shape: the shape of the samples, (N, C, H, W).
+ :param noise: if specified, the noise from the encoder to sample.
+ Should be of the same shape as `shape`.
+ :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+ :param denoised_fn: if not None, a function which applies to the
+ x_start prediction before it is used to sample.
+ :param cond_fn: if not None, this is a gradient function that acts
+ similarly to the model.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+ :param device: if specified, the device to create the samples on.
+ If not specified, use a model parameter's device.
+ :param progress: if True, show a tqdm progress bar.
+ :return: a non-differentiable batch of samples.
+ """
+ final = None
+ for sample in self.p_sample_loop_progressive(
+ model,
+ shape,
+ noise=noise,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ cond_fn=cond_fn,
+ model_kwargs=model_kwargs,
+ device=device,
+ progress=progress,
+ ):
+ final = sample
+ return final["sample"]
+
+ def p_sample_loop_progressive(
+ self,
+ model,
+ shape,
+ noise=None,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ device=None,
+ progress=False,
+ ):
+ """
+ Generate samples from the model and yield intermediate samples from
+ each timestep of diffusion.
+
+ Arguments are the same as p_sample_loop().
+ Returns a generator over dicts, where each dict is the return value of
+ p_sample().
+ """
+ if device is None:
+ device = next(model.parameters()).device
+ assert isinstance(shape, (tuple, list))
+ if noise is not None:
+ img = noise
+ else:
+ img = th.randn(*shape, device=device)
+ indices = list(range(self.num_timesteps))[::-1]
+
+ for i in tqdm(indices, disable=not progress):
+ t = th.tensor([i] * shape[0], device=device)
+ with th.no_grad():
+ out = self.p_sample(
+ model,
+ img,
+ t,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ cond_fn=cond_fn,
+ model_kwargs=model_kwargs,
+ )
+ yield out
+ img = out["sample"]
+
+ def ddim_sample(
+ self,
+ model,
+ x,
+ t,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ eta=0.0,
+ ):
+ """
+ Sample x_{t-1} from the model using DDIM.
+
+ Same usage as p_sample().
+ """
+ out = self.p_mean_variance(
+ model,
+ x,
+ t,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ model_kwargs=model_kwargs,
+ )
+ if cond_fn is not None:
+ out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+
+ # Usually our model outputs epsilon, but we re-derive it
+ # in case we used x_start or x_prev prediction.
+ eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+ alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+ sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+ # Equation 12.
+ noise = th.randn_like(x)
+ mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+ nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0
+ sample = mean_pred + nonzero_mask * sigma * noise
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+
+ def ddim_reverse_sample(
+ self,
+ model,
+ x,
+ t,
+ clip_denoised=True,
+ denoised_fn=None,
+ model_kwargs=None,
+ eta=0.0,
+ ):
+ """
+ Sample x_{t+1} from the model using DDIM reverse ODE.
+ """
+ assert eta == 0.0, "Reverse ODE only for deterministic path"
+ out = self.p_mean_variance(
+ model,
+ x,
+ t,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ model_kwargs=model_kwargs,
+ )
+ # Usually our model outputs epsilon, but we re-derive it
+ # in case we used x_start or x_prev prediction.
+ eps = (
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+ alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+
+ # Equation 12. reversed
+ mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+
+ return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+
+ def ddim_sample_loop(
+ self,
+ model,
+ shape,
+ noise=None,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ device=None,
+ progress=False,
+ eta=0.0,
+ ):
+ """
+ Generate samples from the model using DDIM.
+
+ Same usage as p_sample_loop().
+ """
+ final = None
+ for sample in self.ddim_sample_loop_progressive(
+ model,
+ shape,
+ noise=noise,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ cond_fn=cond_fn,
+ model_kwargs=model_kwargs,
+ device=device,
+ progress=progress,
+ eta=eta,
+ ):
+ final = sample
+ return final["sample"]
+
+ def ddim_sample_loop_progressive(
+ self,
+ model,
+ shape,
+ noise=None,
+ clip_denoised=True,
+ denoised_fn=None,
+ cond_fn=None,
+ model_kwargs=None,
+ device=None,
+ progress=False,
+ eta=0.0,
+ ):
+ """
+ Use DDIM to sample from the model and yield intermediate samples from
+ each timestep of DDIM.
+
+ Same usage as p_sample_loop_progressive().
+ """
+ if device is None:
+ device = next(model.parameters()).device
+ assert isinstance(shape, (tuple, list))
+ if noise is not None:
+ img = noise
+ else:
+ img = th.randn(*shape, device=device)
+ indices = list(range(self.num_timesteps))[::-1]
+
+ if progress:
+ # Lazy import so that we don't depend on tqdm.
+ from tqdm.auto import tqdm
+
+ indices = tqdm(indices, disable=not progress)
+
+ for i in indices:
+ t = th.tensor([i] * shape[0], device=device)
+ with th.no_grad():
+ out = self.ddim_sample(
+ model,
+ img,
+ t,
+ clip_denoised=clip_denoised,
+ denoised_fn=denoised_fn,
+ cond_fn=cond_fn,
+ model_kwargs=model_kwargs,
+ eta=eta,
+ )
+ yield out
+ img = out["sample"]
+
+ def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None):
+ """
+ Get a term for the variational lower-bound.
+
+ The resulting units are bits (rather than nats, as one might expect).
+ This allows for comparison to other papers.
+
+ :return: a dict with the following keys:
+ - 'output': a shape [N] tensor of NLLs or KLs.
+ - 'pred_xstart': the x_0 predictions.
+ """
+ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
+ out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
+ kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
+ kl = mean_flat(kl) / np.log(2.0)
+
+ decoder_nll = -discretized_gaussian_log_likelihood(
+ x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+ )
+ assert decoder_nll.shape == x_start.shape
+ decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+
+ # At the first timestep return the decoder NLL,
+ # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+ output = th.where((t == 0), decoder_nll, kl)
+ return {"output": output, "pred_xstart": out["pred_xstart"]}
+
+ def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+ """
+ Compute training losses for a single timestep.
+
+ :param model: the model to evaluate loss on.
+ :param x_start: the [N x C x ...] tensor of inputs.
+ :param t: a batch of timestep indices.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+ :param noise: if specified, the specific Gaussian noise to try to remove.
+ :return: a dict with the key "loss" containing a tensor of shape [N].
+ Some mean or variance settings may also have other keys.
+ """
+ if model_kwargs is None:
+ model_kwargs = {}
+ if noise is None:
+ noise = th.randn_like(x_start)
+ x_t = self.q_sample(x_start, t, noise=noise)
+
+ terms = {}
+
+ if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+ # TODO: support multiple model outputs for this mode.
+ terms["loss"] = self._vb_terms_bpd(
+ model=model,
+ x_start=x_start,
+ x_t=x_t,
+ t=t,
+ clip_denoised=False,
+ model_kwargs=model_kwargs,
+ )["output"]
+ if self.loss_type == LossType.RESCALED_KL:
+ terms["loss"] *= self.num_timesteps
+ elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+ model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs)
+ if isinstance(model_outputs, tuple):
+ model_output = model_outputs[0]
+ terms["extra_outputs"] = model_outputs[1:]
+ else:
+ model_output = model_outputs
+
+ if self.model_var_type in [
+ ModelVarType.LEARNED,
+ ModelVarType.LEARNED_RANGE,
+ ]:
+ B, C = x_t.shape[:2]
+ assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+ model_output, model_var_values = th.split(model_output, C, dim=1)
+ # Learn the variance using the variational bound, but don't let
+ # it affect our mean prediction.
+ frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+ terms["vb"] = self._vb_terms_bpd(
+ model=lambda *args, r=frozen_out: r,
+ x_start=x_start,
+ x_t=x_t,
+ t=t,
+ clip_denoised=False,
+ )["output"]
+ if self.loss_type == LossType.RESCALED_MSE:
+ # Divide by 1000 for equivalence with initial implementation.
+ # Without a factor of 1/1000, the VB term hurts the MSE term.
+ terms["vb"] *= self.num_timesteps / 1000.0
+
+ if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+ target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0]
+ x_start_pred = torch.zeros(x_start) # Not supported.
+ elif self.model_mean_type == ModelMeanType.START_X:
+ target = x_start
+ x_start_pred = model_output
+ elif self.model_mean_type == ModelMeanType.EPSILON:
+ target = noise
+ x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
+ else:
+ raise NotImplementedError(self.model_mean_type)
+ assert model_output.shape == target.shape == x_start.shape
+ terms["mse"] = mean_flat((target - model_output) ** 2)
+ terms["x_start_predicted"] = x_start_pred
+ if "vb" in terms:
+ terms["loss"] = terms["mse"] + terms["vb"]
+ else:
+ terms["loss"] = terms["mse"]
+ else:
+ raise NotImplementedError(self.loss_type)
+
+ return terms
+
+ def autoregressive_training_losses(
+ self, model, x_start, t, model_output_keys, gd_out_key, model_kwargs=None, noise=None
+ ):
+ """
+ Compute training losses for a single timestep.
+
+ :param model: the model to evaluate loss on.
+ :param x_start: the [N x C x ...] tensor of inputs.
+ :param t: a batch of timestep indices.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+ :param noise: if specified, the specific Gaussian noise to try to remove.
+ :return: a dict with the key "loss" containing a tensor of shape [N].
+ Some mean or variance settings may also have other keys.
+ """
+ if model_kwargs is None:
+ model_kwargs = {}
+ if noise is None:
+ noise = th.randn_like(x_start)
+ x_t = self.q_sample(x_start, t, noise=noise)
+ terms = {}
+ if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+ assert False # not currently supported for this type of diffusion.
+ elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+ model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs)
+ terms.update({k: o for k, o in zip(model_output_keys, model_outputs)})
+ model_output = terms[gd_out_key]
+ if self.model_var_type in [
+ ModelVarType.LEARNED,
+ ModelVarType.LEARNED_RANGE,
+ ]:
+ B, C = x_t.shape[:2]
+ assert model_output.shape == (B, C, 2, *x_t.shape[2:])
+ model_output, model_var_values = model_output[:, :, 0], model_output[:, :, 1]
+ # Learn the variance using the variational bound, but don't let
+ # it affect our mean prediction.
+ frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+ terms["vb"] = self._vb_terms_bpd(
+ model=lambda *args, r=frozen_out: r,
+ x_start=x_start,
+ x_t=x_t,
+ t=t,
+ clip_denoised=False,
+ )["output"]
+ if self.loss_type == LossType.RESCALED_MSE:
+ # Divide by 1000 for equivalence with initial implementation.
+ # Without a factor of 1/1000, the VB term hurts the MSE term.
+ terms["vb"] *= self.num_timesteps / 1000.0
+
+ if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+ target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0]
+ x_start_pred = torch.zeros(x_start) # Not supported.
+ elif self.model_mean_type == ModelMeanType.START_X:
+ target = x_start
+ x_start_pred = model_output
+ elif self.model_mean_type == ModelMeanType.EPSILON:
+ target = noise
+ x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
+ else:
+ raise NotImplementedError(self.model_mean_type)
+ assert model_output.shape == target.shape == x_start.shape
+ terms["mse"] = mean_flat((target - model_output) ** 2)
+ terms["x_start_predicted"] = x_start_pred
+ if "vb" in terms:
+ terms["loss"] = terms["mse"] + terms["vb"]
+ else:
+ terms["loss"] = terms["mse"]
+ else:
+ raise NotImplementedError(self.loss_type)
+
+ return terms
+
+ def _prior_bpd(self, x_start):
+ """
+ Get the prior KL term for the variational lower-bound, measured in
+ bits-per-dim.
+
+ This term can't be optimized, as it only depends on the encoder.
+
+ :param x_start: the [N x C x ...] tensor of inputs.
+ :return: a batch of [N] KL values (in bits), one per batch element.
+ """
+ batch_size = x_start.shape[0]
+ t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+ kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
+ return mean_flat(kl_prior) / np.log(2.0)
+
+ def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+ """
+ Compute the entire variational lower-bound, measured in bits-per-dim,
+ as well as other related quantities.
+
+ :param model: the model to evaluate loss on.
+ :param x_start: the [N x C x ...] tensor of inputs.
+ :param clip_denoised: if True, clip denoised samples.
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
+ pass to the model. This can be used for conditioning.
+
+ :return: a dict containing the following keys:
+ - total_bpd: the total variational lower-bound, per batch element.
+ - prior_bpd: the prior term in the lower-bound.
+ - vb: an [N x T] tensor of terms in the lower-bound.
+ - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+ - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+ """
+ device = x_start.device
+ batch_size = x_start.shape[0]
+
+ vb = []
+ xstart_mse = []
+ mse = []
+ for t in list(range(self.num_timesteps))[::-1]:
+ t_batch = th.tensor([t] * batch_size, device=device)
+ noise = th.randn_like(x_start)
+ x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+ # Calculate VLB term at the current timestep
+ with th.no_grad():
+ out = self._vb_terms_bpd(
+ model,
+ x_start=x_start,
+ x_t=x_t,
+ t=t_batch,
+ clip_denoised=clip_denoised,
+ model_kwargs=model_kwargs,
+ )
+ vb.append(out["output"])
+ xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+ eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+ mse.append(mean_flat((eps - noise) ** 2))
+
+ vb = th.stack(vb, dim=1)
+ xstart_mse = th.stack(xstart_mse, dim=1)
+ mse = th.stack(mse, dim=1)
+
+ prior_bpd = self._prior_bpd(x_start)
+ total_bpd = vb.sum(dim=1) + prior_bpd
+ return {
+ "total_bpd": total_bpd,
+ "prior_bpd": prior_bpd,
+ "vb": vb,
+ "xstart_mse": xstart_mse,
+ "mse": mse,
+ }
+
+
+class SpacedDiffusion(GaussianDiffusion):
+ """
+ A diffusion process which can skip steps in a base diffusion process.
+
+ :param use_timesteps: a collection (sequence or set) of timesteps from the
+ original diffusion process to retain.
+ :param kwargs: the kwargs to create the base diffusion process.
+ """
+
+ def __init__(self, use_timesteps, **kwargs):
+ self.use_timesteps = set(use_timesteps)
+ self.timestep_map = []
+ self.original_num_steps = len(kwargs["betas"])
+ base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
+ last_alpha_cumprod = 1.0
+ new_betas = []
+ for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+ if i in self.use_timesteps:
+ new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+ last_alpha_cumprod = alpha_cumprod
+ self.timestep_map.append(i)
+ kwargs["betas"] = np.array(new_betas)
+ super().__init__(**kwargs)
+
+ def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs
+ return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+
+ def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs
+ return super().training_losses(self._wrap_model(model), *args, **kwargs)
+
+ def autoregressive_training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs
+ return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs)
+
+ def condition_mean(self, cond_fn, *args, **kwargs):
+ return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+
+ def condition_score(self, cond_fn, *args, **kwargs):
+ return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+
+ def _wrap_model(self, model, autoregressive=False):
+ if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel):
+ return model
+ mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel
+ return mod(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps)
+
+ def _scale_timesteps(self, t):
+ # Scaling is done by the wrapped model.
+ return t
+
+
+def space_timesteps(num_timesteps, section_counts):
+ """
+ Create a list of timesteps to use from an original diffusion process,
+ given the number of timesteps we want to take from equally-sized portions
+ of the original process.
+
+ For example, if there's 300 timesteps and the section counts are [10,15,20]
+ then the first 100 timesteps are strided to be 10 timesteps, the second 100
+ are strided to be 15 timesteps, and the final 100 are strided to be 20.
+
+ If the stride is a string starting with "ddim", then the fixed striding
+ from the DDIM paper is used, and only one section is allowed.
+
+ :param num_timesteps: the number of diffusion steps in the original
+ process to divide up.
+ :param section_counts: either a list of numbers, or a string containing
+ comma-separated numbers, indicating the step count
+ per section. As a special case, use "ddimN" where N
+ is a number of steps to use the striding from the
+ DDIM paper.
+ :return: a set of diffusion steps from the original process to use.
+ """
+ if isinstance(section_counts, str):
+ if section_counts.startswith("ddim"):
+ desired_count = int(section_counts[len("ddim") :])
+ for i in range(1, num_timesteps):
+ if len(range(0, num_timesteps, i)) == desired_count:
+ return set(range(0, num_timesteps, i))
+ raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
+ section_counts = [int(x) for x in section_counts.split(",")]
+ size_per = num_timesteps // len(section_counts)
+ extra = num_timesteps % len(section_counts)
+ start_idx = 0
+ all_steps = []
+ for i, section_count in enumerate(section_counts):
+ size = size_per + (1 if i < extra else 0)
+ if size < section_count:
+ raise ValueError(f"cannot divide section of {size} steps into {section_count}")
+ if section_count <= 1:
+ frac_stride = 1
+ else:
+ frac_stride = (size - 1) / (section_count - 1)
+ cur_idx = 0.0
+ taken_steps = []
+ for _ in range(section_count):
+ taken_steps.append(start_idx + round(cur_idx))
+ cur_idx += frac_stride
+ all_steps += taken_steps
+ start_idx += size
+ return set(all_steps)
+
+
+class _WrappedModel:
+ def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+ self.model = model
+ self.timestep_map = timestep_map
+ self.rescale_timesteps = rescale_timesteps
+ self.original_num_steps = original_num_steps
+
+ def __call__(self, x, ts, **kwargs):
+ map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+ new_ts = map_tensor[ts]
+ if self.rescale_timesteps:
+ new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+ model_output = self.model(x, new_ts, **kwargs)
+ return model_output
+
+
+class _WrappedAutoregressiveModel:
+ def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+ self.model = model
+ self.timestep_map = timestep_map
+ self.rescale_timesteps = rescale_timesteps
+ self.original_num_steps = original_num_steps
+
+ def __call__(self, x, x0, ts, **kwargs):
+ map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+ new_ts = map_tensor[ts]
+ if self.rescale_timesteps:
+ new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+ return self.model(x, x0, new_ts, **kwargs)
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+ """
+ Extract values from a 1-D numpy array for a batch of indices.
+
+ :param arr: the 1-D numpy array.
+ :param timesteps: a tensor of indices into the array to extract.
+ :param broadcast_shape: a larger shape of K dimensions with the batch
+ dimension equal to the length of timesteps.
+ :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+ """
+ res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+ while len(res.shape) < len(broadcast_shape):
+ res = res[..., None]
+ return res.expand(broadcast_shape)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/diffusion_decoder.py b/submodules/TTS/TTS/tts/layers/tortoise/diffusion_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3cf7698a7334b4cfc8d9bdd0f5f6ee3059189d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/diffusion_decoder.py
@@ -0,0 +1,415 @@
+import math
+import random
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import autocast
+
+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, normalization
+
+
+def is_latent(t):
+ return t.dtype == torch.float
+
+
+def is_sequence(t):
+ return t.dtype == torch.long
+
+
+def timestep_embedding(timesteps, dim, max_period=10000):
+ """
+ Create sinusoidal timestep embeddings.
+
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ :param dim: the dimension of the output.
+ :param max_period: controls the minimum frequency of the embeddings.
+ :return: an [N x dim] Tensor of positional embeddings.
+ """
+ half = dim // 2
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+ device=timesteps.device
+ )
+ args = timesteps[:, None].float() * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ return embedding
+
+
+class TimestepBlock(nn.Module):
+ @abstractmethod
+ def forward(self, x, emb):
+ """
+ Apply the module to `x` given `emb` timestep embeddings.
+ """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+ def forward(self, x, emb):
+ for layer in self:
+ if isinstance(layer, TimestepBlock):
+ x = layer(x, emb)
+ else:
+ x = layer(x)
+ return x
+
+
+class ResBlock(TimestepBlock):
+ def __init__(
+ self,
+ channels,
+ emb_channels,
+ dropout,
+ out_channels=None,
+ dims=2,
+ kernel_size=3,
+ efficient_config=True,
+ use_scale_shift_norm=False,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.emb_channels = emb_channels
+ self.dropout = dropout
+ self.out_channels = out_channels or channels
+ self.use_scale_shift_norm = use_scale_shift_norm
+ padding = {1: 0, 3: 1, 5: 2}[kernel_size]
+ eff_kernel = 1 if efficient_config else 3
+ eff_padding = 0 if efficient_config else 1
+
+ self.in_layers = nn.Sequential(
+ normalization(channels),
+ nn.SiLU(),
+ nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding),
+ )
+
+ self.emb_layers = nn.Sequential(
+ nn.SiLU(),
+ nn.Linear(
+ emb_channels,
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+ ),
+ )
+ self.out_layers = nn.Sequential(
+ normalization(self.out_channels),
+ nn.SiLU(),
+ nn.Dropout(p=dropout),
+ nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding),
+ )
+
+ if self.out_channels == channels:
+ self.skip_connection = nn.Identity()
+ else:
+ self.skip_connection = nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding)
+
+ def forward(self, x, emb):
+ h = self.in_layers(x)
+ emb_out = self.emb_layers(emb).type(h.dtype)
+ while len(emb_out.shape) < len(h.shape):
+ emb_out = emb_out[..., None]
+ if self.use_scale_shift_norm:
+ out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+ scale, shift = torch.chunk(emb_out, 2, dim=1)
+ h = out_norm(h) * (1 + scale) + shift
+ h = out_rest(h)
+ else:
+ h = h + emb_out
+ h = self.out_layers(h)
+ return self.skip_connection(x) + h
+
+
+class DiffusionLayer(TimestepBlock):
+ def __init__(self, model_channels, dropout, num_heads):
+ super().__init__()
+ self.resblk = ResBlock(
+ model_channels,
+ model_channels,
+ dropout,
+ model_channels,
+ dims=1,
+ use_scale_shift_norm=True,
+ )
+ self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True)
+
+ def forward(self, x, time_emb):
+ y = self.resblk(x, time_emb)
+ return self.attn(y)
+
+
+class DiffusionTts(nn.Module):
+ def __init__(
+ self,
+ model_channels=512,
+ num_layers=8,
+ in_channels=100,
+ in_latent_channels=512,
+ in_tokens=8193,
+ out_channels=200, # mean and variance
+ dropout=0,
+ use_fp16=False,
+ num_heads=16,
+ # Parameters for regularization.
+ layer_drop=0.1,
+ unconditioned_percentage=0.1, # This implements a mechanism similar to what is used in classifier-free training.
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.model_channels = model_channels
+ self.out_channels = out_channels
+ self.dropout = dropout
+ self.num_heads = num_heads
+ self.unconditioned_percentage = unconditioned_percentage
+ self.enable_fp16 = use_fp16
+ self.layer_drop = layer_drop
+
+ self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1)
+ self.time_embed = nn.Sequential(
+ nn.Linear(model_channels, model_channels),
+ nn.SiLU(),
+ nn.Linear(model_channels, model_channels),
+ )
+
+ # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed.
+ # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally
+ # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive
+ # transformer network.
+ self.code_embedding = nn.Embedding(in_tokens, model_channels)
+ self.code_converter = nn.Sequential(
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ )
+ self.code_norm = normalization(model_channels)
+ self.latent_conditioner = nn.Sequential(
+ nn.Conv1d(in_latent_channels, model_channels, 3, padding=1),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+ )
+ self.contextual_embedder = nn.Sequential(
+ nn.Conv1d(in_channels, model_channels, 3, padding=1, stride=2),
+ nn.Conv1d(model_channels, model_channels * 2, 3, padding=1, stride=2),
+ AttentionBlock(
+ model_channels * 2,
+ num_heads,
+ relative_pos_embeddings=True,
+ do_checkpoint=False,
+ ),
+ AttentionBlock(
+ model_channels * 2,
+ num_heads,
+ relative_pos_embeddings=True,
+ do_checkpoint=False,
+ ),
+ AttentionBlock(
+ model_channels * 2,
+ num_heads,
+ relative_pos_embeddings=True,
+ do_checkpoint=False,
+ ),
+ AttentionBlock(
+ model_channels * 2,
+ num_heads,
+ relative_pos_embeddings=True,
+ do_checkpoint=False,
+ ),
+ AttentionBlock(
+ model_channels * 2,
+ num_heads,
+ relative_pos_embeddings=True,
+ do_checkpoint=False,
+ ),
+ )
+ self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1))
+ self.conditioning_timestep_integrator = TimestepEmbedSequential(
+ DiffusionLayer(model_channels, dropout, num_heads),
+ DiffusionLayer(model_channels, dropout, num_heads),
+ DiffusionLayer(model_channels, dropout, num_heads),
+ )
+
+ self.integrating_conv = nn.Conv1d(model_channels * 2, model_channels, kernel_size=1)
+ self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1)
+
+ self.layers = nn.ModuleList(
+ [DiffusionLayer(model_channels, dropout, num_heads) for _ in range(num_layers)]
+ + [
+ ResBlock(
+ model_channels,
+ model_channels,
+ dropout,
+ dims=1,
+ use_scale_shift_norm=True,
+ )
+ for _ in range(3)
+ ]
+ )
+
+ self.out = nn.Sequential(
+ normalization(model_channels),
+ nn.SiLU(),
+ nn.Conv1d(model_channels, out_channels, 3, padding=1),
+ )
+
+ def get_grad_norm_parameter_groups(self):
+ groups = {
+ "minicoder": list(self.contextual_embedder.parameters()),
+ "layers": list(self.layers.parameters()),
+ "code_converters": list(self.code_embedding.parameters())
+ + list(self.code_converter.parameters())
+ + list(self.latent_conditioner.parameters())
+ + list(self.latent_conditioner.parameters()),
+ "timestep_integrator": list(self.conditioning_timestep_integrator.parameters())
+ + list(self.integrating_conv.parameters()),
+ "time_embed": list(self.time_embed.parameters()),
+ }
+ return groups
+
+ def get_conditioning(self, conditioning_input):
+ speech_conditioning_input = (
+ conditioning_input.unsqueeze(1) if len(conditioning_input.shape) == 3 else conditioning_input
+ )
+ conds = []
+ for j in range(speech_conditioning_input.shape[1]):
+ conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
+ conds = torch.cat(conds, dim=-1)
+ conds = conds.mean(dim=-1)
+ return conds
+
+ def timestep_independent(
+ self,
+ aligned_conditioning,
+ conditioning_latent,
+ expected_seq_len,
+ return_code_pred,
+ ):
+ # Shuffle aligned_latent to BxCxS format
+ if is_latent(aligned_conditioning):
+ aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
+
+ cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1)
+ if is_latent(aligned_conditioning):
+ code_emb = self.latent_conditioner(aligned_conditioning)
+ else:
+ code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1)
+ code_emb = self.code_converter(code_emb)
+ code_emb = self.code_norm(code_emb) * (1 + cond_scale.unsqueeze(-1)) + cond_shift.unsqueeze(-1)
+
+ unconditioned_batches = torch.zeros((code_emb.shape[0], 1, 1), device=code_emb.device)
+ # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance.
+ if self.training and self.unconditioned_percentage > 0:
+ unconditioned_batches = (
+ torch.rand((code_emb.shape[0], 1, 1), device=code_emb.device) < self.unconditioned_percentage
+ )
+ code_emb = torch.where(
+ unconditioned_batches,
+ self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1),
+ code_emb,
+ )
+ expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode="nearest")
+
+ if not return_code_pred:
+ return expanded_code_emb
+ else:
+ mel_pred = self.mel_head(expanded_code_emb)
+ # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss.
+ mel_pred = mel_pred * unconditioned_batches.logical_not()
+ return expanded_code_emb, mel_pred
+
+ def forward(
+ self,
+ x,
+ timesteps,
+ aligned_conditioning=None,
+ conditioning_latent=None,
+ precomputed_aligned_embeddings=None,
+ conditioning_free=False,
+ return_code_pred=False,
+ ):
+ """
+ Apply the model to an input batch.
+
+ :param x: an [N x C x ...] Tensor of inputs.
+ :param timesteps: a 1-D batch of timesteps.
+ :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
+ :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning().
+ :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
+ :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
+ :return: an [N x C x ...] Tensor of outputs.
+ """
+ assert precomputed_aligned_embeddings is not None or (
+ aligned_conditioning is not None and conditioning_latent is not None
+ )
+ assert not (
+ return_code_pred and precomputed_aligned_embeddings is not None
+ ) # These two are mutually exclusive.
+
+ unused_params = []
+ if conditioning_free:
+ code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1])
+ unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
+ unused_params.extend(list(self.latent_conditioner.parameters()))
+ else:
+ if precomputed_aligned_embeddings is not None:
+ code_emb = precomputed_aligned_embeddings
+ else:
+ code_emb, mel_pred = self.timestep_independent(
+ aligned_conditioning, conditioning_latent, x.shape[-1], True
+ )
+ if is_latent(aligned_conditioning):
+ unused_params.extend(
+ list(self.code_converter.parameters()) + list(self.code_embedding.parameters())
+ )
+ else:
+ unused_params.extend(list(self.latent_conditioner.parameters()))
+
+ unused_params.append(self.unconditioned_embedding)
+
+ time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+ code_emb = self.conditioning_timestep_integrator(code_emb, time_emb)
+ x = self.inp_block(x)
+ x = torch.cat([x, code_emb], dim=1)
+ x = self.integrating_conv(x)
+ for i, lyr in enumerate(self.layers):
+ # Do layer drop where applicable. Do not drop first and last layers.
+ if (
+ self.training
+ and self.layer_drop > 0
+ and i != 0
+ and i != (len(self.layers) - 1)
+ and random.random() < self.layer_drop
+ ):
+ unused_params.extend(list(lyr.parameters()))
+ else:
+ # First and last blocks will have autocast disabled for improved precision.
+ with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
+ x = lyr(x, time_emb)
+
+ x = x.float()
+ out = self.out(x)
+
+ # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors.
+ extraneous_addition = 0
+ for p in unused_params:
+ extraneous_addition = extraneous_addition + p.mean()
+ out = out + extraneous_addition * 0
+
+ if return_code_pred:
+ return out, mel_pred
+ return out
+
+
+if __name__ == "__main__":
+ clip = torch.randn(2, 100, 400)
+ aligned_latent = torch.randn(2, 388, 512)
+ aligned_sequence = torch.randint(0, 8192, (2, 100))
+ cond = torch.randn(2, 100, 400)
+ ts = torch.LongTensor([600, 600])
+ model = DiffusionTts(512, layer_drop=0.3, unconditioned_percentage=0.5)
+ # Test with latent aligned conditioning
+ # o = model(clip, ts, aligned_latent, cond)
+ # Test with sequence aligned conditioning
+ o = model(clip, ts, aligned_sequence, cond)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/dpm_solver.py b/submodules/TTS/TTS/tts/layers/tortoise/dpm_solver.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70888df42063e65dabf50eadb9a78813effa4e9
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/dpm_solver.py
@@ -0,0 +1,1562 @@
+import math
+
+import torch
+
+
+class NoiseScheduleVP:
+ def __init__(
+ self,
+ schedule="discrete",
+ betas=None,
+ alphas_cumprod=None,
+ continuous_beta_0=0.1,
+ continuous_beta_1=20.0,
+ dtype=torch.float32,
+ ):
+ """Create a wrapper class for the forward SDE (VP type).
+
+ ***
+ Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+ We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+ ***
+
+ The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+ We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+ Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+
+ log_alpha_t = self.marginal_log_mean_coeff(t)
+ sigma_t = self.marginal_std(t)
+ lambda_t = self.marginal_lambda(t)
+
+ Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+
+ t = self.inverse_lambda(lambda_t)
+
+ ===============================================================
+
+ We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+
+ 1. For discrete-time DPMs:
+
+ For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+ t_i = (i + 1) / N
+ e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+ We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+
+ Args:
+ betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+ alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+
+ Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+
+ **Important**: Please pay special attention for the args for `alphas_cumprod`:
+ The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+ q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+ Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+ alpha_{t_n} = \sqrt{\hat{alpha_n}},
+ and
+ log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+
+
+ 2. For continuous-time DPMs:
+
+ We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+ schedule are the default settings in DDPM and improved-DDPM:
+
+ Args:
+ beta_min: A `float` number. The smallest beta for the linear schedule.
+ beta_max: A `float` number. The largest beta for the linear schedule.
+ cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+ cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+ T: A `float` number. The ending time of the forward process.
+
+ ===============================================================
+
+ Args:
+ schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+ 'linear' or 'cosine' for continuous-time DPMs.
+ Returns:
+ A wrapper object of the forward SDE (VP type).
+
+ ===============================================================
+
+ Example:
+
+ # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+ >>> ns = NoiseScheduleVP('discrete', betas=betas)
+
+ # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+ >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+
+ # For continuous-time DPMs (VPSDE), linear schedule:
+ >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+
+ """
+
+ if schedule not in ["discrete", "linear", "cosine"]:
+ raise ValueError(
+ "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
+ schedule
+ )
+ )
+
+ self.schedule = schedule
+ if schedule == "discrete":
+ if betas is not None:
+ log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+ else:
+ assert alphas_cumprod is not None
+ log_alphas = 0.5 * torch.log(alphas_cumprod)
+ self.total_N = len(log_alphas)
+ self.T = 1.0
+ self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
+ self.log_alpha_array = log_alphas.reshape(
+ (
+ 1,
+ -1,
+ )
+ ).to(dtype=dtype)
+ else:
+ self.total_N = 1000
+ self.beta_0 = continuous_beta_0
+ self.beta_1 = continuous_beta_1
+ self.cosine_s = 0.008
+ self.cosine_beta_max = 999.0
+ self.cosine_t_max = (
+ math.atan(self.cosine_beta_max * (1.0 + self.cosine_s) / math.pi)
+ * 2.0
+ * (1.0 + self.cosine_s)
+ / math.pi
+ - self.cosine_s
+ )
+ self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1.0 + self.cosine_s) * math.pi / 2.0))
+ self.schedule = schedule
+ if schedule == "cosine":
+ # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
+ # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
+ self.T = 0.9946
+ else:
+ self.T = 1.0
+
+ def marginal_log_mean_coeff(self, t):
+ """
+ Compute log(alpha_t) of a given continuous-time label t in [0, T].
+ """
+ if self.schedule == "discrete":
+ return interpolate_fn(
+ t.reshape((-1, 1)),
+ self.t_array.to(t.device),
+ self.log_alpha_array.to(t.device),
+ ).reshape((-1))
+ elif self.schedule == "linear":
+ return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+ elif self.schedule == "cosine":
+
+ def log_alpha_fn(s):
+ return torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
+
+ log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
+ return log_alpha_t
+
+ def marginal_alpha(self, t):
+ """
+ Compute alpha_t of a given continuous-time label t in [0, T].
+ """
+ return torch.exp(self.marginal_log_mean_coeff(t))
+
+ def marginal_std(self, t):
+ """
+ Compute sigma_t of a given continuous-time label t in [0, T].
+ """
+ return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t)))
+
+ def marginal_lambda(self, t):
+ """
+ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+ """
+ log_mean_coeff = self.marginal_log_mean_coeff(t)
+ log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff))
+ return log_mean_coeff - log_std
+
+ def inverse_lambda(self, lamb):
+ """
+ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+ """
+ if self.schedule == "linear":
+ tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+ Delta = self.beta_0**2 + tmp
+ return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+ elif self.schedule == "discrete":
+ log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb)
+ t = interpolate_fn(
+ log_alpha.reshape((-1, 1)),
+ torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+ torch.flip(self.t_array.to(lamb.device), [1]),
+ )
+ return t.reshape((-1,))
+ else:
+ log_alpha = -0.5 * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+
+ def t_fn(log_alpha_t):
+ return (
+ torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0))
+ * 2.0
+ * (1.0 + self.cosine_s)
+ / math.pi
+ - self.cosine_s
+ )
+
+ t = t_fn(log_alpha)
+ return t
+
+
+def model_wrapper(
+ model,
+ noise_schedule,
+ model_type="noise",
+ model_kwargs={},
+ guidance_type="uncond",
+ condition=None,
+ unconditional_condition=None,
+ guidance_scale=1.0,
+ classifier_fn=None,
+ classifier_kwargs={},
+):
+ """Create a wrapper function for the noise prediction model.
+
+ DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+ firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+
+ We support four types of the diffusion model by setting `model_type`:
+
+ 1. "noise": noise prediction model. (Trained by predicting noise).
+
+ 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+
+ 3. "v": velocity prediction model. (Trained by predicting the velocity).
+ The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+
+ [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+ arXiv preprint arXiv:2202.00512 (2022).
+ [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+ arXiv preprint arXiv:2210.02303 (2022).
+
+ 4. "score": marginal score function. (Trained by denoising score matching).
+ Note that the score function and the noise prediction model follows a simple relationship:
+ ```
+ noise(x_t, t) = -sigma_t * score(x_t, t)
+ ```
+
+ We support three types of guided sampling by DPMs by setting `guidance_type`:
+ 1. "uncond": unconditional sampling by DPMs.
+ The input `model` has the following format:
+ ``
+ model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+ ``
+
+ 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+ The input `model` has the following format:
+ ``
+ model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+ ``
+
+ The input `classifier_fn` has the following format:
+ ``
+ classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+ ``
+
+ [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+ in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+
+ 3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+ The input `model` has the following format:
+ ``
+ model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+ ``
+ And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+
+ [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+ arXiv preprint arXiv:2207.12598 (2022).
+
+
+ The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+ or continuous-time labels (i.e. epsilon to T).
+
+ We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+ ``
+ def model_fn(x, t_continuous) -> noise:
+ t_input = get_model_input_time(t_continuous)
+ return noise_pred(model, x, t_input, **model_kwargs)
+ ``
+ where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+
+ ===============================================================
+
+ Args:
+ model: A diffusion model with the corresponding format described above.
+ noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+ model_type: A `str`. The parameterization type of the diffusion model.
+ "noise" or "x_start" or "v" or "score".
+ model_kwargs: A `dict`. A dict for the other inputs of the model function.
+ guidance_type: A `str`. The type of the guidance for sampling.
+ "uncond" or "classifier" or "classifier-free".
+ condition: A pytorch tensor. The condition for the guided sampling.
+ Only used for "classifier" or "classifier-free" guidance type.
+ unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+ Only used for "classifier-free" guidance type.
+ guidance_scale: A `float`. The scale for the guided sampling.
+ classifier_fn: A classifier function. Only used for the classifier guidance.
+ classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+ Returns:
+ A noise prediction model that accepts the noised data and the continuous time as the inputs.
+ """
+
+ def get_model_input_time(t_continuous):
+ """
+ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+ For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+ For continuous-time DPMs, we just use `t_continuous`.
+ """
+ if noise_schedule.schedule == "discrete":
+ return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0
+ else:
+ return t_continuous
+
+ def noise_pred_fn(x, t_continuous, cond=None):
+ t_input = get_model_input_time(t_continuous)
+ if cond is None:
+ output = model(x, t_input, **model_kwargs)
+ else:
+ output = model(x, t_input, cond, **model_kwargs)
+ if model_type == "noise":
+ return output
+ elif model_type == "x_start":
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+ return (x - alpha_t * output) / sigma_t
+ elif model_type == "v":
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+ return alpha_t * output + sigma_t * x
+ elif model_type == "score":
+ sigma_t = noise_schedule.marginal_std(t_continuous)
+ return -sigma_t * output
+
+ def cond_grad_fn(x, t_input):
+ """
+ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+ """
+ with torch.enable_grad():
+ x_in = x.detach().requires_grad_(True)
+ log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+ return torch.autograd.grad(log_prob.sum(), x_in)[0]
+
+ def model_fn(x, t_continuous):
+ """
+ The noise predicition model function that is used for DPM-Solver.
+ """
+ if guidance_type == "uncond":
+ return noise_pred_fn(x, t_continuous)
+ elif guidance_type == "classifier":
+ assert classifier_fn is not None
+ t_input = get_model_input_time(t_continuous)
+ cond_grad = cond_grad_fn(x, t_input)
+ sigma_t = noise_schedule.marginal_std(t_continuous)
+ noise = noise_pred_fn(x, t_continuous)
+ return noise - guidance_scale * sigma_t * cond_grad
+ elif guidance_type == "classifier-free":
+ if guidance_scale == 1.0 or unconditional_condition is None:
+ return noise_pred_fn(x, t_continuous, cond=condition)
+ else:
+ x_in = torch.cat([x] * 2)
+ t_in = torch.cat([t_continuous] * 2)
+ c_in = torch.cat([unconditional_condition, condition])
+ noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+ return noise_uncond + guidance_scale * (noise - noise_uncond)
+
+ assert model_type in ["noise", "x_start", "v", "score"]
+ assert guidance_type in ["uncond", "classifier", "classifier-free"]
+ return model_fn
+
+
+class DPM_Solver:
+ def __init__(
+ self,
+ model_fn,
+ noise_schedule,
+ algorithm_type="dpmsolver++",
+ correcting_x0_fn=None,
+ correcting_xt_fn=None,
+ thresholding_max_val=1.0,
+ dynamic_thresholding_ratio=0.995,
+ ):
+ """Construct a DPM-Solver.
+
+ We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).
+
+ We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
+ can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
+ dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
+ DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
+ DPMs (such as stable-diffusion).
+
+ To support advanced algorithms in image-to-image applications, we also support corrector functions for
+ both x0 and xt.
+
+ Args:
+ model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
+ ``
+ def model_fn(x, t_continuous):
+ return noise
+ ``
+ The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
+ noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+ algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
+ correcting_x0_fn: A `str` or a function with the following format:
+ ```
+ def correcting_x0_fn(x0, t):
+ x0_new = ...
+ return x0_new
+ ```
+ This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
+ ```
+ x0_pred = data_pred_model(xt, t)
+ if correcting_x0_fn is not None:
+ x0_pred = correcting_x0_fn(x0_pred, t)
+ xt_1 = update(x0_pred, xt, t)
+ ```
+ If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
+ correcting_xt_fn: A function with the following format:
+ ```
+ def correcting_xt_fn(xt, t, step):
+ x_new = ...
+ return x_new
+ ```
+ This function is to correct the intermediate samples xt at each sampling step. e.g.,
+ ```
+ xt = ...
+ xt = correcting_xt_fn(xt, t, step)
+ ```
+ thresholding_max_val: A `float`. The max value for thresholding.
+ Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+ dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
+ Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+
+ [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
+ Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
+ with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
+ """
+ self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+ self.noise_schedule = noise_schedule
+ assert algorithm_type in ["dpmsolver", "dpmsolver++"]
+ self.algorithm_type = algorithm_type
+ if correcting_x0_fn == "dynamic_thresholding":
+ self.correcting_x0_fn = self.dynamic_thresholding_fn
+ else:
+ self.correcting_x0_fn = correcting_x0_fn
+ self.correcting_xt_fn = correcting_xt_fn
+ self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+ self.thresholding_max_val = thresholding_max_val
+
+ def dynamic_thresholding_fn(self, x0, t):
+ """
+ The dynamic thresholding method.
+ """
+ dims = x0.dim()
+ p = self.dynamic_thresholding_ratio
+ s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+ s = expand_dims(
+ torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)),
+ dims,
+ )
+ x0 = torch.clamp(x0, -s, s) / s
+ return x0
+
+ def noise_prediction_fn(self, x, t):
+ """
+ Return the noise prediction model.
+ """
+ return self.model(x, t)
+
+ def data_prediction_fn(self, x, t):
+ """
+ Return the data prediction model (with corrector).
+ """
+ noise = self.noise_prediction_fn(x, t)
+ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+ x0 = (x - sigma_t * noise) / alpha_t
+ if self.correcting_x0_fn is not None:
+ x0 = self.correcting_x0_fn(x0, t)
+ return x0
+
+ def model_fn(self, x, t):
+ """
+ Convert the model to the noise prediction model or the data prediction model.
+ """
+ if self.algorithm_type == "dpmsolver++":
+ return self.data_prediction_fn(x, t)
+ else:
+ return self.noise_prediction_fn(x, t)
+
+ def get_time_steps(self, skip_type, t_T, t_0, N, device):
+ """Compute the intermediate time steps for sampling.
+
+ Args:
+ skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+ - 'logSNR': uniform logSNR for the time steps.
+ - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+ - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+ t_T: A `float`. The starting time of the sampling (default is T).
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
+ N: A `int`. The total number of the spacing of the time steps.
+ device: A torch device.
+ Returns:
+ A pytorch tensor of the time steps, with the shape (N + 1,).
+ """
+ if skip_type == "logSNR":
+ lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+ lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+ logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+ return self.noise_schedule.inverse_lambda(logSNR_steps)
+ elif skip_type == "time_uniform":
+ return torch.linspace(t_T, t_0, N + 1).to(device)
+ elif skip_type == "time_quadratic":
+ t_order = 2
+ t = torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
+ return t
+ else:
+ raise ValueError(
+ "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+ )
+
+ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
+ """
+ Get the order of each step for sampling by the singlestep DPM-Solver.
+
+ We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
+ Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
+ - If order == 1:
+ We take `steps` of DPM-Solver-1 (i.e. DDIM).
+ - If order == 2:
+ - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
+ - If steps % 2 == 0, we use K steps of DPM-Solver-2.
+ - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
+ - If order == 3:
+ - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+ - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
+ - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
+ - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
+
+ ============================================
+ Args:
+ order: A `int`. The max order for the solver (2 or 3).
+ steps: A `int`. The total number of function evaluations (NFE).
+ skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+ - 'logSNR': uniform logSNR for the time steps.
+ - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+ - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+ t_T: A `float`. The starting time of the sampling (default is T).
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
+ device: A torch device.
+ Returns:
+ orders: A list of the solver order of each step.
+ """
+ if order == 3:
+ K = steps // 3 + 1
+ if steps % 3 == 0:
+ orders = [
+ 3,
+ ] * (
+ K - 2
+ ) + [2, 1]
+ elif steps % 3 == 1:
+ orders = [
+ 3,
+ ] * (
+ K - 1
+ ) + [1]
+ else:
+ orders = [
+ 3,
+ ] * (
+ K - 1
+ ) + [2]
+ elif order == 2:
+ if steps % 2 == 0:
+ K = steps // 2
+ orders = [
+ 2,
+ ] * K
+ else:
+ K = steps // 2 + 1
+ orders = [
+ 2,
+ ] * (
+ K - 1
+ ) + [1]
+ elif order == 1:
+ K = 1
+ orders = [
+ 1,
+ ] * steps
+ else:
+ raise ValueError("'order' must be '1' or '2' or '3'.")
+ if skip_type == "logSNR":
+ # To reproduce the results in DPM-Solver paper
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
+ else:
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
+ torch.cumsum(
+ torch.tensor(
+ [
+ 0,
+ ]
+ + orders
+ ),
+ 0,
+ ).to(device)
+ ]
+ return timesteps_outer, orders
+
+ def denoise_to_zero_fn(self, x, s):
+ """
+ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+ """
+ return self.data_prediction_fn(x, s)
+
+ def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
+ """
+ DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ s: A pytorch tensor. The starting time, with the shape (1,).
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+ return_intermediate: A `bool`. If true, also return the model value at time `s`.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ ns = self.noise_schedule
+ dims = x.dim()
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+ h = lambda_t - lambda_s
+ log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
+ sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
+ alpha_t = torch.exp(log_alpha_t)
+
+ if self.algorithm_type == "dpmsolver++":
+ phi_1 = torch.expm1(-h)
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ x_t = sigma_t / sigma_s * x - alpha_t * phi_1 * model_s
+ if return_intermediate:
+ return x_t, {"model_s": model_s}
+ else:
+ return x_t
+ else:
+ phi_1 = torch.expm1(h)
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ x_t = torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s
+ if return_intermediate:
+ return x_t, {"model_s": model_s}
+ else:
+ return x_t
+
+ def singlestep_dpm_solver_second_update(
+ self,
+ x,
+ s,
+ t,
+ r1=0.5,
+ model_s=None,
+ return_intermediate=False,
+ solver_type="dpmsolver",
+ ):
+ """
+ Singlestep solver DPM-Solver-2 from time `s` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ s: A pytorch tensor. The starting time, with the shape (1,).
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ r1: A `float`. The hyperparameter of the second-order solver.
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+ return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ if solver_type not in ["dpmsolver", "taylor"]:
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+ if r1 is None:
+ r1 = 0.5
+ ns = self.noise_schedule
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+ h = lambda_t - lambda_s
+ lambda_s1 = lambda_s + r1 * h
+ s1 = ns.inverse_lambda(lambda_s1)
+ log_alpha_s, log_alpha_s1, log_alpha_t = (
+ ns.marginal_log_mean_coeff(s),
+ ns.marginal_log_mean_coeff(s1),
+ ns.marginal_log_mean_coeff(t),
+ )
+ sigma_s, sigma_s1, sigma_t = (
+ ns.marginal_std(s),
+ ns.marginal_std(s1),
+ ns.marginal_std(t),
+ )
+ alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
+
+ if self.algorithm_type == "dpmsolver++":
+ phi_11 = torch.expm1(-r1 * h)
+ phi_1 = torch.expm1(-h)
+
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
+ model_s1 = self.model_fn(x_s1, s1)
+ if solver_type == "dpmsolver":
+ x_t = (
+ (sigma_t / sigma_s) * x
+ - (alpha_t * phi_1) * model_s
+ - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
+ )
+ elif solver_type == "taylor":
+ x_t = (
+ (sigma_t / sigma_s) * x
+ - (alpha_t * phi_1) * model_s
+ + (1.0 / r1) * (alpha_t * (phi_1 / h + 1.0)) * (model_s1 - model_s)
+ )
+ else:
+ phi_11 = torch.expm1(r1 * h)
+ phi_1 = torch.expm1(h)
+
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ x_s1 = torch.exp(log_alpha_s1 - log_alpha_s) * x - (sigma_s1 * phi_11) * model_s
+ model_s1 = self.model_fn(x_s1, s1)
+ if solver_type == "dpmsolver":
+ x_t = (
+ torch.exp(log_alpha_t - log_alpha_s) * x
+ - (sigma_t * phi_1) * model_s
+ - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
+ )
+ elif solver_type == "taylor":
+ x_t = (
+ torch.exp(log_alpha_t - log_alpha_s) * x
+ - (sigma_t * phi_1) * model_s
+ - (1.0 / r1) * (sigma_t * (phi_1 / h - 1.0)) * (model_s1 - model_s)
+ )
+ if return_intermediate:
+ return x_t, {"model_s": model_s, "model_s1": model_s1}
+ else:
+ return x_t
+
+ def singlestep_dpm_solver_third_update(
+ self,
+ x,
+ s,
+ t,
+ r1=1.0 / 3.0,
+ r2=2.0 / 3.0,
+ model_s=None,
+ model_s1=None,
+ return_intermediate=False,
+ solver_type="dpmsolver",
+ ):
+ """
+ Singlestep solver DPM-Solver-3 from time `s` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ s: A pytorch tensor. The starting time, with the shape (1,).
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ r1: A `float`. The hyperparameter of the third-order solver.
+ r2: A `float`. The hyperparameter of the third-order solver.
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+ model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
+ If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
+ return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ if solver_type not in ["dpmsolver", "taylor"]:
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+ if r1 is None:
+ r1 = 1.0 / 3.0
+ if r2 is None:
+ r2 = 2.0 / 3.0
+ ns = self.noise_schedule
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+ h = lambda_t - lambda_s
+ lambda_s1 = lambda_s + r1 * h
+ lambda_s2 = lambda_s + r2 * h
+ s1 = ns.inverse_lambda(lambda_s1)
+ s2 = ns.inverse_lambda(lambda_s2)
+ log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = (
+ ns.marginal_log_mean_coeff(s),
+ ns.marginal_log_mean_coeff(s1),
+ ns.marginal_log_mean_coeff(s2),
+ ns.marginal_log_mean_coeff(t),
+ )
+ sigma_s, sigma_s1, sigma_s2, sigma_t = (
+ ns.marginal_std(s),
+ ns.marginal_std(s1),
+ ns.marginal_std(s2),
+ ns.marginal_std(t),
+ )
+ alpha_s1, alpha_s2, alpha_t = (
+ torch.exp(log_alpha_s1),
+ torch.exp(log_alpha_s2),
+ torch.exp(log_alpha_t),
+ )
+
+ if self.algorithm_type == "dpmsolver++":
+ phi_11 = torch.expm1(-r1 * h)
+ phi_12 = torch.expm1(-r2 * h)
+ phi_1 = torch.expm1(-h)
+ phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.0
+ phi_2 = phi_1 / h + 1.0
+ phi_3 = phi_2 / h - 0.5
+
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ if model_s1 is None:
+ x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
+ model_s1 = self.model_fn(x_s1, s1)
+ x_s2 = (
+ (sigma_s2 / sigma_s) * x
+ - (alpha_s2 * phi_12) * model_s
+ + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
+ )
+ model_s2 = self.model_fn(x_s2, s2)
+ if solver_type == "dpmsolver":
+ x_t = (
+ (sigma_t / sigma_s) * x
+ - (alpha_t * phi_1) * model_s
+ + (1.0 / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
+ )
+ elif solver_type == "taylor":
+ D1_0 = (1.0 / r1) * (model_s1 - model_s)
+ D1_1 = (1.0 / r2) * (model_s2 - model_s)
+ D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+ D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
+ x_t = (
+ (sigma_t / sigma_s) * x
+ - (alpha_t * phi_1) * model_s
+ + (alpha_t * phi_2) * D1
+ - (alpha_t * phi_3) * D2
+ )
+ else:
+ phi_11 = torch.expm1(r1 * h)
+ phi_12 = torch.expm1(r2 * h)
+ phi_1 = torch.expm1(h)
+ phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.0
+ phi_2 = phi_1 / h - 1.0
+ phi_3 = phi_2 / h - 0.5
+
+ if model_s is None:
+ model_s = self.model_fn(x, s)
+ if model_s1 is None:
+ x_s1 = (torch.exp(log_alpha_s1 - log_alpha_s)) * x - (sigma_s1 * phi_11) * model_s
+ model_s1 = self.model_fn(x_s1, s1)
+ x_s2 = (
+ (torch.exp(log_alpha_s2 - log_alpha_s)) * x
+ - (sigma_s2 * phi_12) * model_s
+ - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
+ )
+ model_s2 = self.model_fn(x_s2, s2)
+ if solver_type == "dpmsolver":
+ x_t = (
+ (torch.exp(log_alpha_t - log_alpha_s)) * x
+ - (sigma_t * phi_1) * model_s
+ - (1.0 / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
+ )
+ elif solver_type == "taylor":
+ D1_0 = (1.0 / r1) * (model_s1 - model_s)
+ D1_1 = (1.0 / r2) * (model_s2 - model_s)
+ D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+ D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
+ x_t = (
+ (torch.exp(log_alpha_t - log_alpha_s)) * x
+ - (sigma_t * phi_1) * model_s
+ - (sigma_t * phi_2) * D1
+ - (sigma_t * phi_3) * D2
+ )
+
+ if return_intermediate:
+ return x_t, {"model_s": model_s, "model_s1": model_s1, "model_s2": model_s2}
+ else:
+ return x_t
+
+ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
+ """
+ Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ if solver_type not in ["dpmsolver", "taylor"]:
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+ ns = self.noise_schedule
+ model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
+ t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
+ lambda_prev_1, lambda_prev_0, lambda_t = (
+ ns.marginal_lambda(t_prev_1),
+ ns.marginal_lambda(t_prev_0),
+ ns.marginal_lambda(t),
+ )
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+ alpha_t = torch.exp(log_alpha_t)
+
+ h_0 = lambda_prev_0 - lambda_prev_1
+ h = lambda_t - lambda_prev_0
+ r0 = h_0 / h
+ D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
+ if self.algorithm_type == "dpmsolver++":
+ phi_1 = torch.expm1(-h)
+ if solver_type == "dpmsolver":
+ x_t = (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 - 0.5 * (alpha_t * phi_1) * D1_0
+ elif solver_type == "taylor":
+ x_t = (
+ (sigma_t / sigma_prev_0) * x
+ - (alpha_t * phi_1) * model_prev_0
+ + (alpha_t * (phi_1 / h + 1.0)) * D1_0
+ )
+ else:
+ phi_1 = torch.expm1(h)
+ if solver_type == "dpmsolver":
+ x_t = (
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+ - (sigma_t * phi_1) * model_prev_0
+ - 0.5 * (sigma_t * phi_1) * D1_0
+ )
+ elif solver_type == "taylor":
+ x_t = (
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+ - (sigma_t * phi_1) * model_prev_0
+ - (sigma_t * (phi_1 / h - 1.0)) * D1_0
+ )
+ return x_t
+
+ def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
+ """
+ Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ ns = self.noise_schedule
+ model_prev_2, model_prev_1, model_prev_0 = model_prev_list
+ t_prev_2, t_prev_1, t_prev_0 = t_prev_list
+ lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = (
+ ns.marginal_lambda(t_prev_2),
+ ns.marginal_lambda(t_prev_1),
+ ns.marginal_lambda(t_prev_0),
+ ns.marginal_lambda(t),
+ )
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+ alpha_t = torch.exp(log_alpha_t)
+
+ h_1 = lambda_prev_1 - lambda_prev_2
+ h_0 = lambda_prev_0 - lambda_prev_1
+ h = lambda_t - lambda_prev_0
+ r0, r1 = h_0 / h, h_1 / h
+ D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
+ D1_1 = (1.0 / r1) * (model_prev_1 - model_prev_2)
+ D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+ D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+ if self.algorithm_type == "dpmsolver++":
+ phi_1 = torch.expm1(-h)
+ phi_2 = phi_1 / h + 1.0
+ phi_3 = phi_2 / h - 0.5
+ x_t = (
+ (sigma_t / sigma_prev_0) * x
+ - (alpha_t * phi_1) * model_prev_0
+ + (alpha_t * phi_2) * D1
+ - (alpha_t * phi_3) * D2
+ )
+ else:
+ phi_1 = torch.expm1(h)
+ phi_2 = phi_1 / h - 1.0
+ phi_3 = phi_2 / h - 0.5
+ x_t = (
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+ - (sigma_t * phi_1) * model_prev_0
+ - (sigma_t * phi_2) * D1
+ - (sigma_t * phi_3) * D2
+ )
+ return x_t
+
+ def singlestep_dpm_solver_update(
+ self,
+ x,
+ s,
+ t,
+ order,
+ return_intermediate=False,
+ solver_type="dpmsolver",
+ r1=None,
+ r2=None,
+ ):
+ """
+ Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ s: A pytorch tensor. The starting time, with the shape (1,).
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+ return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ r1: A `float`. The hyperparameter of the second-order or third-order solver.
+ r2: A `float`. The hyperparameter of the third-order solver.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ if order == 1:
+ return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
+ elif order == 2:
+ return self.singlestep_dpm_solver_second_update(
+ x,
+ s,
+ t,
+ return_intermediate=return_intermediate,
+ solver_type=solver_type,
+ r1=r1,
+ )
+ elif order == 3:
+ return self.singlestep_dpm_solver_third_update(
+ x,
+ s,
+ t,
+ return_intermediate=return_intermediate,
+ solver_type=solver_type,
+ r1=r1,
+ r2=r2,
+ )
+ else:
+ raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+
+ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"):
+ """
+ Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `s`.
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+ t: A pytorch tensor. The ending time, with the shape (1,).
+ order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_t: A pytorch tensor. The approximated solution at time `t`.
+ """
+ if order == 1:
+ return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
+ elif order == 2:
+ return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+ elif order == 3:
+ return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+ else:
+ raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+
+ def dpm_solver_adaptive(
+ self,
+ x,
+ order,
+ t_T,
+ t_0,
+ h_init=0.05,
+ atol=0.0078,
+ rtol=0.05,
+ theta=0.9,
+ t_err=1e-5,
+ solver_type="dpmsolver",
+ ):
+ """
+ The adaptive step size solver based on singlestep DPM-Solver.
+
+ Args:
+ x: A pytorch tensor. The initial value at time `t_T`.
+ order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
+ t_T: A `float`. The starting time of the sampling (default is T).
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
+ h_init: A `float`. The initial step size (for logSNR).
+ atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
+ rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
+ theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
+ t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
+ current time and `t_0` is less than `t_err`. The default setting is 1e-5.
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+ Returns:
+ x_0: A pytorch tensor. The approximated solution at time `t_0`.
+
+ [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
+ """
+ ns = self.noise_schedule
+ s = t_T * torch.ones((1,)).to(x)
+ lambda_s = ns.marginal_lambda(s)
+ lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
+ h = h_init * torch.ones_like(s).to(x)
+ x_prev = x
+ nfe = 0
+ if order == 2:
+ r1 = 0.5
+
+ def lower_update(x, s, t):
+ return self.dpm_solver_first_update(x, s, t, return_intermediate=True)
+
+ def higher_update(x, s, t, **kwargs):
+ return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
+
+ elif order == 3:
+ r1, r2 = 1.0 / 3.0, 2.0 / 3.0
+
+ def lower_update(x, s, t):
+ return self.singlestep_dpm_solver_second_update(
+ x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type
+ )
+
+ def higher_update(x, s, t, **kwargs):
+ return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
+
+ else:
+ raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
+ while torch.abs((s - t_0)).mean() > t_err:
+ t = ns.inverse_lambda(lambda_s + h)
+ x_lower, lower_noise_kwargs = lower_update(x, s, t)
+ x_higher = higher_update(x, s, t, **lower_noise_kwargs)
+ delta = torch.max(
+ torch.ones_like(x).to(x) * atol,
+ rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)),
+ )
+
+ def norm_fn(v):
+ return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
+
+ E = norm_fn((x_higher - x_lower) / delta).max()
+ if torch.all(E <= 1.0):
+ x = x_higher
+ s = t
+ x_prev = x_lower
+ lambda_s = ns.marginal_lambda(s)
+ h = torch.min(
+ theta * h * torch.float_power(E, -1.0 / order).float(),
+ lambda_0 - lambda_s,
+ )
+ nfe += order
+ print("adaptive solver nfe", nfe)
+ return x
+
+ def add_noise(self, x, t, noise=None):
+ """
+ Compute the noised input xt = alpha_t * x + sigma_t * noise.
+
+ Args:
+ x: A `torch.Tensor` with shape `(batch_size, *shape)`.
+ t: A `torch.Tensor` with shape `(t_size,)`.
+ Returns:
+ xt with shape `(t_size, batch_size, *shape)`.
+ """
+ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+ if noise is None:
+ noise = torch.randn((t.shape[0], *x.shape), device=x.device)
+ x = x.reshape((-1, *x.shape))
+ xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
+ if t.shape[0] == 1:
+ return xt.squeeze(0)
+ else:
+ return xt
+
+ def inverse(
+ self,
+ x,
+ steps=20,
+ t_start=None,
+ t_end=None,
+ order=2,
+ skip_type="time_uniform",
+ method="multistep",
+ lower_order_final=True,
+ denoise_to_zero=False,
+ solver_type="dpmsolver",
+ atol=0.0078,
+ rtol=0.05,
+ return_intermediate=False,
+ ):
+ """
+ Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
+ For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
+ """
+ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start
+ t_T = self.noise_schedule.T if t_end is None else t_end
+ assert (
+ t_0 > 0 and t_T > 0
+ ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+ return self.sample(
+ x,
+ steps=steps,
+ t_start=t_0,
+ t_end=t_T,
+ order=order,
+ skip_type=skip_type,
+ method=method,
+ lower_order_final=lower_order_final,
+ denoise_to_zero=denoise_to_zero,
+ solver_type=solver_type,
+ atol=atol,
+ rtol=rtol,
+ return_intermediate=return_intermediate,
+ )
+
+ def sample(
+ self,
+ x,
+ steps=20,
+ t_start=None,
+ t_end=None,
+ order=2,
+ skip_type="time_uniform",
+ method="multistep",
+ lower_order_final=True,
+ denoise_to_zero=False,
+ solver_type="dpmsolver",
+ atol=0.0078,
+ rtol=0.05,
+ return_intermediate=False,
+ ):
+ """
+ Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
+
+ =====================================================
+
+ We support the following algorithms for both noise prediction model and data prediction model:
+ - 'singlestep':
+ Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
+ We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
+ The total number of function evaluations (NFE) == `steps`.
+ Given a fixed NFE == `steps`, the sampling procedure is:
+ - If `order` == 1:
+ - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
+ - If `order` == 2:
+ - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
+ - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
+ - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+ - If `order` == 3:
+ - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+ - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+ - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
+ - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
+ - 'multistep':
+ Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
+ We initialize the first `order` values by lower order multistep solvers.
+ Given a fixed NFE == `steps`, the sampling procedure is:
+ Denote K = steps.
+ - If `order` == 1:
+ - We use K steps of DPM-Solver-1 (i.e. DDIM).
+ - If `order` == 2:
+ - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
+ - If `order` == 3:
+ - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
+ - 'singlestep_fixed':
+ Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
+ We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
+ - 'adaptive':
+ Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
+ We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
+ You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
+ (NFE) and the sample quality.
+ - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
+ - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
+
+ =====================================================
+
+ Some advices for choosing the algorithm:
+ - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
+ Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
+ e.g., DPM-Solver:
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+ skip_type='time_uniform', method='singlestep')
+ e.g., DPM-Solver++:
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+ skip_type='time_uniform', method='singlestep')
+ - For **guided sampling with large guidance scale** by DPMs:
+ Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
+ e.g.
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
+ skip_type='time_uniform', method='multistep')
+
+ We support three types of `skip_type`:
+ - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
+ - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
+ - 'time_quadratic': quadratic time for the time steps.
+
+ =====================================================
+ Args:
+ x: A pytorch tensor. The initial value at time `t_start`
+ e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
+ steps: A `int`. The total number of function evaluations (NFE).
+ t_start: A `float`. The starting time of the sampling.
+ If `T` is None, we use self.noise_schedule.T (default is 1.0).
+ t_end: A `float`. The ending time of the sampling.
+ If `t_end` is None, we use 1. / self.noise_schedule.total_N.
+ e.g. if total_N == 1000, we have `t_end` == 1e-3.
+ For discrete-time DPMs:
+ - We recommend `t_end` == 1. / self.noise_schedule.total_N.
+ For continuous-time DPMs:
+ - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
+ order: A `int`. The order of DPM-Solver.
+ skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
+ method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
+ denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
+ Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
+
+ This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
+ score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
+ for diffusion models sampling by diffusion SDEs for low-resolutional images
+ (such as CIFAR-10). However, we observed that such trick does not matter for
+ high-resolutional images. As it needs an additional NFE, we do not recommend
+ it for high-resolutional images.
+ lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
+ Only valid for `method=multistep` and `steps < 15`. We empirically find that
+ this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
+ (especially for steps <= 10). So we recommend to set it to be `True`.
+ solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
+ atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+ rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+ return_intermediate: A `bool`. Whether to save the xt at each step.
+ When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
+ Returns:
+ x_end: A pytorch tensor. The approximated solution at time `t_end`.
+
+ """
+ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
+ t_T = self.noise_schedule.T if t_start is None else t_start
+ assert (
+ t_0 > 0 and t_T > 0
+ ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+ if return_intermediate:
+ assert method in [
+ "multistep",
+ "singlestep",
+ "singlestep_fixed",
+ ], "Cannot use adaptive solver when saving intermediate values"
+ if self.correcting_xt_fn is not None:
+ assert method in [
+ "multistep",
+ "singlestep",
+ "singlestep_fixed",
+ ], "Cannot use adaptive solver when correcting_xt_fn is not None"
+ device = x.device
+ intermediates = []
+ with torch.no_grad():
+ if method == "adaptive":
+ x = self.dpm_solver_adaptive(
+ x,
+ order=order,
+ t_T=t_T,
+ t_0=t_0,
+ atol=atol,
+ rtol=rtol,
+ solver_type=solver_type,
+ )
+ elif method == "multistep":
+ assert steps >= order
+ timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+ assert timesteps.shape[0] - 1 == steps
+ # Init the initial values.
+ step = 0
+ t = timesteps[step]
+ t_prev_list = [t]
+ model_prev_list = [self.model_fn(x, t)]
+ if self.correcting_xt_fn is not None:
+ x = self.correcting_xt_fn(x, t, step)
+ if return_intermediate:
+ intermediates.append(x)
+ # Init the first `order` values by lower order multistep DPM-Solver.
+ for step in range(1, order):
+ t = timesteps[step]
+ x = self.multistep_dpm_solver_update(
+ x,
+ model_prev_list,
+ t_prev_list,
+ t,
+ step,
+ solver_type=solver_type,
+ )
+ if self.correcting_xt_fn is not None:
+ x = self.correcting_xt_fn(x, t, step)
+ if return_intermediate:
+ intermediates.append(x)
+ t_prev_list.append(t)
+ model_prev_list.append(self.model_fn(x, t))
+ # Compute the remaining values by `order`-th order multistep DPM-Solver.
+ for step in range(order, steps + 1):
+ t = timesteps[step]
+ # We only use lower order for steps < 10
+ if lower_order_final and steps < 10:
+ step_order = min(order, steps + 1 - step)
+ else:
+ step_order = order
+ x = self.multistep_dpm_solver_update(
+ x,
+ model_prev_list,
+ t_prev_list,
+ t,
+ step_order,
+ solver_type=solver_type,
+ )
+ if self.correcting_xt_fn is not None:
+ x = self.correcting_xt_fn(x, t, step)
+ if return_intermediate:
+ intermediates.append(x)
+ for i in range(order - 1):
+ t_prev_list[i] = t_prev_list[i + 1]
+ model_prev_list[i] = model_prev_list[i + 1]
+ t_prev_list[-1] = t
+ # We do not need to evaluate the final model value.
+ if step < steps:
+ model_prev_list[-1] = self.model_fn(x, t)
+ elif method in ["singlestep", "singlestep_fixed"]:
+ if method == "singlestep":
+ (
+ timesteps_outer,
+ orders,
+ ) = self.get_orders_and_timesteps_for_singlestep_solver(
+ steps=steps,
+ order=order,
+ skip_type=skip_type,
+ t_T=t_T,
+ t_0=t_0,
+ device=device,
+ )
+ elif method == "singlestep_fixed":
+ K = steps // order
+ orders = [
+ order,
+ ] * K
+ timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
+ for step, order in enumerate(orders):
+ s, t = timesteps_outer[step], timesteps_outer[step + 1]
+ timesteps_inner = self.get_time_steps(
+ skip_type=skip_type,
+ t_T=s.item(),
+ t_0=t.item(),
+ N=order,
+ device=device,
+ )
+ lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
+ h = lambda_inner[-1] - lambda_inner[0]
+ r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
+ r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
+ x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
+ if self.correcting_xt_fn is not None:
+ x = self.correcting_xt_fn(x, t, step)
+ if return_intermediate:
+ intermediates.append(x)
+ else:
+ raise ValueError("Got wrong method {}".format(method))
+ if denoise_to_zero:
+ t = torch.ones((1,)).to(device) * t_0
+ x = self.denoise_to_zero_fn(x, t)
+ if self.correcting_xt_fn is not None:
+ x = self.correcting_xt_fn(x, t, step + 1)
+ if return_intermediate:
+ intermediates.append(x)
+ if return_intermediate:
+ return x, intermediates
+ else:
+ return x
+
+
+#############################################################
+# other utility functions
+#############################################################
+
+
+def interpolate_fn(x, xp, yp):
+ """
+ A piecewise linear function y = f(x), using xp and yp as keypoints.
+ We implement f(x) in a differentiable way (i.e. applicable for autograd).
+ The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+
+ Args:
+ x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+ xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+ yp: PyTorch tensor with shape [C, K].
+ Returns:
+ The function values f(x), with shape [N, C].
+ """
+ N, K = x.shape[0], xp.shape[1]
+ all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+ sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+ x_idx = torch.argmin(x_indices, dim=2)
+ cand_start_idx = x_idx - 1
+ start_idx = torch.where(
+ torch.eq(x_idx, 0),
+ torch.tensor(1, device=x.device),
+ torch.where(
+ torch.eq(x_idx, K),
+ torch.tensor(K - 2, device=x.device),
+ cand_start_idx,
+ ),
+ )
+ end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+ start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+ end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+ start_idx2 = torch.where(
+ torch.eq(x_idx, 0),
+ torch.tensor(0, device=x.device),
+ torch.where(
+ torch.eq(x_idx, K),
+ torch.tensor(K - 2, device=x.device),
+ cand_start_idx,
+ ),
+ )
+ y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+ start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+ end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+ cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+ return cand
+
+
+def expand_dims(v, dims):
+ """
+ Expand the tensor `v` to the dim `dims`.
+
+ Args:
+ `v`: a PyTorch tensor with shape [N].
+ `dim`: a `int`.
+ Returns:
+ a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+ """
+ return v[(...,) + (None,) * (dims - 1)]
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/random_latent_generator.py b/submodules/TTS/TTS/tts/layers/tortoise/random_latent_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b39c1e4b22ee5a9ad84a1711a08a8530c4d76b7
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/random_latent_generator.py
@@ -0,0 +1,55 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
+ if bias is not None:
+ rest_dim = [1] * (input.ndim - bias.ndim - 1)
+ return (
+ F.leaky_relu(
+ input + bias.view(1, bias.shape[0], *rest_dim),
+ negative_slope=negative_slope,
+ )
+ * scale
+ )
+ else:
+ return F.leaky_relu(input, negative_slope=0.2) * scale
+
+
+class EqualLinear(nn.Module):
+ def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1):
+ super().__init__()
+ self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
+ if bias:
+ self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
+ else:
+ self.bias = None
+ self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+ self.lr_mul = lr_mul
+
+ def forward(self, input):
+ out = F.linear(input, self.weight * self.scale)
+ out = fused_leaky_relu(out, self.bias * self.lr_mul)
+ return out
+
+
+class RandomLatentConverter(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.layers = nn.Sequential(
+ *[EqualLinear(channels, channels, lr_mul=0.1) for _ in range(5)], nn.Linear(channels, channels)
+ )
+ self.channels = channels
+
+ def forward(self, ref):
+ r = torch.randn(ref.shape[0], self.channels, device=ref.device)
+ y = self.layers(r)
+ return y
+
+
+if __name__ == "__main__":
+ model = RandomLatentConverter(512)
+ model(torch.randn(5, 512))
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/tokenizer.py b/submodules/TTS/TTS/tts/layers/tortoise/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d243d6558d0dfcbfee59769f991ce5ad9a603678
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/tokenizer.py
@@ -0,0 +1,37 @@
+import os
+
+import torch
+from tokenizers import Tokenizer
+
+from TTS.tts.utils.text.cleaners import english_cleaners
+
+DEFAULT_VOCAB_FILE = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
+)
+
+
+class VoiceBpeTokenizer:
+ def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None):
+ self.tokenizer = None
+ if vocab_file is not None:
+ self.tokenizer = Tokenizer.from_file(vocab_file)
+ if vocab_str is not None:
+ self.tokenizer = Tokenizer.from_str(vocab_str)
+
+ def preprocess_text(self, txt):
+ txt = english_cleaners(txt)
+ return txt
+
+ def encode(self, txt):
+ txt = self.preprocess_text(txt)
+ txt = txt.replace(" ", "[SPACE]")
+ return self.tokenizer.encode(txt).ids
+
+ def decode(self, seq):
+ if isinstance(seq, torch.Tensor):
+ seq = seq.cpu().numpy()
+ txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
+ txt = txt.replace("[SPACE]", " ")
+ txt = txt.replace("[STOP]", "")
+ txt = txt.replace("[UNK]", "")
+ return txt
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/transformer.py b/submodules/TTS/TTS/tts/layers/tortoise/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d46aa3e03626d8123700a5c2541d2d1a7314b4
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/transformer.py
@@ -0,0 +1,229 @@
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+# helpers
+
+
+def exists(val):
+ return val is not None
+
+
+def default(val, d):
+ return val if exists(val) else d
+
+
+def cast_tuple(val, depth=1):
+ if isinstance(val, list):
+ val = tuple(val)
+ return val if isinstance(val, tuple) else (val,) * depth
+
+
+def max_neg_value(t):
+ return -torch.finfo(t.dtype).max
+
+
+def stable_softmax(t, dim=-1, alpha=32**2):
+ t = t / alpha
+ t = t - torch.amax(t, dim=dim, keepdim=True).detach()
+ return (t * alpha).softmax(dim=dim)
+
+
+def route_args(router, args, depth):
+ routed_args = [(dict(), dict()) for _ in range(depth)]
+ matched_keys = [key for key in args.keys() if key in router]
+
+ for key in matched_keys:
+ val = args[key]
+ for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])):
+ new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes)
+ routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
+ return routed_args
+
+
+# classes
+class SequentialSequence(nn.Module):
+ def __init__(self, layers, args_route={}, layer_dropout=0.0):
+ super().__init__()
+ assert all(
+ len(route) == len(layers) for route in args_route.values()
+ ), "each argument route map must have the same depth as the number of sequential layers"
+ self.layers = layers
+ self.args_route = args_route
+ self.layer_dropout = layer_dropout
+
+ def forward(self, x, **kwargs):
+ args = route_args(self.args_route, kwargs, len(self.layers))
+ layers_and_args = list(zip(self.layers, args))
+
+ for (f, g), (f_args, g_args) in layers_and_args:
+ x = x + f(x, **f_args)
+ x = x + g(x, **g_args)
+ return x
+
+
+class DivideMax(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.dim = dim
+
+ def forward(self, x):
+ maxes = x.amax(dim=self.dim, keepdim=True).detach()
+ return x / maxes
+
+
+# https://arxiv.org/abs/2103.17239
+class LayerScale(nn.Module):
+ def __init__(self, dim, depth, fn):
+ super().__init__()
+ if depth <= 18:
+ init_eps = 0.1
+ elif depth > 18 and depth <= 24:
+ init_eps = 1e-5
+ else:
+ init_eps = 1e-6
+
+ scale = torch.zeros(1, 1, dim).fill_(init_eps)
+ self.scale = nn.Parameter(scale)
+ self.fn = fn
+
+ def forward(self, x, **kwargs):
+ return self.fn(x, **kwargs) * self.scale
+
+
+# layer norm
+
+
+class PreNorm(nn.Module):
+ def __init__(self, dim, fn, sandwich=False):
+ super().__init__()
+ self.norm = nn.LayerNorm(dim)
+ self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity()
+ self.fn = fn
+
+ def forward(self, x, **kwargs):
+ x = self.norm(x)
+ x = self.fn(x, **kwargs)
+ return self.norm_out(x)
+
+
+# feed forward
+
+
+class GEGLU(nn.Module):
+ def forward(self, x):
+ x, gates = x.chunk(2, dim=-1)
+ return x * F.gelu(gates)
+
+
+class FeedForward(nn.Module):
+ def __init__(self, dim, dropout=0.0, mult=4.0):
+ super().__init__()
+ self.net = nn.Sequential(
+ nn.Linear(dim, dim * mult * 2),
+ GEGLU(),
+ nn.Dropout(dropout),
+ nn.Linear(dim * mult, dim),
+ )
+
+ def forward(self, x):
+ return self.net(x)
+
+
+# Attention
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, seq_len, causal=True, heads=8, dim_head=64, dropout=0.0):
+ super().__init__()
+ inner_dim = dim_head * heads
+ self.heads = heads
+ self.seq_len = seq_len
+ self.scale = dim_head**-0.5
+
+ self.causal = causal
+
+ self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+ self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+
+ def forward(self, x, mask=None):
+ b, n, _, h, device = *x.shape, self.heads, x.device
+ softmax = torch.softmax
+
+ qkv = self.to_qkv(x).chunk(3, dim=-1)
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
+
+ q = q * self.scale
+
+ dots = torch.einsum("b h i d, b h j d -> b h i j", q, k)
+ mask_value = max_neg_value(dots)
+
+ if exists(mask):
+ mask = rearrange(mask, "b j -> b () () j")
+ dots.masked_fill_(~mask, mask_value)
+ del mask
+
+ if self.causal:
+ i, j = dots.shape[-2:]
+ mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
+ dots.masked_fill_(mask, mask_value)
+
+ attn = softmax(dots, dim=-1)
+
+ out = torch.einsum("b h i j, b h j d -> b h i d", attn, v)
+ out = rearrange(out, "b h n d -> b n (h d)")
+ out = self.to_out(out)
+ return out
+
+
+# main transformer class
+class Transformer(nn.Module):
+ def __init__(
+ self,
+ *,
+ dim,
+ depth,
+ seq_len,
+ causal=True,
+ heads=8,
+ dim_head=64,
+ ff_mult=4,
+ attn_dropout=0.0,
+ ff_dropout=0.0,
+ sparse_attn=False,
+ sandwich_norm=False,
+ ):
+ super().__init__()
+ layers = nn.ModuleList([])
+ sparse_layer = cast_tuple(sparse_attn, depth)
+
+ for ind, sparse_attn in zip(range(depth), sparse_layer):
+ attn = Attention(
+ dim,
+ causal=causal,
+ seq_len=seq_len,
+ heads=heads,
+ dim_head=dim_head,
+ dropout=attn_dropout,
+ )
+
+ ff = FeedForward(dim, mult=ff_mult, dropout=ff_dropout)
+
+ layers.append(
+ nn.ModuleList(
+ [
+ LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich=sandwich_norm)),
+ LayerScale(dim, ind + 1, PreNorm(dim, ff, sandwich=sandwich_norm)),
+ ]
+ )
+ )
+
+ execute_type = SequentialSequence
+ route_attn = ((True, False),) * depth
+ attn_route_map = {"mask": route_attn}
+
+ self.layers = execute_type(layers, args_route=attn_route_map)
+
+ def forward(self, x, **kwargs):
+ return self.layers(x, **kwargs)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/utils.py b/submodules/TTS/TTS/tts/layers/tortoise/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..810a9e7f7a8ab4a6a48974367020961f9a9967f4
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/utils.py
@@ -0,0 +1,46 @@
+import os
+from urllib import request
+
+from tqdm import tqdm
+
+DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models")
+MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
+MODELS_DIR = "/data/speech_synth/models/"
+MODELS = {
+ "autoregressive.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth",
+ "classifier.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth",
+ "clvp2.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth",
+ "diffusion_decoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth",
+ "vocoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth",
+ "rlg_auto.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth",
+ "rlg_diffuser.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth",
+}
+
+
+def download_models(specific_models=None):
+ """
+ Call to download all the models that Tortoise uses.
+ """
+ os.makedirs(MODELS_DIR, exist_ok=True)
+ for model_name, url in MODELS.items():
+ if specific_models is not None and model_name not in specific_models:
+ continue
+ model_path = os.path.join(MODELS_DIR, model_name)
+ if os.path.exists(model_path):
+ continue
+ print(f"Downloading {model_name} from {url}...")
+ with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
+ request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n))
+ print("Done.")
+
+
+def get_model_path(model_name, models_dir=MODELS_DIR):
+ """
+ Get path to given model, download it if it doesn't exist.
+ """
+ if model_name not in MODELS:
+ raise ValueError(f"Model {model_name} not found in available models.")
+ model_path = os.path.join(models_dir, model_name)
+ if not os.path.exists(model_path) and models_dir == MODELS_DIR:
+ download_models([model_name])
+ return model_path
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/vocoder.py b/submodules/TTS/TTS/tts/layers/tortoise/vocoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5200c26738b55273a74c86e4308a6bd6783f5d7
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/vocoder.py
@@ -0,0 +1,405 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as parametrize
+
+MAX_WAV_VALUE = 32768.0
+
+
+class KernelPredictor(torch.nn.Module):
+ """Kernel predictor for the location-variable convolutions"""
+
+ def __init__(
+ self,
+ cond_channels,
+ conv_in_channels,
+ conv_out_channels,
+ conv_layers,
+ conv_kernel_size=3,
+ kpnet_hidden_channels=64,
+ kpnet_conv_size=3,
+ kpnet_dropout=0.0,
+ kpnet_nonlinear_activation="LeakyReLU",
+ kpnet_nonlinear_activation_params={"negative_slope": 0.1},
+ ):
+ """
+ Args:
+ cond_channels (int): number of channel for the conditioning sequence,
+ conv_in_channels (int): number of channel for the input sequence,
+ conv_out_channels (int): number of channel for the output sequence,
+ conv_layers (int): number of layers
+ """
+ super().__init__()
+
+ self.conv_in_channels = conv_in_channels
+ self.conv_out_channels = conv_out_channels
+ self.conv_kernel_size = conv_kernel_size
+ self.conv_layers = conv_layers
+
+ kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w
+ kpnet_bias_channels = conv_out_channels * conv_layers # l_b
+
+ self.input_conv = nn.Sequential(
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ )
+
+ self.residual_convs = nn.ModuleList()
+ padding = (kpnet_conv_size - 1) // 2
+ for _ in range(3):
+ self.residual_convs.append(
+ nn.Sequential(
+ nn.Dropout(kpnet_dropout),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_hidden_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_hidden_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ ),
+ getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+ )
+ )
+ self.kernel_conv = nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_kernel_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ )
+ self.bias_conv = nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ kpnet_hidden_channels,
+ kpnet_bias_channels,
+ kpnet_conv_size,
+ padding=padding,
+ bias=True,
+ )
+ )
+
+ def forward(self, c):
+ """
+ Args:
+ c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+ """
+ batch, _, cond_length = c.shape
+ c = self.input_conv(c)
+ for residual_conv in self.residual_convs:
+ residual_conv.to(c.device)
+ c = c + residual_conv(c)
+ k = self.kernel_conv(c)
+ b = self.bias_conv(c)
+ kernels = k.contiguous().view(
+ batch,
+ self.conv_layers,
+ self.conv_in_channels,
+ self.conv_out_channels,
+ self.conv_kernel_size,
+ cond_length,
+ )
+ bias = b.contiguous().view(
+ batch,
+ self.conv_layers,
+ self.conv_out_channels,
+ cond_length,
+ )
+
+ return kernels, bias
+
+ def remove_weight_norm(self):
+ parametrize.remove_parametrizations(self.input_conv[0], "weight")
+ parametrize.remove_parametrizations(self.kernel_conv, "weight")
+ parametrize.remove_parametrizations(self.bias_conv)
+ for block in self.residual_convs:
+ parametrize.remove_parametrizations(block[1], "weight")
+ parametrize.remove_parametrizations(block[3], "weight")
+
+
+class LVCBlock(torch.nn.Module):
+ """the location-variable convolutions"""
+
+ def __init__(
+ self,
+ in_channels,
+ cond_channels,
+ stride,
+ dilations=[1, 3, 9, 27],
+ lReLU_slope=0.2,
+ conv_kernel_size=3,
+ cond_hop_length=256,
+ kpnet_hidden_channels=64,
+ kpnet_conv_size=3,
+ kpnet_dropout=0.0,
+ ):
+ super().__init__()
+
+ self.cond_hop_length = cond_hop_length
+ self.conv_layers = len(dilations)
+ self.conv_kernel_size = conv_kernel_size
+
+ self.kernel_predictor = KernelPredictor(
+ cond_channels=cond_channels,
+ conv_in_channels=in_channels,
+ conv_out_channels=2 * in_channels,
+ conv_layers=len(dilations),
+ conv_kernel_size=conv_kernel_size,
+ kpnet_hidden_channels=kpnet_hidden_channels,
+ kpnet_conv_size=kpnet_conv_size,
+ kpnet_dropout=kpnet_dropout,
+ kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
+ )
+
+ self.convt_pre = nn.Sequential(
+ nn.LeakyReLU(lReLU_slope),
+ nn.utils.parametrizations.weight_norm(
+ nn.ConvTranspose1d(
+ in_channels,
+ in_channels,
+ 2 * stride,
+ stride=stride,
+ padding=stride // 2 + stride % 2,
+ output_padding=stride % 2,
+ )
+ ),
+ )
+
+ self.conv_blocks = nn.ModuleList()
+ for dilation in dilations:
+ self.conv_blocks.append(
+ nn.Sequential(
+ nn.LeakyReLU(lReLU_slope),
+ nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(
+ in_channels,
+ in_channels,
+ conv_kernel_size,
+ padding=dilation * (conv_kernel_size - 1) // 2,
+ dilation=dilation,
+ )
+ ),
+ nn.LeakyReLU(lReLU_slope),
+ )
+ )
+
+ def forward(self, x, c):
+ """forward propagation of the location-variable convolutions.
+ Args:
+ x (Tensor): the input sequence (batch, in_channels, in_length)
+ c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+
+ Returns:
+ Tensor: the output sequence (batch, in_channels, in_length)
+ """
+ _, in_channels, _ = x.shape # (B, c_g, L')
+
+ x = self.convt_pre(x) # (B, c_g, stride * L')
+ kernels, bias = self.kernel_predictor(c)
+
+ for i, conv in enumerate(self.conv_blocks):
+ output = conv(x) # (B, c_g, stride * L')
+
+ k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length)
+ b = bias[:, i, :, :] # (B, 2 * c_g, cond_length)
+
+ output = self.location_variable_convolution(
+ output, k, b, hop_size=self.cond_hop_length
+ ) # (B, 2 * c_g, stride * L'): LVC
+ x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
+ output[:, in_channels:, :]
+ ) # (B, c_g, stride * L'): GAU
+
+ return x
+
+ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):
+ """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
+ Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
+ Args:
+ x (Tensor): the input sequence (batch, in_channels, in_length).
+ kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
+ bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
+ dilation (int): the dilation of convolution.
+ hop_size (int): the hop_size of the conditioning sequence.
+ Returns:
+ (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
+ """
+ batch, _, in_length = x.shape
+ batch, _, out_channels, kernel_size, kernel_length = kernel.shape
+ assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
+
+ padding = dilation * int((kernel_size - 1) / 2)
+ x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding)
+ x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
+
+ if hop_size < dilation:
+ x = F.pad(x, (0, dilation), "constant", 0)
+ x = x.unfold(
+ 3, dilation, dilation
+ ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
+ x = x[:, :, :, :, :hop_size]
+ x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
+ x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
+
+ o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
+ o = o.to(memory_format=torch.channels_last_3d)
+ bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+ o = o + bias
+ o = o.contiguous().view(batch, out_channels, -1)
+
+ return o
+
+ def remove_weight_norm(self):
+ self.kernel_predictor.remove_weight_norm()
+ parametrize.remove_parametrizations(self.convt_pre[1], "weight")
+ for block in self.conv_blocks:
+ parametrize.remove_parametrizations(block[1], "weight")
+
+
+class UnivNetGenerator(nn.Module):
+ """
+ UnivNet Generator
+
+ Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py.
+ """
+
+ def __init__(
+ self,
+ noise_dim=64,
+ channel_size=32,
+ dilations=[1, 3, 9, 27],
+ strides=[8, 8, 4],
+ lReLU_slope=0.2,
+ kpnet_conv_size=3,
+ # Below are MEL configurations options that this generator requires.
+ hop_length=256,
+ n_mel_channels=100,
+ ):
+ super(UnivNetGenerator, self).__init__()
+ self.mel_channel = n_mel_channels
+ self.noise_dim = noise_dim
+ self.hop_length = hop_length
+ channel_size = channel_size
+ kpnet_conv_size = kpnet_conv_size
+
+ self.res_stack = nn.ModuleList()
+ hop_length = 1
+ for stride in strides:
+ hop_length = stride * hop_length
+ self.res_stack.append(
+ LVCBlock(
+ channel_size,
+ n_mel_channels,
+ stride=stride,
+ dilations=dilations,
+ lReLU_slope=lReLU_slope,
+ cond_hop_length=hop_length,
+ kpnet_conv_size=kpnet_conv_size,
+ )
+ )
+
+ self.conv_pre = nn.utils.parametrizations.weight_norm(
+ nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")
+ )
+
+ self.conv_post = nn.Sequential(
+ nn.LeakyReLU(lReLU_slope),
+ nn.utils.parametrizations.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")),
+ nn.Tanh(),
+ )
+
+ def forward(self, c, z):
+ """
+ Args:
+ c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length)
+ z (Tensor): the noise sequence (batch, noise_dim, in_length)
+
+ """
+ z = self.conv_pre(z) # (B, c_g, L)
+
+ for res_block in self.res_stack:
+ res_block.to(z.device)
+ z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i)
+
+ z = self.conv_post(z) # (B, 1, L * 256)
+
+ return z
+
+ def eval(self, inference=False):
+ super(UnivNetGenerator, self).eval()
+ # don't remove weight norm while validation in training loop
+ if inference:
+ self.remove_weight_norm()
+
+ def remove_weight_norm(self):
+ parametrize.remove_parametrizations(self.conv_pre, "weight")
+
+ for layer in self.conv_post:
+ if len(layer.state_dict()) != 0:
+ parametrize.remove_parametrizations(layer, "weight")
+
+ for res_block in self.res_stack:
+ res_block.remove_weight_norm()
+
+ def inference(self, c, z=None):
+ # pad input mel with zeros to cut artifact
+ # see https://github.com/seungwonpark/melgan/issues/8
+ zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
+ mel = torch.cat((c, zero), dim=2)
+
+ if z is None:
+ z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
+
+ audio = self.forward(mel, z)
+ audio = audio[:, :, : -(self.hop_length * 10)]
+ audio = audio.clamp(min=-1, max=1)
+ return audio
+
+
+@dataclass
+class VocType:
+ constructor: Callable[[], nn.Module]
+ model_path: str
+ subkey: Optional[str] = None
+
+ def optionally_index(self, model_dict):
+ if self.subkey is not None:
+ return model_dict[self.subkey]
+ return model_dict
+
+
+class VocConf(Enum):
+ Univnet = VocType(UnivNetGenerator, "vocoder.pth", "model_g")
+
+
+if __name__ == "__main__":
+ model = UnivNetGenerator()
+
+ c = torch.randn(3, 100, 10)
+ z = torch.randn(3, 64, 10)
+ print(c.shape)
+
+ y = model(c, z)
+ print(y.shape)
+ assert y.shape == torch.Size([3, 1, 2560])
+
+ pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ print(pytorch_total_params)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/wav2vec_alignment.py b/submodules/TTS/TTS/tts/layers/tortoise/wav2vec_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..47456cc5ac41b7ed9522fe543affc8482218730c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/wav2vec_alignment.py
@@ -0,0 +1,150 @@
+import torch
+import torchaudio
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
+
+
+def max_alignment(s1, s2, skip_character="~", record=None):
+ """
+ A clever function that aligns s1 to s2 as best it can. Wherever a character from s1 is not found in s2, a '~' is
+ used to replace that character.
+
+ Finally got to use my DP skills!
+ """
+ if record is None:
+ record = {}
+ assert skip_character not in s1, f"Found the skip character {skip_character} in the provided string, {s1}"
+ if len(s1) == 0:
+ return ""
+ if len(s2) == 0:
+ return skip_character * len(s1)
+ if s1 == s2:
+ return s1
+ if s1[0] == s2[0]:
+ return s1[0] + max_alignment(s1[1:], s2[1:], skip_character, record)
+
+ take_s1_key = (len(s1), len(s2) - 1)
+ if take_s1_key in record:
+ take_s1, take_s1_score = record[take_s1_key]
+ else:
+ take_s1 = max_alignment(s1, s2[1:], skip_character, record)
+ take_s1_score = len(take_s1.replace(skip_character, ""))
+ record[take_s1_key] = (take_s1, take_s1_score)
+
+ take_s2_key = (len(s1) - 1, len(s2))
+ if take_s2_key in record:
+ take_s2, take_s2_score = record[take_s2_key]
+ else:
+ take_s2 = max_alignment(s1[1:], s2, skip_character, record)
+ take_s2_score = len(take_s2.replace(skip_character, ""))
+ record[take_s2_key] = (take_s2, take_s2_score)
+
+ return take_s1 if take_s1_score > take_s2_score else skip_character + take_s2
+
+
+class Wav2VecAlignment:
+ """
+ Uses wav2vec2 to perform audio<->text alignment.
+ """
+
+ def __init__(self, device="cuda"):
+ self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu()
+ self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-960h")
+ self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("jbetker/tacotron-symbols")
+ self.device = device
+
+ def align(self, audio, expected_text, audio_sample_rate=24000):
+ orig_len = audio.shape[-1]
+
+ with torch.no_grad():
+ self.model = self.model.to(self.device)
+ audio = audio.to(self.device)
+ audio = torchaudio.functional.resample(audio, audio_sample_rate, 16000)
+ clip_norm = (audio - audio.mean()) / torch.sqrt(audio.var() + 1e-7)
+ logits = self.model(clip_norm).logits
+ self.model = self.model.cpu()
+
+ logits = logits[0]
+ pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
+
+ fixed_expectation = max_alignment(expected_text.lower(), pred_string)
+ w2v_compression = orig_len // logits.shape[0]
+ expected_tokens = self.tokenizer.encode(fixed_expectation)
+ expected_chars = list(fixed_expectation)
+ if len(expected_tokens) == 1:
+ return [0] # The alignment is simple; there is only one token.
+ expected_tokens.pop(0) # The first token is a given.
+ expected_chars.pop(0)
+
+ alignments = [0]
+
+ def pop_till_you_win():
+ if len(expected_tokens) == 0:
+ return None
+ popped = expected_tokens.pop(0)
+ popped_char = expected_chars.pop(0)
+ while popped_char == "~":
+ alignments.append(-1)
+ if len(expected_tokens) == 0:
+ return None
+ popped = expected_tokens.pop(0)
+ popped_char = expected_chars.pop(0)
+ return popped
+
+ next_expected_token = pop_till_you_win()
+ for i, logit in enumerate(logits):
+ top = logit.argmax()
+ if next_expected_token == top:
+ alignments.append(i * w2v_compression)
+ if len(expected_tokens) > 0:
+ next_expected_token = pop_till_you_win()
+ else:
+ break
+
+ pop_till_you_win()
+ if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
+ torch.save([audio, expected_text], "alignment_debug.pth")
+ assert False, (
+ "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to"
+ "your current working directory. Please report this along with the file so it can get fixed."
+ )
+
+ # Now fix up alignments. Anything with -1 should be interpolated.
+ alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable.
+ for i in range(len(alignments)):
+ if alignments[i] == -1:
+ for j in range(i + 1, len(alignments)):
+ if alignments[j] != -1:
+ next_found_token = j
+ break
+ for j in range(i, next_found_token):
+ gap = alignments[next_found_token] - alignments[i - 1]
+ alignments[j] = (j - i + 1) * gap // (next_found_token - i + 1) + alignments[i - 1]
+
+ return alignments[:-1]
+
+ def redact(self, audio, expected_text, audio_sample_rate=24000):
+ if "[" not in expected_text:
+ return audio
+ splitted = expected_text.split("[")
+ fully_split = [splitted[0]]
+ for spl in splitted[1:]:
+ assert "]" in spl, 'Every "[" character must be paired with a "]" with no nesting.'
+ fully_split.extend(spl.split("]"))
+
+ # At this point, fully_split is a list of strings, with every other string being something that should be redacted.
+ non_redacted_intervals = []
+ last_point = 0
+ for i in range(len(fully_split)):
+ if i % 2 == 0:
+ end_interval = max(0, last_point + len(fully_split[i]) - 1)
+ non_redacted_intervals.append((last_point, end_interval))
+ last_point += len(fully_split[i])
+
+ bare_text = "".join(fully_split)
+ alignments = self.align(audio, bare_text, audio_sample_rate)
+
+ output_audio = []
+ for nri in non_redacted_intervals:
+ start, stop = nri
+ output_audio.append(audio[:, alignments[start] : alignments[stop]])
+ return torch.cat(output_audio, dim=-1)
diff --git a/submodules/TTS/TTS/tts/layers/tortoise/xtransformers.py b/submodules/TTS/TTS/tts/layers/tortoise/xtransformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eb3f77269c0e7b718d350217796ec704543c681
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/tortoise/xtransformers.py
@@ -0,0 +1,1259 @@
+import math
+from collections import namedtuple
+from functools import partial
+from inspect import isfunction
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import einsum, nn
+
+DEFAULT_DIM_HEAD = 64
+
+Intermediates = namedtuple("Intermediates", ["pre_softmax_attn", "post_softmax_attn"])
+
+LayerIntermediates = namedtuple(
+ "Intermediates",
+ [
+ "hiddens",
+ "attn_intermediates",
+ "past_key_values",
+ ],
+)
+
+
+# helpers
+
+
+def exists(val):
+ return val is not None
+
+
+def default(val, d):
+ if exists(val):
+ return val
+ return d() if isfunction(d) else d
+
+
+def cast_tuple(val, depth):
+ return val if isinstance(val, tuple) else (val,) * depth
+
+
+class always:
+ def __init__(self, val):
+ self.val = val
+
+ def __call__(self, *args, **kwargs):
+ return self.val
+
+
+class not_equals:
+ def __init__(self, val):
+ self.val = val
+
+ def __call__(self, x, *args, **kwargs):
+ return x != self.val
+
+
+class equals:
+ def __init__(self, val):
+ self.val = val
+
+ def __call__(self, x, *args, **kwargs):
+ return x == self.val
+
+
+def max_neg_value(tensor):
+ return -torch.finfo(tensor.dtype).max
+
+
+def l2norm(t):
+ return F.normalize(t, p=2, dim=-1)
+
+
+# init helpers
+
+
+def init_zero_(layer):
+ nn.init.constant_(layer.weight, 0.0)
+ if exists(layer.bias):
+ nn.init.constant_(layer.bias, 0.0)
+
+
+# keyword argument helpers
+
+
+def pick_and_pop(keys, d):
+ values = list(map(lambda key: d.pop(key), keys))
+ return dict(zip(keys, values))
+
+
+def group_dict_by_key(cond, d):
+ return_val = [dict(), dict()]
+ for key in d.keys():
+ match = bool(cond(key))
+ ind = int(not match)
+ return_val[ind][key] = d[key]
+ return (*return_val,)
+
+
+def string_begins_with(prefix, str):
+ return str.startswith(prefix)
+
+
+def group_by_key_prefix(prefix, d):
+ return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+
+def groupby_prefix_and_trim(prefix, d):
+ kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+ kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix) :], x[1]), tuple(kwargs_with_prefix.items())))
+ return kwargs_without_prefix, kwargs
+
+
+# activations
+
+
+class ReluSquared(nn.Module):
+ def forward(self, x):
+ return F.relu(x) ** 2
+
+
+# positional embeddings
+
+
+class AbsolutePositionalEmbedding(nn.Module):
+ def __init__(self, dim, max_seq_len):
+ super().__init__()
+ self.scale = dim**-0.5
+ self.emb = nn.Embedding(max_seq_len, dim)
+
+ def forward(self, x):
+ n = torch.arange(x.shape[1], device=x.device)
+ pos_emb = self.emb(n)
+ pos_emb = rearrange(pos_emb, "n d -> () n d")
+ return pos_emb * self.scale
+
+
+class FixedPositionalEmbedding(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+ self.register_buffer("inv_freq", inv_freq)
+
+ def forward(self, x, seq_dim=1, offset=0):
+ t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+ sinusoid_inp = torch.einsum("i , j -> i j", t, self.inv_freq)
+ emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+ return rearrange(emb, "n d -> () n d")
+
+
+class RelativePositionBias(nn.Module):
+ def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8):
+ super().__init__()
+ self.scale = scale
+ self.causal = causal
+ self.num_buckets = num_buckets
+ self.max_distance = max_distance
+ self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+
+ @staticmethod
+ def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128):
+ ret = 0
+ n = -relative_position
+ if not causal:
+ num_buckets //= 2
+ ret += (n < 0).long() * num_buckets
+ n = torch.abs(n)
+ else:
+ n = torch.max(n, torch.zeros_like(n))
+
+ max_exact = num_buckets // 2
+ is_small = n < max_exact
+
+ val_if_large = (
+ max_exact
+ + (torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).long()
+ )
+ val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+ ret += torch.where(is_small, n, val_if_large)
+ return ret
+
+ def forward(self, qk_dots):
+ i, j, device = *qk_dots.shape[-2:], qk_dots.device
+ q_pos = torch.arange(i, dtype=torch.long, device=device)
+ k_pos = torch.arange(j, dtype=torch.long, device=device)
+ rel_pos = k_pos[None, :] - q_pos[:, None]
+ rp_bucket = self._relative_position_bucket(
+ rel_pos, causal=self.causal, num_buckets=self.num_buckets, max_distance=self.max_distance
+ )
+ values = self.relative_attention_bias(rp_bucket)
+ bias = rearrange(values, "i j h -> () h i j")
+ return qk_dots + (bias * self.scale)
+
+
+class AlibiPositionalBias(nn.Module):
+ def __init__(self, heads, **kwargs):
+ super().__init__()
+ self.heads = heads
+ slopes = torch.Tensor(self._get_slopes(heads))
+ slopes = rearrange(slopes, "h -> () h () ()")
+ self.register_buffer("slopes", slopes, persistent=False)
+ self.register_buffer("bias", None, persistent=False)
+
+ @staticmethod
+ def _get_slopes(heads):
+ def get_slopes_power_of_2(n):
+ start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+ ratio = start
+ return [start * ratio**i for i in range(n)]
+
+ if math.log2(heads).is_integer():
+ return get_slopes_power_of_2(heads)
+
+ closest_power_of_2 = 2 ** math.floor(math.log2(heads))
+ return (
+ get_slopes_power_of_2(closest_power_of_2)
+ + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][: heads - closest_power_of_2]
+ )
+
+ def forward(self, qk_dots):
+ h, i, j, device = *qk_dots.shape[-3:], qk_dots.device
+
+ if exists(self.bias) and self.bias.shape[-1] >= j:
+ return qk_dots + self.bias[..., :j]
+
+ bias = torch.arange(j, device=device)
+ bias = rearrange(bias, "j -> () () () j")
+ bias = bias * self.slopes
+
+ num_heads_unalibied = h - bias.shape[1]
+ bias = F.pad(bias, (0, 0, 0, 0, 0, num_heads_unalibied))
+
+ self.register_buffer("bias", bias, persistent=False)
+ return qk_dots + self.bias
+
+
+class LearnedAlibiPositionalBias(AlibiPositionalBias):
+ def __init__(self, heads, bidirectional=False):
+ super().__init__(heads)
+ los_slopes = torch.log(self.slopes)
+ self.learned_logslopes = nn.Parameter(los_slopes)
+
+ self.bidirectional = bidirectional
+ if self.bidirectional:
+ self.learned_logslopes_future = nn.Parameter(los_slopes)
+
+ def forward(self, qk_dots):
+ h, i, j, device = *qk_dots.shape[-3:], qk_dots.device
+
+ def get_slopes(param):
+ return F.pad(param.exp(), (0, 0, 0, 0, 0, h - param.shape[1]))
+
+ if exists(self.bias) and self.bias.shape[-1] >= j:
+ bias = self.bias[..., :i, :j]
+ else:
+ i_arange = torch.arange(i, device=device)
+ j_arange = torch.arange(j, device=device)
+ bias = rearrange(j_arange, "j -> 1 1 1 j") - rearrange(i_arange, "i -> 1 1 i 1")
+ self.register_buffer("bias", bias, persistent=False)
+
+ if self.bidirectional:
+ past_slopes = get_slopes(self.learned_logslopes)
+ future_slopes = get_slopes(self.learned_logslopes_future)
+ bias = torch.tril(bias * past_slopes) + torch.triu(bias * future_slopes)
+ else:
+ slopes = get_slopes(self.learned_logslopes)
+ bias = bias * slopes
+
+ return qk_dots + bias
+
+
+class RotaryEmbedding(nn.Module):
+ def __init__(self, dim):
+ super().__init__()
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+ self.register_buffer("inv_freq", inv_freq)
+
+ def forward(self, max_seq_len, device):
+ t = torch.arange(max_seq_len, device=device).type_as(self.inv_freq)
+ freqs = torch.einsum("i , j -> i j", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ return rearrange(emb, "n d -> () () n d")
+
+
+def rotate_half(x):
+ x = rearrange(x, "... (j d) -> ... j d", j=2)
+ x1, x2 = x.unbind(dim=-2)
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(t, freqs):
+ seq_len = t.shape[-2]
+ freqs = freqs[:, :, -seq_len:]
+ return (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
+
+
+# norms
+
+
+class Scale(nn.Module):
+ def __init__(self, value, fn):
+ super().__init__()
+ self.value = value
+ self.fn = fn
+
+ def forward(self, x, **kwargs):
+ out = self.fn(x, **kwargs)
+ scale_fn = lambda t: t * self.value
+
+ if not isinstance(out, tuple):
+ return scale_fn(out)
+
+ return (scale_fn(out[0]), *out[1:])
+
+
+class Rezero(nn.Module):
+ def __init__(self, fn):
+ super().__init__()
+ self.fn = fn
+ self.g = nn.Parameter(torch.zeros(1))
+
+ def forward(self, x, **kwargs):
+ out = self.fn(x, **kwargs)
+ rezero_fn = lambda t: t * self.g
+
+ if not isinstance(out, tuple):
+ return rezero_fn(out)
+
+ return (rezero_fn(out[0]), *out[1:])
+
+
+class ScaleNorm(nn.Module):
+ def __init__(self, dim, eps=1e-5):
+ super().__init__()
+ self.scale = dim**-0.5
+ self.eps = eps
+ self.g = nn.Parameter(torch.ones(1))
+
+ def forward(self, x):
+ norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+ return x / norm.clamp(min=self.eps) * self.g
+
+
+class RMSNorm(nn.Module):
+ def __init__(self, dim, eps=1e-8):
+ super().__init__()
+ self.scale = dim**-0.5
+ self.eps = eps
+ self.g = nn.Parameter(torch.ones(dim))
+
+ def forward(self, x):
+ norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+ return x / norm.clamp(min=self.eps) * self.g
+
+
+class RMSScaleShiftNorm(nn.Module):
+ def __init__(self, dim, eps=1e-8):
+ super().__init__()
+ self.scale = dim**-0.5
+ self.eps = eps
+ self.g = nn.Parameter(torch.ones(dim))
+ self.scale_shift_process = nn.Linear(dim * 2, dim * 2)
+
+ def forward(self, x, norm_scale_shift_inp):
+ norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+ norm = x / norm.clamp(min=self.eps) * self.g
+
+ ss_emb = self.scale_shift_process(norm_scale_shift_inp)
+ scale, shift = torch.chunk(ss_emb, 2, dim=1)
+ h = norm * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+ return h
+
+
+# residual and residual gates
+
+
+class Residual(nn.Module):
+ def __init__(self, dim, scale_residual=False):
+ super().__init__()
+ self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+
+ def forward(self, x, residual):
+ if exists(self.residual_scale):
+ residual = residual * self.residual_scale
+
+ return x + residual
+
+
+class GRUGating(nn.Module):
+ def __init__(self, dim, scale_residual=False):
+ super().__init__()
+ self.gru = nn.GRUCell(dim, dim)
+ self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+
+ def forward(self, x, residual):
+ if exists(self.residual_scale):
+ residual = residual * self.residual_scale
+
+ gated_output = self.gru(rearrange(x, "b n d -> (b n) d"), rearrange(residual, "b n d -> (b n) d"))
+
+ return gated_output.reshape_as(x)
+
+
+# token shifting
+
+
+def shift(t, amount, mask=None):
+ if amount == 0:
+ return t
+
+ if exists(mask):
+ t = t.masked_fill(~mask[..., None], 0.0)
+
+ return F.pad(t, (0, 0, amount, -amount), value=0.0)
+
+
+class ShiftTokens(nn.Module):
+ def __init__(self, shifts, fn):
+ super().__init__()
+ self.fn = fn
+ self.shifts = tuple(shifts)
+
+ def forward(self, x, **kwargs):
+ mask = kwargs.get("mask", None)
+ shifts = self.shifts
+ segments = len(shifts)
+ feats_per_shift = x.shape[-1] // segments
+ splitted = x.split(feats_per_shift, dim=-1)
+ segments_to_shift, rest = splitted[:segments], splitted[segments:]
+ segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts)))
+ x = torch.cat((*segments_to_shift, *rest), dim=-1)
+ return self.fn(x, **kwargs)
+
+
+# feedforward
+
+
+class GLU(nn.Module):
+ def __init__(self, dim_in, dim_out, activation):
+ super().__init__()
+ self.act = activation
+ self.proj = nn.Linear(dim_in, dim_out * 2)
+
+ def forward(self, x):
+ x, gate = self.proj(x).chunk(2, dim=-1)
+ return x * self.act(gate)
+
+
+class FeedForward(nn.Module):
+ def __init__(
+ self,
+ dim,
+ dim_out=None,
+ mult=4,
+ glu=False,
+ relu_squared=False,
+ post_act_ln=False,
+ dropout=0.0,
+ zero_init_output=False,
+ ):
+ super().__init__()
+ inner_dim = int(dim * mult)
+ dim_out = default(dim_out, dim)
+ activation = ReluSquared() if relu_squared else nn.GELU()
+
+ project_in = (
+ nn.Sequential(nn.Linear(dim, inner_dim), activation) if not glu else GLU(dim, inner_dim, activation)
+ )
+
+ self.net = nn.Sequential(
+ project_in,
+ nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(),
+ nn.Dropout(dropout),
+ nn.Linear(inner_dim, dim_out),
+ )
+
+ # init last linear layer to 0
+ if zero_init_output:
+ init_zero_(self.net[-1])
+
+ def forward(self, x):
+ return self.net(x)
+
+
+# attention.
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim,
+ dim_head=DEFAULT_DIM_HEAD,
+ heads=8,
+ causal=False,
+ talking_heads=False,
+ head_scale=False,
+ collab_heads=False,
+ collab_compression=0.3,
+ sparse_topk=None,
+ use_entmax15=False,
+ num_mem_kv=0,
+ dropout=0.0,
+ on_attn=False,
+ gate_values=False,
+ zero_init_output=False,
+ max_attend_past=None,
+ qk_norm=False,
+ scale_init_value=None,
+ rel_pos_bias=False,
+ rel_pos_num_buckets=32,
+ rel_pos_max_distance=128,
+ ):
+ super().__init__()
+ self.scale = dim_head**-0.5
+
+ self.heads = heads
+ self.causal = causal
+ self.max_attend_past = max_attend_past
+
+ qk_dim = v_dim = dim_head * heads
+
+ # collaborative heads
+ self.collab_heads = collab_heads
+ if self.collab_heads:
+ qk_dim = int(collab_compression * qk_dim)
+ self.collab_mixing = nn.Parameter(torch.randn(heads, qk_dim))
+
+ self.to_q = nn.Linear(dim, qk_dim, bias=False)
+ self.to_k = nn.Linear(dim, qk_dim, bias=False)
+ self.to_v = nn.Linear(dim, v_dim, bias=False)
+
+ self.dropout = nn.Dropout(dropout)
+
+ # add GLU gating for aggregated values, from alphafold2
+ self.to_v_gate = None
+ if gate_values:
+ self.to_v_gate = nn.Linear(dim, v_dim)
+ nn.init.constant_(self.to_v_gate.weight, 0)
+ nn.init.constant_(self.to_v_gate.bias, 1)
+
+ # cosine sim attention
+ self.qk_norm = qk_norm
+ if qk_norm:
+ scale_init_value = default(
+ scale_init_value, -3
+ ) # if not provided, initialize as though it were sequence length of 1024
+ self.scale = nn.Parameter(torch.ones(1, heads, 1, 1) * scale_init_value)
+
+ # talking heads
+ self.talking_heads = talking_heads
+ if talking_heads:
+ self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+ self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+
+ # head scaling
+ self.head_scale = head_scale
+ if head_scale:
+ self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1))
+
+ # explicit topk sparse attention
+ self.sparse_topk = sparse_topk
+
+ # entmax
+ self.attn_fn = F.softmax
+
+ # add memory key / values
+ self.num_mem_kv = num_mem_kv
+ if num_mem_kv > 0:
+ self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+ self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+
+ # attention on attention
+ self.attn_on_attn = on_attn
+ self.to_out = nn.Sequential(nn.Linear(v_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(v_dim, dim)
+
+ self.rel_pos_bias = rel_pos_bias
+ if rel_pos_bias:
+ assert (
+ rel_pos_num_buckets <= rel_pos_max_distance
+ ), "number of relative position buckets must be less than the relative position max distance"
+ self.rel_pos = RelativePositionBias(
+ scale=dim_head**0.5,
+ causal=causal,
+ heads=heads,
+ num_buckets=rel_pos_num_buckets,
+ max_distance=rel_pos_max_distance,
+ )
+
+ # init output projection 0
+ if zero_init_output:
+ init_zero_(self.to_out)
+
+ def forward(
+ self,
+ x,
+ context=None,
+ mask=None,
+ context_mask=None,
+ attn_mask=None,
+ sinusoidal_emb=None,
+ rotary_pos_emb=None,
+ prev_attn=None,
+ mem=None,
+ layer_past=None,
+ ):
+ b, n, _, h, talking_heads, collab_heads, head_scale, scale, device, has_context = (
+ *x.shape,
+ self.heads,
+ self.talking_heads,
+ self.collab_heads,
+ self.head_scale,
+ self.scale,
+ x.device,
+ exists(context),
+ )
+ kv_input = default(context, x)
+
+ q_input = x
+ k_input = kv_input
+ v_input = kv_input
+
+ if exists(mem):
+ k_input = torch.cat((mem, k_input), dim=-2)
+ v_input = torch.cat((mem, v_input), dim=-2)
+
+ if exists(sinusoidal_emb):
+ # in shortformer, the query would start at a position offset depending on the past cached memory
+ offset = k_input.shape[-2] - q_input.shape[-2]
+ q_input = q_input + sinusoidal_emb(q_input, offset=offset)
+ k_input = k_input + sinusoidal_emb(k_input)
+
+ q = self.to_q(q_input)
+ k = self.to_k(k_input)
+ v = self.to_v(v_input)
+
+ if not collab_heads:
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+ else:
+ q = einsum("b i d, h d -> b h i d", q, self.collab_mixing)
+ k = rearrange(k, "b n d -> b () n d")
+ v = rearrange(v, "b n (h d) -> b h n d", h=h)
+
+ if layer_past is not None:
+ past_key, past_value = layer_past
+ k = torch.cat([past_key, k], dim=-2)
+ v = torch.cat([past_value, v], dim=-2)
+ k_cache = k
+ v_cache = v
+
+ if exists(rotary_pos_emb) and not has_context:
+ l = rotary_pos_emb.shape[-1]
+ (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v))
+ ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl))
+ q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr)))
+
+ input_mask = None
+ if any(map(exists, (mask, context_mask))):
+ q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
+ k_mask = q_mask if not exists(context) else context_mask
+ k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
+ q_mask = rearrange(q_mask, "b i -> b () i ()")
+ k_mask = rearrange(k_mask, "b j -> b () () j")
+ input_mask = q_mask * k_mask
+
+ if self.num_mem_kv > 0:
+ mem_k, mem_v = map(lambda t: repeat(t, "h n d -> b h n d", b=b), (self.mem_k, self.mem_v))
+ k = torch.cat((mem_k, k), dim=-2)
+ v = torch.cat((mem_v, v), dim=-2)
+ if exists(input_mask):
+ input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
+
+ if collab_heads:
+ k = k.expand(-1, h, -1, -1)
+
+ if self.qk_norm:
+ q, k = map(l2norm, (q, k))
+ scale = 1 / (self.scale.exp().clamp(min=1e-2))
+
+ dots = einsum("b h i d, b h j d -> b h i j", q, k) * scale
+ mask_value = max_neg_value(dots)
+
+ if exists(prev_attn):
+ dots = dots + prev_attn
+
+ pre_softmax_attn = dots.clone()
+
+ if talking_heads:
+ dots = einsum("b h i j, h k -> b k i j", dots, self.pre_softmax_proj).contiguous()
+
+ if self.rel_pos_bias:
+ dots = self.rel_pos(dots)
+
+ if exists(input_mask):
+ dots.masked_fill_(~input_mask, mask_value)
+ del input_mask
+
+ if exists(attn_mask):
+ assert (
+ 2 <= attn_mask.ndim <= 4
+ ), "attention mask must have greater than 2 dimensions but less than or equal to 4"
+ if attn_mask.ndim == 2:
+ attn_mask = rearrange(attn_mask, "i j -> () () i j")
+ elif attn_mask.ndim == 3:
+ attn_mask = rearrange(attn_mask, "h i j -> () h i j")
+ dots.masked_fill_(~attn_mask, mask_value)
+
+ if exists(self.max_attend_past):
+ i, j = dots.shape[-2:]
+ range_q = torch.arange(j - i, j, device=device)
+ range_k = torch.arange(j, device=device)
+ dist = rearrange(range_q, "i -> () () i ()") - rearrange(range_k, "j -> () () () j")
+ mask = dist > self.max_attend_past
+ dots.masked_fill_(mask, mask_value)
+ del mask
+
+ if self.causal:
+ i, j = dots.shape[-2:]
+ r = torch.arange(i, device=device)
+ mask = rearrange(r, "i -> () () i ()") < rearrange(r, "j -> () () () j")
+ mask = F.pad(mask, (j - i, 0), value=False)
+ dots.masked_fill_(mask, mask_value)
+ del mask
+
+ if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+ top, _ = dots.topk(self.sparse_topk, dim=-1)
+ vk = top[..., -1].unsqueeze(-1).expand_as(dots)
+ mask = dots < vk
+ dots.masked_fill_(mask, mask_value)
+ del mask
+
+ attn = self.attn_fn(dots, dim=-1)
+ post_softmax_attn = attn.clone()
+
+ attn = self.dropout(attn)
+
+ if talking_heads:
+ attn = einsum("b h i j, h k -> b k i j", attn, self.post_softmax_proj).contiguous()
+
+ out = einsum("b h i j, b h j d -> b h i d", attn, v)
+
+ if head_scale:
+ out = out * self.head_scale_params
+
+ out = rearrange(out, "b h n d -> b n (h d)")
+
+ if exists(self.to_v_gate):
+ gates = self.to_v_gate(x)
+ out = out * gates.sigmoid()
+
+ intermediates = Intermediates(pre_softmax_attn=pre_softmax_attn, post_softmax_attn=post_softmax_attn)
+
+ return self.to_out(out), intermediates, k_cache, v_cache
+
+
+class AttentionLayers(nn.Module):
+ def __init__(
+ self,
+ dim,
+ depth,
+ heads=8,
+ causal=False,
+ cross_attend=False,
+ only_cross=False,
+ use_scalenorm=False,
+ use_rms_scaleshift_norm=False,
+ use_rmsnorm=False,
+ use_rezero=False,
+ alibi_pos_bias=False,
+ alibi_num_heads=None,
+ alibi_learned=False,
+ position_infused_attn=False,
+ rotary_pos_emb=False,
+ rotary_emb_dim=None,
+ custom_layers=None,
+ sandwich_coef=None,
+ par_ratio=None,
+ residual_attn=False,
+ cross_residual_attn=False,
+ macaron=False,
+ pre_norm=True,
+ gate_residual=False,
+ scale_residual=False,
+ shift_tokens=0,
+ sandwich_norm=False,
+ use_qk_norm_attn=False,
+ qk_norm_attn_seq_len=None,
+ zero_init_branch_output=False,
+ **kwargs,
+ ):
+ super().__init__()
+ ff_kwargs, kwargs = groupby_prefix_and_trim("ff_", kwargs)
+ attn_kwargs, _ = groupby_prefix_and_trim("attn_", kwargs)
+
+ dim_head = attn_kwargs.get("dim_head", DEFAULT_DIM_HEAD)
+
+ self.dim = dim
+ self.depth = depth
+ self.layers = nn.ModuleList([])
+ self.causal = causal
+
+ rel_pos_bias = "rel_pos_bias" in attn_kwargs
+ self.has_pos_emb = position_infused_attn or rel_pos_bias or rotary_pos_emb
+ self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
+
+ rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32)
+ self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None
+
+ assert not (
+ alibi_pos_bias and rel_pos_bias
+ ), "you can only choose Alibi positional bias or T5 relative positional bias, not both"
+
+ if alibi_pos_bias:
+ alibi_num_heads = default(alibi_num_heads, heads)
+ assert alibi_num_heads <= heads, "number of ALiBi heads must be less than the total number of heads"
+ alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned or not causal else AlibiPositionalBias
+ self.rel_pos = alibi_pos_klass(heads=alibi_num_heads, bidirectional=not causal)
+ else:
+ self.rel_pos = None
+
+ assert not (not pre_norm and sandwich_norm), "sandwich norm cannot be used when not using prenorm"
+ self.pre_norm = pre_norm
+ self.sandwich_norm = sandwich_norm
+
+ self.residual_attn = residual_attn
+ self.cross_residual_attn = cross_residual_attn
+ self.cross_attend = cross_attend
+
+ norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+ norm_class = RMSNorm if use_rmsnorm else norm_class
+ norm_class = RMSScaleShiftNorm if use_rms_scaleshift_norm else norm_class
+ norm_fn = partial(norm_class, dim)
+
+ norm_fn = nn.Identity if use_rezero else norm_fn
+ branch_fn = Rezero if use_rezero else None
+
+ if cross_attend and not only_cross:
+ default_block = ("a", "c", "f")
+ elif cross_attend and only_cross:
+ default_block = ("c", "f")
+ else:
+ default_block = ("a", "f")
+
+ if macaron:
+ default_block = ("f",) + default_block
+
+ # qk normalization
+
+ if use_qk_norm_attn:
+ attn_scale_init_value = (
+ -math.log(math.log2(qk_norm_attn_seq_len**2 - qk_norm_attn_seq_len))
+ if exists(qk_norm_attn_seq_len)
+ else None
+ )
+ attn_kwargs = {**attn_kwargs, "qk_norm": True, "scale_init_value": attn_scale_init_value}
+
+ # zero init
+
+ if zero_init_branch_output:
+ attn_kwargs = {**attn_kwargs, "zero_init_output": True}
+ ff_kwargs = {**ff_kwargs, "zero_init_output": True}
+
+ # calculate layer block order
+
+ if exists(custom_layers):
+ layer_types = custom_layers
+ elif exists(par_ratio):
+ par_depth = depth * len(default_block)
+ assert 1 < par_ratio <= par_depth, "par ratio out of range"
+ default_block = tuple(filter(not_equals("f"), default_block))
+ par_attn = par_depth // par_ratio
+ depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper
+ par_width = (depth_cut + depth_cut // par_attn) // par_attn
+ assert len(default_block) <= par_width, "default block is too large for par_ratio"
+ par_block = default_block + ("f",) * (par_width - len(default_block))
+ par_head = par_block * par_attn
+ layer_types = par_head + ("f",) * (par_depth - len(par_head))
+ elif exists(sandwich_coef):
+ assert sandwich_coef > 0 and sandwich_coef <= depth, "sandwich coefficient should be less than the depth"
+ layer_types = ("a",) * sandwich_coef + default_block * (depth - sandwich_coef) + ("f",) * sandwich_coef
+ else:
+ layer_types = default_block * depth
+
+ self.layer_types = layer_types
+ self.num_attn_layers = len(list(filter(equals("a"), layer_types)))
+
+ # calculate token shifting
+
+ shift_tokens = cast_tuple(shift_tokens, len(layer_types))
+
+ # iterate and construct layers
+
+ for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):
+ is_last_layer = ind == (len(self.layer_types) - 1)
+
+ if layer_type == "a":
+ layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
+ elif layer_type == "c":
+ layer = Attention(dim, heads=heads, **attn_kwargs)
+ elif layer_type == "f":
+ layer = FeedForward(dim, **ff_kwargs)
+ layer = layer if not macaron else Scale(0.5, layer)
+ else:
+ raise Exception(f"invalid layer type {layer_type}")
+
+ if layer_shift_tokens > 0:
+ shift_range_upper = layer_shift_tokens + 1
+ shift_range_lower = -layer_shift_tokens if not causal else 0
+ layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer)
+
+ if exists(branch_fn):
+ layer = branch_fn(layer)
+
+ residual_fn = GRUGating if gate_residual else Residual
+ residual = residual_fn(dim, scale_residual=scale_residual)
+
+ layer_uses_qk_norm = use_qk_norm_attn and layer_type in ("a", "c")
+
+ pre_branch_norm = norm_fn() if pre_norm and not layer_uses_qk_norm else None
+ post_branch_norm = norm_fn() if sandwich_norm or layer_uses_qk_norm else None
+ post_main_norm = norm_fn() if not pre_norm and not is_last_layer else None
+
+ norms = nn.ModuleList([pre_branch_norm, post_branch_norm, post_main_norm])
+
+ self.layers.append(nn.ModuleList([norms, layer, residual]))
+
+ def forward(
+ self,
+ x,
+ context=None,
+ full_context=None, # for passing a list of hidden states from an encoder
+ mask=None,
+ context_mask=None,
+ attn_mask=None,
+ mems=None,
+ return_hiddens=False,
+ norm_scale_shift_inp=None,
+ past_key_values=None,
+ expected_seq_len=None,
+ ):
+ assert not (
+ self.cross_attend ^ (exists(context) or exists(full_context))
+ ), "context must be passed in if cross_attend is set to True"
+ assert context is None or full_context is None, "only one of full_context or context can be provided"
+
+ hiddens = []
+ intermediates = []
+ prev_attn = None
+ prev_cross_attn = None
+
+ mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+ norm_args = {}
+ if exists(norm_scale_shift_inp):
+ norm_args["norm_scale_shift_inp"] = norm_scale_shift_inp
+
+ rotary_pos_emb = None
+ if exists(self.rotary_pos_emb):
+ if not self.training and self.causal:
+ assert (
+ expected_seq_len is not None
+ ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`"
+ elif expected_seq_len is None:
+ expected_seq_len = 0
+ seq_len = x.shape[1]
+ if past_key_values is not None:
+ seq_len += past_key_values[0][0].shape[-2]
+ max_rotary_emb_length = max(
+ list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len]
+ )
+ rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device)
+
+ present_key_values = []
+ cross_attn_count = 0
+ for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
+ if layer_type == "a":
+ layer_mem = mems.pop(0) if mems else None
+
+ residual = x
+
+ pre_branch_norm, post_branch_norm, post_main_norm = norm
+
+ if exists(pre_branch_norm):
+ x = pre_branch_norm(x, **norm_args)
+
+ if layer_type == "a" or layer_type == "c":
+ if past_key_values is not None:
+ layer_kv = past_key_values.pop(0)
+ layer_past = tuple(s.to(x.device) for s in layer_kv)
+ else:
+ layer_past = None
+
+ if layer_type == "a":
+ out, inter, k, v = block(
+ x, None, mask, None, attn_mask, self.pia_pos_emb, rotary_pos_emb, prev_attn, layer_mem, layer_past
+ )
+ elif layer_type == "c":
+ if exists(full_context):
+ out, inter, k, v = block(
+ x,
+ full_context[cross_attn_count],
+ mask,
+ context_mask,
+ None,
+ None,
+ None,
+ prev_attn,
+ None,
+ layer_past,
+ )
+ else:
+ out, inter, k, v = block(
+ x, context, mask, context_mask, None, None, None, prev_attn, None, layer_past
+ )
+ elif layer_type == "f":
+ out = block(x)
+
+ if layer_type == "a" or layer_type == "c" and present_key_values is not None:
+ present_key_values.append((k.detach(), v.detach()))
+
+ if exists(post_branch_norm):
+ out = post_branch_norm(out, **norm_args)
+
+ x = residual_fn(out, residual)
+
+ if layer_type in ("a", "c"):
+ intermediates.append(inter)
+
+ if layer_type == "a" and self.residual_attn:
+ prev_attn = inter.pre_softmax_attn
+ elif layer_type == "c" and self.cross_residual_attn:
+ prev_cross_attn = inter.pre_softmax_attn
+
+ if exists(post_main_norm):
+ x = post_main_norm(x, **norm_args)
+
+ if layer_type == "c":
+ cross_attn_count += 1
+
+ if layer_type == "f":
+ hiddens.append(x)
+
+ if return_hiddens:
+ intermediates = LayerIntermediates(
+ hiddens=hiddens, attn_intermediates=intermediates, past_key_values=present_key_values
+ )
+
+ return x, intermediates
+
+ return x
+
+
+class Encoder(AttentionLayers):
+ def __init__(self, **kwargs):
+ assert "causal" not in kwargs, "cannot set causality on encoder"
+ super().__init__(causal=False, **kwargs)
+
+
+class Decoder(AttentionLayers):
+ def __init__(self, **kwargs):
+ assert "causal" not in kwargs, "cannot set causality on decoder"
+ super().__init__(causal=True, **kwargs)
+
+
+class CrossAttender(AttentionLayers):
+ def __init__(self, **kwargs):
+ super().__init__(cross_attend=True, only_cross=True, **kwargs)
+
+
+class ViTransformerWrapper(nn.Module):
+ def __init__(self, *, image_size, patch_size, attn_layers, num_classes=None, dropout=0.0, emb_dropout=0.0):
+ super().__init__()
+ assert isinstance(attn_layers, Encoder), "attention layers must be an Encoder"
+ assert image_size % patch_size == 0, "image dimensions must be divisible by the patch size"
+ dim = attn_layers.dim
+ num_patches = (image_size // patch_size) ** 2
+ patch_dim = 3 * patch_size**2
+
+ self.patch_size = patch_size
+
+ self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
+ self.patch_to_embedding = nn.Linear(patch_dim, dim)
+ self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
+ self.dropout = nn.Dropout(emb_dropout)
+
+ self.attn_layers = attn_layers
+ self.norm = nn.LayerNorm(dim)
+ self.mlp_head = FeedForward(dim, dim_out=num_classes, dropout=dropout) if exists(num_classes) else None
+
+ def forward(self, img, return_embeddings=False):
+ p = self.patch_size
+
+ x = rearrange(img, "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=p, p2=p)
+ x = self.patch_to_embedding(x)
+ b, n, _ = x.shape
+
+ cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embedding[:, : (n + 1)]
+ x = self.dropout(x)
+
+ x = self.attn_layers(x)
+ x = self.norm(x)
+
+ if not exists(self.mlp_head) or return_embeddings:
+ return x
+
+ return self.mlp_head(x[:, 0])
+
+
+class TransformerWrapper(nn.Module):
+ def __init__(
+ self,
+ *,
+ num_tokens,
+ max_seq_len,
+ attn_layers,
+ emb_dim=None,
+ max_mem_len=0.0,
+ shift_mem_down=0,
+ emb_dropout=0.0,
+ num_memory_tokens=None,
+ tie_embedding=False,
+ use_pos_emb=True,
+ ):
+ super().__init__()
+ assert isinstance(attn_layers, AttentionLayers), "attention layers must be one of Encoder or Decoder"
+
+ dim = attn_layers.dim
+ emb_dim = default(emb_dim, dim)
+
+ self.max_seq_len = max_seq_len
+ self.max_mem_len = max_mem_len
+ self.shift_mem_down = shift_mem_down
+
+ self.token_emb = nn.Embedding(num_tokens, emb_dim)
+ self.pos_emb = (
+ AbsolutePositionalEmbedding(emb_dim, max_seq_len)
+ if (use_pos_emb and not attn_layers.has_pos_emb)
+ else always(0)
+ )
+ self.emb_dropout = nn.Dropout(emb_dropout)
+
+ self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+ self.attn_layers = attn_layers
+ self.norm = nn.LayerNorm(dim)
+
+ self.init_()
+
+ self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+
+ # memory tokens (like [cls]) from Memory Transformers paper
+ num_memory_tokens = default(num_memory_tokens, 0)
+ self.num_memory_tokens = num_memory_tokens
+ if num_memory_tokens > 0:
+ self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+
+ def init_(self):
+ nn.init.kaiming_normal_(self.token_emb.weight)
+
+ def forward(
+ self,
+ x,
+ return_embeddings=False,
+ mask=None,
+ return_hiddens=False,
+ return_attn=False,
+ mems=None,
+ use_cache=False,
+ **kwargs,
+ ):
+ b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
+ x = self.token_emb(x)
+ x = x + self.pos_emb(x)
+ x = self.emb_dropout(x)
+
+ x = self.project_emb(x)
+
+ if num_mem > 0:
+ mem = repeat(self.memory_tokens, "n d -> b n d", b=b)
+ x = torch.cat((mem, x), dim=1)
+
+ # auto-handle masking after appending memory tokens
+ if exists(mask):
+ mask = F.pad(mask, (num_mem, 0), value=True)
+
+ if self.shift_mem_down and exists(mems):
+ mems_l, mems_r = mems[: self.shift_mem_down], mems[self.shift_mem_down :]
+ mems = [*mems_r, *mems_l]
+
+ x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+ x = self.norm(x)
+
+ mem, x = x[:, :num_mem], x[:, num_mem:]
+
+ out = self.to_logits(x) if not return_embeddings else x
+
+ if return_hiddens:
+ hiddens = intermediates.hiddens
+ return out, hiddens
+
+ res = [out]
+ if return_attn:
+ attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+ res.append(attn_maps)
+ if use_cache:
+ res.append(intermediates.past_key_values)
+
+ if len(res) > 1:
+ return tuple(res)
+ return res[0]
+
+
+class ContinuousTransformerWrapper(nn.Module):
+ def __init__(
+ self, *, max_seq_len, attn_layers, dim_in=None, dim_out=None, emb_dim=None, emb_dropout=0.0, use_pos_emb=True
+ ):
+ super().__init__()
+ assert isinstance(attn_layers, AttentionLayers), "attention layers must be one of Encoder or Decoder"
+
+ dim = attn_layers.dim
+
+ self.max_seq_len = max_seq_len
+
+ self.pos_emb = (
+ AbsolutePositionalEmbedding(dim, max_seq_len)
+ if (use_pos_emb and not attn_layers.has_pos_emb)
+ else always(0)
+ )
+ self.emb_dropout = nn.Dropout(emb_dropout)
+
+ self.project_in = nn.Linear(dim_in, dim) if exists(dim_in) else nn.Identity()
+
+ self.attn_layers = attn_layers
+ self.norm = nn.LayerNorm(dim)
+
+ self.project_out = nn.Linear(dim, dim_out) if exists(dim_out) else nn.Identity()
+
+ def forward(self, x, return_embeddings=False, mask=None, return_attn=False, mems=None, use_cache=False, **kwargs):
+ b, n, _, device = *x.shape, x.device
+
+ x = self.project_in(x)
+ x = x + self.pos_emb(x)
+ x = self.emb_dropout(x)
+
+ x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+ x = self.norm(x)
+
+ out = self.project_out(x) if not return_embeddings else x
+
+ res = [out]
+ if return_attn:
+ attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+ res.append(attn_maps)
+ if use_cache:
+ res.append(intermediates.past_key_values)
+
+ if len(res) > 1:
+ return tuple(res)
+ return res[0]
diff --git a/submodules/TTS/TTS/tts/layers/vits/discriminator.py b/submodules/TTS/TTS/tts/layers/vits/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27d11bef632d02169aba7db9a9b38eba32c76a5
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/vits/discriminator.py
@@ -0,0 +1,89 @@
+import torch
+from torch import nn
+from torch.nn.modules.conv import Conv1d
+
+from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator
+
+
+class DiscriminatorS(torch.nn.Module):
+ """HiFiGAN Scale Discriminator. Channel sizes are different from the original HiFiGAN.
+
+ Args:
+ use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
+ """
+
+ def __init__(self, use_spectral_norm=False):
+ super().__init__()
+ norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm
+ self.convs = nn.ModuleList(
+ [
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+ ]
+ )
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+ def forward(self, x):
+ """
+ Args:
+ x (Tensor): input waveform.
+
+ Returns:
+ Tensor: discriminator scores.
+ List[Tensor]: list of features from the convolutiona layers.
+ """
+ feat = []
+ for l in self.convs:
+ x = l(x)
+ x = torch.nn.functional.leaky_relu(x, 0.1)
+ feat.append(x)
+ x = self.conv_post(x)
+ feat.append(x)
+ x = torch.flatten(x, 1, -1)
+ return x, feat
+
+
+class VitsDiscriminator(nn.Module):
+ """VITS discriminator wrapping one Scale Discriminator and a stack of Period Discriminator.
+
+ ::
+ waveform -> ScaleDiscriminator() -> scores_sd, feats_sd --> append() -> scores, feats
+ |--> MultiPeriodDiscriminator() -> scores_mpd, feats_mpd ^
+
+ Args:
+ use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
+ """
+
+ def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
+ super().__init__()
+ self.nets = nn.ModuleList()
+ self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
+ self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])
+
+ def forward(self, x, x_hat=None):
+ """
+ Args:
+ x (Tensor): ground truth waveform.
+ x_hat (Tensor): predicted waveform.
+
+ Returns:
+ List[Tensor]: discriminator scores.
+ List[List[Tensor]]: list of list of features from each layers of each discriminator.
+ """
+ x_scores = []
+ x_hat_scores = [] if x_hat is not None else None
+ x_feats = []
+ x_hat_feats = [] if x_hat is not None else None
+ for net in self.nets:
+ x_score, x_feat = net(x)
+ x_scores.append(x_score)
+ x_feats.append(x_feat)
+ if x_hat is not None:
+ x_hat_score, x_hat_feat = net(x_hat)
+ x_hat_scores.append(x_hat_score)
+ x_hat_feats.append(x_hat_feat)
+ return x_scores, x_feats, x_hat_scores, x_hat_feats
diff --git a/submodules/TTS/TTS/tts/layers/vits/networks.py b/submodules/TTS/TTS/tts/layers/vits/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f97b584fe6ed311127a8c01a089b159946219cb2
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/vits/networks.py
@@ -0,0 +1,288 @@
+import math
+
+import torch
+from torch import nn
+
+from TTS.tts.layers.glow_tts.glow import WN
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
+from TTS.tts.utils.helpers import sequence_mask
+
+LRELU_SLOPE = 0.1
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size * dilation - dilation) / 2)
+
+
+class TextEncoder(nn.Module):
+ def __init__(
+ self,
+ n_vocab: int,
+ out_channels: int,
+ hidden_channels: int,
+ hidden_channels_ffn: int,
+ num_heads: int,
+ num_layers: int,
+ kernel_size: int,
+ dropout_p: float,
+ language_emb_dim: int = None,
+ ):
+ """Text Encoder for VITS model.
+
+ Args:
+ n_vocab (int): Number of characters for the embedding layer.
+ out_channels (int): Number of channels for the output.
+ hidden_channels (int): Number of channels for the hidden layers.
+ hidden_channels_ffn (int): Number of channels for the convolutional layers.
+ num_heads (int): Number of attention heads for the Transformer layers.
+ num_layers (int): Number of Transformer layers.
+ kernel_size (int): Kernel size for the FFN layers in Transformer network.
+ dropout_p (float): Dropout rate for the Transformer layers.
+ """
+ super().__init__()
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
+
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+
+ if language_emb_dim:
+ hidden_channels += language_emb_dim
+
+ self.encoder = RelativePositionTransformer(
+ in_channels=hidden_channels,
+ out_channels=hidden_channels,
+ hidden_channels=hidden_channels,
+ hidden_channels_ffn=hidden_channels_ffn,
+ num_heads=num_heads,
+ num_layers=num_layers,
+ kernel_size=kernel_size,
+ dropout_p=dropout_p,
+ layer_norm_type="2",
+ rel_attn_window_size=4,
+ )
+
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, lang_emb=None):
+ """
+ Shapes:
+ - x: :math:`[B, T]`
+ - x_length: :math:`[B]`
+ """
+ assert x.shape[0] == x_lengths.shape[0]
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
+
+ # concat the lang emb in embedding chars
+ if lang_emb is not None:
+ x = torch.cat((x, lang_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1)
+
+ x = torch.transpose(x, 1, -1) # [b, h, t]
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) # [b, 1, t]
+
+ x = self.encoder(x * x_mask, x_mask)
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return x, m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(
+ self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_layers,
+ dropout_p=0,
+ cond_channels=0,
+ mean_only=False,
+ ):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+ # input layer
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ # coupling layers
+ self.enc = WN(
+ hidden_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_layers,
+ dropout_p=dropout_p,
+ c_in_channels=cond_channels,
+ )
+ # output layer
+ # Initializing last layer to 0 makes the affine coupling layers
+ # do nothing at first. This helps with training stability
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ """
+ Note:
+ Set `reverse` to True for inference.
+
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ - g: :math:`[B, C, 1]`
+ """
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, log_scale = torch.split(stats, [self.half_channels] * 2, 1)
+ else:
+ m = stats
+ log_scale = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(log_scale) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(log_scale, [1, 2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-log_scale) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
+
+
+class ResidualCouplingBlocks(nn.Module):
+ def __init__(
+ self,
+ channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate: int,
+ num_layers: int,
+ num_flows=4,
+ cond_channels=0,
+ ):
+ """Redisual Coupling blocks for VITS flow layers.
+
+ Args:
+ channels (int): Number of input and output tensor channels.
+ hidden_channels (int): Number of hidden network channels.
+ kernel_size (int): Kernel size of the WaveNet layers.
+ dilation_rate (int): Dilation rate of the WaveNet layers.
+ num_layers (int): Number of the WaveNet layers.
+ num_flows (int, optional): Number of Residual Coupling blocks. Defaults to 4.
+ cond_channels (int, optional): Number of channels of the conditioning tensor. Defaults to 0.
+ """
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.num_layers = num_layers
+ self.num_flows = num_flows
+ self.cond_channels = cond_channels
+
+ self.flows = nn.ModuleList()
+ for _ in range(num_flows):
+ self.flows.append(
+ ResidualCouplingBlock(
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ num_layers,
+ cond_channels=cond_channels,
+ mean_only=True,
+ )
+ )
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ """
+ Note:
+ Set `reverse` to True for inference.
+
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ - g: :math:`[B, C, 1]`
+ """
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ x = torch.flip(x, [1])
+ else:
+ for flow in reversed(self.flows):
+ x = torch.flip(x, [1])
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+
+class PosteriorEncoder(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dilation_rate: int,
+ num_layers: int,
+ cond_channels=0,
+ ):
+ """Posterior Encoder of VITS model.
+
+ ::
+ x -> conv1x1() -> WaveNet() (non-causal) -> conv1x1() -> split() -> [m, s] -> sample(m, s) -> z
+
+ Args:
+ in_channels (int): Number of input tensor channels.
+ out_channels (int): Number of output tensor channels.
+ hidden_channels (int): Number of hidden channels.
+ kernel_size (int): Kernel size of the WaveNet convolution layers.
+ dilation_rate (int): Dilation rate of the WaveNet layers.
+ num_layers (int): Number of the WaveNet layers.
+ cond_channels (int, optional): Number of conditioning tensor channels. Defaults to 0.
+ """
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.num_layers = num_layers
+ self.cond_channels = cond_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = WN(
+ hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels=cond_channels
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_lengths: :math:`[B, 1]`
+ - g: :math:`[B, C, 1]`
+ """
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ mean, log_scale = torch.split(stats, self.out_channels, dim=1)
+ z = (mean + torch.randn_like(mean) * torch.exp(log_scale)) * x_mask
+ return z, mean, log_scale, x_mask
diff --git a/submodules/TTS/TTS/tts/layers/vits/stochastic_duration_predictor.py b/submodules/TTS/TTS/tts/layers/vits/stochastic_duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..98dbf0935ca0f6cd6e92fe6ecf063dde2ee4138f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/vits/stochastic_duration_predictor.py
@@ -0,0 +1,294 @@
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from TTS.tts.layers.generic.normalization import LayerNorm2
+from TTS.tts.layers.vits.transforms import piecewise_rational_quadratic_transform
+
+
+class DilatedDepthSeparableConv(nn.Module):
+ def __init__(self, channels, kernel_size, num_layers, dropout_p=0.0) -> torch.tensor:
+ """Dilated Depth-wise Separable Convolution module.
+
+ ::
+ x |-> DDSConv(x) -> LayerNorm(x) -> GeLU(x) -> Conv1x1(x) -> LayerNorm(x) -> GeLU(x) -> + -> o
+ |-------------------------------------------------------------------------------------^
+
+ Args:
+ channels ([type]): [description]
+ kernel_size ([type]): [description]
+ num_layers ([type]): [description]
+ dropout_p (float, optional): [description]. Defaults to 0.0.
+
+ Returns:
+ torch.tensor: Network output masked by the input sequence mask.
+ """
+ super().__init__()
+ self.num_layers = num_layers
+
+ self.convs_sep = nn.ModuleList()
+ self.convs_1x1 = nn.ModuleList()
+ self.norms_1 = nn.ModuleList()
+ self.norms_2 = nn.ModuleList()
+ for i in range(num_layers):
+ dilation = kernel_size**i
+ padding = (kernel_size * dilation - dilation) // 2
+ self.convs_sep.append(
+ nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding)
+ )
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+ self.norms_1.append(LayerNorm2(channels))
+ self.norms_2.append(LayerNorm2(channels))
+ self.dropout = nn.Dropout(dropout_p)
+
+ def forward(self, x, x_mask, g=None):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ """
+ if g is not None:
+ x = x + g
+ for i in range(self.num_layers):
+ y = self.convs_sep[i](x * x_mask)
+ y = self.norms_1[i](y)
+ y = F.gelu(y)
+ y = self.convs_1x1[i](y)
+ y = self.norms_2[i](y)
+ y = F.gelu(y)
+ y = self.dropout(y)
+ x = x + y
+ return x * x_mask
+
+
+class ElementwiseAffine(nn.Module):
+ """Element-wise affine transform like no-population stats BatchNorm alternative.
+
+ Args:
+ channels (int): Number of input tensor channels.
+ """
+
+ def __init__(self, channels):
+ super().__init__()
+ self.translation = nn.Parameter(torch.zeros(channels, 1))
+ self.log_scale = nn.Parameter(torch.zeros(channels, 1))
+
+ def forward(self, x, x_mask, reverse=False, **kwargs): # pylint: disable=unused-argument
+ if not reverse:
+ y = (x * torch.exp(self.log_scale) + self.translation) * x_mask
+ logdet = torch.sum(self.log_scale * x_mask, [1, 2])
+ return y, logdet
+ x = (x - self.translation) * torch.exp(-self.log_scale) * x_mask
+ return x
+
+
+class ConvFlow(nn.Module):
+ """Dilated depth separable convolutional based spline flow.
+
+ Args:
+ in_channels (int): Number of input tensor channels.
+ hidden_channels (int): Number of in network channels.
+ kernel_size (int): Convolutional kernel size.
+ num_layers (int): Number of convolutional layers.
+ num_bins (int, optional): Number of spline bins. Defaults to 10.
+ tail_bound (float, optional): Tail bound for PRQT. Defaults to 5.0.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ num_layers: int,
+ num_bins=10,
+ tail_bound=5.0,
+ ):
+ super().__init__()
+ self.num_bins = num_bins
+ self.tail_bound = tail_bound
+ self.hidden_channels = hidden_channels
+ self.half_channels = in_channels // 2
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers, dropout_p=0.0)
+ self.proj = nn.Conv1d(hidden_channels, self.half_channels * (num_bins * 3 - 1), 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0)
+ h = self.convs(h, x_mask, g=g)
+ h = self.proj(h) * x_mask
+
+ b, c, t = x0.shape
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
+
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.hidden_channels)
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.hidden_channels)
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
+
+ x1, logabsdet = piecewise_rational_quadratic_transform(
+ x1,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=reverse,
+ tails="linear",
+ tail_bound=self.tail_bound,
+ )
+
+ x = torch.cat([x0, x1], 1) * x_mask
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
+ if not reverse:
+ return x, logdet
+ return x
+
+
+class StochasticDurationPredictor(nn.Module):
+ """Stochastic duration predictor with Spline Flows.
+
+ It applies Variational Dequantization and Variational Data Augmentation.
+
+ Paper:
+ SDP: https://arxiv.org/pdf/2106.06103.pdf
+ Spline Flow: https://arxiv.org/abs/1906.04032
+
+ ::
+ ## Inference
+
+ x -> TextCondEncoder() -> Flow() -> dr_hat
+ noise ----------------------^
+
+ ## Training
+ |---------------------|
+ x -> TextCondEncoder() -> + -> PosteriorEncoder() -> split() -> z_u, z_v -> (d - z_u) -> concat() -> Flow() -> noise
+ d -> DurCondEncoder() -> ^ |
+ |------------------------------------------------------------------------------|
+
+ Args:
+ in_channels (int): Number of input tensor channels.
+ hidden_channels (int): Number of hidden channels.
+ kernel_size (int): Kernel size of convolutional layers.
+ dropout_p (float): Dropout rate.
+ num_flows (int, optional): Number of flow blocks. Defaults to 4.
+ cond_channels (int, optional): Number of channels of conditioning tensor. Defaults to 0.
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ hidden_channels: int,
+ kernel_size: int,
+ dropout_p: float,
+ num_flows=4,
+ cond_channels=0,
+ language_emb_dim=0,
+ ):
+ super().__init__()
+
+ # add language embedding dim in the input
+ if language_emb_dim:
+ in_channels += language_emb_dim
+
+ # condition encoder text
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p)
+ self.proj = nn.Conv1d(hidden_channels, hidden_channels, 1)
+
+ # posterior encoder
+ self.flows = nn.ModuleList()
+ self.flows.append(ElementwiseAffine(2))
+ self.flows += [ConvFlow(2, hidden_channels, kernel_size, num_layers=3) for _ in range(num_flows)]
+
+ # condition encoder duration
+ self.post_pre = nn.Conv1d(1, hidden_channels, 1)
+ self.post_convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p)
+ self.post_proj = nn.Conv1d(hidden_channels, hidden_channels, 1)
+
+ # flow layers
+ self.post_flows = nn.ModuleList()
+ self.post_flows.append(ElementwiseAffine(2))
+ self.post_flows += [ConvFlow(2, hidden_channels, kernel_size, num_layers=3) for _ in range(num_flows)]
+
+ if cond_channels != 0 and cond_channels is not None:
+ self.cond = nn.Conv1d(cond_channels, hidden_channels, 1)
+
+ if language_emb_dim != 0 and language_emb_dim is not None:
+ self.cond_lang = nn.Conv1d(language_emb_dim, hidden_channels, 1)
+
+ def forward(self, x, x_mask, dr=None, g=None, lang_emb=None, reverse=False, noise_scale=1.0):
+ """
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_mask: :math:`[B, 1, T]`
+ - dr: :math:`[B, 1, T]`
+ - g: :math:`[B, C]`
+ """
+ # condition encoder text
+ x = self.pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ if lang_emb is not None:
+ x = x + self.cond_lang(lang_emb)
+
+ x = self.convs(x, x_mask)
+ x = self.proj(x) * x_mask
+
+ if not reverse:
+ flows = self.flows
+ assert dr is not None
+
+ # condition encoder duration
+ h = self.post_pre(dr)
+ h = self.post_convs(h, x_mask)
+ h = self.post_proj(h) * x_mask
+ noise = torch.randn(dr.size(0), 2, dr.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
+ z_q = noise
+
+ # posterior encoder
+ logdet_tot_q = 0.0
+ for idx, flow in enumerate(self.post_flows):
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h))
+ logdet_tot_q = logdet_tot_q + logdet_q
+ if idx > 0:
+ z_q = torch.flip(z_q, [1])
+
+ z_u, z_v = torch.split(z_q, [1, 1], 1)
+ u = torch.sigmoid(z_u) * x_mask
+ z0 = (dr - u) * x_mask
+
+ # posterior encoder - neg log likelihood
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
+ nll_posterior_encoder = (
+ torch.sum(-0.5 * (math.log(2 * math.pi) + (noise**2)) * x_mask, [1, 2]) - logdet_tot_q
+ )
+
+ z0 = torch.log(torch.clamp_min(z0, 1e-5)) * x_mask
+ logdet_tot = torch.sum(-z0, [1, 2])
+ z = torch.cat([z0, z_v], 1)
+
+ # flow layers
+ for idx, flow in enumerate(flows):
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+ logdet_tot = logdet_tot + logdet
+ if idx > 0:
+ z = torch.flip(z, [1])
+
+ # flow layers - neg log likelihood
+ nll_flow_layers = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot
+ return nll_flow_layers + nll_posterior_encoder
+
+ flows = list(reversed(self.flows))
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+ for flow in flows:
+ z = torch.flip(z, [1])
+ z = flow(z, x_mask, g=x, reverse=reverse)
+
+ z0, _ = torch.split(z, [1, 1], 1)
+ logw = z0
+ return logw
diff --git a/submodules/TTS/TTS/tts/layers/vits/transforms.py b/submodules/TTS/TTS/tts/layers/vits/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cac1b8d6d12fe98123ca554899978782cf3b4c5
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/vits/transforms.py
@@ -0,0 +1,202 @@
+# adopted from https://github.com/bayesiains/nflows
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ tails=None,
+ tail_bound=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ if tails is None:
+ spline_fn = rational_quadratic_spline
+ spline_kwargs = {}
+ else:
+ spline_fn = unconstrained_rational_quadratic_spline
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+
+ outputs, logabsdet = spline_fn(
+ inputs=inputs,
+ unnormalized_widths=unnormalized_widths,
+ unnormalized_heights=unnormalized_heights,
+ unnormalized_derivatives=unnormalized_derivatives,
+ inverse=inverse,
+ min_bin_width=min_bin_width,
+ min_bin_height=min_bin_height,
+ min_derivative=min_derivative,
+ **spline_kwargs,
+ )
+ return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+ bin_locations[..., -1] += eps
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+
+
+def unconstrained_rational_quadratic_spline(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ tails="linear",
+ tail_bound=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+ outside_interval_mask = ~inside_interval_mask
+
+ outputs = torch.zeros_like(inputs)
+ logabsdet = torch.zeros_like(inputs)
+
+ if tails == "linear":
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+ constant = np.log(np.exp(1 - min_derivative) - 1)
+ unnormalized_derivatives[..., 0] = constant
+ unnormalized_derivatives[..., -1] = constant
+
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
+ logabsdet[outside_interval_mask] = 0
+ else:
+ raise RuntimeError("{} tails are not implemented.".format(tails))
+
+ outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+ inputs=inputs[inside_interval_mask],
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+ inverse=inverse,
+ left=-tail_bound,
+ right=tail_bound,
+ bottom=-tail_bound,
+ top=tail_bound,
+ min_bin_width=min_bin_width,
+ min_bin_height=min_bin_height,
+ min_derivative=min_derivative,
+ )
+
+ return outputs, logabsdet
+
+
+def rational_quadratic_spline(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ left=0.0,
+ right=1.0,
+ bottom=0.0,
+ top=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ if torch.min(inputs) < left or torch.max(inputs) > right:
+ raise ValueError("Input to a transform is not within its domain")
+
+ num_bins = unnormalized_widths.shape[-1]
+
+ if min_bin_width * num_bins > 1.0:
+ raise ValueError("Minimal bin width too large for the number of bins")
+ if min_bin_height * num_bins > 1.0:
+ raise ValueError("Minimal bin height too large for the number of bins")
+
+ widths = F.softmax(unnormalized_widths, dim=-1)
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+ cumwidths = torch.cumsum(widths, dim=-1)
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+ cumwidths = (right - left) * cumwidths + left
+ cumwidths[..., 0] = left
+ cumwidths[..., -1] = right
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+ heights = F.softmax(unnormalized_heights, dim=-1)
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+ cumheights = torch.cumsum(heights, dim=-1)
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+ cumheights = (top - bottom) * cumheights + bottom
+ cumheights[..., 0] = bottom
+ cumheights[..., -1] = top
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+ if inverse:
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
+ else:
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+ delta = heights / widths
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+ if inverse:
+ a = (inputs - input_cumheights) * (
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
+ ) + input_heights * (input_delta - input_derivatives)
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
+ )
+ c = -input_delta * (inputs - input_cumheights)
+
+ discriminant = b.pow(2) - 4 * a * c
+ assert (discriminant >= 0).all()
+
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
+ outputs = root * input_bin_widths + input_cumwidths
+
+ theta_one_minus_theta = root * (1 - root)
+ denominator = input_delta + (
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
+ )
+ derivative_numerator = input_delta.pow(2) * (
+ input_derivatives_plus_one * root.pow(2)
+ + 2 * input_delta * theta_one_minus_theta
+ + input_derivatives * (1 - root).pow(2)
+ )
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+ return outputs, -logabsdet
+ else:
+ theta = (inputs - input_cumwidths) / input_bin_widths
+ theta_one_minus_theta = theta * (1 - theta)
+
+ numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
+ denominator = input_delta + (
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
+ )
+ outputs = input_cumheights + numerator / denominator
+
+ derivative_numerator = input_delta.pow(2) * (
+ input_derivatives_plus_one * theta.pow(2)
+ + 2 * input_delta * theta_one_minus_theta
+ + input_derivatives * (1 - theta).pow(2)
+ )
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+ return outputs, logabsdet
diff --git a/submodules/TTS/TTS/tts/layers/xtts/dvae.py b/submodules/TTS/TTS/tts/layers/xtts/dvae.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdd7a9d09f44cc8dae102a053c365462dc416b6d
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/dvae.py
@@ -0,0 +1,393 @@
+import functools
+from math import sqrt
+
+import torch
+import torch.distributed as distributed
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from einops import rearrange
+
+
+def default(val, d):
+ return val if val is not None else d
+
+
+def eval_decorator(fn):
+ def inner(model, *args, **kwargs):
+ was_training = model.training
+ model.eval()
+ out = fn(model, *args, **kwargs)
+ model.train(was_training)
+ return out
+
+ return inner
+
+
+def dvae_wav_to_mel(
+ wav, mel_norms_file="../experiments/clips_mel_norms.pth", mel_norms=None, device=torch.device("cpu")
+):
+ mel_stft = torchaudio.transforms.MelSpectrogram(
+ n_fft=1024,
+ hop_length=256,
+ win_length=1024,
+ power=2,
+ normalized=False,
+ sample_rate=22050,
+ f_min=0,
+ f_max=8000,
+ n_mels=80,
+ norm="slaney",
+ ).to(device)
+ wav = wav.to(device)
+ mel = mel_stft(wav)
+ mel = torch.log(torch.clamp(mel, min=1e-5))
+ if mel_norms is None:
+ mel_norms = torch.load(mel_norms_file, map_location=device)
+ mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1)
+ return mel
+
+
+class Quantize(nn.Module):
+ def __init__(self, dim, n_embed, decay=0.99, eps=1e-5, balancing_heuristic=False, new_return_order=False):
+ super().__init__()
+
+ self.dim = dim
+ self.n_embed = n_embed
+ self.decay = decay
+ self.eps = eps
+
+ self.balancing_heuristic = balancing_heuristic
+ self.codes = None
+ self.max_codes = 64000
+ self.codes_full = False
+ self.new_return_order = new_return_order
+
+ embed = torch.randn(dim, n_embed)
+ self.register_buffer("embed", embed)
+ self.register_buffer("cluster_size", torch.zeros(n_embed))
+ self.register_buffer("embed_avg", embed.clone())
+
+ def forward(self, input, return_soft_codes=False):
+ if self.balancing_heuristic and self.codes_full:
+ h = torch.histc(self.codes, bins=self.n_embed, min=0, max=self.n_embed) / len(self.codes)
+ mask = torch.logical_or(h > 0.9, h < 0.01).unsqueeze(1)
+ ep = self.embed.permute(1, 0)
+ ea = self.embed_avg.permute(1, 0)
+ rand_embed = torch.randn_like(ep) * mask
+ self.embed = (ep * ~mask + rand_embed).permute(1, 0)
+ self.embed_avg = (ea * ~mask + rand_embed).permute(1, 0)
+ self.cluster_size = self.cluster_size * ~mask.squeeze()
+ if torch.any(mask):
+ print(f"Reset {torch.sum(mask)} embedding codes.")
+ self.codes = None
+ self.codes_full = False
+
+ flatten = input.reshape(-1, self.dim)
+ dist = flatten.pow(2).sum(1, keepdim=True) - 2 * flatten @ self.embed + self.embed.pow(2).sum(0, keepdim=True)
+ soft_codes = -dist
+ _, embed_ind = soft_codes.max(1)
+ embed_onehot = F.one_hot(embed_ind, self.n_embed).type(flatten.dtype)
+ embed_ind = embed_ind.view(*input.shape[:-1])
+ quantize = self.embed_code(embed_ind)
+
+ if self.balancing_heuristic:
+ if self.codes is None:
+ self.codes = embed_ind.flatten()
+ else:
+ self.codes = torch.cat([self.codes, embed_ind.flatten()])
+ if len(self.codes) > self.max_codes:
+ self.codes = self.codes[-self.max_codes :]
+ self.codes_full = True
+
+ if self.training:
+ embed_onehot_sum = embed_onehot.sum(0)
+ embed_sum = flatten.transpose(0, 1) @ embed_onehot
+
+ if distributed.is_initialized() and distributed.get_world_size() > 1:
+ distributed.all_reduce(embed_onehot_sum)
+ distributed.all_reduce(embed_sum)
+
+ self.cluster_size.data.mul_(self.decay).add_(embed_onehot_sum, alpha=1 - self.decay)
+ self.embed_avg.data.mul_(self.decay).add_(embed_sum, alpha=1 - self.decay)
+ n = self.cluster_size.sum()
+ cluster_size = (self.cluster_size + self.eps) / (n + self.n_embed * self.eps) * n
+ embed_normalized = self.embed_avg / cluster_size.unsqueeze(0)
+ self.embed.data.copy_(embed_normalized)
+
+ diff = (quantize.detach() - input).pow(2).mean()
+ quantize = input + (quantize - input).detach()
+
+ if return_soft_codes:
+ return quantize, diff, embed_ind, soft_codes.view(input.shape[:-1] + (-1,))
+ elif self.new_return_order:
+ return quantize, embed_ind, diff
+ else:
+ return quantize, diff, embed_ind
+
+ def embed_code(self, embed_id):
+ return F.embedding(embed_id, self.embed.transpose(0, 1))
+
+
+# Fits a soft-discretized input to a normal-PDF across the specified dimension.
+# In other words, attempts to force the discretization function to have a mean equal utilization across all discrete
+# values with the specified expected variance.
+class DiscretizationLoss(nn.Module):
+ def __init__(self, discrete_bins, dim, expected_variance, store_past=0):
+ super().__init__()
+ self.discrete_bins = discrete_bins
+ self.dim = dim
+ self.dist = torch.distributions.Normal(0, scale=expected_variance)
+ if store_past > 0:
+ self.record_past = True
+ self.register_buffer("accumulator_index", torch.zeros(1, dtype=torch.long, device="cpu"))
+ self.register_buffer("accumulator_filled", torch.zeros(1, dtype=torch.long, device="cpu"))
+ self.register_buffer("accumulator", torch.zeros(store_past, discrete_bins))
+ else:
+ self.record_past = False
+
+ def forward(self, x):
+ other_dims = set(range(len(x.shape))) - set([self.dim])
+ averaged = x.sum(dim=tuple(other_dims)) / x.sum()
+ averaged = averaged - averaged.mean()
+
+ if self.record_past:
+ acc_count = self.accumulator.shape[0]
+ avg = averaged.detach().clone()
+ if self.accumulator_filled > 0:
+ averaged = torch.mean(self.accumulator, dim=0) * (acc_count - 1) / acc_count + averaged / acc_count
+
+ # Also push averaged into the accumulator.
+ self.accumulator[self.accumulator_index] = avg
+ self.accumulator_index += 1
+ if self.accumulator_index >= acc_count:
+ self.accumulator_index *= 0
+ if self.accumulator_filled <= 0:
+ self.accumulator_filled += 1
+
+ return torch.sum(-self.dist.log_prob(averaged))
+
+
+class ResBlock(nn.Module):
+ def __init__(self, chan, conv, activation):
+ super().__init__()
+ self.net = nn.Sequential(
+ conv(chan, chan, 3, padding=1),
+ activation(),
+ conv(chan, chan, 3, padding=1),
+ activation(),
+ conv(chan, chan, 1),
+ )
+
+ def forward(self, x):
+ return self.net(x) + x
+
+
+class UpsampledConv(nn.Module):
+ def __init__(self, conv, *args, **kwargs):
+ super().__init__()
+ assert "stride" in kwargs.keys()
+ self.stride = kwargs["stride"]
+ del kwargs["stride"]
+ self.conv = conv(*args, **kwargs)
+
+ def forward(self, x):
+ up = nn.functional.interpolate(x, scale_factor=self.stride, mode="nearest")
+ return self.conv(up)
+
+
+# DiscreteVAE partially derived from lucidrains DALLE implementation
+# Credit: https://github.com/lucidrains/DALLE-pytorch
+class DiscreteVAE(nn.Module):
+ def __init__(
+ self,
+ positional_dims=2,
+ num_tokens=512,
+ codebook_dim=512,
+ num_layers=3,
+ num_resnet_blocks=0,
+ hidden_dim=64,
+ channels=3,
+ stride=2,
+ kernel_size=4,
+ use_transposed_convs=True,
+ encoder_norm=False,
+ activation="relu",
+ smooth_l1_loss=False,
+ straight_through=False,
+ normalization=None, # ((0.5,) * 3, (0.5,) * 3),
+ record_codes=False,
+ discretization_loss_averaging_steps=100,
+ lr_quantizer_args={},
+ ):
+ super().__init__()
+ has_resblocks = num_resnet_blocks > 0
+
+ self.num_tokens = num_tokens
+ self.num_layers = num_layers
+ self.straight_through = straight_through
+ self.positional_dims = positional_dims
+ self.discrete_loss = DiscretizationLoss(
+ num_tokens, 2, 1 / (num_tokens * 2), discretization_loss_averaging_steps
+ )
+
+ assert positional_dims > 0 and positional_dims < 3 # This VAE only supports 1d and 2d inputs for now.
+ if positional_dims == 2:
+ conv = nn.Conv2d
+ conv_transpose = nn.ConvTranspose2d
+ else:
+ conv = nn.Conv1d
+ conv_transpose = nn.ConvTranspose1d
+ if not use_transposed_convs:
+ conv_transpose = functools.partial(UpsampledConv, conv)
+
+ if activation == "relu":
+ act = nn.ReLU
+ elif activation == "silu":
+ act = nn.SiLU
+ else:
+ assert NotImplementedError()
+
+ enc_layers = []
+ dec_layers = []
+
+ if num_layers > 0:
+ enc_chans = [hidden_dim * 2**i for i in range(num_layers)]
+ dec_chans = list(reversed(enc_chans))
+
+ enc_chans = [channels, *enc_chans]
+
+ dec_init_chan = codebook_dim if not has_resblocks else dec_chans[0]
+ dec_chans = [dec_init_chan, *dec_chans]
+
+ enc_chans_io, dec_chans_io = map(lambda t: list(zip(t[:-1], t[1:])), (enc_chans, dec_chans))
+
+ pad = (kernel_size - 1) // 2
+ for (enc_in, enc_out), (dec_in, dec_out) in zip(enc_chans_io, dec_chans_io):
+ enc_layers.append(nn.Sequential(conv(enc_in, enc_out, kernel_size, stride=stride, padding=pad), act()))
+ if encoder_norm:
+ enc_layers.append(nn.GroupNorm(8, enc_out))
+ dec_layers.append(
+ nn.Sequential(conv_transpose(dec_in, dec_out, kernel_size, stride=stride, padding=pad), act())
+ )
+ dec_out_chans = dec_chans[-1]
+ innermost_dim = dec_chans[0]
+ else:
+ enc_layers.append(nn.Sequential(conv(channels, hidden_dim, 1), act()))
+ dec_out_chans = hidden_dim
+ innermost_dim = hidden_dim
+
+ for _ in range(num_resnet_blocks):
+ dec_layers.insert(0, ResBlock(innermost_dim, conv, act))
+ enc_layers.append(ResBlock(innermost_dim, conv, act))
+
+ if num_resnet_blocks > 0:
+ dec_layers.insert(0, conv(codebook_dim, innermost_dim, 1))
+
+ enc_layers.append(conv(innermost_dim, codebook_dim, 1))
+ dec_layers.append(conv(dec_out_chans, channels, 1))
+
+ self.encoder = nn.Sequential(*enc_layers)
+ self.decoder = nn.Sequential(*dec_layers)
+
+ self.loss_fn = F.smooth_l1_loss if smooth_l1_loss else F.mse_loss
+ self.codebook = Quantize(codebook_dim, num_tokens, new_return_order=True)
+
+ # take care of normalization within class
+ self.normalization = normalization
+ self.record_codes = record_codes
+ if record_codes:
+ self.codes = torch.zeros((1228800,), dtype=torch.long)
+ self.code_ind = 0
+ self.total_codes = 0
+ self.internal_step = 0
+
+ def norm(self, images):
+ if not self.normalization is not None:
+ return images
+
+ means, stds = map(lambda t: torch.as_tensor(t).to(images), self.normalization)
+ arrange = "c -> () c () ()" if self.positional_dims == 2 else "c -> () c ()"
+ means, stds = map(lambda t: rearrange(t, arrange), (means, stds))
+ images = images.clone()
+ images.sub_(means).div_(stds)
+ return images
+
+ def get_debug_values(self, step, __):
+ if self.record_codes and self.total_codes > 0:
+ # Report annealing schedule
+ return {"histogram_codes": self.codes[: self.total_codes]}
+ else:
+ return {}
+
+ @torch.no_grad()
+ @eval_decorator
+ def get_codebook_indices(self, images):
+ img = self.norm(images)
+ logits = self.encoder(img).permute((0, 2, 3, 1) if len(img.shape) == 4 else (0, 2, 1))
+ sampled, codes, _ = self.codebook(logits)
+ self.log_codes(codes)
+ return codes
+
+ def decode(self, img_seq):
+ self.log_codes(img_seq)
+ if hasattr(self.codebook, "embed_code"):
+ image_embeds = self.codebook.embed_code(img_seq)
+ else:
+ image_embeds = F.embedding(img_seq, self.codebook.codebook)
+ b, n, d = image_embeds.shape
+
+ kwargs = {}
+ if self.positional_dims == 1:
+ arrange = "b n d -> b d n"
+ else:
+ h = w = int(sqrt(n))
+ arrange = "b (h w) d -> b d h w"
+ kwargs = {"h": h, "w": w}
+ image_embeds = rearrange(image_embeds, arrange, **kwargs)
+ images = [image_embeds]
+ for layer in self.decoder:
+ images.append(layer(images[-1]))
+ return images[-1], images[-2]
+
+ def infer(self, img):
+ img = self.norm(img)
+ logits = self.encoder(img).permute((0, 2, 3, 1) if len(img.shape) == 4 else (0, 2, 1))
+ sampled, codes, commitment_loss = self.codebook(logits)
+ return self.decode(codes)
+
+ # Note: This module is not meant to be run in forward() except while training. It has special logic which performs
+ # evaluation using quantized values when it detects that it is being run in eval() mode, which will be substantially
+ # more lossy (but useful for determining network performance).
+ def forward(self, img):
+ img = self.norm(img)
+ logits = self.encoder(img).permute((0, 2, 3, 1) if len(img.shape) == 4 else (0, 2, 1))
+ sampled, codes, commitment_loss = self.codebook(logits)
+ sampled = sampled.permute((0, 3, 1, 2) if len(img.shape) == 4 else (0, 2, 1))
+
+ if self.training:
+ out = sampled
+ for d in self.decoder:
+ out = d(out)
+ self.log_codes(codes)
+ else:
+ # This is non-differentiable, but gives a better idea of how the network is actually performing.
+ out, _ = self.decode(codes)
+
+ # reconstruction loss
+ recon_loss = self.loss_fn(img, out, reduction="none")
+
+ return recon_loss, commitment_loss, out
+
+ def log_codes(self, codes):
+ # This is so we can debug the distribution of codes being learned.
+ if self.record_codes and self.internal_step % 10 == 0:
+ codes = codes.flatten()
+ l = codes.shape[0]
+ i = self.code_ind if (self.codes.shape[0] - self.code_ind) > l else self.codes.shape[0] - l
+ self.codes[i : i + l] = codes.cpu()
+ self.code_ind = self.code_ind + l
+ if self.code_ind >= self.codes.shape[0]:
+ self.code_ind = 0
+ self.total_codes += 1
+ self.internal_step += 1
diff --git a/submodules/TTS/TTS/tts/layers/xtts/gpt.py b/submodules/TTS/TTS/tts/layers/xtts/gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b186b858a4dd8baec24a7214ae7bc573097fed
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/gpt.py
@@ -0,0 +1,611 @@
+# ported from: https://github.com/neonbjb/tortoise-tts
+
+import functools
+import math
+import random
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import GPT2Config
+
+from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel
+from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder
+from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler
+
+
+def null_position_embeddings(range, dim):
+ return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
+
+
+class LearnedPositionEmbeddings(nn.Module):
+ def __init__(self, seq_len, model_dim, init=0.02, relative=False):
+ super().__init__()
+ # nn.Embedding
+ self.emb = torch.nn.Embedding(seq_len, model_dim)
+ # Initializing this way is standard for GPT-2
+ self.emb.weight.data.normal_(mean=0.0, std=init)
+ self.relative = relative
+ self.seq_len = seq_len
+
+ def forward(self, x):
+ sl = x.shape[1]
+ if self.relative:
+ start = random.randint(sl, self.seq_len) - sl
+ return self.emb(torch.arange(start, start + sl, device=x.device))
+ else:
+ return self.emb(torch.arange(0, sl, device=x.device))
+
+ def get_fixed_embedding(self, ind, dev):
+ return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
+
+
+def build_hf_gpt_transformer(
+ layers,
+ model_dim,
+ heads,
+ max_mel_seq_len,
+ max_text_seq_len,
+ max_prompt_len,
+ checkpointing,
+):
+ """
+ GPT-2 implemented by the HuggingFace library.
+ """
+ from transformers import GPT2Config, GPT2Model
+
+ gpt_config = GPT2Config(
+ vocab_size=256, # Unused.
+ n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len,
+ n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len,
+ n_embd=model_dim,
+ n_layer=layers,
+ n_head=heads,
+ gradient_checkpointing=checkpointing,
+ use_cache=not checkpointing,
+ )
+ gpt = GPT2Model(gpt_config)
+ # Override the built in positional embeddings
+ del gpt.wpe
+ gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
+ # Built-in token embeddings are unused.
+ del gpt.wte
+
+ mel_pos_emb = (
+ LearnedPositionEmbeddings(max_mel_seq_len, model_dim)
+ if max_mel_seq_len != -1
+ else functools.partial(null_position_embeddings, dim=model_dim)
+ )
+ text_pos_emb = (
+ LearnedPositionEmbeddings(max_text_seq_len, model_dim)
+ if max_mel_seq_len != -1
+ else functools.partial(null_position_embeddings, dim=model_dim)
+ )
+ # gpt = torch.compile(gpt, mode="reduce-overhead", fullgraph=True)
+ return gpt, mel_pos_emb, text_pos_emb, None, None
+
+
+class GPT(nn.Module):
+ def __init__(
+ self,
+ start_text_token=261,
+ stop_text_token=0,
+ layers=8,
+ model_dim=512,
+ heads=8,
+ max_text_tokens=120,
+ max_mel_tokens=250,
+ max_prompt_tokens=70,
+ max_conditioning_inputs=1,
+ code_stride_len=1024,
+ number_text_tokens=256,
+ num_audio_tokens=8194,
+ start_audio_token=8192,
+ stop_audio_token=8193,
+ train_solo_embeddings=False,
+ checkpointing=False,
+ average_conditioning_embeddings=False,
+ label_smoothing=0.0,
+ use_perceiver_resampler=False,
+ perceiver_cond_length_compression=256,
+ ):
+ """
+ Args:
+
+ """
+ super().__init__()
+
+ self.label_smoothing = label_smoothing
+ self.number_text_tokens = number_text_tokens
+ self.start_text_token = start_text_token
+ self.stop_text_token = stop_text_token
+ self.num_audio_tokens = num_audio_tokens
+ self.start_audio_token = start_audio_token
+ self.stop_audio_token = stop_audio_token
+ self.start_prompt_token = start_audio_token
+ self.stop_prompt_token = stop_audio_token
+ self.layers = layers
+ self.heads = heads
+ self.model_dim = model_dim
+ self.max_conditioning_inputs = max_conditioning_inputs
+ self.max_gen_mel_tokens = max_mel_tokens - self.max_conditioning_inputs - 2
+ self.max_mel_tokens = -1 if max_mel_tokens == -1 else max_mel_tokens + 2 + self.max_conditioning_inputs
+ self.max_text_tokens = -1 if max_text_tokens == -1 else max_text_tokens + 2
+ self.max_prompt_tokens = max_prompt_tokens
+ self.code_stride_len = code_stride_len
+ self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
+ self.conditioning_dropout = nn.Dropout1d(0.1)
+ self.average_conditioning_embeddings = average_conditioning_embeddings
+ self.use_perceiver_resampler = use_perceiver_resampler
+ self.perceiver_cond_length_compression = perceiver_cond_length_compression
+
+ self.text_embedding = nn.Embedding(self.number_text_tokens, model_dim)
+ self.mel_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
+
+ (
+ self.gpt,
+ self.mel_pos_embedding,
+ self.text_pos_embedding,
+ self.mel_layer_pos_embedding,
+ self.text_layer_pos_embedding,
+ ) = build_hf_gpt_transformer(
+ layers,
+ model_dim,
+ heads,
+ self.max_mel_tokens,
+ self.max_text_tokens,
+ self.max_prompt_tokens,
+ checkpointing,
+ )
+ if train_solo_embeddings:
+ self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
+ self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
+ else:
+ self.mel_solo_embedding = 0
+ self.text_solo_embedding = 0
+
+ self.final_norm = nn.LayerNorm(model_dim)
+ self.text_head = nn.Linear(model_dim, self.number_text_tokens)
+ self.mel_head = nn.Linear(model_dim, self.num_audio_tokens)
+
+ if self.use_perceiver_resampler:
+ # XTTS v2
+ self.conditioning_perceiver = PerceiverResampler(
+ dim=model_dim,
+ depth=2,
+ dim_context=model_dim,
+ num_latents=32,
+ dim_head=64,
+ heads=8,
+ ff_mult=4,
+ use_flash_attn=False,
+ )
+ else:
+ # XTTS v1
+ self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
+ self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)
+
+ def get_grad_norm_parameter_groups(self):
+ return {
+ "conditioning_encoder": list(self.conditioning_encoder.parameters()),
+ "conditioning_perceiver": list(self.conditioning_perceiver.parameters())
+ if self.use_perceiver_resampler
+ else None,
+ "gpt": list(self.gpt.parameters()),
+ "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
+ }
+
+ def init_gpt_for_inference(self, kv_cache=True, use_deepspeed=False):
+ seq_length = self.max_prompt_tokens + self.max_mel_tokens + self.max_text_tokens + 1
+ gpt_config = GPT2Config(
+ vocab_size=self.max_mel_tokens,
+ n_positions=seq_length,
+ n_ctx=seq_length,
+ n_embd=self.model_dim,
+ n_layer=self.layers,
+ n_head=self.heads,
+ gradient_checkpointing=False,
+ use_cache=True,
+ )
+ self.gpt_inference = GPT2InferenceModel(
+ gpt_config,
+ self.gpt,
+ self.mel_pos_embedding,
+ self.mel_embedding,
+ self.final_norm,
+ self.mel_head,
+ kv_cache=kv_cache,
+ )
+ self.gpt.wte = self.mel_embedding
+
+ if use_deepspeed:
+ import deepspeed
+
+ self.ds_engine = deepspeed.init_inference(
+ model=self.gpt_inference.half(), # Transformers models
+ mp_size=1, # Number of GPU
+ dtype=torch.float32, # desired data type of output
+ replace_method="auto", # Lets DS autmatically identify the layer to replace
+ replace_with_kernel_inject=True, # replace the model with the kernel injector
+ )
+ self.gpt_inference = self.ds_engine.module.eval()
+
+ def set_inputs_and_targets(self, input, start_token, stop_token):
+ inp = F.pad(input, (1, 0), value=start_token)
+ tar = F.pad(input, (0, 1), value=stop_token)
+ return inp, tar
+
+ def set_mel_padding(self, mel_input_tokens, code_lengths):
+ """
+ Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in
+ that audio clip, reformats the tokens with stop_audio_token in place of the zero padding. This is required
+ preformatting to create a working TTS model.
+ """
+ # Set padding areas within MEL (currently it is coded with the MEL code for ).
+ for b in range(len(code_lengths)):
+ actual_end = code_lengths[b]
+ if actual_end < mel_input_tokens.shape[-1]:
+ mel_input_tokens[b, actual_end:] = self.stop_audio_token
+ return mel_input_tokens
+
+ def get_logits(
+ self,
+ first_inputs,
+ first_head,
+ second_inputs=None,
+ second_head=None,
+ prompt=None,
+ get_attns=False,
+ return_latent=False,
+ attn_mask_cond=None,
+ attn_mask_text=None,
+ attn_mask_mel=None,
+ ):
+ if prompt is not None:
+ offset = prompt.shape[1]
+ if second_inputs is not None:
+ emb = torch.cat([prompt, first_inputs, second_inputs], dim=1)
+ else:
+ emb = torch.cat([prompt, first_inputs], dim=1)
+
+ # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+ attn_mask = None
+ if attn_mask_text is not None:
+ attn_mask = torch.cat([attn_mask_text, attn_mask_mel], dim=1)
+ if prompt is not None:
+ attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device)
+ attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
+
+ gpt_out = self.gpt(
+ inputs_embeds=emb,
+ return_dict=True,
+ output_attentions=get_attns,
+ attention_mask=attn_mask,
+ )
+
+ if get_attns:
+ return gpt_out.attentions
+
+ enc = gpt_out.last_hidden_state[:, offset:]
+ enc = self.final_norm(enc)
+
+ if return_latent:
+ return enc[:, : first_inputs.shape[1]], enc[:, -second_inputs.shape[1] :]
+
+ first_logits = enc[:, : first_inputs.shape[1]]
+ first_logits = first_head(first_logits)
+ first_logits = first_logits.permute(0, 2, 1)
+ if second_inputs is not None:
+ second_logits = enc[:, -second_inputs.shape[1] :]
+ second_logits = second_head(second_logits)
+ second_logits = second_logits.permute(0, 2, 1)
+ return first_logits, second_logits
+ else:
+ return first_logits
+
+ def get_conditioning(self, speech_conditioning_input):
+ speech_conditioning_input = (
+ speech_conditioning_input.unsqueeze(1)
+ if len(speech_conditioning_input.shape) == 3
+ else speech_conditioning_input
+ )
+ conds = []
+ for j in range(speech_conditioning_input.shape[1]):
+ conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
+ conds = torch.stack(conds, dim=1)
+ conds = conds.mean(dim=1)
+ return conds
+
+ def get_prompts(self, prompt_codes):
+ """
+ Create a prompt from the mel codes. This is used to condition the model on the mel codes.
+ Pad the prompt with start and stop mel tokens.
+ """
+ prompt = prompt_codes
+ if self.training:
+ lengths = []
+ # Compute the real prompt length based on the first encounter with the token 83 used for padding
+ for i in range(prompt_codes.shape[0]):
+ length = 0
+ for j in range(prompt_codes.shape[1]):
+ if prompt_codes[i, j] == 83:
+ break
+ else:
+ length += 1
+ lengths.append(length)
+
+ # prompt_len = random.randint(1, 9) # in secs
+ prompt_len = 3
+ prompt_len = prompt_len * 24 # in frames
+ if prompt_codes.shape[-1] >= prompt_len:
+ for i in range(prompt_codes.shape[0]):
+ if lengths[i] < prompt_len:
+ start = 0
+ else:
+ start = random.randint(0, lengths[i] - prompt_len)
+ prompt = prompt_codes[:, start : start + prompt_len]
+
+ # add start and stop tokens
+ prompt = F.pad(prompt, (1, 0), value=self.start_prompt_token)
+ prompt = F.pad(prompt, (0, 1), value=self.stop_prompt_token)
+ return prompt
+
+ def get_style_emb(self, cond_input, return_latent=False):
+ """
+ cond_input: (b, 80, s) or (b, 1, 80, s)
+ conds: (b, 1024, s)
+ """
+ conds = None
+ if not return_latent:
+ if cond_input.ndim == 4:
+ cond_input = cond_input.squeeze(1)
+ conds = self.conditioning_encoder(cond_input) # (b, d, s)
+ if self.use_perceiver_resampler:
+ conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2) # (b, d, 32)
+ else:
+ # already computed
+ conds = cond_input.unsqueeze(1)
+ return conds
+
+ def forward(
+ self,
+ text_inputs,
+ text_lengths,
+ audio_codes,
+ wav_lengths,
+ cond_mels=None,
+ cond_idxs=None,
+ cond_lens=None,
+ cond_latents=None,
+ return_attentions=False,
+ return_latent=False,
+ ):
+ """
+ Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
+ (actuated by `text_first`).
+
+ text_inputs: long tensor, (b,t)
+ text_lengths: long tensor, (b,)
+ mel_inputs: long tensor, (b,m)
+ wav_lengths: long tensor, (b,)
+ cond_mels: MEL float tensor, (b, 1, 80,s)
+ cond_idxs: cond start and end indexs, (b, 2)
+
+ If return_attentions is specified, only logits are returned.
+ If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned.
+ """
+ # ❗ FIXIT
+ if self.max_conditioning_inputs == 0:
+ assert cond_mels is None, " ❗ cond_mels is not None, but max_conditioning_inputs == 0"
+
+ max_text_len = text_lengths.max()
+ code_lengths = torch.ceil(wav_lengths / self.code_stride_len).long() + 3
+
+ if cond_lens is not None:
+ if self.use_perceiver_resampler:
+ cond_lens = cond_lens // self.perceiver_cond_length_compression
+ else:
+ cond_lens = cond_lens // self.code_stride_len
+
+ if cond_idxs is not None:
+ # recompute cond idxs for mel lengths
+ for idx in range(cond_idxs.size(0)):
+ if self.use_perceiver_resampler:
+ cond_idxs[idx] = cond_idxs[idx] // self.perceiver_cond_length_compression
+ else:
+ cond_idxs[idx] = cond_idxs[idx] // self.code_stride_len
+
+ # ensure that the cond_mel does not have padding
+ # if cond_lens is not None and cond_idxs is None:
+ # min_cond_len = torch.min(cond_lens)
+ # cond_mels = cond_mels[:, :, :, :min_cond_len]
+
+ # If len(codes) + 3 is larger than maxiumum allowed length, we truncate the codes.
+ max_mel_len = code_lengths.max()
+
+ if max_mel_len > audio_codes.shape[-1]:
+ audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1]))
+
+ # 💖 Lovely assertions
+ assert (
+ max_mel_len <= audio_codes.shape[-1]
+ ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})"
+ assert (
+ max_text_len <= text_inputs.shape[-1]
+ ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})"
+
+ # Append stop token to text inputs
+ text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token)
+
+ # Append silence token to mel codes
+ audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
+
+ # Pad mel codes with stop_audio_token
+ audio_codes = self.set_mel_padding(
+ audio_codes, code_lengths - 3
+ ) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
+
+ # Build input and target tensors
+ # Prepend start token to inputs and append stop token to targets
+ text_inputs, text_targets = self.set_inputs_and_targets(
+ text_inputs, self.start_text_token, self.stop_text_token
+ )
+ audio_codes, mel_targets = self.set_inputs_and_targets(
+ audio_codes, self.start_audio_token, self.stop_audio_token
+ )
+
+ # Set attn_mask
+ attn_mask_cond = None
+ attn_mask_text = None
+ attn_mask_mel = None
+ if not return_latent:
+ attn_mask_cond = torch.ones(
+ cond_mels.shape[0],
+ cond_mels.shape[-1],
+ dtype=torch.bool,
+ device=text_inputs.device,
+ )
+ attn_mask_text = torch.ones(
+ text_inputs.shape[0],
+ text_inputs.shape[1],
+ dtype=torch.bool,
+ device=text_inputs.device,
+ )
+ attn_mask_mel = torch.ones(
+ audio_codes.shape[0],
+ audio_codes.shape[1],
+ dtype=torch.bool,
+ device=audio_codes.device,
+ )
+
+ if cond_idxs is not None:
+ # use masking approach
+ for idx, r in enumerate(cond_idxs):
+ l = r[1] - r[0]
+ attn_mask_cond[idx, l:] = 0.0
+ elif cond_lens is not None:
+ for idx, l in enumerate(cond_lens):
+ attn_mask_cond[idx, l:] = 0.0
+
+ for idx, l in enumerate(text_lengths):
+ attn_mask_text[idx, l + 1 :] = 0.0
+
+ for idx, l in enumerate(code_lengths):
+ attn_mask_mel[idx, l + 1 :] = 0.0
+
+ # Compute text embeddings + positional embeddings
+ text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+
+ # Compute mel embeddings + positional embeddings
+ mel_emb = self.mel_embedding(audio_codes) + self.mel_pos_embedding(audio_codes)
+
+ # Compute speech conditioning input
+ if cond_latents is None:
+ cond_latents = self.get_style_emb(cond_mels).transpose(1, 2)
+
+ # Get logits
+ sub = -5 # don't ask me why 😄
+ if self.training:
+ sub = -1
+
+ text_logits, mel_logits = self.get_logits(
+ text_emb,
+ self.text_head,
+ mel_emb,
+ self.mel_head,
+ prompt=cond_latents,
+ get_attns=return_attentions,
+ return_latent=return_latent,
+ attn_mask_cond=attn_mask_cond,
+ attn_mask_text=attn_mask_text,
+ attn_mask_mel=attn_mask_mel,
+ )
+ if return_latent:
+ return mel_logits[:, :sub] # sub to prevent bla.
+
+ if return_attentions:
+ return mel_logits
+
+ # Set paddings to -1 to ignore them in loss
+ for idx, l in enumerate(text_lengths):
+ text_targets[idx, l + 1 :] = -1
+
+ for idx, l in enumerate(code_lengths):
+ mel_targets[idx, l + 1 :] = -1
+
+ # check if stoptoken is in every row of mel_targets
+ assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[
+ 0
+ ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row."
+
+ # ignore the loss for the segment used for conditioning
+ # coin flip for the segment to be ignored
+ if cond_idxs is not None:
+ cond_start = cond_idxs[idx, 0]
+ cond_end = cond_idxs[idx, 1]
+ mel_targets[idx, cond_start:cond_end] = -1
+
+ # Compute losses
+ loss_text = F.cross_entropy(
+ text_logits, text_targets.long(), ignore_index=-1, label_smoothing=self.label_smoothing
+ )
+ loss_mel = F.cross_entropy(
+ mel_logits, mel_targets.long(), ignore_index=-1, label_smoothing=self.label_smoothing
+ )
+ return loss_text.mean(), loss_mel.mean(), mel_logits
+
+ def inference(self, cond_latents, text_inputs, **hf_generate_kwargs):
+ self.compute_embeddings(cond_latents, text_inputs)
+ return self.generate(cond_latents, text_inputs, **hf_generate_kwargs)
+
+ def compute_embeddings(
+ self,
+ cond_latents,
+ text_inputs,
+ ):
+ text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
+ text_inputs = F.pad(text_inputs, (1, 0), value=self.start_text_token)
+ emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+ emb = torch.cat([cond_latents, emb], dim=1)
+ self.gpt_inference.store_prefix_emb(emb)
+ gpt_inputs = torch.full(
+ (
+ emb.shape[0],
+ emb.shape[1] + 1, # +1 for the start_audio_token
+ ),
+ fill_value=1,
+ dtype=torch.long,
+ device=text_inputs.device,
+ )
+ gpt_inputs[:, -1] = self.start_audio_token
+ return gpt_inputs
+
+ def generate(
+ self,
+ cond_latents,
+ text_inputs,
+ **hf_generate_kwargs,
+ ):
+ gpt_inputs = self.compute_embeddings(cond_latents, text_inputs)
+ gen = self.gpt_inference.generate(
+ gpt_inputs,
+ bos_token_id=self.start_audio_token,
+ pad_token_id=self.stop_audio_token,
+ eos_token_id=self.stop_audio_token,
+ max_length=self.max_gen_mel_tokens + gpt_inputs.shape[-1],
+ **hf_generate_kwargs,
+ )
+ if "return_dict_in_generate" in hf_generate_kwargs:
+ return gen.sequences[:, gpt_inputs.shape[1] :], gen
+ return gen[:, gpt_inputs.shape[1] :]
+
+ def get_generator(self, fake_inputs, **hf_generate_kwargs):
+ return self.gpt_inference.generate_stream(
+ fake_inputs,
+ bos_token_id=self.start_audio_token,
+ pad_token_id=self.stop_audio_token,
+ eos_token_id=self.stop_audio_token,
+ max_length=self.max_gen_mel_tokens + fake_inputs.shape[-1],
+ do_stream=True,
+ **hf_generate_kwargs,
+ )
diff --git a/submodules/TTS/TTS/tts/layers/xtts/gpt_inference.py b/submodules/TTS/TTS/tts/layers/xtts/gpt_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d44bd3decd2eb14a5bed14e5d2a8232386ef7076
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/gpt_inference.py
@@ -0,0 +1,136 @@
+import math
+
+import torch
+from torch import nn
+from transformers import GPT2PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+
+
+class GPT2InferenceModel(GPT2PreTrainedModel):
+ """Override GPT2LMHeadModel to allow for prefix conditioning."""
+
+ def __init__(self, config, gpt, pos_emb, embeddings, norm, linear, kv_cache):
+ super().__init__(config)
+ self.transformer = gpt
+ self.pos_embedding = pos_emb
+ self.embeddings = embeddings
+ self.final_norm = norm
+ self.lm_head = nn.Sequential(norm, linear)
+ self.kv_cache = kv_cache
+
+ def store_prefix_emb(self, prefix_emb):
+ self.cached_prefix_emb = prefix_emb
+
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+ token_type_ids = kwargs.get("token_type_ids", None) # usually None
+ if not self.kv_cache:
+ past_key_values = None
+
+ # only last token for inputs_ids if past is defined in kwargs
+ if past_key_values is not None:
+ input_ids = input_ids[:, -1].unsqueeze(-1)
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+ attention_mask = kwargs.get("attention_mask", None)
+ position_ids = kwargs.get("position_ids", None)
+
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values is not None:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+ else:
+ position_ids = None
+ return {
+ "input_ids": input_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "position_ids": position_ids,
+ "attention_mask": attention_mask,
+ "token_type_ids": token_type_ids,
+ }
+
+ def forward(
+ self,
+ input_ids=None,
+ past_key_values=None,
+ attention_mask=None,
+ token_type_ids=None,
+ position_ids=None,
+ head_mask=None,
+ inputs_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ assert self.cached_prefix_emb is not None
+ assert inputs_embeds is None # Not supported by this inference model.
+ assert labels is None # Training not supported by this inference model.
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # assert len(past_key_values) + len(input_ids) == attention_mask.shape[1]
+
+ # Create embedding
+ prefix_len = self.cached_prefix_emb.shape[1]
+ if input_ids.shape[1] != 1:
+ gen_inputs = input_ids[:, prefix_len:]
+ gen_emb = self.embeddings(gen_inputs)
+ gen_emb = gen_emb + self.pos_embedding(gen_emb)
+ if self.cached_prefix_emb.shape[0] != gen_emb.shape[0]:
+ prefix_emb = self.cached_prefix_emb.repeat_interleave(
+ gen_emb.shape[0] // self.cached_prefix_emb.shape[0], 0
+ )
+ else:
+ prefix_emb = self.cached_prefix_emb.to(gen_emb.dtype)
+ emb = torch.cat([prefix_emb, gen_emb], dim=1)
+ else:
+ emb = self.embeddings(input_ids)
+ emb = emb + self.pos_embedding.get_fixed_embedding(
+ attention_mask.shape[1] - (prefix_len + 1), attention_mask.device
+ )
+ transformer_outputs = self.transformer(
+ inputs_embeds=emb,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ lm_logits = self.lm_head(hidden_states)
+
+ if not return_dict:
+ return (lm_logits,) + transformer_outputs[1:]
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=None,
+ logits=lm_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ cross_attentions=transformer_outputs.cross_attentions,
+ )
+
+ @staticmethod
+ def _reorder_cache(past, beam_idx):
+ """
+ This function is used to re-order the :obj:`past_key_values` cache if
+ :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
+ called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+ """
+ return tuple(
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+ for layer_past in past
+ )
diff --git a/submodules/TTS/TTS/tts/layers/xtts/hifigan_decoder.py b/submodules/TTS/TTS/tts/layers/xtts/hifigan_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9add7826e694e40296493b0830a6920ca9b36f1a
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/hifigan_decoder.py
@@ -0,0 +1,732 @@
+import torch
+import torchaudio
+from torch import nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+
+from TTS.utils.io import load_fsspec
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding(k, d):
+ return int((k * d - d) / 2)
+
+
+class ResBlock1(torch.nn.Module):
+ """Residual Block Type 1. It has 3 convolutional layers in each convolutional block.
+
+ Network::
+
+ x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o
+ |--------------------------------------------------------------------------------------------------|
+
+
+ Args:
+ channels (int): number of hidden channels for the convolutional layers.
+ kernel_size (int): size of the convolution filter in each layer.
+ dilations (list): list of dilation value for each conv layer in a block.
+ """
+
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+ super().__init__()
+ self.convs1 = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2]),
+ )
+ ),
+ ]
+ )
+
+ self.convs2 = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ ]
+ )
+
+ def forward(self, x):
+ """
+ Args:
+ x (Tensor): input tensor.
+ Returns:
+ Tensor: output tensor.
+ Shapes:
+ x: [B, C, T]
+ """
+ for c1, c2 in zip(self.convs1, self.convs2):
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ xt = c1(xt)
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
+ xt = c2(xt)
+ x = xt + x
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_parametrizations(l, "weight")
+ for l in self.convs2:
+ remove_parametrizations(l, "weight")
+
+
+class ResBlock2(torch.nn.Module):
+ """Residual Block Type 2. It has 1 convolutional layers in each convolutional block.
+
+ Network::
+
+ x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o
+ |---------------------------------------------------|
+
+
+ Args:
+ channels (int): number of hidden channels for the convolutional layers.
+ kernel_size (int): size of the convolution filter in each layer.
+ dilations (list): list of dilation value for each conv layer in a block.
+ """
+
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+ super().__init__()
+ self.convs = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]),
+ )
+ ),
+ ]
+ )
+
+ def forward(self, x):
+ for c in self.convs:
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ xt = c(xt)
+ x = xt + x
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_parametrizations(l, "weight")
+
+
+class HifiganGenerator(torch.nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ resblock_type,
+ resblock_dilation_sizes,
+ resblock_kernel_sizes,
+ upsample_kernel_sizes,
+ upsample_initial_channel,
+ upsample_factors,
+ inference_padding=5,
+ cond_channels=0,
+ conv_pre_weight_norm=True,
+ conv_post_weight_norm=True,
+ conv_post_bias=True,
+ cond_in_each_up_layer=False,
+ ):
+ r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
+
+ Network:
+ x -> lrelu -> upsampling_layer -> resblock1_k1x1 -> z1 -> + -> z_sum / #resblocks -> lrelu -> conv_post_7x1 -> tanh -> o
+ .. -> zI ---|
+ resblockN_kNx1 -> zN ---'
+
+ Args:
+ in_channels (int): number of input tensor channels.
+ out_channels (int): number of output tensor channels.
+ resblock_type (str): type of the `ResBlock`. '1' or '2'.
+ resblock_dilation_sizes (List[List[int]]): list of dilation values in each layer of a `ResBlock`.
+ resblock_kernel_sizes (List[int]): list of kernel sizes for each `ResBlock`.
+ upsample_kernel_sizes (List[int]): list of kernel sizes for each transposed convolution.
+ upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2
+ for each consecutive upsampling layer.
+ upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
+ inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
+ """
+ super().__init__()
+ self.inference_padding = inference_padding
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_factors)
+ self.cond_in_each_up_layer = cond_in_each_up_layer
+
+ # initial upsampling layers
+ self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
+ resblock = ResBlock1 if resblock_type == "1" else ResBlock2
+ # upsampling layers
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)):
+ self.ups.append(
+ weight_norm(
+ ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+ # MRF blocks
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d))
+ # post convolution layer
+ self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias))
+ if cond_channels > 0:
+ self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1)
+
+ if not conv_pre_weight_norm:
+ remove_parametrizations(self.conv_pre, "weight")
+
+ if not conv_post_weight_norm:
+ remove_parametrizations(self.conv_post, "weight")
+
+ if self.cond_in_each_up_layer:
+ self.conds = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ self.conds.append(nn.Conv1d(cond_channels, ch, 1))
+
+ def forward(self, x, g=None):
+ """
+ Args:
+ x (Tensor): feature input tensor.
+ g (Tensor): global conditioning input tensor.
+
+ Returns:
+ Tensor: output waveform.
+
+ Shapes:
+ x: [B, C, T]
+ Tensor: [B, 1, T]
+ """
+ o = self.conv_pre(x)
+ if hasattr(self, "cond_layer"):
+ o = o + self.cond_layer(g)
+ for i in range(self.num_upsamples):
+ o = F.leaky_relu(o, LRELU_SLOPE)
+ o = self.ups[i](o)
+
+ if self.cond_in_each_up_layer:
+ o = o + self.conds[i](g)
+
+ z_sum = None
+ for j in range(self.num_kernels):
+ if z_sum is None:
+ z_sum = self.resblocks[i * self.num_kernels + j](o)
+ else:
+ z_sum += self.resblocks[i * self.num_kernels + j](o)
+ o = z_sum / self.num_kernels
+ o = F.leaky_relu(o)
+ o = self.conv_post(o)
+ o = torch.tanh(o)
+ return o
+
+ @torch.no_grad()
+ def inference(self, c):
+ """
+ Args:
+ x (Tensor): conditioning input tensor.
+
+ Returns:
+ Tensor: output waveform.
+
+ Shapes:
+ x: [B, C, T]
+ Tensor: [B, 1, T]
+ """
+ c = c.to(self.conv_pre.weight.device)
+ c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
+ return self.forward(c)
+
+ def remove_weight_norm(self):
+ print("Removing weight norm...")
+ for l in self.ups:
+ remove_parametrizations(l, "weight")
+ for l in self.resblocks:
+ l.remove_weight_norm()
+ remove_parametrizations(self.conv_pre, "weight")
+ remove_parametrizations(self.conv_post, "weight")
+
+ def load_checkpoint(
+ self, config, checkpoint_path, eval=False, cache=False
+ ): # pylint: disable=unused-argument, redefined-builtin
+ state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+ self.load_state_dict(state["model"])
+ if eval:
+ self.eval()
+ assert not self.training
+ self.remove_weight_norm()
+
+
+class SELayer(nn.Module):
+ def __init__(self, channel, reduction=8):
+ super(SELayer, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ self.fc = nn.Sequential(
+ nn.Linear(channel, channel // reduction),
+ nn.ReLU(inplace=True),
+ nn.Linear(channel // reduction, channel),
+ nn.Sigmoid(),
+ )
+
+ def forward(self, x):
+ b, c, _, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1, 1)
+ return x * y
+
+
+class SEBasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+ super(SEBasicBlock, self).__init__()
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.se = SELayer(planes, reduction)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.relu(out)
+ out = self.bn1(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.se(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+ return out
+
+
+def set_init_dict(model_dict, checkpoint_state, c):
+ # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
+ for k, v in checkpoint_state.items():
+ if k not in model_dict:
+ print(" | > Layer missing in the model definition: {}".format(k))
+ # 1. filter out unnecessary keys
+ pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict}
+ # 2. filter out different size layers
+ pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()}
+ # 3. skip reinit layers
+ if c.has("reinit_layers") and c.reinit_layers is not None:
+ for reinit_layer_name in c.reinit_layers:
+ pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k}
+ # 4. overwrite entries in the existing state dict
+ model_dict.update(pretrained_dict)
+ print(" | > {} / {} layers are restored.".format(len(pretrained_dict), len(model_dict)))
+ return model_dict
+
+
+class PreEmphasis(nn.Module):
+ def __init__(self, coefficient=0.97):
+ super().__init__()
+ self.coefficient = coefficient
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
+
+ def forward(self, x):
+ assert len(x.size()) == 2
+
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+
+
+class ResNetSpeakerEncoder(nn.Module):
+ """This is copied from 🐸TTS to remove it from the dependencies."""
+
+ # pylint: disable=W0102
+ def __init__(
+ self,
+ input_dim=64,
+ proj_dim=512,
+ layers=[3, 4, 6, 3],
+ num_filters=[32, 64, 128, 256],
+ encoder_type="ASP",
+ log_input=False,
+ use_torch_spec=False,
+ audio_config=None,
+ ):
+ super(ResNetSpeakerEncoder, self).__init__()
+
+ self.encoder_type = encoder_type
+ self.input_dim = input_dim
+ self.log_input = log_input
+ self.use_torch_spec = use_torch_spec
+ self.audio_config = audio_config
+ self.proj_dim = proj_dim
+
+ self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+ self.relu = nn.ReLU(inplace=True)
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
+
+ self.inplanes = num_filters[0]
+ self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+ self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+ self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+ self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+ if self.use_torch_spec:
+ self.torch_spec = torch.nn.Sequential(
+ PreEmphasis(audio_config["preemphasis"]),
+ torchaudio.transforms.MelSpectrogram(
+ sample_rate=audio_config["sample_rate"],
+ n_fft=audio_config["fft_size"],
+ win_length=audio_config["win_length"],
+ hop_length=audio_config["hop_length"],
+ window_fn=torch.hamming_window,
+ n_mels=audio_config["num_mels"],
+ ),
+ )
+
+ else:
+ self.torch_spec = None
+
+ outmap_size = int(self.input_dim / 8)
+
+ self.attention = nn.Sequential(
+ nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+ nn.ReLU(),
+ nn.BatchNorm1d(128),
+ nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+ nn.Softmax(dim=2),
+ )
+
+ if self.encoder_type == "SAP":
+ out_dim = num_filters[3] * outmap_size
+ elif self.encoder_type == "ASP":
+ out_dim = num_filters[3] * outmap_size * 2
+ else:
+ raise ValueError("Undefined encoder")
+
+ self.fc = nn.Linear(out_dim, proj_dim)
+
+ self._init_layers()
+
+ def _init_layers(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+
+ def create_layer(self, block, planes, blocks, stride=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(block(self.inplanes, planes, stride, downsample))
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(block(self.inplanes, planes))
+
+ return nn.Sequential(*layers)
+
+ # pylint: disable=R0201
+ def new_parameter(self, *size):
+ out = nn.Parameter(torch.FloatTensor(*size))
+ nn.init.xavier_normal_(out)
+ return out
+
+ def forward(self, x, l2_norm=False):
+ """Forward pass of the model.
+
+ Args:
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+ to compute the spectrogram on-the-fly.
+ l2_norm (bool): Whether to L2-normalize the outputs.
+
+ Shapes:
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+ """
+ x.squeeze_(1)
+ # if you torch spec compute it otherwise use the mel spec computed by the AP
+ if self.use_torch_spec:
+ x = self.torch_spec(x)
+
+ if self.log_input:
+ x = (x + 1e-6).log()
+ x = self.instancenorm(x).unsqueeze(1)
+
+ x = self.conv1(x)
+ x = self.relu(x)
+ x = self.bn1(x)
+
+ x = self.layer1(x)
+ x = self.layer2(x)
+ x = self.layer3(x)
+ x = self.layer4(x)
+
+ x = x.reshape(x.size()[0], -1, x.size()[-1])
+
+ w = self.attention(x)
+
+ if self.encoder_type == "SAP":
+ x = torch.sum(x * w, dim=2)
+ elif self.encoder_type == "ASP":
+ mu = torch.sum(x * w, dim=2)
+ sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
+ x = torch.cat((mu, sg), 1)
+
+ x = x.view(x.size()[0], -1)
+ x = self.fc(x)
+
+ if l2_norm:
+ x = torch.nn.functional.normalize(x, p=2, dim=1)
+ return x
+
+ def load_checkpoint(
+ self,
+ checkpoint_path: str,
+ eval: bool = False,
+ use_cuda: bool = False,
+ criterion=None,
+ cache=False,
+ ):
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
+ try:
+ self.load_state_dict(state["model"])
+ print(" > Model fully restored. ")
+ except (KeyError, RuntimeError) as error:
+ # If eval raise the error
+ if eval:
+ raise error
+
+ print(" > Partial model initialization.")
+ model_dict = self.state_dict()
+ model_dict = set_init_dict(model_dict, state["model"])
+ self.load_state_dict(model_dict)
+ del model_dict
+
+ # load the criterion for restore_path
+ if criterion is not None and "criterion" in state:
+ try:
+ criterion.load_state_dict(state["criterion"])
+ except (KeyError, RuntimeError) as error:
+ print(" > Criterion load ignored because of:", error)
+
+ if use_cuda:
+ self.cuda()
+ if criterion is not None:
+ criterion = criterion.cuda()
+
+ if eval:
+ self.eval()
+ assert not self.training
+
+ if not eval:
+ return criterion, state["step"]
+ return criterion
+
+
+class HifiDecoder(torch.nn.Module):
+ def __init__(
+ self,
+ input_sample_rate=22050,
+ output_sample_rate=24000,
+ output_hop_length=256,
+ ar_mel_length_compression=1024,
+ decoder_input_dim=1024,
+ resblock_type_decoder="1",
+ resblock_dilation_sizes_decoder=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+ resblock_kernel_sizes_decoder=[3, 7, 11],
+ upsample_rates_decoder=[8, 8, 2, 2],
+ upsample_initial_channel_decoder=512,
+ upsample_kernel_sizes_decoder=[16, 16, 4, 4],
+ d_vector_dim=512,
+ cond_d_vector_in_each_upsampling_layer=True,
+ speaker_encoder_audio_config={
+ "fft_size": 512,
+ "win_length": 400,
+ "hop_length": 160,
+ "sample_rate": 16000,
+ "preemphasis": 0.97,
+ "num_mels": 64,
+ },
+ ):
+ super().__init__()
+ self.input_sample_rate = input_sample_rate
+ self.output_sample_rate = output_sample_rate
+ self.output_hop_length = output_hop_length
+ self.ar_mel_length_compression = ar_mel_length_compression
+ self.speaker_encoder_audio_config = speaker_encoder_audio_config
+ self.waveform_decoder = HifiganGenerator(
+ decoder_input_dim,
+ 1,
+ resblock_type_decoder,
+ resblock_dilation_sizes_decoder,
+ resblock_kernel_sizes_decoder,
+ upsample_kernel_sizes_decoder,
+ upsample_initial_channel_decoder,
+ upsample_rates_decoder,
+ inference_padding=0,
+ cond_channels=d_vector_dim,
+ conv_pre_weight_norm=False,
+ conv_post_weight_norm=False,
+ conv_post_bias=False,
+ cond_in_each_up_layer=cond_d_vector_in_each_upsampling_layer,
+ )
+ self.speaker_encoder = ResNetSpeakerEncoder(
+ input_dim=64,
+ proj_dim=512,
+ log_input=True,
+ use_torch_spec=True,
+ audio_config=speaker_encoder_audio_config,
+ )
+
+ @property
+ def device(self):
+ return next(self.parameters()).device
+
+ def forward(self, latents, g=None):
+ """
+ Args:
+ x (Tensor): feature input tensor (GPT latent).
+ g (Tensor): global conditioning input tensor.
+
+ Returns:
+ Tensor: output waveform.
+
+ Shapes:
+ x: [B, C, T]
+ Tensor: [B, 1, T]
+ """
+
+ z = torch.nn.functional.interpolate(
+ latents.transpose(1, 2),
+ scale_factor=[self.ar_mel_length_compression / self.output_hop_length],
+ mode="linear",
+ ).squeeze(1)
+ # upsample to the right sr
+ if self.output_sample_rate != self.input_sample_rate:
+ z = torch.nn.functional.interpolate(
+ z,
+ scale_factor=[self.output_sample_rate / self.input_sample_rate],
+ mode="linear",
+ ).squeeze(0)
+ o = self.waveform_decoder(z, g=g)
+ return o
+
+ @torch.no_grad()
+ def inference(self, c, g):
+ """
+ Args:
+ x (Tensor): feature input tensor (GPT latent).
+ g (Tensor): global conditioning input tensor.
+
+ Returns:
+ Tensor: output waveform.
+
+ Shapes:
+ x: [B, C, T]
+ Tensor: [B, 1, T]
+ """
+ return self.forward(c, g=g)
+
+ def load_checkpoint(self, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+ # remove unused keys
+ state = state["model"]
+ states_keys = list(state.keys())
+ for key in states_keys:
+ if "waveform_decoder." not in key and "speaker_encoder." not in key:
+ del state[key]
+
+ self.load_state_dict(state)
+ if eval:
+ self.eval()
+ assert not self.training
+ self.waveform_decoder.remove_weight_norm()
diff --git a/submodules/TTS/TTS/tts/layers/xtts/latent_encoder.py b/submodules/TTS/TTS/tts/layers/xtts/latent_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d62a36f1529ddd1e9e6fdd92afc5c9f224f827
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/latent_encoder.py
@@ -0,0 +1,141 @@
+# ported from: Originally ported from: https://github.com/neonbjb/tortoise-tts
+
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class GroupNorm32(nn.GroupNorm):
+ def forward(self, x):
+ return super().forward(x.float()).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+ if dims == 1:
+ return nn.Conv1d(*args, **kwargs)
+ elif dims == 2:
+ return nn.Conv2d(*args, **kwargs)
+ elif dims == 3:
+ return nn.Conv3d(*args, **kwargs)
+ raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def normalization(channels):
+ groups = 32
+ if channels <= 16:
+ groups = 8
+ elif channels <= 64:
+ groups = 16
+ while channels % groups != 0:
+ groups = int(groups / 2)
+ assert groups > 2
+ return GroupNorm32(groups, channels)
+
+
+def zero_module(module):
+ for p in module.parameters():
+ p.detach().zero_()
+ return module
+
+
+class QKVAttention(nn.Module):
+ def __init__(self, n_heads):
+ super().__init__()
+ self.n_heads = n_heads
+
+ def forward(self, qkv, mask=None, qk_bias=0):
+ """
+ Apply QKV attention.
+
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+ :return: an [N x (H * C) x T] tensor after attention.
+ """
+ bs, width, length = qkv.shape
+ assert width % (3 * self.n_heads) == 0
+ ch = width // (3 * self.n_heads)
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+ scale = 1 / math.sqrt(math.sqrt(ch))
+ weight = torch.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards
+ weight = weight + qk_bias
+ if mask is not None:
+ mask = mask.repeat(self.n_heads, 1, 1)
+ weight[mask.logical_not()] = -torch.inf
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+ a = torch.einsum("bts,bcs->bct", weight, v)
+
+ return a.reshape(bs, -1, length)
+
+
+class AttentionBlock(nn.Module):
+ """An attention block that allows spatial positions to attend to each other."""
+
+ def __init__(
+ self,
+ channels,
+ num_heads=1,
+ num_head_channels=-1,
+ out_channels=None,
+ do_activation=False,
+ ):
+ super().__init__()
+ self.channels = channels
+ out_channels = channels if out_channels is None else out_channels
+ self.do_activation = do_activation
+ if num_head_channels == -1:
+ self.num_heads = num_heads
+ else:
+ assert (
+ channels % num_head_channels == 0
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+ self.num_heads = channels // num_head_channels
+ self.norm = normalization(channels)
+ self.qkv = conv_nd(1, channels, out_channels * 3, 1)
+ self.attention = QKVAttention(self.num_heads)
+
+ self.x_proj = nn.Identity() if out_channels == channels else conv_nd(1, channels, out_channels, 1)
+ self.proj_out = zero_module(conv_nd(1, out_channels, out_channels, 1))
+
+ def forward(self, x, mask=None, qk_bias=0):
+ b, c, *spatial = x.shape
+ if mask is not None:
+ if len(mask.shape) == 2:
+ mask = mask.unsqueeze(0).repeat(x.shape[0], 1, 1)
+ if mask.shape[1] != x.shape[-1]:
+ mask = mask[:, : x.shape[-1], : x.shape[-1]]
+
+ x = x.reshape(b, c, -1)
+ x = self.norm(x)
+ if self.do_activation:
+ x = F.silu(x, inplace=True)
+ qkv = self.qkv(x)
+ h = self.attention(qkv, mask=mask, qk_bias=qk_bias)
+ h = self.proj_out(h)
+ xp = self.x_proj(x)
+ return (xp + h).reshape(b, xp.shape[1], *spatial)
+
+
+class ConditioningEncoder(nn.Module):
+ def __init__(
+ self,
+ spec_dim,
+ embedding_dim,
+ attn_blocks=6,
+ num_attn_heads=4,
+ ):
+ super().__init__()
+ attn = []
+ self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
+ for a in range(attn_blocks):
+ attn.append(AttentionBlock(embedding_dim, num_attn_heads))
+ self.attn = nn.Sequential(*attn)
+ self.dim = embedding_dim
+
+ def forward(self, x):
+ """
+ x: (b, 80, s)
+ """
+ h = self.init(x)
+ h = self.attn(h)
+ return h
diff --git a/submodules/TTS/TTS/tts/layers/xtts/perceiver_encoder.py b/submodules/TTS/TTS/tts/layers/xtts/perceiver_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7ee79b5018c80ad04c5766e7cd446862097c09
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/perceiver_encoder.py
@@ -0,0 +1,319 @@
+# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
+
+from collections import namedtuple
+from functools import wraps
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from packaging import version
+from torch import einsum, nn
+
+
+def exists(val):
+ return val is not None
+
+
+def once(fn):
+ called = False
+
+ @wraps(fn)
+ def inner(x):
+ nonlocal called
+ if called:
+ return
+ called = True
+ return fn(x)
+
+ return inner
+
+
+print_once = once(print)
+
+# main class
+
+
+class Attend(nn.Module):
+ def __init__(self, dropout=0.0, causal=False, use_flash=False):
+ super().__init__()
+ self.dropout = dropout
+ self.attn_dropout = nn.Dropout(dropout)
+
+ self.causal = causal
+ self.register_buffer("mask", None, persistent=False)
+
+ self.use_flash = use_flash
+ assert not (
+ use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
+ ), "in order to use flash attention, you must be using pytorch 2.0 or above"
+
+ # determine efficient attention configs for cuda and cpu
+ self.config = namedtuple("EfficientAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"])
+ self.cpu_config = self.config(True, True, True)
+ self.cuda_config = None
+
+ if not torch.cuda.is_available() or not use_flash:
+ return
+
+ device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+
+ if device_properties.major == 8 and device_properties.minor == 0:
+ print_once("A100 GPU detected, using flash attention if input tensor is on cuda")
+ self.cuda_config = self.config(True, False, False)
+ else:
+ print_once("Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda")
+ self.cuda_config = self.config(False, True, True)
+
+ def get_mask(self, n, device):
+ if exists(self.mask) and self.mask.shape[-1] >= n:
+ return self.mask[:n, :n]
+
+ mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1)
+ self.register_buffer("mask", mask, persistent=False)
+ return mask
+
+ def flash_attn(self, q, k, v, mask=None):
+ _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
+
+ # Recommended for multi-query single-key-value attention by Tri Dao
+ # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+
+ if k.ndim == 3:
+ k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
+
+ if v.ndim == 3:
+ v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
+
+ # Check if mask exists and expand to compatible shape
+ # The mask is B L, so it would have to be expanded to B H N L
+
+ if exists(mask):
+ mask = rearrange(mask, "b j -> b 1 1 j")
+ mask = mask.expand(-1, heads, q_len, -1)
+
+ # Check if there is a compatible device for flash attention
+
+ config = self.cuda_config if is_cuda else self.cpu_config
+
+ # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+
+ with torch.backends.cuda.sdp_kernel(**config._asdict()):
+ out = F.scaled_dot_product_attention(
+ q, k, v, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal
+ )
+
+ return out
+
+ def forward(self, q, k, v, mask=None):
+ """
+ einstein notation
+ b - batch
+ h - heads
+ n, i, j - sequence length (base sequence length, source, target)
+ d - feature dimension
+ """
+
+ n, device = q.shape[-2], q.device
+
+ scale = q.shape[-1] ** -0.5
+
+ if self.use_flash:
+ return self.flash_attn(q, k, v, mask=mask)
+
+ kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
+
+ # similarity
+
+ sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale
+
+ # key padding mask
+
+ if exists(mask):
+ mask = rearrange(mask, "b j -> b 1 1 j")
+ sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
+
+ # causal mask
+
+ if self.causal:
+ causal_mask = self.get_mask(n, device)
+ sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
+
+ # attention
+
+ attn = sim.softmax(dim=-1)
+ attn = self.attn_dropout(attn)
+
+ # aggregate values
+
+ out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v)
+
+ return out
+
+
+def Sequential(*mods):
+ return nn.Sequential(*filter(exists, mods))
+
+
+def exists(x):
+ return x is not None
+
+
+def default(val, d):
+ if exists(val):
+ return val
+ return d() if callable(d) else d
+
+
+class RMSNorm(nn.Module):
+ def __init__(self, dim, scale=True, dim_cond=None):
+ super().__init__()
+ self.cond = exists(dim_cond)
+ self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
+
+ self.scale = dim**0.5
+ self.gamma = nn.Parameter(torch.ones(dim)) if scale else None
+
+ def forward(self, x, cond=None):
+ gamma = default(self.gamma, 1)
+ out = F.normalize(x, dim=-1) * self.scale * gamma
+
+ if not self.cond:
+ return out
+
+ assert exists(cond)
+ gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1)
+ gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta))
+ return out * gamma + beta
+
+
+class CausalConv1d(nn.Conv1d):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ (kernel_size,) = self.kernel_size
+ (dilation,) = self.dilation
+ (stride,) = self.stride
+
+ assert stride == 1
+ self.causal_padding = dilation * (kernel_size - 1)
+
+ def forward(self, x):
+ causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0)
+ return super().forward(causal_padded_x)
+
+
+class GEGLU(nn.Module):
+ def forward(self, x):
+ x, gate = x.chunk(2, dim=-1)
+ return F.gelu(gate) * x
+
+
+def FeedForward(dim, mult=4, causal_conv=False):
+ dim_inner = int(dim * mult * 2 / 3)
+
+ conv = None
+ if causal_conv:
+ conv = nn.Sequential(
+ Rearrange("b n d -> b d n"),
+ CausalConv1d(dim_inner, dim_inner, 3),
+ Rearrange("b d n -> b n d"),
+ )
+
+ return Sequential(nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim))
+
+
+class PerceiverResampler(nn.Module):
+ def __init__(
+ self,
+ *,
+ dim,
+ depth=2,
+ dim_context=None,
+ num_latents=32,
+ dim_head=64,
+ heads=8,
+ ff_mult=4,
+ use_flash_attn=False,
+ ):
+ super().__init__()
+ dim_context = default(dim_context, dim)
+
+ self.proj_context = nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
+
+ self.latents = nn.Parameter(torch.randn(num_latents, dim))
+ nn.init.normal_(self.latents, std=0.02)
+
+ self.layers = nn.ModuleList([])
+ for _ in range(depth):
+ self.layers.append(
+ nn.ModuleList(
+ [
+ Attention(
+ dim=dim,
+ dim_head=dim_head,
+ heads=heads,
+ use_flash=use_flash_attn,
+ cross_attn_include_queries=True,
+ ),
+ FeedForward(dim=dim, mult=ff_mult),
+ ]
+ )
+ )
+
+ self.norm = RMSNorm(dim)
+
+ def forward(self, x, mask=None):
+ batch = x.shape[0]
+
+ x = self.proj_context(x)
+
+ latents = repeat(self.latents, "n d -> b n d", b=batch)
+
+ for attn, ff in self.layers:
+ latents = attn(latents, x, mask=mask) + latents
+ latents = ff(latents) + latents
+
+ return self.norm(latents)
+
+
+class Attention(nn.Module):
+ def __init__(
+ self,
+ dim,
+ *,
+ dim_context=None,
+ causal=False,
+ dim_head=64,
+ heads=8,
+ dropout=0.0,
+ use_flash=False,
+ cross_attn_include_queries=False,
+ ):
+ super().__init__()
+ self.scale = dim_head**-0.5
+ self.heads = heads
+ self.cross_attn_include_queries = cross_attn_include_queries
+
+ dim_inner = dim_head * heads
+ dim_context = default(dim_context, dim)
+
+ self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash)
+ self.to_q = nn.Linear(dim, dim_inner, bias=False)
+ self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
+ self.to_out = nn.Linear(dim_inner, dim, bias=False)
+
+ def forward(self, x, context=None, mask=None):
+ h, has_context = self.heads, exists(context)
+
+ context = default(context, x)
+
+ if has_context and self.cross_attn_include_queries:
+ context = torch.cat((x, context), dim=-2)
+
+ q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+
+ out = self.attend(q, k, v, mask=mask)
+
+ out = rearrange(out, "b h n d -> b n (h d)")
+ return self.to_out(out)
diff --git a/submodules/TTS/TTS/tts/layers/xtts/stream_generator.py b/submodules/TTS/TTS/tts/layers/xtts/stream_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e12f8995cf16de94b971036f6b88d255cd42ec6e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/stream_generator.py
@@ -0,0 +1,930 @@
+# Adapted from: https://github.com/LowinLi/transformers-stream-generator
+
+import copy
+import inspect
+import random
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import (
+ BeamSearchScorer,
+ ConstrainedBeamSearchScorer,
+ DisjunctiveConstraint,
+ GenerationConfig,
+ GenerationMixin,
+ LogitsProcessorList,
+ PhrasalConstraint,
+ PreTrainedModel,
+ StoppingCriteriaList,
+)
+from transformers.generation.utils import GenerateOutput, SampleOutput, logger
+
+
+def setup_seed(seed):
+ if seed == -1:
+ return
+ torch.manual_seed(seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed_all(seed)
+ np.random.seed(seed)
+ random.seed(seed)
+ torch.backends.cudnn.deterministic = True
+
+
+class StreamGenerationConfig(GenerationConfig):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.do_stream = kwargs.pop("do_stream", False)
+
+
+class NewGenerationMixin(GenerationMixin):
+ @torch.no_grad()
+ def generate(
+ self,
+ inputs: Optional[torch.Tensor] = None,
+ generation_config: Optional[StreamGenerationConfig] = None,
+ logits_processor: Optional[LogitsProcessorList] = None,
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+ synced_gpus: Optional[bool] = False,
+ seed=0,
+ **kwargs,
+ ) -> Union[GenerateOutput, torch.LongTensor]:
+ r"""
+
+ Generates sequences of token ids for models with a language modeling head.
+
+
+
+ Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+ model's default generation configuration. You can override any `generation_config` by passing the corresponding
+ parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+ Parameters:
+ inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+ The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+ method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+ should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+ `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+ generation_config (`~generation.GenerationConfig`, *optional*):
+ The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+ passed to generate matching the attributes of `generation_config` will override them. If
+ `generation_config` is not provided, the default will be used, which had the following loading
+ priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+ configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+ default values, whose documentation should be checked to parameterize generation.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ Custom logits processors that complement the default logits processors built from arguments and
+ generation config. If a logit processor is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ Custom stopping criteria that complement the default stopping criteria built from arguments and a
+ generation config. If a stopping criteria is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+ If provided, this function constraints the beam search to allowed tokens only at each step. If not
+ provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+ `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+ on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+ for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+ Retrieval](https://arxiv.org/abs/2010.00904).
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+ kwargs:
+ Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+ forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+ specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+ Return:
+ [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+ or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+ If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+ [`~utils.ModelOutput`] types are:
+
+ - [`~generation.GreedySearchDecoderOnlyOutput`],
+ - [`~generation.SampleDecoderOnlyOutput`],
+ - [`~generation.BeamSearchDecoderOnlyOutput`],
+ - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+ If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+ [`~utils.ModelOutput`] types are:
+
+ - [`~generation.GreedySearchEncoderDecoderOutput`],
+ - [`~generation.SampleEncoderDecoderOutput`],
+ - [`~generation.BeamSearchEncoderDecoderOutput`],
+ - [`~generation.BeamSampleEncoderDecoderOutput`]
+ """
+ # setup_seed(seed)
+ # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+ self._validate_model_class()
+
+ # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+ if generation_config is None:
+ # legacy: users may modify the model configuration to control generation -- update the generation config
+ # model attribute accordingly, if it was created from the model config
+ if self.generation_config._from_model_config:
+ new_generation_config = StreamGenerationConfig.from_model_config(self.config)
+ if new_generation_config != self.generation_config:
+ warnings.warn(
+ "You have modified the pretrained model configuration to control generation. This is a"
+ " deprecated strategy to control generation and will be removed soon, in a future version."
+ " Please use a generation configuration file (see"
+ " https://huggingface.co/docs/transformers/main_classes/text_generation)"
+ )
+ self.generation_config = new_generation_config
+ generation_config = self.generation_config
+
+ generation_config = copy.deepcopy(generation_config)
+ model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
+ # self._validate_model_kwargs(model_kwargs.copy())
+
+ # 2. Set generation parameters if not already defined
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+ if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+ if model_kwargs.get("attention_mask", None) is None:
+ logger.warning(
+ "The attention mask and the pad token id were not set. As a consequence, you may observe "
+ "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+ )
+ eos_token_id = generation_config.eos_token_id
+ if isinstance(eos_token_id, list):
+ eos_token_id = eos_token_id[0]
+ logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+ generation_config.pad_token_id = eos_token_id
+
+ # 3. Define model inputs
+ # inputs_tensor has to be defined
+ # model_input_name is defined if model-specific keyword input is passed
+ # otherwise model_input_name is None
+ # all model-specific keyword inputs are removed from `model_kwargs`
+ inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+ inputs, generation_config.bos_token_id, model_kwargs
+ )
+ batch_size = inputs_tensor.shape[0]
+
+ # 4. Define other model kwargs
+ model_kwargs["output_attentions"] = generation_config.output_attentions
+ model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+ model_kwargs["use_cache"] = generation_config.use_cache
+
+ accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
+ requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+ if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
+ model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+ inputs_tensor,
+ generation_config.pad_token_id,
+ generation_config.eos_token_id,
+ )
+
+ # decoder-only models should use left-padding for generation
+ if not self.config.is_encoder_decoder:
+ if (
+ generation_config.pad_token_id is not None
+ and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0
+ ):
+ logger.warning(
+ "A decoder-only architecture is being used, but right-padding was detected! For correct "
+ "generation results, please set `padding_side='left'` when initializing the tokenizer."
+ )
+
+ if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+ # if model is encoder decoder encoder_outputs are created
+ # and added to `model_kwargs`
+ model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+ inputs_tensor, model_kwargs, model_input_name
+ )
+
+ # 5. Prepare `input_ids` which will be used for auto-regressive generation
+ if self.config.is_encoder_decoder:
+ input_ids = self._prepare_decoder_input_ids_for_generation(
+ batch_size,
+ decoder_start_token_id=generation_config.decoder_start_token_id,
+ bos_token_id=generation_config.bos_token_id,
+ model_kwargs=model_kwargs,
+ device=inputs_tensor.device,
+ )
+ else:
+ # if decoder-only then inputs_tensor has to be `input_ids`
+ input_ids = inputs_tensor
+
+ # 6. Prepare `max_length` depending on other stopping criteria.
+ input_ids_seq_length = input_ids.shape[-1]
+ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+ if has_default_max_length and generation_config.max_new_tokens is None:
+ warnings.warn(
+ "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to"
+ f" {generation_config.max_length} (`generation_config.max_length`). Controlling `max_length` via the"
+ " config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we"
+ " recommend using `max_new_tokens` to control the maximum length of the generation.",
+ UserWarning,
+ )
+ elif has_default_max_length and generation_config.max_new_tokens is not None:
+ generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+ elif not has_default_max_length and generation_config.max_new_tokens is not None:
+ raise ValueError(
+ "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
+ " limit to the generated output length. Remove one of those arguments. Please refer to the"
+ " documentation for more information. "
+ "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+ )
+
+ if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+ raise ValueError(
+ f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
+ f" the maximum length ({generation_config.max_length})"
+ )
+ if input_ids_seq_length >= generation_config.max_length:
+ input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+ logger.warning(
+ f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+ f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+ " increasing `max_new_tokens`."
+ )
+
+ # 7. determine generation mode
+ is_constraint_gen_mode = (
+ generation_config.constraints is not None or generation_config.force_words_ids is not None
+ )
+
+ is_contrastive_search_gen_mode = (
+ generation_config.top_k is not None
+ and generation_config.top_k > 1
+ and generation_config.do_sample is False
+ and generation_config.penalty_alpha is not None
+ and generation_config.penalty_alpha > 0
+ )
+
+ is_greedy_gen_mode = (
+ (generation_config.num_beams == 1)
+ and (generation_config.num_beam_groups == 1)
+ and generation_config.do_sample is False
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+ is_sample_gen_mode = (
+ (generation_config.num_beams == 1)
+ and (generation_config.num_beam_groups == 1)
+ and generation_config.do_sample is True
+ and generation_config.do_stream is False
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+ is_sample_gen_stream_mode = (
+ (generation_config.num_beams == 1)
+ and (generation_config.num_beam_groups == 1)
+ and generation_config.do_stream is True
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+ is_beam_gen_mode = (
+ (generation_config.num_beams > 1)
+ and (generation_config.num_beam_groups == 1)
+ and generation_config.do_sample is False
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+ is_beam_sample_gen_mode = (
+ (generation_config.num_beams > 1)
+ and (generation_config.num_beam_groups == 1)
+ and generation_config.do_sample is True
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+ is_group_beam_gen_mode = (
+ (generation_config.num_beams > 1)
+ and (generation_config.num_beam_groups > 1)
+ and not is_constraint_gen_mode
+ and not is_contrastive_search_gen_mode
+ )
+
+ if generation_config.num_beam_groups > generation_config.num_beams:
+ raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
+ if is_group_beam_gen_mode and generation_config.do_sample is True:
+ raise ValueError(
+ "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
+ )
+
+ if self.device.type != input_ids.device.type:
+ warnings.warn(
+ "You are calling .generate() with the `input_ids` being on a device type different"
+ f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+ f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+ " Please make sure that you have put `input_ids` to the"
+ f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+ " running `.generate()`.",
+ UserWarning,
+ )
+ # 8. prepare distribution pre_processing samplers
+ logits_processor = self._get_logits_processor(
+ generation_config=generation_config,
+ input_ids_seq_length=input_ids_seq_length,
+ encoder_input_ids=inputs_tensor,
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+ logits_processor=logits_processor,
+ )
+
+ # 9. prepare stopping criteria
+ stopping_criteria = self._get_stopping_criteria(
+ generation_config=generation_config, stopping_criteria=stopping_criteria
+ )
+ # 10. go into different generation modes
+ if is_greedy_gen_mode:
+ if generation_config.num_return_sequences > 1:
+ raise ValueError(
+ f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
+ " greedy search."
+ )
+
+ # 11. run greedy search
+ return self.greedy_search(
+ input_ids,
+ logits_processor=logits_processor,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ elif is_contrastive_search_gen_mode:
+ if generation_config.num_return_sequences > 1:
+ raise ValueError(
+ f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
+ " contrastive search."
+ )
+
+ return self.contrastive_search(
+ input_ids,
+ top_k=generation_config.top_k,
+ penalty_alpha=generation_config.penalty_alpha,
+ logits_processor=logits_processor,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ elif is_sample_gen_mode:
+ # 11. prepare logits warper
+ logits_warper = self._get_logits_warper(generation_config)
+
+ # 12. expand input_ids with `num_return_sequences` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_return_sequences,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+
+ # 13. run sample
+ return self.sample(
+ input_ids,
+ logits_processor=logits_processor,
+ logits_warper=logits_warper,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+ elif is_sample_gen_stream_mode:
+ # 11. prepare logits warper
+ logits_warper = self._get_logits_warper(generation_config)
+
+ # 12. expand input_ids with `num_return_sequences` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_return_sequences,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+
+ # 13. run sample
+ return self.sample_stream(
+ input_ids,
+ logits_processor=logits_processor,
+ logits_warper=logits_warper,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+ elif is_beam_gen_mode:
+ if generation_config.num_return_sequences > generation_config.num_beams:
+ raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+
+ if stopping_criteria.max_length is None:
+ raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+ # 11. prepare beam search scorer
+ beam_scorer = BeamSearchScorer(
+ batch_size=batch_size,
+ num_beams=generation_config.num_beams,
+ device=inputs_tensor.device,
+ length_penalty=generation_config.length_penalty,
+ do_early_stopping=generation_config.early_stopping,
+ num_beam_hyps_to_keep=generation_config.num_return_sequences,
+ )
+ # 12. interleave input_ids with `num_beams` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_beams,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+ # 13. run beam search
+ return self.beam_search(
+ input_ids,
+ beam_scorer,
+ logits_processor=logits_processor,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ elif is_beam_sample_gen_mode:
+ # 11. prepare logits warper
+ logits_warper = self._get_logits_warper(generation_config)
+
+ if stopping_criteria.max_length is None:
+ raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+ # 12. prepare beam search scorer
+ beam_scorer = BeamSearchScorer(
+ batch_size=batch_size * generation_config.num_return_sequences,
+ num_beams=generation_config.num_beams,
+ device=inputs_tensor.device,
+ length_penalty=generation_config.length_penalty,
+ do_early_stopping=generation_config.early_stopping,
+ )
+
+ # 13. interleave input_ids with `num_beams` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_beams * generation_config.num_return_sequences,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+
+ # 14. run beam sample
+ return self.beam_sample(
+ input_ids,
+ beam_scorer,
+ logits_processor=logits_processor,
+ logits_warper=logits_warper,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ elif is_group_beam_gen_mode:
+ if generation_config.num_return_sequences > generation_config.num_beams:
+ raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+
+ if generation_config.num_beams % generation_config.num_beam_groups != 0:
+ raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
+
+ if stopping_criteria.max_length is None:
+ raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+ has_default_typical_p = kwargs.get("typical_p") is None and generation_config.typical_p == 1.0
+ if not has_default_typical_p:
+ raise ValueError("Decoder argument `typical_p` is not supported with beam groups.")
+
+ # 11. prepare beam search scorer
+ beam_scorer = BeamSearchScorer(
+ batch_size=batch_size,
+ num_beams=generation_config.num_beams,
+ max_length=stopping_criteria.max_length,
+ device=inputs_tensor.device,
+ length_penalty=generation_config.length_penalty,
+ do_early_stopping=generation_config.early_stopping,
+ num_beam_hyps_to_keep=generation_config.num_return_sequences,
+ num_beam_groups=generation_config.num_beam_groups,
+ )
+ # 12. interleave input_ids with `num_beams` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_beams,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+ # 13. run beam search
+ return self.group_beam_search(
+ input_ids,
+ beam_scorer,
+ logits_processor=logits_processor,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ elif is_constraint_gen_mode:
+ if generation_config.num_return_sequences > generation_config.num_beams:
+ raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+
+ if stopping_criteria.max_length is None:
+ raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+ if generation_config.num_beams <= 1:
+ raise ValueError("`num_beams` needs to be greater than 1 for constrained generation.")
+
+ if generation_config.do_sample:
+ raise ValueError("`do_sample` needs to be false for constrained generation.")
+
+ if generation_config.num_beam_groups is not None and generation_config.num_beam_groups > 1:
+ raise ValueError("`num_beam_groups` not supported yet for constrained generation.")
+
+ final_constraints = []
+ if generation_config.constraints is not None:
+ final_constraints = generation_config.constraints
+
+ if generation_config.force_words_ids is not None:
+
+ def typeerror():
+ raise ValueError(
+ "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]`"
+ f"of positive integers, but is {generation_config.force_words_ids}."
+ )
+
+ if (
+ not isinstance(generation_config.force_words_ids, list)
+ or len(generation_config.force_words_ids) == 0
+ ):
+ typeerror()
+
+ for word_ids in generation_config.force_words_ids:
+ if isinstance(word_ids[0], list):
+ if not isinstance(word_ids, list) or len(word_ids) == 0:
+ typeerror()
+ if any(not isinstance(token_ids, list) for token_ids in word_ids):
+ typeerror()
+ if any(
+ any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+ for token_ids in word_ids
+ ):
+ typeerror()
+
+ constraint = DisjunctiveConstraint(word_ids)
+ else:
+ if not isinstance(word_ids, list) or len(word_ids) == 0:
+ typeerror()
+ if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
+ typeerror()
+
+ constraint = PhrasalConstraint(word_ids)
+ final_constraints.append(constraint)
+
+ # 11. prepare beam search scorer
+ constrained_beam_scorer = ConstrainedBeamSearchScorer(
+ constraints=final_constraints,
+ batch_size=batch_size,
+ num_beams=generation_config.num_beams,
+ device=inputs_tensor.device,
+ length_penalty=generation_config.length_penalty,
+ do_early_stopping=generation_config.early_stopping,
+ num_beam_hyps_to_keep=generation_config.num_return_sequences,
+ )
+ # 12. interleave input_ids with `num_beams` additional sequences per batch
+ input_ids, model_kwargs = self._expand_inputs_for_generation(
+ input_ids=input_ids,
+ expand_size=generation_config.num_beams,
+ is_encoder_decoder=self.config.is_encoder_decoder,
+ **model_kwargs,
+ )
+ # 13. run beam search
+ return self.constrained_beam_search(
+ input_ids,
+ constrained_beam_scorer=constrained_beam_scorer,
+ logits_processor=logits_processor,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=generation_config.pad_token_id,
+ eos_token_id=generation_config.eos_token_id,
+ output_scores=generation_config.output_scores,
+ return_dict_in_generate=generation_config.return_dict_in_generate,
+ synced_gpus=synced_gpus,
+ **model_kwargs,
+ )
+
+ @torch.no_grad()
+ def sample_stream(
+ self,
+ input_ids: torch.LongTensor,
+ logits_processor: Optional[LogitsProcessorList] = None,
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
+ logits_warper: Optional[LogitsProcessorList] = None,
+ max_length: Optional[int] = None,
+ pad_token_id: Optional[int] = None,
+ eos_token_id: Optional[Union[int, List[int]]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_scores: Optional[bool] = None,
+ return_dict_in_generate: Optional[bool] = None,
+ synced_gpus: Optional[bool] = False,
+ **model_kwargs,
+ ) -> Union[SampleOutput, torch.LongTensor]:
+ r"""
+ Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+ can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+
+
+ In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+ For an overview of generation strategies and code examples, check the [following
+ guide](./generation_strategies).
+
+
+
+ Parameters:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The sequence used as a prompt for the generation.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+ used to modify the prediction scores of the language modeling head applied at each generation step.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+ used to tell if the generation loop should stop.
+ logits_warper (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+ to warp the prediction score distribution of the language modeling head applied before multinomial
+ sampling at each generation step.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+ tokens. The maximum length of the sequence to be generated.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`int`, *optional*):
+ The id of the *end-of-sequence* token.
+ output_attentions (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more details.
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more details.
+ output_scores (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+ model_kwargs:
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+ an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+ Return:
+ [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`:
+ A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+ `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
+
+ Examples:
+
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForCausalLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... TopKLogitsWarper,
+ ... TemperatureLogitsWarper,
+ ... StoppingCriteriaList,
+ ... MaxLengthCriteria,
+ ... )
+ >>> import torch
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+ >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+ >>> model.config.pad_token_id = model.config.eos_token_id
+ >>> model.generation_config.pad_token_id = model.config.eos_token_id
+
+ >>> input_prompt = "Today is a beautiful day, and"
+ >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList(
+ ... [
+ ... MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+ ... ]
+ ... )
+ >>> # instantiate logits processors
+ >>> logits_warper = LogitsProcessorList(
+ ... [
+ ... TopKLogitsWarper(50),
+ ... TemperatureLogitsWarper(0.7),
+ ... ]
+ ... )
+
+ >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+
+ >>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+ >>> outputs = model.sample(
+ ... input_ids,
+ ... logits_processor=logits_processor,
+ ... logits_warper=logits_warper,
+ ... stopping_criteria=stopping_criteria,
+ ... )
+
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+ ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the']
+ ```"""
+ # init values
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+ if max_length is not None:
+ warnings.warn(
+ "`max_length` is deprecated in this function, use"
+ " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+ UserWarning,
+ )
+ stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+ logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+ pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+ eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+ if isinstance(eos_token_id, int):
+ eos_token_id = [eos_token_id]
+ output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+ output_attentions = (
+ output_attentions if output_attentions is not None else self.generation_config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+ )
+ return_dict_in_generate = (
+ return_dict_in_generate
+ if return_dict_in_generate is not None
+ else self.generation_config.return_dict_in_generate
+ )
+
+ # init attention / hidden states / scores tuples
+ scores = () if (return_dict_in_generate and output_scores) else None
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+ # keep track of which sequences are already finished
+ unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+
+ this_peer_finished = False # used by synced_gpus only
+ # auto-regressive generation
+ while True:
+ if synced_gpus:
+ # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+ # The following logic allows an early break if all peers finished generating their sequence
+ this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+ # send 0.0 if we finished, 1.0 otherwise
+ dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+ # did all peers finish? the reduced sum will be 0.0 then
+ if this_peer_finished_flag.item() == 0.0:
+ break
+
+ # prepare model inputs
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+ # forward pass to get next token
+ outputs = self(
+ **model_inputs,
+ return_dict=True,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ )
+
+ if synced_gpus and this_peer_finished:
+ continue # don't waste resources running the code we don't need
+
+ next_token_logits = outputs.logits[:, -1, :]
+
+ # pre-process distribution
+ next_token_scores = logits_processor(input_ids, next_token_logits)
+ next_token_scores = logits_warper(input_ids, next_token_scores)
+
+ # Store scores, attentions and hidden_states when required
+ if return_dict_in_generate:
+ if output_scores:
+ scores += (next_token_scores,)
+ if output_attentions:
+ decoder_attentions += (
+ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+ )
+ if self.config.is_encoder_decoder:
+ cross_attentions += (outputs.cross_attentions,)
+
+ if output_hidden_states:
+ decoder_hidden_states += (
+ (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+ )
+
+ # sample
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+ # finished sentences should have their next token be a padding token
+ if eos_token_id is not None:
+ if pad_token_id is None:
+ raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+ yield next_tokens, self.final_norm(outputs.hidden_states[-1][:, -1])
+ # update generated ids, model inputs, and length for next step
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+ model_kwargs = self._update_model_kwargs_for_generation(
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+ )
+
+ # if eos_token was found in one sentence, set sentence to finished
+ if eos_token_id is not None:
+ unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+
+ # stop when each sentence is finished, or if we exceed the maximum length
+ if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+ if not synced_gpus:
+ break
+ else:
+ this_peer_finished = True
+
+
+def init_stream_support():
+ """Overload PreTrainedModel for streaming."""
+ PreTrainedModel.generate_stream = NewGenerationMixin.generate
+ PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream
+
+
+if __name__ == "__main__":
+ from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
+
+ PreTrainedModel.generate = NewGenerationMixin.generate
+ PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream
+ model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.float16)
+
+ tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+ model = model.to("cuda:0")
+ model = model.eval()
+ prompt_text = "hello? \n"
+ input_ids = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False).input_ids
+ input_ids = input_ids.to("cuda:0")
+
+ with torch.no_grad():
+ result = model.generate(
+ input_ids,
+ max_new_tokens=200,
+ do_sample=True,
+ top_k=30,
+ top_p=0.85,
+ temperature=0.35,
+ repetition_penalty=1.2,
+ early_stopping=True,
+ seed=0,
+ )
+ print(tokenizer.decode(result, skip_special_tokens=True))
+ generator = model.generate(
+ input_ids,
+ max_new_tokens=200,
+ do_sample=True,
+ top_k=30,
+ top_p=0.85,
+ temperature=0.35,
+ repetition_penalty=1.2,
+ early_stopping=True,
+ seed=0,
+ do_stream=True,
+ )
+ stream_result = ""
+ for x in generator:
+ chunk = tokenizer.decode(x, skip_special_tokens=True)
+ stream_result += chunk
+ print(stream_result)
diff --git a/submodules/TTS/TTS/tts/layers/xtts/tokenizer.py b/submodules/TTS/TTS/tts/layers/xtts/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a3cc47aafa0536031cd453ce500c797df28a02b
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/tokenizer.py
@@ -0,0 +1,843 @@
+import os
+import re
+import textwrap
+from functools import cached_property
+
+import pypinyin
+import torch
+from hangul_romanize import Transliter
+from hangul_romanize.rule import academic
+from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
+from tokenizers import Tokenizer
+
+from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
+
+
+def get_spacy_lang(lang):
+ if lang == "zh":
+ return Chinese()
+ elif lang == "ja":
+ return Japanese()
+ elif lang == "ar":
+ return Arabic()
+ elif lang == "es":
+ return Spanish()
+ else:
+ # For most languages, Enlish does the job
+ return English()
+
+
+def split_sentence(text, lang, text_split_length=250):
+ """Preprocess the input text"""
+ text_splits = []
+ if text_split_length is not None and len(text) >= text_split_length:
+ text_splits.append("")
+ nlp = get_spacy_lang(lang)
+ nlp.add_pipe("sentencizer")
+ doc = nlp(text)
+ for sentence in doc.sents:
+ if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
+ # if the last sentence + the current sentence is less than the text_split_length
+ # then add the current sentence to the last sentence
+ text_splits[-1] += " " + str(sentence)
+ text_splits[-1] = text_splits[-1].lstrip()
+ elif len(str(sentence)) > text_split_length:
+ # if the current sentence is greater than the text_split_length
+ for line in textwrap.wrap(
+ str(sentence),
+ width=text_split_length,
+ drop_whitespace=True,
+ break_on_hyphens=False,
+ tabsize=1,
+ ):
+ text_splits.append(str(line))
+ else:
+ text_splits.append(str(sentence))
+
+ if len(text_splits) > 1:
+ if text_splits[0] == "":
+ del text_splits[0]
+ else:
+ text_splits = [text.lstrip()]
+
+ return text_splits
+
+
+_whitespace_re = re.compile(r"\s+")
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = {
+ "en": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("mrs", "misess"),
+ ("mr", "mister"),
+ ("dr", "doctor"),
+ ("st", "saint"),
+ ("co", "company"),
+ ("jr", "junior"),
+ ("maj", "major"),
+ ("gen", "general"),
+ ("drs", "doctors"),
+ ("rev", "reverend"),
+ ("lt", "lieutenant"),
+ ("hon", "honorable"),
+ ("sgt", "sergeant"),
+ ("capt", "captain"),
+ ("esq", "esquire"),
+ ("ltd", "limited"),
+ ("col", "colonel"),
+ ("ft", "fort"),
+ ]
+ ],
+ "es": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("sra", "señora"),
+ ("sr", "señor"),
+ ("dr", "doctor"),
+ ("dra", "doctora"),
+ ("st", "santo"),
+ ("co", "compañía"),
+ ("jr", "junior"),
+ ("ltd", "limitada"),
+ ]
+ ],
+ "fr": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("mme", "madame"),
+ ("mr", "monsieur"),
+ ("dr", "docteur"),
+ ("st", "saint"),
+ ("co", "compagnie"),
+ ("jr", "junior"),
+ ("ltd", "limitée"),
+ ]
+ ],
+ "de": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("fr", "frau"),
+ ("dr", "doktor"),
+ ("st", "sankt"),
+ ("co", "firma"),
+ ("jr", "junior"),
+ ]
+ ],
+ "pt": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("sra", "senhora"),
+ ("sr", "senhor"),
+ ("dr", "doutor"),
+ ("dra", "doutora"),
+ ("st", "santo"),
+ ("co", "companhia"),
+ ("jr", "júnior"),
+ ("ltd", "limitada"),
+ ]
+ ],
+ "it": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ # ("sig.ra", "signora"),
+ ("sig", "signore"),
+ ("dr", "dottore"),
+ ("st", "santo"),
+ ("co", "compagnia"),
+ ("jr", "junior"),
+ ("ltd", "limitata"),
+ ]
+ ],
+ "pl": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("p", "pani"),
+ ("m", "pan"),
+ ("dr", "doktor"),
+ ("sw", "święty"),
+ ("jr", "junior"),
+ ]
+ ],
+ "ar": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ # There are not many common abbreviations in Arabic as in English.
+ ]
+ ],
+ "zh": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
+ ]
+ ],
+ "cs": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("dr", "doktor"), # doctor
+ ("ing", "inženýr"), # engineer
+ ("p", "pan"), # Could also map to pani for woman but no easy way to do it
+ # Other abbreviations would be specialized and not as common.
+ ]
+ ],
+ "ru": [
+ (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("г-жа", "госпожа"), # Mrs.
+ ("г-н", "господин"), # Mr.
+ ("д-р", "доктор"), # doctor
+ # Other abbreviations are less common or specialized.
+ ]
+ ],
+ "nl": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("dhr", "de heer"), # Mr.
+ ("mevr", "mevrouw"), # Mrs.
+ ("dr", "dokter"), # doctor
+ ("jhr", "jonkheer"), # young lord or nobleman
+ # Dutch uses more abbreviations, but these are the most common ones.
+ ]
+ ],
+ "tr": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("b", "bay"), # Mr.
+ ("byk", "büyük"), # büyük
+ ("dr", "doktor"), # doctor
+ # Add other Turkish abbreviations here if needed.
+ ]
+ ],
+ "hu": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ ("dr", "doktor"), # doctor
+ ("b", "bácsi"), # Mr.
+ ("nőv", "nővér"), # nurse
+ # Add other Hungarian abbreviations here if needed.
+ ]
+ ],
+ "ko": [
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+ for x in [
+ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
+ ]
+ ],
+}
+
+
+def expand_abbreviations_multilingual(text, lang="en"):
+ for regex, replacement in _abbreviations[lang]:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+_symbols_multilingual = {
+ "en": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " and "),
+ ("@", " at "),
+ ("%", " percent "),
+ ("#", " hash "),
+ ("$", " dollar "),
+ ("£", " pound "),
+ ("°", " degree "),
+ ]
+ ],
+ "es": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " y "),
+ ("@", " arroba "),
+ ("%", " por ciento "),
+ ("#", " numeral "),
+ ("$", " dolar "),
+ ("£", " libra "),
+ ("°", " grados "),
+ ]
+ ],
+ "fr": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " et "),
+ ("@", " arobase "),
+ ("%", " pour cent "),
+ ("#", " dièse "),
+ ("$", " dollar "),
+ ("£", " livre "),
+ ("°", " degrés "),
+ ]
+ ],
+ "de": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " und "),
+ ("@", " at "),
+ ("%", " prozent "),
+ ("#", " raute "),
+ ("$", " dollar "),
+ ("£", " pfund "),
+ ("°", " grad "),
+ ]
+ ],
+ "pt": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " e "),
+ ("@", " arroba "),
+ ("%", " por cento "),
+ ("#", " cardinal "),
+ ("$", " dólar "),
+ ("£", " libra "),
+ ("°", " graus "),
+ ]
+ ],
+ "it": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " e "),
+ ("@", " chiocciola "),
+ ("%", " per cento "),
+ ("#", " cancelletto "),
+ ("$", " dollaro "),
+ ("£", " sterlina "),
+ ("°", " gradi "),
+ ]
+ ],
+ "pl": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " i "),
+ ("@", " małpa "),
+ ("%", " procent "),
+ ("#", " krzyżyk "),
+ ("$", " dolar "),
+ ("£", " funt "),
+ ("°", " stopnie "),
+ ]
+ ],
+ "ar": [
+ # Arabic
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " و "),
+ ("@", " على "),
+ ("%", " في المئة "),
+ ("#", " رقم "),
+ ("$", " دولار "),
+ ("£", " جنيه "),
+ ("°", " درجة "),
+ ]
+ ],
+ "zh": [
+ # Chinese
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " 和 "),
+ ("@", " 在 "),
+ ("%", " 百分之 "),
+ ("#", " 号 "),
+ ("$", " 美元 "),
+ ("£", " 英镑 "),
+ ("°", " 度 "),
+ ]
+ ],
+ "cs": [
+ # Czech
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " a "),
+ ("@", " na "),
+ ("%", " procento "),
+ ("#", " křížek "),
+ ("$", " dolar "),
+ ("£", " libra "),
+ ("°", " stupně "),
+ ]
+ ],
+ "ru": [
+ # Russian
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " и "),
+ ("@", " собака "),
+ ("%", " процентов "),
+ ("#", " номер "),
+ ("$", " доллар "),
+ ("£", " фунт "),
+ ("°", " градус "),
+ ]
+ ],
+ "nl": [
+ # Dutch
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " en "),
+ ("@", " bij "),
+ ("%", " procent "),
+ ("#", " hekje "),
+ ("$", " dollar "),
+ ("£", " pond "),
+ ("°", " graden "),
+ ]
+ ],
+ "tr": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " ve "),
+ ("@", " at "),
+ ("%", " yüzde "),
+ ("#", " diyez "),
+ ("$", " dolar "),
+ ("£", " sterlin "),
+ ("°", " derece "),
+ ]
+ ],
+ "hu": [
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " és "),
+ ("@", " kukac "),
+ ("%", " százalék "),
+ ("#", " kettőskereszt "),
+ ("$", " dollár "),
+ ("£", " font "),
+ ("°", " fok "),
+ ]
+ ],
+ "ko": [
+ # Korean
+ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+ for x in [
+ ("&", " 그리고 "),
+ ("@", " 에 "),
+ ("%", " 퍼센트 "),
+ ("#", " 번호 "),
+ ("$", " 달러 "),
+ ("£", " 파운드 "),
+ ("°", " 도 "),
+ ]
+ ],
+}
+
+
+def expand_symbols_multilingual(text, lang="en"):
+ for regex, replacement in _symbols_multilingual[lang]:
+ text = re.sub(regex, replacement, text)
+ text = text.replace(" ", " ") # Ensure there are no double spaces
+ return text.strip()
+
+
+_ordinal_re = {
+ "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
+ "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
+ "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
+ "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
+ "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
+ "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
+ "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
+ "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
+ "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals.
+ "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
+ "nl": re.compile(r"([0-9]+)(de|ste|e)"),
+ "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
+ "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
+ "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
+}
+_number_re = re.compile(r"[0-9]+")
+_currency_re = {
+ "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
+ "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
+ "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
+}
+
+_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
+_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
+_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
+
+
+def _remove_commas(m):
+ text = m.group(0)
+ if "," in text:
+ text = text.replace(",", "")
+ return text
+
+
+def _remove_dots(m):
+ text = m.group(0)
+ if "." in text:
+ text = text.replace(".", "")
+ return text
+
+
+def _expand_decimal_point(m, lang="en"):
+ amount = m.group(1).replace(",", ".")
+ return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+
+
+def _expand_currency(m, lang="en", currency="USD"):
+ amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+ full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+
+ and_equivalents = {
+ "en": ", ",
+ "es": " con ",
+ "fr": " et ",
+ "de": " und ",
+ "pt": " e ",
+ "it": " e ",
+ "pl": ", ",
+ "cs": ", ",
+ "ru": ", ",
+ "nl": ", ",
+ "ar": ", ",
+ "tr": ", ",
+ "hu": ", ",
+ "ko": ", ",
+ }
+
+ if amount.is_integer():
+ last_and = full_amount.rfind(and_equivalents[lang])
+ if last_and != -1:
+ full_amount = full_amount[:last_and]
+
+ return full_amount
+
+
+def _expand_ordinal(m, lang="en"):
+ return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+
+
+def _expand_number(m, lang="en"):
+ return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+
+
+def expand_numbers_multilingual(text, lang="en"):
+ if lang == "zh":
+ text = zh_num2words()(text)
+ else:
+ if lang in ["en", "ru"]:
+ text = re.sub(_comma_number_re, _remove_commas, text)
+ else:
+ text = re.sub(_dot_number_re, _remove_dots, text)
+ try:
+ text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
+ text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
+ text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
+ except:
+ pass
+ if lang != "tr":
+ text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
+ text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
+ text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
+ return text
+
+
+def lowercase(text):
+ return text.lower()
+
+
+def collapse_whitespace(text):
+ return re.sub(_whitespace_re, " ", text)
+
+
+def multilingual_cleaners(text, lang):
+ text = text.replace('"', "")
+ if lang == "tr":
+ text = text.replace("İ", "i")
+ text = text.replace("Ö", "ö")
+ text = text.replace("Ü", "ü")
+ text = lowercase(text)
+ text = expand_numbers_multilingual(text, lang)
+ text = expand_abbreviations_multilingual(text, lang)
+ text = expand_symbols_multilingual(text, lang=lang)
+ text = collapse_whitespace(text)
+ return text
+
+
+def basic_cleaners(text):
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+ text = lowercase(text)
+ text = collapse_whitespace(text)
+ return text
+
+
+def chinese_transliterate(text):
+ return "".join(
+ [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
+ )
+
+
+def japanese_cleaners(text, katsu):
+ text = katsu.romaji(text)
+ text = lowercase(text)
+ return text
+
+
+def korean_transliterate(text):
+ r = Transliter(academic)
+ return r.translit(text)
+
+
+DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json")
+
+
+class VoiceBpeTokenizer:
+ def __init__(self, vocab_file=None):
+ self.tokenizer = None
+ if vocab_file is not None:
+ self.tokenizer = Tokenizer.from_file(vocab_file)
+ self.char_limits = {
+ "en": 250,
+ "de": 253,
+ "fr": 273,
+ "es": 239,
+ "it": 213,
+ "pt": 203,
+ "pl": 224,
+ "zh": 82,
+ "ar": 166,
+ "cs": 186,
+ "ru": 182,
+ "nl": 251,
+ "tr": 226,
+ "ja": 71,
+ "hu": 224,
+ "ko": 95,
+ }
+
+ @cached_property
+ def katsu(self):
+ import cutlet
+
+ return cutlet.Cutlet()
+
+ def check_input_length(self, txt, lang):
+ lang = lang.split("-")[0] # remove the region
+ limit = self.char_limits.get(lang, 250)
+ if len(txt) > limit:
+ print(
+ f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
+ )
+
+ def preprocess_text(self, txt, lang):
+ if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+ txt = multilingual_cleaners(txt, lang)
+ if lang == "zh":
+ txt = chinese_transliterate(txt)
+ if lang == "ko":
+ txt = korean_transliterate(txt)
+ elif lang == "ja":
+ txt = japanese_cleaners(txt, self.katsu)
+ elif lang == "hi":
+ # @manmay will implement this
+ txt = basic_cleaners(txt)
+ else:
+ raise NotImplementedError(f"Language '{lang}' is not supported.")
+ return txt
+
+ def encode(self, txt, lang):
+ lang = lang.split("-")[0] # remove the region
+ self.check_input_length(txt, lang)
+ txt = self.preprocess_text(txt, lang)
+ lang = "zh-cn" if lang == "zh" else lang
+ txt = f"[{lang}]{txt}"
+ txt = txt.replace(" ", "[SPACE]")
+ return self.tokenizer.encode(txt).ids
+
+ def decode(self, seq):
+ if isinstance(seq, torch.Tensor):
+ seq = seq.cpu().numpy()
+ txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
+ txt = txt.replace("[SPACE]", " ")
+ txt = txt.replace("[STOP]", "")
+ txt = txt.replace("[UNK]", "")
+ return txt
+
+ def __len__(self):
+ return self.tokenizer.get_vocab_size()
+
+ def get_number_tokens(self):
+ return max(self.tokenizer.get_vocab().values()) + 1
+
+
+def test_expand_numbers_multilingual():
+ test_cases = [
+ # English
+ ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
+ ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
+ ("This is a 1st test", "This is a first test", "en"),
+ ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
+ ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
+ ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"),
+ ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
+ # French
+ ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
+ ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
+ ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
+ ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"),
+ ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
+ ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"),
+ ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
+ # German
+ ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
+ ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
+ ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender
+ ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
+ ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
+ ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"),
+ # Spanish
+ ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
+ ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
+ ("Este es un 1er test", "Este es un primero test", "es"),
+ ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
+ ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
+ ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"),
+ # Italian
+ ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
+ ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
+ ("Questo è un 1° test", "Questo è un primo test", "it"),
+ ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
+ ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
+ ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"),
+ # Portuguese
+ ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
+ ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
+ ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
+ ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
+ ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
+ (
+ "Isso custará 20,15€ senhor.",
+ "Isso custará vinte euros e quinze cêntimos senhor.",
+ "pt",
+ ), # "cêntimos" should be "centavos" num2words issue
+ # Polish
+ ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
+ ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
+ ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"),
+ ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"),
+ # Arabic
+ ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"),
+ ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
+ # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
+ # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
+ # Czech
+ ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
+ ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
+ ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
+ ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
+ # Russian
+ ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
+ ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
+ ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"),
+ ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"),
+ # Dutch
+ ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
+ ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
+ ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
+ ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
+ # Chinese (Simplified)
+ ("在12.5秒内", "在十二点五秒内", "zh"),
+ ("有50名士兵", "有五十名士兵", "zh"),
+ # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
+ # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
+ # Turkish
+ # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
+ ("50 asker vardı.", "elli asker vardı.", "tr"),
+ ("Bu 1. test", "Bu birinci test", "tr"),
+ # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
+ # Hungarian
+ ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
+ ("50 katona volt.", "ötven katona volt.", "hu"),
+ ("Ez az 1. teszt", "Ez az első teszt", "hu"),
+ # Korean
+ ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
+ ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
+ ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
+ ]
+ for a, b, lang in test_cases:
+ out = expand_numbers_multilingual(a, lang=lang)
+ assert out == b, f"'{out}' vs '{b}'"
+
+
+def test_abbreviations_multilingual():
+ test_cases = [
+ # English
+ ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
+ ("Dr. Jones is here.", "doctor Jones is here.", "en"),
+ # Spanish
+ ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
+ ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
+ # French
+ ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
+ ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"),
+ # German
+ ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
+ # Portuguese
+ ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
+ ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"),
+ # Italian
+ ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
+ # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
+ # Polish
+ ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
+ ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"),
+ # Czech
+ ("P. Novák", "pan Novák", "cs"),
+ ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
+ # Dutch
+ ("Dhr. Jansen", "de heer Jansen", "nl"),
+ ("Mevr. de Vries", "mevrouw de Vries", "nl"),
+ # Russian
+ ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
+ ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"),
+ # Turkish
+ ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
+ ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
+ # Hungarian
+ ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
+ ]
+
+ for a, b, lang in test_cases:
+ out = expand_abbreviations_multilingual(a, lang=lang)
+ assert out == b, f"'{out}' vs '{b}'"
+
+
+def test_symbols_multilingual():
+ test_cases = [
+ ("I have 14% battery", "I have 14 percent battery", "en"),
+ ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
+ ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
+ ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
+ ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"),
+ ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
+ ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
+ ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
+ ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
+ ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
+ ("Я буду @ дома", "Я буду собака дома", "ru"),
+ ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
+ ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
+ ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
+ ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
+ ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
+ ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
+ ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
+ ]
+
+ for a, b, lang in test_cases:
+ out = expand_symbols_multilingual(a, lang=lang)
+ assert out == b, f"'{out}' vs '{b}'"
+
+
+if __name__ == "__main__":
+ test_expand_numbers_multilingual()
+ test_abbreviations_multilingual()
+ test_symbols_multilingual()
diff --git a/submodules/TTS/TTS/tts/layers/xtts/trainer/dataset.py b/submodules/TTS/TTS/tts/layers/xtts/trainer/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f958cb5a5a66e1b7714887f1784a549200e479b
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/trainer/dataset.py
@@ -0,0 +1,239 @@
+import os
+import random
+import sys
+
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+
+from TTS.tts.models.xtts import load_audio
+
+torch.set_num_threads(1)
+
+
+def key_samples_by_col(samples, col):
+ """Returns a dictionary of samples keyed by language."""
+ samples_by_col = {}
+ for sample in samples:
+ col_val = sample[col]
+ assert isinstance(col_val, str)
+ if col_val not in samples_by_col:
+ samples_by_col[col_val] = []
+ samples_by_col[col_val].append(sample)
+ return samples_by_col
+
+
+def get_prompt_slice(gt_path, max_sample_length, min_sample_length, sample_rate, is_eval=False):
+ rel_clip = load_audio(gt_path, sample_rate)
+ # if eval uses a middle size sample when it is possible to be more reproducible
+ if is_eval:
+ sample_length = int((min_sample_length + max_sample_length) / 2)
+ else:
+ sample_length = random.randint(min_sample_length, max_sample_length)
+ gap = rel_clip.shape[-1] - sample_length
+ if gap < 0:
+ sample_length = rel_clip.shape[-1] // 2
+ gap = rel_clip.shape[-1] - sample_length
+
+ # if eval start always from the position 0 to be more reproducible
+ if is_eval:
+ rand_start = 0
+ else:
+ rand_start = random.randint(0, gap)
+
+ rand_end = rand_start + sample_length
+ rel_clip = rel_clip[:, rand_start:rand_end]
+ rel_clip = F.pad(rel_clip, pad=(0, max_sample_length - rel_clip.shape[-1]))
+ cond_idxs = [rand_start, rand_end]
+ return rel_clip, rel_clip.shape[-1], cond_idxs
+
+
+class XTTSDataset(torch.utils.data.Dataset):
+ def __init__(self, config, samples, tokenizer, sample_rate, is_eval=False):
+ self.config = config
+ model_args = config.model_args
+ self.failed_samples = set()
+ self.debug_failures = model_args.debug_loading_failures
+ self.max_conditioning_length = model_args.max_conditioning_length
+ self.min_conditioning_length = model_args.min_conditioning_length
+ self.is_eval = is_eval
+ self.tokenizer = tokenizer
+ self.sample_rate = sample_rate
+ self.max_wav_len = model_args.max_wav_length
+ self.max_text_len = model_args.max_text_length
+ self.use_masking_gt_prompt_approach = model_args.gpt_use_masking_gt_prompt_approach
+ assert self.max_wav_len is not None and self.max_text_len is not None
+
+ self.samples = samples
+ if not is_eval:
+ random.seed(config.training_seed)
+ # random.shuffle(self.samples)
+ random.shuffle(self.samples)
+ # order by language
+ self.samples = key_samples_by_col(self.samples, "language")
+ print(" > Sampling by language:", self.samples.keys())
+ else:
+ # for evaluation load and check samples that are corrupted to ensures the reproducibility
+ self.check_eval_samples()
+
+ def check_eval_samples(self):
+ print(" > Filtering invalid eval samples!!")
+ new_samples = []
+ for sample in self.samples:
+ try:
+ tseq, _, wav, _, _, _ = self.load_item(sample)
+ except:
+ continue
+ # Basically, this audio file is nonexistent or too long to be supported by the dataset.
+ if (
+ wav is None
+ or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
+ or (self.max_text_len is not None and tseq.shape[0] > self.max_text_len)
+ ):
+ continue
+ new_samples.append(sample)
+ self.samples = new_samples
+ print(" > Total eval samples after filtering:", len(self.samples))
+
+ def get_text(self, text, lang):
+ tokens = self.tokenizer.encode(text, lang)
+ tokens = torch.IntTensor(tokens)
+ assert not torch.any(tokens == 1), f"UNK token found in {text} -> {self.tokenizer.decode(tokens)}"
+ # The stop token should always be sacred.
+ assert not torch.any(tokens == 0), f"Stop token found in {text}"
+ return tokens
+
+ def load_item(self, sample):
+ text = str(sample["text"])
+ tseq = self.get_text(text, sample["language"])
+ audiopath = sample["audio_file"]
+ wav = load_audio(audiopath, self.sample_rate)
+ if text is None or len(text.strip()) == 0:
+ raise ValueError
+ if wav is None or wav.shape[-1] < (0.5 * self.sample_rate):
+ # Ultra short clips are also useless (and can cause problems within some models).
+ raise ValueError
+
+ if self.use_masking_gt_prompt_approach:
+ # get a slice from GT to condition the model
+ cond, _, cond_idxs = get_prompt_slice(
+ audiopath, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval
+ )
+ # if use masking do not use cond_len
+ cond_len = torch.nan
+ else:
+ ref_sample = (
+ sample["reference_path"]
+ if "reference_path" in sample and sample["reference_path"] is not None
+ else audiopath
+ )
+ cond, cond_len, _ = get_prompt_slice(
+ ref_sample, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval
+ )
+ # if do not use masking use cond_len
+ cond_idxs = torch.nan
+
+ return tseq, audiopath, wav, cond, cond_len, cond_idxs
+
+ def __getitem__(self, index):
+ if self.is_eval:
+ sample = self.samples[index]
+ sample_id = str(index)
+ else:
+ # select a random language
+ lang = random.choice(list(self.samples.keys()))
+ # select random sample
+ index = random.randint(0, len(self.samples[lang]) - 1)
+ sample = self.samples[lang][index]
+ # a unique id for each sampel to deal with fails
+ sample_id = lang + "_" + str(index)
+
+ # ignore samples that we already know that is not valid ones
+ if sample_id in self.failed_samples:
+ if self.debug_failures:
+ print(f"Ignoring sample {sample['audio_file']} because it was already ignored before !!")
+ # call get item again to get other sample
+ return self[1]
+
+ # try to load the sample, if fails added it to the failed samples list
+ try:
+ tseq, audiopath, wav, cond, cond_len, cond_idxs = self.load_item(sample)
+ except:
+ if self.debug_failures:
+ print(f"error loading {sample['audio_file']} {sys.exc_info()}")
+ self.failed_samples.add(sample_id)
+ return self[1]
+
+ # check if the audio and text size limits and if it out of the limits, added it failed_samples
+ if (
+ wav is None
+ or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
+ or (self.max_text_len is not None and tseq.shape[0] > self.max_text_len)
+ ):
+ # Basically, this audio file is nonexistent or too long to be supported by the dataset.
+ # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
+ if self.debug_failures and wav is not None and tseq is not None:
+ print(
+ f"error loading {sample['audio_file']}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}"
+ )
+ self.failed_samples.add(sample_id)
+ return self[1]
+
+ res = {
+ # 'real_text': text,
+ "text": tseq,
+ "text_lengths": torch.tensor(tseq.shape[0], dtype=torch.long),
+ "wav": wav,
+ "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
+ "filenames": audiopath,
+ "conditioning": cond.unsqueeze(1),
+ "cond_lens": torch.tensor(cond_len, dtype=torch.long)
+ if cond_len is not torch.nan
+ else torch.tensor([cond_len]),
+ "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]),
+ }
+ return res
+
+ def __len__(self):
+ if self.is_eval:
+ return len(self.samples)
+ return sum([len(v) for v in self.samples.values()])
+
+ def collate_fn(self, batch):
+ # convert list of dicts to dict of lists
+ B = len(batch)
+
+ batch = {k: [dic[k] for dic in batch] for k in batch[0]}
+
+ # stack for features that already have the same shape
+ batch["wav_lengths"] = torch.stack(batch["wav_lengths"])
+ batch["text_lengths"] = torch.stack(batch["text_lengths"])
+ batch["conditioning"] = torch.stack(batch["conditioning"])
+ batch["cond_lens"] = torch.stack(batch["cond_lens"])
+ batch["cond_idxs"] = torch.stack(batch["cond_idxs"])
+
+ if torch.any(batch["cond_idxs"].isnan()):
+ batch["cond_idxs"] = None
+
+ if torch.any(batch["cond_lens"].isnan()):
+ batch["cond_lens"] = None
+
+ max_text_len = batch["text_lengths"].max()
+ max_wav_len = batch["wav_lengths"].max()
+
+ # create padding tensors
+ text_padded = torch.IntTensor(B, max_text_len)
+ wav_padded = torch.FloatTensor(B, 1, max_wav_len)
+
+ # initialize tensors for zero padding
+ text_padded = text_padded.zero_()
+ wav_padded = wav_padded.zero_()
+ for i in range(B):
+ text = batch["text"][i]
+ text_padded[i, : batch["text_lengths"][i]] = torch.IntTensor(text)
+ wav = batch["wav"][i]
+ wav_padded[i, :, : batch["wav_lengths"][i]] = torch.FloatTensor(wav)
+
+ batch["wav"] = wav_padded
+ batch["padded_text"] = text_padded
+ return batch
diff --git a/submodules/TTS/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/submodules/TTS/TTS/tts/layers/xtts/trainer/gpt_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a7a1d77835e87fa92b59b18e8d8439a50550d8b
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -0,0 +1,504 @@
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torchaudio
+from coqpit import Coqpit
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from trainer.torch import DistributedSampler
+from trainer.trainer_utils import get_optimizer, get_scheduler
+
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.datasets.dataset import TTSDataset
+from TTS.tts.layers.tortoise.arch_utils import TorchMelSpectrogram
+from TTS.tts.layers.xtts.dvae import DiscreteVAE
+from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
+from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset
+from TTS.tts.models.base_tts import BaseTTS
+from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig
+from TTS.utils.io import load_fsspec
+
+
+@dataclass
+class GPTTrainerConfig(XttsConfig):
+ lr: float = 5e-06
+ training_seed: int = 1
+ optimizer_wd_only_on_weights: bool = False
+ weighted_loss_attrs: dict = field(default_factory=lambda: {})
+ weighted_loss_multipliers: dict = field(default_factory=lambda: {})
+ test_sentences: List[dict] = field(default_factory=lambda: [])
+
+
+@dataclass
+class XttsAudioConfig(XttsAudioConfig):
+ dvae_sample_rate: int = 22050
+
+
+@dataclass
+class GPTArgs(XttsArgs):
+ min_conditioning_length: int = 66150
+ max_conditioning_length: int = 132300
+ gpt_loss_text_ce_weight: float = 0.01
+ gpt_loss_mel_ce_weight: float = 1.0
+ gpt_num_audio_tokens: int = 8194
+ debug_loading_failures: bool = False
+ max_wav_length: int = 255995 # ~11.6 seconds
+ max_text_length: int = 200
+ tokenizer_file: str = ""
+ mel_norm_file: str = "https://coqui.gateway.scarf.sh/v0.14.0_models/mel_norms.pth"
+ dvae_checkpoint: str = ""
+ xtts_checkpoint: str = ""
+ gpt_checkpoint: str = "" # if defined it will replace the gpt weights on xtts model
+ vocoder: str = "" # overide vocoder key on the config to avoid json write issues
+
+
+def callback_clearml_load_save(operation_type, model_info):
+ # return None means skip the file upload/log, returning model_info will continue with the log/upload
+ # you can also change the upload destination file name model_info.upload_filename or check the local file size with Path(model_info.local_model_path).stat().st_size
+ assert operation_type in ("load", "save")
+ # print(operation_type, model_info.__dict__)
+
+ if "similarities.pth" in model_info.__dict__["local_model_path"]:
+ return None
+
+ return model_info
+
+
+class GPTTrainer(BaseTTS):
+ def __init__(self, config: Coqpit):
+ """
+ Tortoise GPT training class
+ """
+ super().__init__(config, ap=None, tokenizer=None)
+ self.config = config
+ # init XTTS model
+ self.xtts = Xtts(self.config)
+ # create the tokenizer with the target vocabulary
+ self.xtts.tokenizer = VoiceBpeTokenizer(self.args.tokenizer_file)
+ # init gpt encoder and hifigan decoder
+ self.xtts.init_models()
+
+ if self.args.xtts_checkpoint:
+ self.load_checkpoint(self.config, self.args.xtts_checkpoint, eval=False, strict=False)
+
+ # set mel stats
+ if self.args.mel_norm_file:
+ self.xtts.mel_stats = load_fsspec(self.args.mel_norm_file)
+
+ # load GPT if available
+ if self.args.gpt_checkpoint:
+ gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu"))
+ # deal with coqui Trainer exported model
+ if "model" in gpt_checkpoint.keys() and "config" in gpt_checkpoint.keys():
+ print("Coqui Trainer checkpoint detected! Converting it!")
+ gpt_checkpoint = gpt_checkpoint["model"]
+ states_keys = list(gpt_checkpoint.keys())
+ for key in states_keys:
+ if "gpt." in key:
+ new_key = key.replace("gpt.", "")
+ gpt_checkpoint[new_key] = gpt_checkpoint[key]
+ del gpt_checkpoint[key]
+ else:
+ del gpt_checkpoint[key]
+
+ # edit checkpoint if the number of tokens is changed to ensures the better transfer learning possible
+ if (
+ "text_embedding.weight" in gpt_checkpoint
+ and gpt_checkpoint["text_embedding.weight"].shape != self.xtts.gpt.text_embedding.weight.shape
+ ):
+ num_new_tokens = (
+ self.xtts.gpt.text_embedding.weight.shape[0] - gpt_checkpoint["text_embedding.weight"].shape[0]
+ )
+ print(f" > Loading checkpoint with {num_new_tokens} additional tokens.")
+
+ # add new tokens to a linear layer (text_head)
+ emb_g = gpt_checkpoint["text_embedding.weight"]
+ new_row = torch.randn(num_new_tokens, emb_g.shape[1])
+ start_token_row = emb_g[-1, :]
+ emb_g = torch.cat([emb_g, new_row], axis=0)
+ emb_g[-1, :] = start_token_row
+ gpt_checkpoint["text_embedding.weight"] = emb_g
+
+ # add new weights to the linear layer (text_head)
+ text_head_weight = gpt_checkpoint["text_head.weight"]
+ start_token_row = text_head_weight[-1, :]
+ new_entry = torch.randn(num_new_tokens, self.xtts.gpt.text_head.weight.shape[1])
+ text_head_weight = torch.cat([text_head_weight, new_entry], axis=0)
+ text_head_weight[-1, :] = start_token_row
+ gpt_checkpoint["text_head.weight"] = text_head_weight
+
+ # add new biases to the linear layer (text_head)
+ text_head_bias = gpt_checkpoint["text_head.bias"]
+ start_token_row = text_head_bias[-1]
+ new_bias_entry = torch.zeros(num_new_tokens)
+ text_head_bias = torch.cat([text_head_bias, new_bias_entry], axis=0)
+ text_head_bias[-1] = start_token_row
+ gpt_checkpoint["text_head.bias"] = text_head_bias
+
+ self.xtts.gpt.load_state_dict(gpt_checkpoint, strict=True)
+ print(">> GPT weights restored from:", self.args.gpt_checkpoint)
+
+ # Mel spectrogram extractor for conditioning
+ if self.args.gpt_use_perceiver_resampler:
+ self.torch_mel_spectrogram_style_encoder = TorchMelSpectrogram(
+ filter_length=2048,
+ hop_length=256,
+ win_length=1024,
+ normalize=False,
+ sampling_rate=config.audio.sample_rate,
+ mel_fmin=0,
+ mel_fmax=8000,
+ n_mel_channels=80,
+ mel_norm_file=self.args.mel_norm_file,
+ )
+ else:
+ self.torch_mel_spectrogram_style_encoder = TorchMelSpectrogram(
+ filter_length=4096,
+ hop_length=1024,
+ win_length=4096,
+ normalize=False,
+ sampling_rate=config.audio.sample_rate,
+ mel_fmin=0,
+ mel_fmax=8000,
+ n_mel_channels=80,
+ mel_norm_file=self.args.mel_norm_file,
+ )
+
+ # Load DVAE
+ self.dvae = DiscreteVAE(
+ channels=80,
+ normalization=None,
+ positional_dims=1,
+ num_tokens=self.args.gpt_num_audio_tokens - 2,
+ codebook_dim=512,
+ hidden_dim=512,
+ num_resnet_blocks=3,
+ kernel_size=3,
+ num_layers=2,
+ use_transposed_convs=False,
+ )
+
+ self.dvae.eval()
+ if self.args.dvae_checkpoint:
+ dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"))
+ self.dvae.load_state_dict(dvae_checkpoint, strict=False)
+ print(">> DVAE weights restored from:", self.args.dvae_checkpoint)
+ else:
+ raise RuntimeError(
+ "You need to specify config.model_args.dvae_checkpoint path to be able to train the GPT decoder!!"
+ )
+
+ # Mel spectrogram extractor for DVAE
+ self.torch_mel_spectrogram_dvae = TorchMelSpectrogram(
+ mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
+ )
+
+ @property
+ def device(self):
+ return next(self.parameters()).device
+
+ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
+ """
+ Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
+ (actuated by `text_first`).
+
+ text_inputs: long tensor, (b,t)
+ text_lengths: long tensor, (b,)
+ mel_inputs: long tensor, (b,m)
+ wav_lengths: long tensor, (b,)
+ cond_mels: MEL float tensor, (b, num_samples, 80,t_m)
+ cond_idxs: cond start and end indexs, (b, 2)
+ cond_lens: long tensor, (b,)
+ """
+ losses = self.xtts.gpt(
+ text_inputs,
+ text_lengths,
+ audio_codes,
+ wav_lengths,
+ cond_mels=cond_mels,
+ cond_idxs=cond_idxs,
+ cond_lens=cond_lens,
+ )
+ return losses
+
+ @torch.no_grad()
+ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613
+ test_audios = {}
+ if self.config.test_sentences:
+ # init gpt for inference mode
+ self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False)
+ self.xtts.gpt.eval()
+ print(" | > Synthesizing test sentences.")
+ for idx, s_info in enumerate(self.config.test_sentences):
+ wav = self.xtts.synthesize(
+ s_info["text"],
+ self.config,
+ s_info["speaker_wav"],
+ s_info["language"],
+ gpt_cond_len=3,
+ )["wav"]
+ test_audios["{}-audio".format(idx)] = wav
+
+ # delete inference layers
+ del self.xtts.gpt.gpt_inference
+ del self.xtts.gpt.gpt.wte
+ return {"audios": test_audios}
+
+ def test_log(
+ self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument
+ ) -> None:
+ logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate)
+
+ def format_batch(self, batch: Dict) -> Dict:
+ return batch
+
+ @torch.no_grad() # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction
+ def format_batch_on_device(self, batch):
+ """Compute spectrograms on the device."""
+ batch["text_lengths"] = batch["text_lengths"]
+ batch["wav_lengths"] = batch["wav_lengths"]
+ batch["text_inputs"] = batch["padded_text"]
+ batch["cond_idxs"] = batch["cond_idxs"]
+ # compute conditioning mel specs
+ # transform waves from torch.Size([B, num_cond_samples, 1, T] to torch.Size([B * num_cond_samples, 1, T] because if is faster than iterate the tensor
+ B, num_cond_samples, C, T = batch["conditioning"].size()
+ conditioning_reshaped = batch["conditioning"].view(B * num_cond_samples, C, T)
+ paired_conditioning_mel = self.torch_mel_spectrogram_style_encoder(conditioning_reshaped)
+ # transform torch.Size([B * num_cond_samples, n_mel, T_mel]) in torch.Size([B, num_cond_samples, n_mel, T_mel])
+ n_mel = self.torch_mel_spectrogram_style_encoder.n_mel_channels # paired_conditioning_mel.size(1)
+ T_mel = paired_conditioning_mel.size(2)
+ paired_conditioning_mel = paired_conditioning_mel.view(B, num_cond_samples, n_mel, T_mel)
+ # get the conditioning embeddings
+ batch["cond_mels"] = paired_conditioning_mel
+ # compute codes using DVAE
+ if self.config.audio.sample_rate != self.config.audio.dvae_sample_rate:
+ dvae_wav = torchaudio.functional.resample(
+ batch["wav"],
+ orig_freq=self.config.audio.sample_rate,
+ new_freq=self.config.audio.dvae_sample_rate,
+ lowpass_filter_width=64,
+ rolloff=0.9475937167399596,
+ resampling_method="kaiser_window",
+ beta=14.769656459379492,
+ )
+ else:
+ dvae_wav = batch["wav"]
+ dvae_mel_spec = self.torch_mel_spectrogram_dvae(dvae_wav)
+ codes = self.dvae.get_codebook_indices(dvae_mel_spec)
+
+ batch["audio_codes"] = codes
+ # delete useless batch tensors
+ del batch["padded_text"]
+ del batch["wav"]
+ del batch["conditioning"]
+ return batch
+
+ def train_step(self, batch, criterion):
+ loss_dict = {}
+ cond_mels = batch["cond_mels"]
+ text_inputs = batch["text_inputs"]
+ text_lengths = batch["text_lengths"]
+ audio_codes = batch["audio_codes"]
+ wav_lengths = batch["wav_lengths"]
+ cond_idxs = batch["cond_idxs"]
+ cond_lens = batch["cond_lens"]
+
+ loss_text, loss_mel, _ = self.forward(
+ text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens
+ )
+ loss_dict["loss_text_ce"] = loss_text * self.args.gpt_loss_text_ce_weight
+ loss_dict["loss_mel_ce"] = loss_mel * self.args.gpt_loss_mel_ce_weight
+ loss_dict["loss"] = loss_dict["loss_text_ce"] + loss_dict["loss_mel_ce"]
+ return {"model_outputs": None}, loss_dict
+
+ def eval_step(self, batch, criterion):
+ # ignore masking for more consistent evaluation
+ batch["cond_idxs"] = None
+ return self.train_step(batch, criterion)
+
+ def on_train_epoch_start(self, trainer):
+ trainer.model.eval() # the whole model to eval
+ # put gpt model in training mode
+ if hasattr(trainer.model, "module") and hasattr(trainer.model.module, "xtts"):
+ trainer.model.module.xtts.gpt.train()
+ else:
+ trainer.model.xtts.gpt.train()
+
+ def on_init_end(self, trainer): # pylint: disable=W0613
+ # ignore similarities.pth on clearml save/upload
+ if self.config.dashboard_logger.lower() == "clearml":
+ from clearml.binding.frameworks import WeightsFileHandler
+
+ WeightsFileHandler.add_pre_callback(callback_clearml_load_save)
+
+ @torch.no_grad()
+ def inference(
+ self,
+ x,
+ aux_input=None,
+ ): # pylint: disable=dangerous-default-value
+ return None
+
+ @staticmethod
+ def get_criterion():
+ return None
+
+ def get_sampler(self, dataset: TTSDataset, num_gpus=1):
+ # sampler for DDP
+ batch_sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+ return batch_sampler
+
+ def get_data_loader(
+ self,
+ config: Coqpit,
+ assets: Dict,
+ is_eval: bool,
+ samples: Union[List[Dict], List[List]],
+ verbose: bool,
+ num_gpus: int,
+ rank: int = None,
+ ) -> "DataLoader": # pylint: disable=W0613
+ if is_eval and not config.run_eval:
+ loader = None
+ else:
+ # init dataloader
+ dataset = XTTSDataset(self.config, samples, self.xtts.tokenizer, config.audio.sample_rate, is_eval)
+
+ # wait all the DDP process to be ready
+ if num_gpus > 1:
+ torch.distributed.barrier()
+
+ # sort input sequences from short to long
+ # dataset.preprocess_samples()
+
+ # get samplers
+ sampler = self.get_sampler(dataset, num_gpus)
+
+ # ignore sampler when is eval because if we changed the sampler parameter we will not be able to compare previous runs
+ if sampler is None or is_eval:
+ loader = DataLoader(
+ dataset,
+ batch_size=config.eval_batch_size if is_eval else config.batch_size,
+ shuffle=False,
+ drop_last=False,
+ collate_fn=dataset.collate_fn,
+ num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
+ pin_memory=False,
+ )
+ else:
+ loader = DataLoader(
+ dataset,
+ sampler=sampler,
+ batch_size = config.eval_batch_size if is_eval else config.batch_size,
+ collate_fn=dataset.collate_fn,
+ num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
+ pin_memory=False,
+ )
+ return loader
+
+ def get_optimizer(self) -> List:
+ """Initiate and return the optimizer based on the config parameters."""
+ # ToDo: deal with multi GPU training
+ if self.config.optimizer_wd_only_on_weights:
+ # parameters to only GPT model
+ net = self.xtts.gpt
+
+ # normalizations
+ norm_modules = (
+ nn.BatchNorm2d,
+ nn.InstanceNorm2d,
+ nn.BatchNorm1d,
+ nn.InstanceNorm1d,
+ nn.BatchNorm3d,
+ nn.InstanceNorm3d,
+ nn.GroupNorm,
+ nn.LayerNorm,
+ )
+ # nn.Embedding
+ emb_modules = (nn.Embedding, nn.EmbeddingBag)
+
+ param_names_notweights = set()
+ all_param_names = set()
+ param_map = {}
+ for mn, m in net.named_modules():
+ for k, v in m.named_parameters():
+ v.is_bias = k.endswith(".bias")
+ v.is_weight = k.endswith(".weight")
+ v.is_norm = isinstance(m, norm_modules)
+ v.is_emb = isinstance(m, emb_modules)
+
+ fpn = "%s.%s" % (mn, k) if mn else k # full param name
+ all_param_names.add(fpn)
+ param_map[fpn] = v
+ if v.is_bias or v.is_norm or v.is_emb:
+ param_names_notweights.add(fpn)
+
+ params_names_notweights = sorted(list(param_names_notweights))
+ params_notweights = [param_map[k] for k in params_names_notweights]
+ params_names_weights = sorted(list(all_param_names ^ param_names_notweights))
+ params_weights = [param_map[k] for k in params_names_weights]
+
+ groups = [
+ {"params": params_weights, "weight_decay": self.config.optimizer_params["weight_decay"]},
+ {"params": params_notweights, "weight_decay": 0},
+ ]
+ # torch.optim.AdamW
+ opt = get_optimizer(
+ self.config.optimizer,
+ self.config.optimizer_params,
+ self.config.lr,
+ parameters=groups,
+ )
+ opt._group_names = [params_names_weights, params_names_notweights]
+ return opt
+
+ return get_optimizer(
+ self.config.optimizer,
+ self.config.optimizer_params,
+ self.config.lr,
+ # optimize only for the GPT model
+ parameters=self.xtts.gpt.parameters(),
+ )
+
+ def get_scheduler(self, optimizer) -> List:
+ """Set the scheduler for the optimizer.
+
+ Args:
+ optimizer: `torch.optim.Optimizer`.
+ """
+ return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, optimizer)
+
+ def load_checkpoint(
+ self,
+ config,
+ checkpoint_path,
+ eval=False,
+ strict=True,
+ cache_storage="/tmp/tts_cache",
+ target_protocol="s3",
+ target_options={"anon": True},
+ ): # pylint: disable=unused-argument, disable=W0201, disable=W0102, redefined-builtin
+ """Load the model checkpoint and setup for training or inference"""
+
+ state = self.xtts.get_compatible_checkpoint_state_dict(checkpoint_path)
+
+ # load the model weights
+ self.xtts.load_state_dict(state, strict=strict)
+
+ if eval:
+ self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False)
+ self.eval()
+ assert not self.training
+
+ @staticmethod
+ def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None):
+ """Initiate model from config
+
+ Args:
+ config (GPTTrainerConfig): Model config.
+ samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+ Defaults to None.
+ """
+ return GPTTrainer(config)
diff --git a/submodules/TTS/TTS/tts/layers/xtts/xtts_manager.py b/submodules/TTS/TTS/tts/layers/xtts/xtts_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e7d0f6c914fa3a4e706a5e28bbd745afcaa4d67
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/xtts_manager.py
@@ -0,0 +1,34 @@
+import torch
+
+class SpeakerManager():
+ def __init__(self, speaker_file_path=None):
+ self.speakers = torch.load(speaker_file_path)
+
+ @property
+ def name_to_id(self):
+ return self.speakers.keys()
+
+ @property
+ def num_speakers(self):
+ return len(self.name_to_id)
+
+ @property
+ def speaker_names(self):
+ return list(self.name_to_id.keys())
+
+
+class LanguageManager():
+ def __init__(self, config):
+ self.langs = config["languages"]
+
+ @property
+ def name_to_id(self):
+ return self.langs
+
+ @property
+ def num_languages(self):
+ return len(self.name_to_id)
+
+ @property
+ def language_names(self):
+ return list(self.name_to_id)
diff --git a/submodules/TTS/TTS/tts/layers/xtts/zh_num2words.py b/submodules/TTS/TTS/tts/layers/xtts/zh_num2words.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59ccb66309aaebf67f4db972fc83058421d5ed8
--- /dev/null
+++ b/submodules/TTS/TTS/tts/layers/xtts/zh_num2words.py
@@ -0,0 +1,1209 @@
+# Authors:
+# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
+# 2019.9 - 2022 Jiayu DU
+
+import argparse
+import csv
+import os
+import re
+import string
+import sys
+
+# fmt: off
+
+# ================================================================================ #
+# basic constant
+# ================================================================================ #
+CHINESE_DIGIS = "零一二三四五六七八九"
+BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖"
+BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖"
+SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬"
+LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载"
+LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載"
+SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万"
+SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬"
+
+ZERO_ALT = "〇"
+ONE_ALT = "幺"
+TWO_ALTS = ["两", "兩"]
+
+POSITIVE = ["正", "正"]
+NEGATIVE = ["负", "負"]
+POINT = ["点", "點"]
+# PLUS = [u'加', u'加']
+# SIL = [u'杠', u'槓']
+
+FILLER_CHARS = ["呃", "啊"]
+
+ER_WHITELIST = (
+ "(儿女|儿子|儿孙|女儿|儿媳|妻儿|"
+ "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|"
+ "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|"
+ "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)"
+)
+ER_WHITELIST_PATTERN = re.compile(ER_WHITELIST)
+
+# 中文数字系统类型
+NUMBERING_TYPES = ["low", "mid", "high"]
+
+CURRENCY_NAMES = "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|" "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)"
+CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)"
+COM_QUANTIFIERS = (
+ "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|"
+ "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|"
+ "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
+ "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|"
+ "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|"
+ "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)"
+)
+
+
+# Punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
+CN_PUNCS_STOP = "!?。。"
+CN_PUNCS_NONSTOP = ""#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-"
+CN_PUNCS = CN_PUNCS_STOP + CN_PUNCS_NONSTOP
+
+PUNCS = CN_PUNCS + string.punctuation
+PUNCS_TRANSFORM = str.maketrans(PUNCS, "," * len(PUNCS), "") # replace puncs with English comma
+
+
+# https://zh.wikipedia.org/wiki/全行和半行
+QJ2BJ = {
+ " ": " ",
+ "!": "!",
+ """: '"',
+ "#": "#",
+ "$": "$",
+ "%": "%",
+ "&": "&",
+ "'": "'",
+ "(": "(",
+ ")": ")",
+ "*": "*",
+ "+": "+",
+ ",": ",",
+ "-": "-",
+ ".": ".",
+ "/": "/",
+ "0": "0",
+ "1": "1",
+ "2": "2",
+ "3": "3",
+ "4": "4",
+ "5": "5",
+ "6": "6",
+ "7": "7",
+ "8": "8",
+ "9": "9",
+ ":": ":",
+ ";": ";",
+ "<": "<",
+ "=": "=",
+ ">": ">",
+ "?": "?",
+ "@": "@",
+ "A": "A",
+ "B": "B",
+ "C": "C",
+ "D": "D",
+ "E": "E",
+ "F": "F",
+ "G": "G",
+ "H": "H",
+ "I": "I",
+ "J": "J",
+ "K": "K",
+ "L": "L",
+ "M": "M",
+ "N": "N",
+ "O": "O",
+ "P": "P",
+ "Q": "Q",
+ "R": "R",
+ "S": "S",
+ "T": "T",
+ "U": "U",
+ "V": "V",
+ "W": "W",
+ "X": "X",
+ "Y": "Y",
+ "Z": "Z",
+ "[": "[",
+ "\": "\\",
+ "]": "]",
+ "^": "^",
+ "_": "_",
+ "`": "`",
+ "a": "a",
+ "b": "b",
+ "c": "c",
+ "d": "d",
+ "e": "e",
+ "f": "f",
+ "g": "g",
+ "h": "h",
+ "i": "i",
+ "j": "j",
+ "k": "k",
+ "l": "l",
+ "m": "m",
+ "n": "n",
+ "o": "o",
+ "p": "p",
+ "q": "q",
+ "r": "r",
+ "s": "s",
+ "t": "t",
+ "u": "u",
+ "v": "v",
+ "w": "w",
+ "x": "x",
+ "y": "y",
+ "z": "z",
+ "{": "{",
+ "|": "|",
+ "}": "}",
+ "~": "~",
+}
+QJ2BJ_TRANSFORM = str.maketrans("".join(QJ2BJ.keys()), "".join(QJ2BJ.values()), "")
+
+
+# 2013 China National Standard: https://zh.wikipedia.org/wiki/通用规范汉字表, raw resources:
+# https://github.com/mozillazg/pinyin-data/blob/master/kMandarin_8105.txt with 8105 chinese chars in total
+CN_CHARS_COMMON = (
+ "一丁七万丈三上下不与丏丐丑专且丕世丘丙业丛东丝丞丢两严丧个丫中丰串临丸丹为主丽举"
+ "乂乃久么义之乌乍乎乏乐乒乓乔乖乘乙乜九乞也习乡书乩买乱乳乸乾了予争事二亍于亏云互"
+ "亓五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仂仃仄仅仆仇仉今介仍从"
+ "仑仓仔仕他仗付仙仝仞仟仡代令以仨仪仫们仰仲仳仵件价任份仿企伈伉伊伋伍伎伏伐休众优"
+ "伙会伛伞伟传伢伣伤伥伦伧伪伫伭伯估伲伴伶伸伺似伽伾佁佃但位低住佐佑体何佖佗佘余佚"
+ "佛作佝佞佟你佣佤佥佩佬佯佰佳佴佶佸佺佻佼佽佾使侁侂侃侄侈侉例侍侏侑侔侗侘供依侠侣"
+ "侥侦侧侨侩侪侬侮侯侴侵侹便促俄俅俊俍俎俏俐俑俗俘俙俚俜保俞俟信俣俦俨俩俪俫俭修俯"
+ "俱俳俵俶俸俺俾倌倍倏倒倓倔倕倘候倚倜倞借倡倥倦倧倨倩倪倬倭倮倴债倻值倾偁偃假偈偌"
+ "偎偏偓偕做停偡健偬偭偰偲偶偷偻偾偿傀傃傅傈傉傍傒傕傣傥傧储傩催傲傺傻僇僎像僔僖僚"
+ "僦僧僬僭僮僰僳僵僻儆儇儋儒儡儦儳儴儿兀允元兄充兆先光克免兑兔兕兖党兜兢入全八公六"
+ "兮兰共关兴兵其具典兹养兼兽冀冁内冈冉册再冏冒冔冕冗写军农冠冢冤冥冬冮冯冰冱冲决况"
+ "冶冷冻冼冽净凄准凇凉凋凌减凑凓凘凛凝几凡凤凫凭凯凰凳凶凸凹出击凼函凿刀刁刃分切刈"
+ "刊刍刎刑划刖列刘则刚创初删判刨利别刬刭刮到刳制刷券刹刺刻刽刿剀剁剂剃剅削剋剌前剐"
+ "剑剔剕剖剜剞剟剡剥剧剩剪副割剽剿劁劂劄劈劐劓力劝办功加务劢劣动助努劫劬劭励劲劳劼"
+ "劾势勃勇勉勋勍勐勒勔勖勘勚募勠勤勰勺勾勿匀包匆匈匍匏匐匕化北匙匜匝匠匡匣匦匪匮匹"
+ "区医匼匾匿十千卅升午卉半华协卑卒卓单卖南博卜卞卟占卡卢卣卤卦卧卫卬卮卯印危即却卵"
+ "卷卸卺卿厂厄厅历厉压厌厍厕厖厘厚厝原厢厣厥厦厨厩厮去厾县叁参叆叇又叉及友双反发叔"
+ "叕取受变叙叚叛叟叠口古句另叨叩只叫召叭叮可台叱史右叵叶号司叹叻叼叽吁吃各吆合吉吊"
+ "同名后吏吐向吒吓吕吖吗君吝吞吟吠吡吣否吧吨吩含听吭吮启吱吲吴吵吸吹吻吼吽吾呀呃呆"
+ "呇呈告呋呐呒呓呔呕呖呗员呙呛呜呢呣呤呦周呱呲味呵呶呷呸呻呼命咀咂咄咆咇咉咋和咍咎"
+ "咏咐咒咔咕咖咙咚咛咝咡咣咤咥咦咧咨咩咪咫咬咯咱咳咴咸咺咻咽咿哀品哂哃哄哆哇哈哉哌"
+ "响哎哏哐哑哒哓哔哕哗哙哚哝哞哟哢哥哦哧哨哩哪哭哮哱哲哳哺哼哽哿唁唆唇唉唏唐唑唔唛"
+ "唝唠唢唣唤唧唪唬售唯唰唱唳唵唷唼唾唿啁啃啄商啉啊啐啕啖啜啡啤啥啦啧啪啫啬啭啮啰啴"
+ "啵啶啷啸啻啼啾喀喁喂喃善喆喇喈喉喊喋喏喑喔喘喙喜喝喟喤喧喱喳喵喷喹喻喽喾嗄嗅嗉嗌"
+ "嗍嗐嗑嗒嗓嗔嗖嗜嗝嗞嗟嗡嗣嗤嗥嗦嗨嗪嗫嗬嗯嗲嗳嗵嗷嗽嗾嘀嘁嘈嘉嘌嘎嘏嘘嘚嘛嘞嘟嘡"
+ "嘣嘤嘧嘬嘭嘱嘲嘴嘶嘹嘻嘿噀噂噇噌噍噎噔噗噘噙噜噢噤器噩噪噫噬噱噶噻噼嚄嚅嚆嚎嚏嚓"
+ "嚚嚣嚭嚯嚷嚼囊囔囚四回囟因囡团囤囫园困囱围囵囷囹固国图囿圃圄圆圈圉圊圌圐圙圜土圢"
+ "圣在圩圪圫圬圭圮圯地圲圳圹场圻圾址坂均坉坊坋坌坍坎坏坐坑坒块坚坛坜坝坞坟坠坡坤坥"
+ "坦坨坩坪坫坬坭坯坰坳坷坻坼坽垂垃垄垆垈型垌垍垎垏垒垓垕垙垚垛垞垟垠垡垢垣垤垦垧垩"
+ "垫垭垮垯垱垲垴垵垸垺垾垿埂埃埆埇埋埌城埏埒埔埕埗埘埙埚埝域埠埤埪埫埭埯埴埵埸培基"
+ "埼埽堂堃堆堇堉堋堌堍堎堐堑堕堙堞堠堡堤堧堨堪堰堲堵堼堽堾塄塅塆塌塍塑塔塘塝塞塥填"
+ "塬塱塾墀墁境墅墈墉墐墒墓墕墘墙墚增墟墡墣墦墨墩墼壁壅壑壕壤士壬壮声壳壶壸壹处备复"
+ "夏夐夔夕外夙多夜够夤夥大天太夫夬夭央夯失头夷夸夹夺夼奁奂奄奇奈奉奋奎奏契奓奔奕奖"
+ "套奘奚奠奡奢奥奭女奴奶奸她好妁如妃妄妆妇妈妊妍妒妓妖妗妘妙妞妣妤妥妧妨妩妪妫妭妮"
+ "妯妲妹妻妾姆姈姊始姐姑姒姓委姗姘姚姜姝姞姣姤姥姨姬姮姱姶姹姻姽姿娀威娃娄娅娆娇娈"
+ "娉娌娑娓娘娜娟娠娣娥娩娱娲娴娵娶娼婀婆婉婊婌婍婕婘婚婞婠婢婤婧婪婫婳婴婵婶婷婺婻"
+ "婼婿媂媄媆媒媓媖媚媛媞媪媭媱媲媳媵媸媾嫁嫂嫄嫉嫌嫒嫔嫕嫖嫘嫚嫜嫠嫡嫣嫦嫩嫪嫫嫭嫱"
+ "嫽嬉嬖嬗嬛嬥嬬嬴嬷嬿孀孅子孑孓孔孕孖字存孙孚孛孜孝孟孢季孤孥学孩孪孬孰孱孳孵孺孽"
+ "宁它宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宧宪宫宬宰害宴宵家宸容宽宾"
+ "宿寁寂寄寅密寇富寐寒寓寝寞察寡寤寥寨寮寰寸对寺寻导寿封射将尉尊小少尔尕尖尘尚尜尝"
+ "尢尤尥尧尨尪尬就尴尸尹尺尻尼尽尾尿局屁层屃居屈屉届屋屎屏屐屑展屙属屠屡屣履屦屯山"
+ "屹屺屼屾屿岁岂岈岊岌岍岐岑岔岖岗岘岙岚岛岜岞岠岢岣岨岩岫岬岭岱岳岵岷岸岽岿峁峂峃"
+ "峄峋峒峗峘峙峛峡峣峤峥峦峧峨峪峭峰峱峻峿崀崁崂崃崄崆崇崌崎崒崔崖崚崛崞崟崡崤崦崧"
+ "崩崭崮崴崶崽崾崿嵁嵅嵇嵊嵋嵌嵎嵖嵘嵚嵛嵝嵩嵫嵬嵯嵲嵴嶂嶅嶍嶒嶓嶙嶝嶟嶦嶲嶷巅巇巉"
+ "巍川州巡巢工左巧巨巩巫差巯己已巳巴巷巽巾币市布帅帆师希帏帐帑帔帕帖帘帙帚帛帜帝帡"
+ "带帧帨席帮帱帷常帻帼帽幂幄幅幌幔幕幖幛幞幡幢幪干平年并幸幺幻幼幽广庄庆庇床庋序庐"
+ "庑库应底庖店庙庚府庞废庠庤庥度座庭庱庳庵庶康庸庹庼庾廆廉廊廋廑廒廓廖廙廛廨廪延廷"
+ "建廿开弁异弃弄弆弇弈弊弋式弑弓引弗弘弛弟张弢弥弦弧弨弩弭弯弱弶弸弹强弼彀归当录彖"
+ "彗彘彝彟形彤彦彧彩彪彬彭彰影彳彷役彻彼往征徂径待徇很徉徊律徐徒徕得徘徙徛徜御徨循"
+ "徭微徵德徼徽心必忆忉忌忍忏忐忑忒忖志忘忙忝忞忠忡忤忧忪快忭忮忱忳念忸忺忻忽忾忿怀"
+ "态怂怃怄怅怆怊怍怎怏怒怔怕怖怙怛怜思怠怡急怦性怨怩怪怫怯怵总怼怿恁恂恃恋恍恐恒恓"
+ "恔恕恙恚恝恢恣恤恧恨恩恪恫恬恭息恰恳恶恸恹恺恻恼恽恿悃悄悆悈悉悌悍悒悔悖悚悛悝悟"
+ "悠悢患悦您悫悬悭悯悰悱悲悴悸悻悼情惆惇惊惋惎惑惔惕惘惙惚惛惜惝惟惠惦惧惨惩惫惬惭"
+ "惮惯惰想惴惶惹惺愀愁愃愆愈愉愍愎意愐愔愕愚感愠愣愤愦愧愫愭愿慆慈慊慌慎慑慕慝慢慥"
+ "慧慨慬慭慰慵慷憋憎憔憕憙憧憨憩憬憭憷憺憾懂懈懊懋懑懒懔懦懵懿戆戈戊戋戌戍戎戏成我"
+ "戒戕或戗战戚戛戟戡戢戣戤戥截戬戭戮戳戴户戽戾房所扁扂扃扅扆扇扈扉扊手才扎扑扒打扔"
+ "托扛扞扣扦执扩扪扫扬扭扮扯扰扳扶批扺扼扽找承技抃抄抉把抑抒抓抔投抖抗折抚抛抟抠抡"
+ "抢护报抨披抬抱抵抹抻押抽抿拂拃拄担拆拇拈拉拊拌拍拎拐拒拓拔拖拗拘拙招拜拟拢拣拤拥"
+ "拦拧拨择括拭拮拯拱拳拴拶拷拼拽拾拿持挂指挈按挎挑挓挖挚挛挝挞挟挠挡挣挤挥挦挨挪挫"
+ "振挲挹挺挽捂捃捅捆捉捋捌捍捎捏捐捕捞损捡换捣捧捩捭据捯捶捷捺捻捽掀掂掇授掉掊掌掎"
+ "掏掐排掖掘掞掠探掣接控推掩措掬掭掮掰掳掴掷掸掺掼掾揄揆揉揍描提插揕揖揠握揣揩揪揭"
+ "揳援揶揸揽揿搀搁搂搅搋搌搏搐搒搓搔搛搜搞搠搡搦搪搬搭搴携搽摁摄摅摆摇摈摊摏摒摔摘"
+ "摛摞摧摩摭摴摸摹摽撂撄撅撇撑撒撕撖撙撞撤撩撬播撮撰撵撷撸撺撼擀擂擅操擎擐擒擘擞擢"
+ "擤擦擿攀攉攒攘攥攫攮支收攸改攻攽放政故效敉敌敏救敔敕敖教敛敝敞敢散敦敩敫敬数敲整"
+ "敷文斋斌斐斑斓斗料斛斜斝斟斠斡斤斥斧斩斫断斯新斶方於施旁旃旄旅旆旋旌旎族旐旒旖旗"
+ "旞无既日旦旧旨早旬旭旮旯旰旱旴旵时旷旸旺旻旿昀昂昃昄昆昇昈昉昊昌明昏昒易昔昕昙昝"
+ "星映昡昣昤春昧昨昪昫昭是昱昳昴昵昶昺昼昽显晁晃晅晊晋晌晏晐晒晓晔晕晖晗晙晚晞晟晡"
+ "晢晤晦晨晪晫普景晰晱晴晶晷智晾暂暄暅暇暌暑暕暖暗暝暧暨暮暲暴暵暶暹暾暿曈曌曙曛曜"
+ "曝曦曩曰曲曳更曷曹曼曾替最月有朋服朏朐朓朔朕朗望朝期朦木未末本札术朱朳朴朵朸机朽"
+ "杀杂权杄杆杈杉杌李杏材村杓杕杖杙杜杞束杠条来杧杨杩杪杭杯杰杲杳杵杷杻杼松板极构枅"
+ "枇枉枋枍析枕林枘枚果枝枞枢枣枥枧枨枪枫枭枯枰枲枳枵架枷枸枹柁柃柄柈柊柏某柑柒染柔"
+ "柖柘柙柚柜柝柞柠柢查柩柬柯柰柱柳柴柷柽柿栀栅标栈栉栊栋栌栎栏栐树栒栓栖栗栝栟校栩"
+ "株栲栳栴样核根栻格栽栾桀桁桂桃桄桅框案桉桊桌桎桐桑桓桔桕桠桡桢档桤桥桦桧桨桩桫桯"
+ "桲桴桶桷桹梁梃梅梆梌梏梓梗梠梢梣梦梧梨梭梯械梳梴梵梼梽梾梿检棁棂棉棋棍棐棒棓棕棘"
+ "棚棠棣棤棨棪棫棬森棰棱棵棹棺棻棼棽椀椁椅椆椋植椎椐椑椒椓椟椠椤椪椭椰椴椸椹椽椿楂"
+ "楒楔楗楙楚楝楞楠楣楦楩楪楫楮楯楷楸楹楼概榃榄榅榆榇榈榉榍榑榔榕榖榛榜榧榨榫榭榰榱"
+ "榴榷榻槁槃槊槌槎槐槔槚槛槜槟槠槭槱槲槽槿樊樗樘樟模樨横樯樱樵樽樾橄橇橐橑橘橙橛橞"
+ "橡橥橦橱橹橼檀檄檎檐檑檗檞檠檩檫檬櫆欂欠次欢欣欤欧欲欸欹欺欻款歃歅歆歇歉歌歙止正"
+ "此步武歧歪歹死歼殁殂殃殄殆殇殉殊残殍殒殓殖殚殛殡殣殪殳殴段殷殿毁毂毅毋毌母每毐毒"
+ "毓比毕毖毗毙毛毡毪毫毯毳毵毹毽氅氆氇氍氏氐民氓气氕氖氘氙氚氛氟氡氢氤氦氧氨氩氪氮"
+ "氯氰氲水永氾氿汀汁求汆汇汈汉汊汋汐汔汕汗汛汜汝汞江池污汤汧汨汩汪汫汭汰汲汴汶汹汽"
+ "汾沁沂沃沄沅沆沇沈沉沌沏沐沓沔沘沙沚沛沟没沣沤沥沦沧沨沩沪沫沭沮沱河沸油沺治沼沽"
+ "沾沿泂泃泄泅泇泉泊泌泐泓泔法泖泗泙泚泛泜泞泠泡波泣泥注泪泫泮泯泰泱泳泵泷泸泺泻泼"
+ "泽泾洁洄洇洈洋洌洎洑洒洓洗洘洙洚洛洞洢洣津洧洨洪洫洭洮洱洲洳洴洵洸洹洺活洼洽派洿"
+ "流浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕浙浚浛浜浞浟浠浡浣浥浦浩浪浬浭浮浯浰浲浴海浸"
+ "浼涂涄涅消涉涌涍涎涐涑涓涔涕涘涛涝涞涟涠涡涢涣涤润涧涨涩涪涫涮涯液涴涵涸涿淀淄淅"
+ "淆淇淋淌淏淑淖淘淙淜淝淞淟淠淡淤淦淫淬淮淯深淳淴混淹添淼清渊渌渍渎渐渑渔渗渚渝渟"
+ "渠渡渣渤渥温渫渭港渰渲渴游渺渼湃湄湉湍湎湑湓湔湖湘湛湜湝湟湣湫湮湲湴湾湿溁溃溅溆"
+ "溇溉溍溏源溘溚溜溞溟溠溢溥溦溧溪溯溱溲溴溵溶溷溹溺溻溽滁滂滃滆滇滉滋滍滏滑滓滔滕"
+ "滗滘滚滞滟滠满滢滤滥滦滧滨滩滪滫滴滹漂漆漈漉漋漏漓演漕漖漠漤漦漩漪漫漭漯漱漳漴漶"
+ "漷漹漻漼漾潆潇潋潍潏潖潘潜潞潟潢潦潩潭潮潲潴潵潸潺潼潽潾澂澄澈澉澌澍澎澛澜澡澥澧"
+ "澪澭澳澴澶澹澼澽激濂濉濋濑濒濞濠濡濩濮濯瀌瀍瀑瀔瀚瀛瀣瀱瀵瀹瀼灈灌灏灞火灭灯灰灵"
+ "灶灸灼灾灿炀炅炆炉炊炌炎炒炔炕炖炘炙炜炝炟炣炫炬炭炮炯炱炳炷炸点炻炼炽烀烁烂烃烈"
+ "烊烔烘烙烛烜烝烟烠烤烦烧烨烩烫烬热烯烶烷烹烺烻烽焆焉焊焌焐焓焕焖焗焘焙焚焜焞焦焯"
+ "焰焱然煁煃煅煊煋煌煎煓煜煞煟煤煦照煨煮煲煳煴煸煺煽熄熇熊熏熔熘熙熛熜熟熠熥熨熬熵"
+ "熹熻燃燊燋燎燏燔燕燚燠燥燧燮燹爆爇爔爚爝爟爨爪爬爰爱爵父爷爸爹爻爽爿牁牂片版牌牍"
+ "牒牖牙牚牛牝牟牡牢牤牥牦牧物牮牯牲牵特牺牻牾牿犀犁犄犇犊犋犍犏犒犟犨犬犯犰犴状犷"
+ "犸犹狁狂狃狄狈狉狍狎狐狒狗狙狝狞狠狡狨狩独狭狮狯狰狱狲狳狴狷狸狺狻狼猁猃猄猇猊猎"
+ "猕猖猗猛猜猝猞猡猢猥猩猪猫猬献猯猰猱猴猷猹猺猾猿獍獐獒獗獠獬獭獯獴獾玃玄率玉王玎"
+ "玑玒玓玕玖玘玙玚玛玞玟玠玡玢玤玥玦玩玫玭玮环现玱玲玳玶玷玹玺玻玼玿珀珂珅珇珈珉珊"
+ "珋珌珍珏珐珑珒珕珖珙珛珝珞珠珢珣珥珦珧珩珪珫班珰珲珵珷珸珹珺珽琀球琄琅理琇琈琉琊"
+ "琎琏琐琔琚琛琟琡琢琤琥琦琨琪琫琬琭琮琯琰琲琳琴琵琶琼瑀瑁瑂瑃瑄瑅瑆瑑瑓瑔瑕瑖瑗瑙"
+ "瑚瑛瑜瑝瑞瑟瑢瑧瑨瑬瑭瑰瑱瑳瑶瑷瑾璀璁璃璆璇璈璋璎璐璒璘璜璞璟璠璥璧璨璩璪璬璮璱"
+ "璲璺瓀瓒瓖瓘瓜瓞瓠瓢瓣瓤瓦瓮瓯瓴瓶瓷瓻瓿甄甍甏甑甓甗甘甚甜生甡甥甦用甩甪甫甬甭甯"
+ "田由甲申电男甸町画甾畀畅畈畋界畎畏畔畖留畚畛畜畤略畦番畬畯畲畴畸畹畿疁疃疆疍疏疐"
+ "疑疔疖疗疙疚疝疟疠疡疢疣疤疥疫疬疭疮疯疰疱疲疳疴疵疸疹疼疽疾痂痃痄病症痈痉痊痍痒"
+ "痓痔痕痘痛痞痢痣痤痦痧痨痪痫痰痱痴痹痼痿瘀瘁瘃瘅瘆瘊瘌瘐瘕瘗瘘瘙瘛瘟瘠瘢瘤瘥瘦瘩"
+ "瘪瘫瘭瘰瘳瘴瘵瘸瘼瘾瘿癀癃癌癍癔癖癗癜癞癣癫癯癸登白百癿皂的皆皇皈皋皎皑皓皕皖皙"
+ "皛皞皤皦皭皮皱皲皴皿盂盅盆盈盉益盍盎盏盐监盒盔盖盗盘盛盟盥盦目盯盱盲直盷相盹盼盾"
+ "省眄眇眈眉眊看眍眙眚真眠眢眦眨眩眬眭眯眵眶眷眸眺眼着睁睃睄睇睎睐睑睚睛睡睢督睥睦"
+ "睨睫睬睹睽睾睿瞀瞄瞅瞋瞌瞍瞎瞑瞒瞟瞠瞢瞥瞧瞩瞪瞫瞬瞭瞰瞳瞵瞻瞽瞿矍矗矛矜矞矢矣知"
+ "矧矩矫矬短矮矰石矶矸矻矼矾矿砀码砂砄砆砉砌砍砑砒研砖砗砘砚砜砝砟砠砣砥砧砫砬砭砮"
+ "砰破砵砷砸砹砺砻砼砾础硁硅硇硊硌硍硎硐硒硔硕硖硗硙硚硝硪硫硬硭确硼硿碃碇碈碉碌碍"
+ "碎碏碑碓碗碘碚碛碜碟碡碣碥碧碨碰碱碲碳碴碶碹碾磁磅磉磊磋磏磐磔磕磙磜磡磨磬磲磴磷"
+ "磹磻礁礅礌礓礞礴礵示礼社祀祁祃祆祇祈祉祊祋祎祏祐祓祕祖祗祚祛祜祝神祟祠祢祥祧票祭"
+ "祯祲祷祸祺祼祾禀禁禄禅禊禋福禒禔禘禚禛禤禧禳禹禺离禽禾秀私秃秆秉秋种科秒秕秘租秣"
+ "秤秦秧秩秫秬秭积称秸移秽秾稀稂稃稆程稌稍税稑稔稗稙稚稞稠稣稳稷稹稻稼稽稿穄穆穑穗"
+ "穙穜穟穰穴究穷穸穹空穿窀突窃窄窅窈窊窍窎窑窒窕窖窗窘窜窝窟窠窣窥窦窨窬窭窳窸窿立"
+ "竑竖竘站竞竟章竣童竦竫竭端竹竺竽竿笃笄笆笈笊笋笏笑笔笕笙笛笞笠笤笥符笨笪笫第笮笯"
+ "笱笳笸笺笼笾筀筅筇等筋筌筏筐筑筒答策筘筚筛筜筝筠筢筤筥筦筮筱筲筵筶筷筹筻筼签简箅"
+ "箍箐箓箔箕箖算箜管箢箦箧箨箩箪箫箬箭箱箴箸篁篆篇篌篑篓篙篚篝篡篥篦篪篮篯篱篷篼篾"
+ "簃簇簉簋簌簏簕簖簝簟簠簧簪簰簸簿籀籁籍籥米籴类籼籽粉粑粒粕粗粘粜粝粞粟粢粤粥粪粮"
+ "粱粲粳粹粼粽精粿糁糅糇糈糊糌糍糒糕糖糗糙糜糟糠糨糯糵系紊素索紧紫累絜絮絷綦綮縠縢"
+ "縻繁繄繇纂纛纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁"
+ "绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩"
+ "绪绫续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缌缎缐缑缒缓缔缕"
+ "编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵缶缸缺罂罄罅罍罐"
+ "网罔罕罗罘罚罟罡罢罨罩罪置罱署罴罶罹罽罾羁羊羌美羑羓羔羕羖羚羝羞羟羡群羧羯羰羱羲"
+ "羸羹羼羽羿翀翁翂翃翅翈翊翌翎翔翕翘翙翚翛翟翠翡翥翦翩翮翯翰翱翳翷翻翼翾耀老考耄者"
+ "耆耇耋而耍耏耐耑耒耔耕耖耗耘耙耜耠耢耤耥耦耧耨耩耪耰耱耳耵耶耷耸耻耽耿聂聃聆聊聋"
+ "职聍聒联聘聚聩聪聱聿肃肄肆肇肉肋肌肓肖肘肚肛肝肟肠股肢肤肥肩肪肫肭肮肯肱育肴肷肸"
+ "肺肼肽肾肿胀胁胂胃胄胆胈背胍胎胖胗胙胚胛胜胝胞胠胡胣胤胥胧胨胩胪胫胬胭胯胰胱胲胳"
+ "胴胶胸胺胼能脂脆脉脊脍脎脏脐脑脒脓脔脖脘脚脞脟脩脬脯脱脲脶脸脾脿腆腈腊腋腌腐腑腒"
+ "腓腔腕腘腙腚腠腥腧腨腩腭腮腯腰腱腴腹腺腻腼腽腾腿膀膂膈膊膏膑膘膙膛膜膝膦膨膳膺膻"
+ "臀臂臃臆臊臌臑臜臣臧自臬臭至致臻臼臾舀舁舂舄舅舆舌舍舐舒舔舛舜舞舟舠舢舣舥航舫般"
+ "舭舯舰舱舲舳舴舵舶舷舸船舻舾艄艅艇艉艋艎艏艘艚艟艨艮良艰色艳艴艺艽艾艿节芃芄芈芊"
+ "芋芍芎芏芑芒芗芘芙芜芝芟芠芡芣芤芥芦芨芩芪芫芬芭芮芯芰花芳芴芷芸芹芼芽芾苁苄苇苈"
+ "苉苊苋苌苍苎苏苑苒苓苔苕苗苘苛苜苞苟苠苡苣苤若苦苧苫苯英苴苷苹苻苾茀茁茂范茄茅茆"
+ "茈茉茋茌茎茏茑茓茔茕茗茚茛茜茝茧茨茫茬茭茯茱茳茴茵茶茸茹茺茼茽荀荁荃荄荆荇草荏荐"
+ "荑荒荓荔荖荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药荷荸荻荼荽莅莆莉莎莒莓莘莙莛"
+ "莜莝莞莠莨莩莪莫莰莱莲莳莴莶获莸莹莺莼莽莿菀菁菂菅菇菉菊菌菍菏菔菖菘菜菝菟菠菡菥"
+ "菩菪菰菱菲菹菼菽萁萃萄萆萋萌萍萎萏萑萘萚萜萝萣萤营萦萧萨萩萱萳萸萹萼落葆葎葑葖著"
+ "葙葚葛葜葡董葩葫葬葭葰葱葳葴葵葶葸葺蒂蒄蒇蒈蒉蒋蒌蒎蒐蒗蒙蒜蒟蒡蒨蒯蒱蒲蒴蒸蒹蒺"
+ "蒻蒽蒿蓁蓂蓄蓇蓉蓊蓍蓏蓐蓑蓓蓖蓝蓟蓠蓢蓣蓥蓦蓬蓰蓼蓿蔀蔃蔈蔊蔌蔑蔓蔗蔚蔟蔡蔫蔬蔷"
+ "蔸蔹蔺蔻蔼蔽蕃蕈蕉蕊蕖蕗蕙蕞蕤蕨蕰蕲蕴蕹蕺蕻蕾薁薄薅薇薏薛薜薢薤薨薪薮薯薰薳薷薸"
+ "薹薿藁藉藏藐藓藕藜藟藠藤藦藨藩藻藿蘅蘑蘖蘘蘧蘩蘸蘼虎虏虐虑虒虓虔虚虞虢虤虫虬虮虱"
+ "虷虸虹虺虻虼虽虾虿蚀蚁蚂蚄蚆蚊蚋蚌蚍蚓蚕蚜蚝蚣蚤蚧蚨蚩蚪蚬蚯蚰蚱蚲蚴蚶蚺蛀蛃蛄蛆"
+ "蛇蛉蛊蛋蛎蛏蛐蛑蛔蛘蛙蛛蛞蛟蛤蛩蛭蛮蛰蛱蛲蛳蛴蛸蛹蛾蜀蜂蜃蜇蜈蜉蜊蜍蜎蜐蜒蜓蜕蜗"
+ "蜘蜚蜜蜞蜡蜢蜣蜥蜩蜮蜱蜴蜷蜻蜾蜿蝇蝈蝉蝌蝎蝓蝗蝘蝙蝠蝣蝤蝥蝮蝰蝲蝴蝶蝻蝼蝽蝾螂螃"
+ "螅螈螋融螗螟螠螣螨螫螬螭螯螱螳螵螺螽蟀蟆蟊蟋蟏蟑蟒蟛蟠蟥蟪蟫蟮蟹蟾蠃蠊蠋蠓蠕蠖蠡"
+ "蠢蠲蠹蠼血衃衄衅行衍衎衒衔街衙衠衡衢衣补表衩衫衬衮衰衲衷衽衾衿袁袂袄袅袆袈袋袍袒"
+ "袖袗袜袢袤袪被袭袯袱袷袼裁裂装裆裈裉裎裒裔裕裘裙裛裟裢裣裤裥裨裰裱裳裴裸裹裼裾褂"
+ "褊褐褒褓褕褙褚褛褟褡褥褪褫褯褰褴褶襁襄襕襚襜襞襟襦襫襻西要覃覆见观觃规觅视觇览觉"
+ "觊觋觌觎觏觐觑角觖觚觜觞觟解觥触觫觭觯觱觳觿言訄訇訚訾詈詟詹誉誊誓謇警譬计订讣认"
+ "讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词"
+ "诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诫诬语诮误诰诱诲诳说诵请"
+ "诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谙谚谛谜谝谞谟谠谡"
+ "谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷谼谿豁豆豇豉豌豕豚象豢豨豪豫豮豳豸豹"
+ "豺貂貅貆貉貊貌貔貘贝贞负贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼"
+ "贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赤"
+ "赦赧赪赫赭走赳赴赵赶起趁趄超越趋趑趔趟趣趯趱足趴趵趸趺趼趾趿跂跃跄跆跋跌跎跏跐跑"
+ "跖跗跚跛距跞跟跣跤跨跪跬路跱跳践跶跷跸跹跺跻跽踅踉踊踌踏踒踔踝踞踟踢踣踦踩踪踬踮"
+ "踯踱踵踶踹踺踽蹀蹁蹂蹄蹅蹇蹈蹉蹊蹋蹐蹑蹒蹙蹚蹜蹢蹦蹩蹬蹭蹯蹰蹲蹴蹶蹼蹽蹾蹿躁躅躇"
+ "躏躐躔躜躞身躬躯躲躺车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较"
+ "辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辛辜辞辟辣辨辩辫辰辱边辽达辿迁迂迄"
+ "迅过迈迎运近迓返迕还这进远违连迟迢迤迥迦迨迩迪迫迭迮述迳迷迸迹迺追退送适逃逄逅逆"
+ "选逊逋逍透逐逑递途逖逗通逛逝逞速造逡逢逦逭逮逯逴逵逶逸逻逼逾遁遂遄遆遇遍遏遐遑遒"
+ "道遗遘遛遢遣遥遨遭遮遴遵遹遽避邀邂邃邈邋邑邓邕邗邘邙邛邝邠邡邢那邦邨邪邬邮邯邰邱"
+ "邲邳邴邵邶邸邹邺邻邽邾邿郁郃郄郅郇郈郊郎郏郐郑郓郗郚郛郜郝郡郢郤郦郧部郪郫郭郯郴"
+ "郸都郾郿鄀鄂鄃鄄鄅鄌鄑鄗鄘鄙鄚鄜鄞鄠鄢鄣鄫鄯鄱鄹酂酃酅酆酉酊酋酌配酎酏酐酒酗酚酝"
+ "酞酡酢酣酤酥酦酩酪酬酮酯酰酱酲酴酵酶酷酸酹酺酽酾酿醅醇醉醋醌醍醐醑醒醚醛醢醨醪醭"
+ "醮醯醴醵醺醾采釉释里重野量釐金釜鉴銎銮鋆鋈錾鍪鎏鏊鏖鐾鑫钆钇针钉钊钋钌钍钎钏钐钒"
+ "钓钔钕钖钗钘钙钚钛钜钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钷钹钺钻钼"
+ "钽钾钿铀铁铂铃铄铅铆铈铉铊铋铌铍铎铏铐铑铒铕铖铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铧铨"
+ "铩铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐"
+ "锑锒锓锔锕锖锗锘错锚锛锜锝锞锟锡锢锣锤锥锦锧锨锩锪锫锬锭键锯锰锱锲锳锴锵锶锷锸锹"
+ "锺锻锼锽锾锿镀镁镂镃镄镅镆镇镈镉镊镋镌镍镎镏镐镑镒镓镔镕镖镗镘镚镛镜镝镞镠镡镢镣"
+ "镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镵镶长门闩闪闫闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼"
+ "闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阔阕阖阗阘阙阚阜队阡阪阮阱防阳阴阵阶"
+ "阻阼阽阿陀陂附际陆陇陈陉陋陌降陎限陑陔陕陛陞陟陡院除陧陨险陪陬陲陴陵陶陷隃隅隆隈"
+ "隋隍随隐隔隗隘隙障隧隩隰隳隶隹隺隼隽难雀雁雄雅集雇雉雊雌雍雎雏雒雕雠雨雩雪雯雱雳"
+ "零雷雹雾需霁霄霅霆震霈霉霍霎霏霓霖霜霞霨霪霭霰露霸霹霾青靓靖静靛非靠靡面靥革靬靰"
+ "靳靴靶靸靺靼靽靿鞁鞅鞋鞍鞑鞒鞔鞘鞠鞡鞣鞧鞨鞫鞬鞭鞮鞯鞲鞳鞴韂韦韧韨韩韪韫韬韭音韵"
+ "韶页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颓颔颖颗题颙颚颛颜额"
+ "颞颟颠颡颢颤颥颦颧风飏飐飑飒飓飔飕飗飘飙飞食飧飨餍餐餮饔饕饥饧饨饩饪饫饬饭饮饯饰"
+ "饱饲饳饴饵饶饷饸饹饺饻饼饽饿馁馃馄馅馆馇馈馉馊馋馌馍馏馐馑馒馓馔馕首馗馘香馝馞馥"
+ "馧馨马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑"
+ "骒骓骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧骨骰骱骶骷骸骺骼髀髁髂髃髅髋髌髎髑髓高"
+ "髡髢髦髫髭髯髹髻髽鬃鬈鬏鬒鬓鬘鬟鬣鬯鬲鬶鬷鬻鬼魁魂魃魄魅魆魇魈魉魋魍魏魑魔鱼鱽鱾"
+ "鱿鲀鲁鲂鲃鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨"
+ "鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳇鳈鳉鳊鳌鳍鳎鳏鳐鳑鳒鳓"
+ "鳔鳕鳖鳗鳘鳙鳚鳛鳜鳝鳞鳟鳠鳡鳢鳣鳤鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸵鸶"
+ "鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹔鹕鹖鹗鹘鹙鹚鹛鹜鹝鹞鹟"
+ "鹠鹡鹢鹣鹤鹦鹧鹨鹩鹪鹫鹬鹭鹮鹯鹰鹱鹲鹳鹴鹾鹿麀麂麇麈麋麑麒麓麖麝麟麦麸麹麻麽麾黄"
+ "黇黉黍黎黏黑黔默黛黜黝黟黠黡黢黥黧黩黪黯黹黻黼黾鼋鼍鼎鼐鼒鼓鼗鼙鼠鼢鼩鼫鼬鼯鼱鼷"
+ "鼹鼻鼽鼾齁齇齉齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟龠龢鿍鿎鿏㑇㑊㕮㘎㙍㙘㙦㛃"
+ "㛚㛹㟃㠇㠓㤘㥄㧐㧑㧟㫰㬊㬎㬚㭎㭕㮾㰀㳇㳘㳚㴔㵐㶲㸆㸌㺄㻬㽏㿠䁖䂮䃅䃎䅟䌹䎃䎖䏝䏡"
+ "䏲䐃䓖䓛䓨䓫䓬䗖䗛䗪䗴䜣䝙䢺䢼䣘䥽䦃䲟䲠䲢䴓䴔䴕䴖䴗䴘䴙䶮𠅤𠙶𠳐𡎚𡐓𣗋𣲗𣲘𣸣𤧛𤩽"
+ "𤫉𥔲𥕢𥖨𥻗𦈡𦒍𦙶𦝼𦭜𦰡𧿹𨐈𨙸𨚕𨟠𨭉𨱇𨱏𨱑𨱔𨺙𩽾𩾃𩾌𪟝𪣻𪤗𪨰𪨶𪩘𪾢𫄧𫄨𫄷𫄸𫇭𫌀𫍣𫍯"
+ "𫍲𫍽𫐄𫐐𫐓𫑡𫓧𫓯𫓶𫓹𫔍𫔎𫔶𫖮𫖯𫖳𫗧𫗴𫘜𫘝𫘦𫘧𫘨𫘪𫘬𫚕𫚖𫚭𫛭𫞩𫟅𫟦𫟹𫟼𫠆𫠊𫠜𫢸𫫇𫭟"
+ "𫭢𫭼𫮃𫰛𫵷𫶇𫷷𫸩𬀩𬀪𬂩𬃊𬇕𬇙𬇹𬉼𬊈𬊤𬌗𬍛𬍡𬍤𬒈𬒔𬒗𬕂𬘓𬘘𬘡𬘩𬘫𬘬𬘭𬘯𬙂𬙊𬙋𬜬𬜯𬞟"
+ "𬟁𬟽𬣙𬣞𬣡𬣳𬤇𬤊𬤝𬨂𬨎𬩽𬪩𬬩𬬭𬬮𬬱𬬸𬬹𬬻𬬿𬭁𬭊𬭎𬭚𬭛𬭤𬭩𬭬𬭯𬭳𬭶𬭸𬭼𬮱𬮿𬯀𬯎𬱖𬱟"
+ "𬳵𬳶𬳽𬳿𬴂𬴃𬴊𬶋𬶍𬶏𬶐𬶟𬶠𬶨𬶭𬶮𬷕𬸘𬸚𬸣𬸦𬸪𬹼𬺈𬺓"
+)
+CN_CHARS_EXT = "吶诶屌囧飚屄"
+
+CN_CHARS = CN_CHARS_COMMON + CN_CHARS_EXT
+IN_CH_CHARS = {c: True for c in CN_CHARS}
+
+EN_CHARS = string.ascii_letters + string.digits
+IN_EN_CHARS = {c: True for c in EN_CHARS}
+
+VALID_CHARS = CN_CHARS + EN_CHARS + " "
+IN_VALID_CHARS = {c: True for c in VALID_CHARS}
+
+
+# ================================================================================ #
+# basic class
+# ================================================================================ #
+class ChineseChar(object):
+ """
+ 中文字符
+ 每个字符对应简体和繁体,
+ e.g. 简体 = '负', 繁体 = '負'
+ 转换时可转换为简体或繁体
+ """
+
+ def __init__(self, simplified, traditional):
+ self.simplified = simplified
+ self.traditional = traditional
+ # self.__repr__ = self.__str__
+
+ def __str__(self):
+ return self.simplified or self.traditional or None
+
+ def __repr__(self):
+ return self.__str__()
+
+
+class ChineseNumberUnit(ChineseChar):
+ """
+ 中文数字/数位字符
+ 每个字符除繁简体外还有一个额外的大写字符
+ e.g. '陆' 和 '陸'
+ """
+
+ def __init__(self, power, simplified, traditional, big_s, big_t):
+ super(ChineseNumberUnit, self).__init__(simplified, traditional)
+ self.power = power
+ self.big_s = big_s
+ self.big_t = big_t
+
+ def __str__(self):
+ return "10^{}".format(self.power)
+
+ @classmethod
+ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
+ if small_unit:
+ return ChineseNumberUnit(
+ power=index + 1, simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]
+ )
+ elif numbering_type == NUMBERING_TYPES[0]:
+ return ChineseNumberUnit(
+ power=index + 8, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]
+ )
+ elif numbering_type == NUMBERING_TYPES[1]:
+ return ChineseNumberUnit(
+ power=(index + 2) * 4, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]
+ )
+ elif numbering_type == NUMBERING_TYPES[2]:
+ return ChineseNumberUnit(
+ power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]
+ )
+ else:
+ raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type))
+
+
+class ChineseNumberDigit(ChineseChar):
+ """
+ 中文数字字符
+ """
+
+ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
+ super(ChineseNumberDigit, self).__init__(simplified, traditional)
+ self.value = value
+ self.big_s = big_s
+ self.big_t = big_t
+ self.alt_s = alt_s
+ self.alt_t = alt_t
+
+ def __str__(self):
+ return str(self.value)
+
+ @classmethod
+ def create(cls, i, v):
+ return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
+
+
+class ChineseMath(ChineseChar):
+ """
+ 中文数位字符
+ """
+
+ def __init__(self, simplified, traditional, symbol, expression=None):
+ super(ChineseMath, self).__init__(simplified, traditional)
+ self.symbol = symbol
+ self.expression = expression
+ self.big_s = simplified
+ self.big_t = traditional
+
+
+CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
+
+
+class NumberSystem(object):
+ """
+ 中文数字系统
+ """
+
+ pass
+
+
+class MathSymbol(object):
+ """
+ 用于中文数字系统的数学符号 (繁/简体), e.g.
+ positive = ['正', '正']
+ negative = ['负', '負']
+ point = ['点', '點']
+ """
+
+ def __init__(self, positive, negative, point):
+ self.positive = positive
+ self.negative = negative
+ self.point = point
+
+ def __iter__(self):
+ for v in self.__dict__.values():
+ yield v
+
+
+# class OtherSymbol(object):
+# """
+# 其他符号
+# """
+#
+# def __init__(self, sil):
+# self.sil = sil
+#
+# def __iter__(self):
+# for v in self.__dict__.values():
+# yield v
+
+
+# ================================================================================ #
+# basic utils
+# ================================================================================ #
+def create_system(numbering_type=NUMBERING_TYPES[1]):
+ """
+ 根据数字系统类型返回创建相应的数字系统,默认为 mid
+ NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
+ low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
+ mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
+ high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
+ 返回对应的数字系统
+ """
+
+ # chinese number units of '亿' and larger
+ all_larger_units = zip(LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
+ larger_units = [CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)]
+ # chinese number units of '十, 百, 千, 万'
+ all_smaller_units = zip(SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
+ smaller_units = [CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)]
+ # digis
+ chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
+ digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
+ digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
+ digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
+ digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
+
+ # symbols
+ positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x)
+ negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x)
+ point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y)))
+ # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
+ system = NumberSystem()
+ system.units = smaller_units + larger_units
+ system.digits = digits
+ system.math = MathSymbol(positive_cn, negative_cn, point_cn)
+ # system.symbols = OtherSymbol(sil_cn)
+ return system
+
+
+def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
+ def get_symbol(char, system):
+ for u in system.units:
+ if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
+ return u
+ for d in system.digits:
+ if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
+ return d
+ for m in system.math:
+ if char in [m.traditional, m.simplified]:
+ return m
+
+ def string2symbols(chinese_string, system):
+ int_string, dec_string = chinese_string, ""
+ for p in [system.math.point.simplified, system.math.point.traditional]:
+ if p in chinese_string:
+ int_string, dec_string = chinese_string.split(p)
+ break
+ return [get_symbol(c, system) for c in int_string], [get_symbol(c, system) for c in dec_string]
+
+ def correct_symbols(integer_symbols, system):
+ """
+ 一百八 to 一百八十
+ 一亿一千三百万 to 一亿 一千万 三百万
+ """
+
+ if integer_symbols and isinstance(integer_symbols[0], CNU):
+ if integer_symbols[0].power == 1:
+ integer_symbols = [system.digits[1]] + integer_symbols
+
+ if len(integer_symbols) > 1:
+ if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
+ integer_symbols.append(CNU(integer_symbols[-2].power - 1, None, None, None, None))
+
+ result = []
+ unit_count = 0
+ for s in integer_symbols:
+ if isinstance(s, CND):
+ result.append(s)
+ unit_count = 0
+ elif isinstance(s, CNU):
+ current_unit = CNU(s.power, None, None, None, None)
+ unit_count += 1
+
+ if unit_count == 1:
+ result.append(current_unit)
+ elif unit_count > 1:
+ for i in range(len(result)):
+ if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
+ result[-i - 1] = CNU(result[-i - 1].power + current_unit.power, None, None, None, None)
+ return result
+
+ def compute_value(integer_symbols):
+ """
+ Compute the value.
+ When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
+ e.g. '两千万' = 2000 * 10000 not 2000 + 10000
+ """
+ value = [0]
+ last_power = 0
+ for s in integer_symbols:
+ if isinstance(s, CND):
+ value[-1] = s.value
+ elif isinstance(s, CNU):
+ value[-1] *= pow(10, s.power)
+ if s.power > last_power:
+ value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1]))
+ last_power = s.power
+ value.append(0)
+ return sum(value)
+
+ system = create_system(numbering_type)
+ int_part, dec_part = string2symbols(chinese_string, system)
+ int_part = correct_symbols(int_part, system)
+ int_str = str(compute_value(int_part))
+ dec_str = "".join([str(d.value) for d in dec_part])
+ if dec_part:
+ return "{0}.{1}".format(int_str, dec_str)
+ else:
+ return int_str
+
+
+def num2chn(
+ number_string,
+ numbering_type=NUMBERING_TYPES[1],
+ big=False,
+ traditional=False,
+ alt_zero=False,
+ alt_one=False,
+ alt_two=True,
+ use_zeros=True,
+ use_units=True,
+):
+ def get_value(value_string, use_zeros=True):
+ striped_string = value_string.lstrip("0")
+
+ # record nothing if all zeros
+ if not striped_string:
+ return []
+
+ # record one digits
+ elif len(striped_string) == 1:
+ if use_zeros and len(value_string) != len(striped_string):
+ return [system.digits[0], system.digits[int(striped_string)]]
+ else:
+ return [system.digits[int(striped_string)]]
+
+ # recursively record multiple digits
+ else:
+ result_unit = next(u for u in reversed(system.units) if u.power < len(striped_string))
+ result_string = value_string[: -result_unit.power]
+ return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power :])
+
+ system = create_system(numbering_type)
+
+ int_dec = number_string.split(".")
+ if len(int_dec) == 1:
+ int_string = int_dec[0]
+ dec_string = ""
+ elif len(int_dec) == 2:
+ int_string = int_dec[0]
+ dec_string = int_dec[1]
+ else:
+ raise ValueError("invalid input num string with more than one dot: {}".format(number_string))
+
+ if use_units and len(int_string) > 1:
+ result_symbols = get_value(int_string)
+ else:
+ result_symbols = [system.digits[int(c)] for c in int_string]
+ dec_symbols = [system.digits[int(c)] for c in dec_string]
+ if dec_string:
+ result_symbols += [system.math.point] + dec_symbols
+
+ if alt_two:
+ liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, system.digits[2].big_s, system.digits[2].big_t)
+ for i, v in enumerate(result_symbols):
+ if isinstance(v, CND) and v.value == 2:
+ next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None
+ previous_symbol = result_symbols[i - 1] if i > 0 else None
+ if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
+ if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
+ result_symbols[i] = liang
+
+ # if big is True, '两' will not be used and `alt_two` has no impact on output
+ if big:
+ attr_name = "big_"
+ if traditional:
+ attr_name += "t"
+ else:
+ attr_name += "s"
+ else:
+ if traditional:
+ attr_name = "traditional"
+ else:
+ attr_name = "simplified"
+
+ result = "".join([getattr(s, attr_name) for s in result_symbols])
+
+ # if not use_zeros:
+ # result = result.strip(getattr(system.digits[0], attr_name))
+
+ if alt_zero:
+ result = result.replace(getattr(system.digits[0], attr_name), system.digits[0].alt_s)
+
+ if alt_one:
+ result = result.replace(getattr(system.digits[1], attr_name), system.digits[1].alt_s)
+
+ for i, p in enumerate(POINT):
+ if result.startswith(p):
+ return CHINESE_DIGIS[0] + result
+
+ # ^10, 11, .., 19
+ if (
+ len(result) >= 2
+ and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]]
+ and result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]
+ ):
+ result = result[1:]
+
+ return result
+
+
+# ================================================================================ #
+# different types of rewriters
+# ================================================================================ #
+class Cardinal:
+ """
+ CARDINAL类
+ """
+
+ def __init__(self, cardinal=None, chntext=None):
+ self.cardinal = cardinal
+ self.chntext = chntext
+
+ def chntext2cardinal(self):
+ return chn2num(self.chntext)
+
+ def cardinal2chntext(self):
+ return num2chn(self.cardinal)
+
+
+class Digit:
+ """
+ DIGIT类
+ """
+
+ def __init__(self, digit=None, chntext=None):
+ self.digit = digit
+ self.chntext = chntext
+
+ # def chntext2digit(self):
+ # return chn2num(self.chntext)
+
+ def digit2chntext(self):
+ return num2chn(self.digit, alt_two=False, use_units=False)
+
+
+class TelePhone:
+ """
+ TELEPHONE类
+ """
+
+ def __init__(self, telephone=None, raw_chntext=None, chntext=None):
+ self.telephone = telephone
+ self.raw_chntext = raw_chntext
+ self.chntext = chntext
+
+ # def chntext2telephone(self):
+ # sil_parts = self.raw_chntext.split('')
+ # self.telephone = '-'.join([
+ # str(chn2num(p)) for p in sil_parts
+ # ])
+ # return self.telephone
+
+ def telephone2chntext(self, fixed=False):
+ if fixed:
+ sil_parts = self.telephone.split("-")
+ self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sil_parts])
+ self.chntext = self.raw_chntext.replace("", "")
+ else:
+ sp_parts = self.telephone.strip("+").split()
+ self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sp_parts])
+ self.chntext = self.raw_chntext.replace("", "")
+ return self.chntext
+
+
+class Fraction:
+ """
+ FRACTION类
+ """
+
+ def __init__(self, fraction=None, chntext=None):
+ self.fraction = fraction
+ self.chntext = chntext
+
+ def chntext2fraction(self):
+ denominator, numerator = self.chntext.split("分之")
+ return chn2num(numerator) + "/" + chn2num(denominator)
+
+ def fraction2chntext(self):
+ numerator, denominator = self.fraction.split("/")
+ return num2chn(denominator) + "分之" + num2chn(numerator)
+
+
+class Date:
+ """
+ DATE类
+ """
+
+ def __init__(self, date=None, chntext=None):
+ self.date = date
+ self.chntext = chntext
+
+ # def chntext2date(self):
+ # chntext = self.chntext
+ # try:
+ # year, other = chntext.strip().split('年', maxsplit=1)
+ # year = Digit(chntext=year).digit2chntext() + '年'
+ # except ValueError:
+ # other = chntext
+ # year = ''
+ # if other:
+ # try:
+ # month, day = other.strip().split('月', maxsplit=1)
+ # month = Cardinal(chntext=month).chntext2cardinal() + '月'
+ # except ValueError:
+ # day = chntext
+ # month = ''
+ # if day:
+ # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
+ # else:
+ # month = ''
+ # day = ''
+ # date = year + month + day
+ # self.date = date
+ # return self.date
+
+ def date2chntext(self):
+ date = self.date
+ try:
+ year, other = date.strip().split("年", 1)
+ year = Digit(digit=year).digit2chntext() + "年"
+ except ValueError:
+ other = date
+ year = ""
+ if other:
+ try:
+ month, day = other.strip().split("月", 1)
+ month = Cardinal(cardinal=month).cardinal2chntext() + "月"
+ except ValueError:
+ day = date
+ month = ""
+ if day:
+ day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
+ else:
+ month = ""
+ day = ""
+ chntext = year + month + day
+ self.chntext = chntext
+ return self.chntext
+
+
+class Money:
+ """
+ MONEY类
+ """
+
+ def __init__(self, money=None, chntext=None):
+ self.money = money
+ self.chntext = chntext
+
+ # def chntext2money(self):
+ # return self.money
+
+ def money2chntext(self):
+ money = self.money
+ pattern = re.compile(r"(\d+(\.\d+)?)")
+ matchers = pattern.findall(money)
+ if matchers:
+ for matcher in matchers:
+ money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
+ self.chntext = money
+ return self.chntext
+
+
+class Percentage:
+ """
+ PERCENTAGE类
+ """
+
+ def __init__(self, percentage=None, chntext=None):
+ self.percentage = percentage
+ self.chntext = chntext
+
+ def chntext2percentage(self):
+ return chn2num(self.chntext.strip().strip("百分之")) + "%"
+
+ def percentage2chntext(self):
+ return "百分之" + num2chn(self.percentage.strip().strip("%"))
+
+
+def normalize_nsw(raw_text):
+ text = "^" + raw_text + "$"
+
+ # 规范化日期
+ pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('date')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
+
+ # 规范化金钱
+ pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('money')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
+
+ # 规范化固话/手机号码
+ # 手机
+ # http://www.jihaoba.com/news/show/13680
+ # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+ # 联通:130、131、132、156、155、186、185、176
+ # 电信:133、153、189、180、181、177
+ pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('telephone')
+ for matcher in matchers:
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
+ # 固话
+ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('fixed telephone')
+ for matcher in matchers:
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
+
+ # 规范化分数
+ pattern = re.compile(r"(\d+/\d+)")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('fraction')
+ for matcher in matchers:
+ text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
+
+ # 规范化百分数
+ text = text.replace("%", "%")
+ pattern = re.compile(r"(\d+(\.\d+)?%)")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('percentage')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
+
+ # 规范化纯数+量词
+ pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('cardinal+quantifier')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+ # 规范化数字编号
+ pattern = re.compile(r"(\d{4,32})")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('digit')
+ for matcher in matchers:
+ text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
+
+ # 规范化纯数
+ pattern = re.compile(r"(\d+(\.\d+)?)")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('cardinal')
+ for matcher in matchers:
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
+
+ # restore P2P, O2O, B2C, B2B etc
+ pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
+ matchers = pattern.findall(text)
+ if matchers:
+ # print('particular')
+ for matcher in matchers:
+ text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1)
+
+ return text.lstrip("^").rstrip("$")
+
+
+def remove_erhua(text):
+ """
+ 去除儿化音词中的儿:
+ 他女儿在那边儿 -> 他女儿在那边
+ """
+
+ new_str = ""
+ while re.search("儿", text):
+ a = re.search("儿", text).span()
+ remove_er_flag = 0
+
+ if ER_WHITELIST_PATTERN.search(text):
+ b = ER_WHITELIST_PATTERN.search(text).span()
+ if b[0] <= a[0]:
+ remove_er_flag = 1
+
+ if remove_er_flag == 0:
+ new_str = new_str + text[0 : a[0]]
+ text = text[a[1] :]
+ else:
+ new_str = new_str + text[0 : b[1]]
+ text = text[b[1] :]
+
+ text = new_str + text
+ return text
+
+
+def remove_space(text):
+ tokens = text.split()
+ new = []
+ for k, t in enumerate(tokens):
+ if k != 0:
+ if IN_EN_CHARS.get(tokens[k - 1][-1]) and IN_EN_CHARS.get(t[0]):
+ new.append(" ")
+ new.append(t)
+ return "".join(new)
+
+
+class TextNorm:
+ def __init__(
+ self,
+ to_banjiao: bool = False,
+ to_upper: bool = False,
+ to_lower: bool = False,
+ remove_fillers: bool = False,
+ remove_erhua: bool = False,
+ check_chars: bool = False,
+ remove_space: bool = False,
+ cc_mode: str = "",
+ ):
+ self.to_banjiao = to_banjiao
+ self.to_upper = to_upper
+ self.to_lower = to_lower
+ self.remove_fillers = remove_fillers
+ self.remove_erhua = remove_erhua
+ self.check_chars = check_chars
+ self.remove_space = remove_space
+
+ self.cc = None
+ if cc_mode:
+ from opencc import OpenCC # Open Chinese Convert: pip install opencc
+
+ self.cc = OpenCC(cc_mode)
+
+ def __call__(self, text):
+ if self.cc:
+ text = self.cc.convert(text)
+
+ if self.to_banjiao:
+ text = text.translate(QJ2BJ_TRANSFORM)
+
+ if self.to_upper:
+ text = text.upper()
+
+ if self.to_lower:
+ text = text.lower()
+
+ if self.remove_fillers:
+ for c in FILLER_CHARS:
+ text = text.replace(c, "")
+
+ if self.remove_erhua:
+ text = remove_erhua(text)
+
+ text = normalize_nsw(text)
+
+ text = text.translate(PUNCS_TRANSFORM)
+
+ if self.check_chars:
+ for c in text:
+ if not IN_VALID_CHARS.get(c):
+ print(f"WARNING: illegal char {c} in: {text}", file=sys.stderr)
+ return ""
+
+ if self.remove_space:
+ text = remove_space(text)
+
+ return text
+
+
+if __name__ == "__main__":
+ p = argparse.ArgumentParser()
+
+ # normalizer options
+ p.add_argument("--to_banjiao", action="store_true", help="convert quanjiao chars to banjiao")
+ p.add_argument("--to_upper", action="store_true", help="convert to upper case")
+ p.add_argument("--to_lower", action="store_true", help="convert to lower case")
+ p.add_argument("--remove_fillers", action="store_true", help='remove filler chars such as "呃, 啊"')
+ p.add_argument("--remove_erhua", action="store_true", help='remove erhua chars such as "他女儿在那边儿 -> 他女儿在那边"')
+ p.add_argument("--check_chars", action="store_true", help="skip sentences containing illegal chars")
+ p.add_argument("--remove_space", action="store_true", help="remove whitespace")
+ p.add_argument(
+ "--cc_mode", choices=["", "t2s", "s2t"], default="", help="convert between traditional to simplified"
+ )
+
+ # I/O options
+ p.add_argument("--log_interval", type=int, default=10000, help="log interval in number of processed lines")
+ p.add_argument("--has_key", action="store_true", help="will be deprecated, set --format ark instead")
+ p.add_argument("--format", type=str, choices=["txt", "ark", "tsv"], default="txt", help="input format")
+ p.add_argument("ifile", help="input filename, assume utf-8 encoding")
+ p.add_argument("ofile", help="output filename")
+
+ args = p.parse_args()
+
+ if args.has_key:
+ args.format = "ark"
+
+ normalizer = TextNorm(
+ to_banjiao=args.to_banjiao,
+ to_upper=args.to_upper,
+ to_lower=args.to_lower,
+ remove_fillers=args.remove_fillers,
+ remove_erhua=args.remove_erhua,
+ check_chars=args.check_chars,
+ remove_space=args.remove_space,
+ cc_mode=args.cc_mode,
+ )
+
+ normalizer = TextNorm(
+ to_banjiao=args.to_banjiao,
+ to_upper=args.to_upper,
+ to_lower=args.to_lower,
+ remove_fillers=args.remove_fillers,
+ remove_erhua=args.remove_erhua,
+ check_chars=args.check_chars,
+ remove_space=args.remove_space,
+ cc_mode=args.cc_mode,
+ )
+
+ ndone = 0
+ with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream:
+ if args.format == "tsv":
+ reader = csv.DictReader(istream, delimiter="\t")
+ assert "TEXT" in reader.fieldnames
+ print("\t".join(reader.fieldnames), file=ostream)
+
+ for item in reader:
+ text = item["TEXT"]
+
+ if text:
+ text = normalizer(text)
+
+ if text:
+ item["TEXT"] = text
+ print("\t".join([item[f] for f in reader.fieldnames]), file=ostream)
+
+ ndone += 1
+ if ndone % args.log_interval == 0:
+ print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True)
+ else:
+ for l in istream:
+ key, text = "", ""
+ if args.format == "ark": # KALDI archive, line format: "key text"
+ cols = l.strip().split(maxsplit=1)
+ key, text = cols[0], cols[1] if len(cols) == 2 else ""
+ else:
+ text = l.strip()
+
+ if text:
+ text = normalizer(text)
+
+ if text:
+ if args.format == "ark":
+ print(key + "\t" + text, file=ostream)
+ else:
+ print(text, file=ostream)
+
+ ndone += 1
+ if ndone % args.log_interval == 0:
+ print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True)
+ print(f"text norm: {ndone} lines done in total.", file=sys.stderr, flush=True)
diff --git a/submodules/TTS/TTS/tts/utils/__init__.py b/submodules/TTS/TTS/tts/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/utils/assets/tortoise/tokenizer.json b/submodules/TTS/TTS/tts/utils/assets/tortoise/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..a128f273053e465a15c488e48d8106e0c8b0898e
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/assets/tortoise/tokenizer.json
@@ -0,0 +1 @@
+{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
\ No newline at end of file
diff --git a/submodules/TTS/TTS/tts/utils/data.py b/submodules/TTS/TTS/tts/utils/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e46b683adfc7f6c7c8a57fb5b697e422cd915c
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/data.py
@@ -0,0 +1,79 @@
+import bisect
+
+import numpy as np
+import torch
+
+
+def _pad_data(x, length):
+ _pad = 0
+ assert x.ndim == 1
+ return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
+
+
+def prepare_data(inputs):
+ max_len = max((len(x) for x in inputs))
+ return np.stack([_pad_data(x, max_len) for x in inputs])
+
+
+def _pad_tensor(x, length):
+ _pad = 0.0
+ assert x.ndim == 2
+ x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
+ return x
+
+
+def prepare_tensor(inputs, out_steps):
+ max_len = max((x.shape[1] for x in inputs))
+ remainder = max_len % out_steps
+ pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
+ return np.stack([_pad_tensor(x, pad_len) for x in inputs])
+
+
+def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
+ """Pad stop target array.
+
+ Args:
+ x (np.ndarray): Stop target array.
+ length (int): Length after padding.
+ pad_val (int, optional): Padding value. Defaults to 1.
+
+ Returns:
+ np.ndarray: Padded stop target array.
+ """
+ assert x.ndim == 1
+ return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
+
+
+def prepare_stop_target(inputs, out_steps):
+ """Pad row vectors with 1."""
+ max_len = max((x.shape[0] for x in inputs))
+ remainder = max_len % out_steps
+ pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
+ return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
+
+
+def pad_per_step(inputs, pad_len):
+ return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
+
+
+def get_length_balancer_weights(items: list, num_buckets=10):
+ # get all durations
+ audio_lengths = np.array([item["audio_length"] for item in items])
+ # create the $num_buckets buckets classes based in the dataset max and min length
+ max_length = int(max(audio_lengths))
+ min_length = int(min(audio_lengths))
+ step = int((max_length - min_length) / num_buckets) + 1
+ buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
+ # add each sample in their respective length bucket
+ buckets_names = np.array(
+ [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
+ )
+ # count and compute the weights_bucket for each sample
+ unique_buckets_names = np.unique(buckets_names).tolist()
+ bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
+ bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
+ weight_bucket = 1.0 / bucket_count
+ dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
+ # normalize
+ dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+ return torch.from_numpy(dataset_samples_weight).float()
diff --git a/submodules/TTS/TTS/tts/utils/fairseq.py b/submodules/TTS/TTS/tts/utils/fairseq.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d8eec2b4ee0d7b0c79e368616d4b75fb2e551d4
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/fairseq.py
@@ -0,0 +1,48 @@
+import torch
+
+
+def rehash_fairseq_vits_checkpoint(checkpoint_file):
+ chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
+ new_chk = {}
+ for k, v in chk.items():
+ if "enc_p." in k:
+ new_chk[k.replace("enc_p.", "text_encoder.")] = v
+ elif "dec." in k:
+ new_chk[k.replace("dec.", "waveform_decoder.")] = v
+ elif "enc_q." in k:
+ new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
+ elif "flow.flows.2." in k:
+ new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
+ elif "flow.flows.4." in k:
+ new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
+ elif "flow.flows.6." in k:
+ new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
+ elif "dp.flows.0.m" in k:
+ new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
+ elif "dp.flows.0.logs" in k:
+ new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
+ elif "dp.flows.1" in k:
+ new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
+ elif "dp.flows.3" in k:
+ new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
+ elif "dp.flows.5" in k:
+ new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
+ elif "dp.flows.7" in k:
+ new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
+ elif "dp.post_flows.0.m" in k:
+ new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
+ elif "dp.post_flows.0.logs" in k:
+ new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
+ elif "dp.post_flows.1" in k:
+ new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
+ elif "dp.post_flows.3" in k:
+ new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
+ elif "dp.post_flows.5" in k:
+ new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
+ elif "dp.post_flows.7" in k:
+ new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
+ elif "dp." in k:
+ new_chk[k.replace("dp.", "duration_predictor.")] = v
+ else:
+ new_chk[k] = v
+ return new_chk
diff --git a/submodules/TTS/TTS/tts/utils/helpers.py b/submodules/TTS/TTS/tts/utils/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b37201f8410eb34300d8bb2b1a595d5c5cfc42f
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/helpers.py
@@ -0,0 +1,258 @@
+import numpy as np
+import torch
+from scipy.stats import betabinom
+from torch.nn import functional as F
+
+try:
+ from TTS.tts.utils.monotonic_align.core import maximum_path_c
+
+ CYTHON = True
+except ModuleNotFoundError:
+ CYTHON = False
+
+
+class StandardScaler:
+ """StandardScaler for mean-scale normalization with the given mean and scale values."""
+
+ def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
+ self.mean_ = mean
+ self.scale_ = scale
+
+ def set_stats(self, mean, scale):
+ self.mean_ = mean
+ self.scale_ = scale
+
+ def reset_stats(self):
+ delattr(self, "mean_")
+ delattr(self, "scale_")
+
+ def transform(self, X):
+ X = np.asarray(X)
+ X -= self.mean_
+ X /= self.scale_
+ return X
+
+ def inverse_transform(self, X):
+ X = np.asarray(X)
+ X *= self.scale_
+ X += self.mean_
+ return X
+
+
+# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
+def sequence_mask(sequence_length, max_len=None):
+ """Create a sequence mask for filtering padding in a sequence tensor.
+
+ Args:
+ sequence_length (torch.tensor): Sequence lengths.
+ max_len (int, Optional): Maximum sequence length. Defaults to None.
+
+ Shapes:
+ - mask: :math:`[B, T_max]`
+ """
+ if max_len is None:
+ max_len = sequence_length.max()
+ seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
+ # B x T_max
+ return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
+
+
+def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_short=False):
+ """Segment each sample in a batch based on the provided segment indices
+
+ Args:
+ x (torch.tensor): Input tensor.
+ segment_indices (torch.tensor): Segment indices.
+ segment_size (int): Expected output segment size.
+ pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
+ """
+ # pad the input tensor if it is shorter than the segment size
+ if pad_short and x.shape[-1] < segment_size:
+ x = torch.nn.functional.pad(x, (0, segment_size - x.size(2)))
+
+ segments = torch.zeros_like(x[:, :, :segment_size])
+
+ for i in range(x.size(0)):
+ index_start = segment_indices[i]
+ index_end = index_start + segment_size
+ x_i = x[i]
+ if pad_short and index_end >= x.size(2):
+ # pad the sample if it is shorter than the segment size
+ x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2)))
+ segments[i] = x_i[:, index_start:index_end]
+ return segments
+
+
+def rand_segments(
+ x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False
+):
+ """Create random segments based on the input lengths.
+
+ Args:
+ x (torch.tensor): Input tensor.
+ x_lengths (torch.tensor): Input lengths.
+ segment_size (int): Expected output segment size.
+ let_short_samples (bool): Allow shorter samples than the segment size.
+ pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
+
+ Shapes:
+ - x: :math:`[B, C, T]`
+ - x_lengths: :math:`[B]`
+ """
+ _x_lenghts = x_lengths.clone()
+ B, _, T = x.size()
+ if pad_short:
+ if T < segment_size:
+ x = torch.nn.functional.pad(x, (0, segment_size - T))
+ T = segment_size
+ if _x_lenghts is None:
+ _x_lenghts = T
+ len_diff = _x_lenghts - segment_size
+ if let_short_samples:
+ _x_lenghts[len_diff < 0] = segment_size
+ len_diff = _x_lenghts - segment_size
+ else:
+ assert all(
+ len_diff > 0
+ ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+ segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
+ ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
+ return ret, segment_indices
+
+
+def average_over_durations(values, durs):
+ """Average values over durations.
+
+ Shapes:
+ - values: :math:`[B, 1, T_de]`
+ - durs: :math:`[B, T_en]`
+ - avg: :math:`[B, 1, T_en]`
+ """
+ durs_cums_ends = torch.cumsum(durs, dim=1).long()
+ durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
+ values_nonzero_cums = torch.nn.functional.pad(torch.cumsum(values != 0.0, dim=2), (1, 0))
+ values_cums = torch.nn.functional.pad(torch.cumsum(values, dim=2), (1, 0))
+
+ bs, l = durs_cums_ends.size()
+ n_formants = values.size(1)
+ dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
+ dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
+
+ values_sums = (torch.gather(values_cums, 2, dce) - torch.gather(values_cums, 2, dcs)).float()
+ values_nelems = (torch.gather(values_nonzero_cums, 2, dce) - torch.gather(values_nonzero_cums, 2, dcs)).float()
+
+ avg = torch.where(values_nelems == 0.0, values_nelems, values_sums / values_nelems)
+ return avg
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def generate_path(duration, mask):
+ """
+ Shapes:
+ - duration: :math:`[B, T_en]`
+ - mask: :math:'[B, T_en, T_de]`
+ - path: :math:`[B, T_en, T_de]`
+ """
+ b, t_x, t_y = mask.shape
+ cum_duration = torch.cumsum(duration, 1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path * mask
+ return path
+
+
+def maximum_path(value, mask):
+ if CYTHON:
+ return maximum_path_cython(value, mask)
+ return maximum_path_numpy(value, mask)
+
+
+def maximum_path_cython(value, mask):
+ """Cython optimised version.
+ Shapes:
+ - value: :math:`[B, T_en, T_de]`
+ - mask: :math:`[B, T_en, T_de]`
+ """
+ value = value * mask
+ device = value.device
+ dtype = value.dtype
+ value = value.data.cpu().numpy().astype(np.float32)
+ path = np.zeros_like(value).astype(np.int32)
+ mask = mask.data.cpu().numpy()
+
+ t_x_max = mask.sum(1)[:, 0].astype(np.int32)
+ t_y_max = mask.sum(2)[:, 0].astype(np.int32)
+ maximum_path_c(path, value, t_x_max, t_y_max)
+ return torch.from_numpy(path).to(device=device, dtype=dtype)
+
+
+def maximum_path_numpy(value, mask, max_neg_val=None):
+ """
+ Monotonic alignment search algorithm
+ Numpy-friendly version. It's about 4 times faster than torch version.
+ value: [b, t_x, t_y]
+ mask: [b, t_x, t_y]
+ """
+ if max_neg_val is None:
+ max_neg_val = -np.inf # Patch for Sphinx complaint
+ value = value * mask
+
+ device = value.device
+ dtype = value.dtype
+ value = value.cpu().detach().numpy()
+ mask = mask.cpu().detach().numpy().astype(bool)
+
+ b, t_x, t_y = value.shape
+ direction = np.zeros(value.shape, dtype=np.int64)
+ v = np.zeros((b, t_x), dtype=np.float32)
+ x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
+ for j in range(t_y):
+ v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
+ v1 = v
+ max_mask = v1 >= v0
+ v_max = np.where(max_mask, v1, v0)
+ direction[:, :, j] = max_mask
+
+ index_mask = x_range <= j
+ v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
+ direction = np.where(mask, direction, 1)
+
+ path = np.zeros(value.shape, dtype=np.float32)
+ index = mask[:, :, 0].sum(1).astype(np.int64) - 1
+ index_range = np.arange(b)
+ for j in reversed(range(t_y)):
+ path[index_range, index, j] = 1
+ index = index + direction[index_range, index, j] - 1
+ path = path * mask.astype(np.float32)
+ path = torch.from_numpy(path).to(device=device, dtype=dtype)
+ return path
+
+
+def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
+ P, M = phoneme_count, mel_count
+ x = np.arange(0, P)
+ mel_text_probs = []
+ for i in range(1, M + 1):
+ a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
+ rv = betabinom(P, a, b)
+ mel_i_prob = rv.pmf(x)
+ mel_text_probs.append(mel_i_prob)
+ return np.array(mel_text_probs)
+
+
+def compute_attn_prior(x_len, y_len, scaling_factor=1.0):
+ """Compute attention priors for the alignment network."""
+ attn_prior = beta_binomial_prior_distribution(
+ x_len,
+ y_len,
+ scaling_factor,
+ )
+ return attn_prior # [y_len, x_len]
diff --git a/submodules/TTS/TTS/tts/utils/languages.py b/submodules/TTS/TTS/tts/utils/languages.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1836b32ce2010ad55a0253849f2e59c61dad82
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/languages.py
@@ -0,0 +1,125 @@
+import os
+from typing import Any, Dict, List
+
+import fsspec
+import numpy as np
+import torch
+from coqpit import Coqpit
+
+from TTS.config import check_config_and_model_args
+from TTS.tts.utils.managers import BaseIDManager
+
+
+class LanguageManager(BaseIDManager):
+ """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
+ in a way that can be queried by language.
+
+ Args:
+ language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by
+ TTS models. Defaults to "".
+ config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed.
+ Defaults to None.
+
+ Examples:
+ >>> manager = LanguageManager(language_ids_file_path=language_ids_file_path)
+ >>> language_id_mapper = manager.language_ids
+ """
+
+ def __init__(
+ self,
+ language_ids_file_path: str = "",
+ config: Coqpit = None,
+ ):
+ super().__init__(id_file_path=language_ids_file_path)
+
+ if config:
+ self.set_language_ids_from_config(config)
+
+ @property
+ def num_languages(self) -> int:
+ return len(list(self.name_to_id.keys()))
+
+ @property
+ def language_names(self) -> List:
+ return list(self.name_to_id.keys())
+
+ @staticmethod
+ def parse_language_ids_from_config(c: Coqpit) -> Dict:
+ """Set language id from config.
+
+ Args:
+ c (Coqpit): Config
+
+ Returns:
+ Tuple[Dict, int]: Language ID mapping and the number of languages.
+ """
+ languages = set({})
+ for dataset in c.datasets:
+ if "language" in dataset:
+ languages.add(dataset["language"])
+ else:
+ raise ValueError(f"Dataset {dataset['name']} has no language specified.")
+ return {name: i for i, name in enumerate(sorted(list(languages)))}
+
+ def set_language_ids_from_config(self, c: Coqpit) -> None:
+ """Set language IDs from config samples.
+
+ Args:
+ c (Coqpit): Config.
+ """
+ self.name_to_id = self.parse_language_ids_from_config(c)
+
+ @staticmethod
+ def parse_ids_from_data(items: List, parse_key: str) -> Any:
+ raise NotImplementedError
+
+ def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+ raise NotImplementedError
+
+ def save_ids_to_file(self, file_path: str) -> None:
+ """Save language IDs to a json file.
+
+ Args:
+ file_path (str): Path to the output file.
+ """
+ self._save_json(file_path, self.name_to_id)
+
+ @staticmethod
+ def init_from_config(config: Coqpit) -> "LanguageManager":
+ """Initialize the language manager from a Coqpit config.
+
+ Args:
+ config (Coqpit): Coqpit config.
+ """
+ language_manager = None
+ if check_config_and_model_args(config, "use_language_embedding", True):
+ if config.get("language_ids_file", None):
+ language_manager = LanguageManager(language_ids_file_path=config.language_ids_file)
+ language_manager = LanguageManager(config=config)
+ return language_manager
+
+
+def _set_file_path(path):
+ """Find the language_ids.json under the given path or the above it.
+ Intended to band aid the different paths returned in restored and continued training."""
+ path_restore = os.path.join(os.path.dirname(path), "language_ids.json")
+ path_continue = os.path.join(path, "language_ids.json")
+ fs = fsspec.get_mapper(path).fs
+ if fs.exists(path_restore):
+ return path_restore
+ if fs.exists(path_continue):
+ return path_continue
+ return None
+
+
+def get_language_balancer_weights(items: list):
+ language_names = np.array([item["language"] for item in items])
+ unique_language_names = np.unique(language_names).tolist()
+ language_ids = [unique_language_names.index(l) for l in language_names]
+ language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
+ weight_language = 1.0 / language_count
+ # get weight for each sample
+ dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
+ # normalize
+ dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+ return torch.from_numpy(dataset_samples_weight).float()
diff --git a/submodules/TTS/TTS/tts/utils/managers.py b/submodules/TTS/TTS/tts/utils/managers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f94c5332df1e2774955eb263c3b688c5ad6e827
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/managers.py
@@ -0,0 +1,383 @@
+import json
+import random
+from typing import Any, Dict, List, Tuple, Union
+
+import fsspec
+import numpy as np
+import torch
+
+from TTS.config import load_config
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.utils.audio import AudioProcessor
+
+
+def load_file(path: str):
+ if path.endswith(".json"):
+ with fsspec.open(path, "r") as f:
+ return json.load(f)
+ elif path.endswith(".pth"):
+ with fsspec.open(path, "rb") as f:
+ return torch.load(f, map_location="cpu")
+ else:
+ raise ValueError("Unsupported file type")
+
+
+def save_file(obj: Any, path: str):
+ if path.endswith(".json"):
+ with fsspec.open(path, "w") as f:
+ json.dump(obj, f, indent=4)
+ elif path.endswith(".pth"):
+ with fsspec.open(path, "wb") as f:
+ torch.save(obj, f)
+ else:
+ raise ValueError("Unsupported file type")
+
+
+class BaseIDManager:
+ """Base `ID` Manager class. Every new `ID` manager must inherit this.
+ It defines common `ID` manager specific functions.
+ """
+
+ def __init__(self, id_file_path: str = ""):
+ self.name_to_id = {}
+
+ if id_file_path:
+ self.load_ids_from_file(id_file_path)
+
+ @staticmethod
+ def _load_json(json_file_path: str) -> Dict:
+ with fsspec.open(json_file_path, "r") as f:
+ return json.load(f)
+
+ @staticmethod
+ def _save_json(json_file_path: str, data: dict) -> None:
+ with fsspec.open(json_file_path, "w") as f:
+ json.dump(data, f, indent=4)
+
+ def set_ids_from_data(self, items: List, parse_key: str) -> None:
+ """Set IDs from data samples.
+
+ Args:
+ items (List): Data sampled returned by `load_tts_samples()`.
+ """
+ self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
+
+ def load_ids_from_file(self, file_path: str) -> None:
+ """Set IDs from a file.
+
+ Args:
+ file_path (str): Path to the file.
+ """
+ self.name_to_id = load_file(file_path)
+
+ def save_ids_to_file(self, file_path: str) -> None:
+ """Save IDs to a json file.
+
+ Args:
+ file_path (str): Path to the output file.
+ """
+ save_file(self.name_to_id, file_path)
+
+ def get_random_id(self) -> Any:
+ """Get a random embedding.
+
+ Args:
+
+ Returns:
+ np.ndarray: embedding.
+ """
+ if self.name_to_id:
+ return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
+
+ return None
+
+ @staticmethod
+ def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+ """Parse IDs from data samples retured by `load_tts_samples()`.
+
+ Args:
+ items (list): Data sampled returned by `load_tts_samples()`.
+ parse_key (str): The key to being used to parse the data.
+ Returns:
+ Tuple[Dict]: speaker IDs.
+ """
+ classes = sorted({item[parse_key] for item in items})
+ ids = {name: i for i, name in enumerate(classes)}
+ return ids
+
+
+class EmbeddingManager(BaseIDManager):
+ """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
+ It defines common `Embedding` manager specific functions.
+
+ It expects embeddings files in the following format:
+
+ ::
+
+ {
+ 'audio_file_key':{
+ 'name': 'category_name',
+ 'embedding'[]
+ },
+ ...
+ }
+
+ `audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
+ `embedding` is the embedding vector of the audio file.
+ `name` can be name of the speaker of the audio file.
+ """
+
+ def __init__(
+ self,
+ embedding_file_path: Union[str, List[str]] = "",
+ id_file_path: str = "",
+ encoder_model_path: str = "",
+ encoder_config_path: str = "",
+ use_cuda: bool = False,
+ ):
+ super().__init__(id_file_path=id_file_path)
+
+ self.embeddings = {}
+ self.embeddings_by_names = {}
+ self.clip_ids = []
+ self.encoder = None
+ self.encoder_ap = None
+ self.use_cuda = use_cuda
+
+ if embedding_file_path:
+ if isinstance(embedding_file_path, list):
+ self.load_embeddings_from_list_of_files(embedding_file_path)
+ else:
+ self.load_embeddings_from_file(embedding_file_path)
+
+ if encoder_model_path and encoder_config_path:
+ self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
+
+ @property
+ def num_embeddings(self):
+ """Get number of embeddings."""
+ return len(self.embeddings)
+
+ @property
+ def num_names(self):
+ """Get number of embeddings."""
+ return len(self.embeddings_by_names)
+
+ @property
+ def embedding_dim(self):
+ """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
+ if self.embeddings:
+ return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
+ return 0
+
+ @property
+ def embedding_names(self):
+ """Get embedding names."""
+ return list(self.embeddings_by_names.keys())
+
+ def save_embeddings_to_file(self, file_path: str) -> None:
+ """Save embeddings to a json file.
+
+ Args:
+ file_path (str): Path to the output file.
+ """
+ save_file(self.embeddings, file_path)
+
+ @staticmethod
+ def read_embeddings_from_file(file_path: str):
+ """Load embeddings from a json file.
+
+ Args:
+ file_path (str): Path to the file.
+ """
+ embeddings = load_file(file_path)
+ speakers = sorted({x["name"] for x in embeddings.values()})
+ name_to_id = {name: i for i, name in enumerate(speakers)}
+ clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
+ # cache embeddings_by_names for fast inference using a bigger speakers.json
+ embeddings_by_names = {}
+ for x in embeddings.values():
+ if x["name"] not in embeddings_by_names.keys():
+ embeddings_by_names[x["name"]] = [x["embedding"]]
+ else:
+ embeddings_by_names[x["name"]].append(x["embedding"])
+ return name_to_id, clip_ids, embeddings, embeddings_by_names
+
+ def load_embeddings_from_file(self, file_path: str) -> None:
+ """Load embeddings from a json file.
+
+ Args:
+ file_path (str): Path to the target json file.
+ """
+ self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
+ file_path
+ )
+
+ def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
+ """Load embeddings from a list of json files and don't allow duplicate keys.
+
+ Args:
+ file_paths (List[str]): List of paths to the target json files.
+ """
+ self.name_to_id = {}
+ self.clip_ids = []
+ self.embeddings_by_names = {}
+ self.embeddings = {}
+ for file_path in file_paths:
+ ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
+ # check colliding keys
+ duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
+ if duplicates:
+ raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
+ # store values
+ self.name_to_id.update(ids)
+ self.clip_ids.extend(clip_ids)
+ self.embeddings_by_names.update(embeddings_by_names)
+ self.embeddings.update(embeddings)
+
+ # reset name_to_id to get the right speaker ids
+ self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
+
+ def get_embedding_by_clip(self, clip_idx: str) -> List:
+ """Get embedding by clip ID.
+
+ Args:
+ clip_idx (str): Target clip ID.
+
+ Returns:
+ List: embedding as a list.
+ """
+ return self.embeddings[clip_idx]["embedding"]
+
+ def get_embeddings_by_name(self, idx: str) -> List[List]:
+ """Get all embeddings of a speaker.
+
+ Args:
+ idx (str): Target name.
+
+ Returns:
+ List[List]: all the embeddings of the given speaker.
+ """
+ return self.embeddings_by_names[idx]
+
+ def get_embeddings_by_names(self) -> Dict:
+ """Get all embeddings by names.
+
+ Returns:
+ Dict: all the embeddings of each speaker.
+ """
+ embeddings_by_names = {}
+ for x in self.embeddings.values():
+ if x["name"] not in embeddings_by_names.keys():
+ embeddings_by_names[x["name"]] = [x["embedding"]]
+ else:
+ embeddings_by_names[x["name"]].append(x["embedding"])
+ return embeddings_by_names
+
+ def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
+ """Get mean embedding of a idx.
+
+ Args:
+ idx (str): Target name.
+ num_samples (int, optional): Number of samples to be averaged. Defaults to None.
+ randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False.
+
+ Returns:
+ np.ndarray: Mean embedding.
+ """
+ embeddings = self.get_embeddings_by_name(idx)
+ if num_samples is None:
+ embeddings = np.stack(embeddings).mean(0)
+ else:
+ assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}"
+ if randomize:
+ embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0)
+ else:
+ embeddings = np.stack(embeddings[:num_samples]).mean(0)
+ return embeddings
+
+ def get_random_embedding(self) -> Any:
+ """Get a random embedding.
+
+ Args:
+
+ Returns:
+ np.ndarray: embedding.
+ """
+ if self.embeddings:
+ return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"]
+
+ return None
+
+ def get_clips(self) -> List:
+ return sorted(self.embeddings.keys())
+
+ def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
+ """Initialize a speaker encoder model.
+
+ Args:
+ model_path (str): Model file path.
+ config_path (str): Model config file path.
+ use_cuda (bool, optional): Use CUDA. Defaults to False.
+ """
+ self.use_cuda = use_cuda
+ self.encoder_config = load_config(config_path)
+ self.encoder = setup_encoder_model(self.encoder_config)
+ self.encoder_criterion = self.encoder.load_checkpoint(
+ self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
+ )
+ self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
+
+ def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+ """Compute a embedding from a given audio file.
+
+ Args:
+ wav_file (Union[str, List[str]]): Target file path.
+
+ Returns:
+ list: Computed embedding.
+ """
+
+ def _compute(wav_file: str):
+ waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)
+ if not self.encoder_config.model_params.get("use_torch_spec", False):
+ m_input = self.encoder_ap.melspectrogram(waveform)
+ m_input = torch.from_numpy(m_input)
+ else:
+ m_input = torch.from_numpy(waveform)
+
+ if self.use_cuda:
+ m_input = m_input.cuda()
+ m_input = m_input.unsqueeze(0)
+ embedding = self.encoder.compute_embedding(m_input)
+ return embedding
+
+ if isinstance(wav_file, list):
+ # compute the mean embedding
+ embeddings = None
+ for wf in wav_file:
+ embedding = _compute(wf)
+ if embeddings is None:
+ embeddings = embedding
+ else:
+ embeddings += embedding
+ return (embeddings / len(wav_file))[0].tolist()
+ embedding = _compute(wav_file)
+ return embedding[0].tolist()
+
+ def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+ """Compute embedding from features.
+
+ Args:
+ feats (Union[torch.Tensor, np.ndarray]): Input features.
+
+ Returns:
+ List: computed embedding.
+ """
+ if isinstance(feats, np.ndarray):
+ feats = torch.from_numpy(feats)
+ if feats.ndim == 2:
+ feats = feats.unsqueeze(0)
+ if self.use_cuda:
+ feats = feats.cuda()
+ return self.encoder.compute_embedding(feats)
diff --git a/submodules/TTS/TTS/tts/utils/measures.py b/submodules/TTS/TTS/tts/utils/measures.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e862e1190bdb8443933580b3ff47321f70cecd
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/measures.py
@@ -0,0 +1,15 @@
+def alignment_diagonal_score(alignments, binary=False):
+ """
+ Compute how diagonal alignment predictions are. It is useful
+ to measure the alignment consistency of a model
+ Args:
+ alignments (torch.Tensor): batch of alignments.
+ binary (bool): if True, ignore scores and consider attention
+ as a binary mask.
+ Shape:
+ - alignments : :math:`[B, T_de, T_en]`
+ """
+ maxs = alignments.max(dim=1)[0]
+ if binary:
+ maxs[maxs > 0] = 1
+ return maxs.mean(dim=1).mean(dim=0).item()
diff --git a/submodules/TTS/TTS/tts/utils/monotonic_align/__init__.py b/submodules/TTS/TTS/tts/utils/monotonic_align/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/submodules/TTS/TTS/tts/utils/monotonic_align/core.c b/submodules/TTS/TTS/tts/utils/monotonic_align/core.c
new file mode 100644
index 0000000000000000000000000000000000000000..44718a3e333e809aefc413726d854a044e0bb3fa
--- /dev/null
+++ b/submodules/TTS/TTS/tts/utils/monotonic_align/core.c
@@ -0,0 +1,23798 @@
+/* Generated by Cython 0.29.37 */
+
+/* BEGIN: Cython Metadata
+{
+ "distutils": {
+ "depends": [],
+ "name": "TTS.tts.utils.monotonic_align.core",
+ "sources": [
+ "TTS/tts/utils/monotonic_align/core.pyx"
+ ]
+ },
+ "module_name": "TTS.tts.utils.monotonic_align.core"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#include "Python.h"
+#ifndef Py_PYTHON_H
+ #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+ #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_29_37"
+#define CYTHON_HEX_VERSION 0x001D25F0
+#define CYTHON_FUTURE_DIVISION 1
+#include
+#ifndef offsetof
+ #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+ #ifndef __stdcall
+ #define __stdcall
+ #endif
+ #ifndef __cdecl
+ #define __cdecl
+ #endif
+ #ifndef __fastcall
+ #define __fastcall
+ #endif
+#endif
+#ifndef DL_IMPORT
+ #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+ #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+ #if PY_VERSION_HEX >= 0x02070000
+ #define HAVE_LONG_LONG
+ #endif
+#endif
+#ifndef PY_LONG_LONG
+ #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+ #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+ #define CYTHON_COMPILING_IN_PYPY 1
+ #define CYTHON_COMPILING_IN_PYSTON 0
+ #define CYTHON_COMPILING_IN_CPYTHON 0
+ #define CYTHON_COMPILING_IN_NOGIL 0
+ #undef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 0
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #if PY_VERSION_HEX < 0x03050000
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+ #define CYTHON_USE_ASYNC_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 0
+ #undef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 0
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #undef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 1
+ #undef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 0
+ #undef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 0
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #undef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 0
+ #if PY_VERSION_HEX < 0x03090000
+ #undef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+ #elif !defined(CYTHON_PEP489_MULTI_PHASE_INIT)
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+ #endif
+ #undef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1 && PYPY_VERSION_NUM >= 0x07030C00)
+ #undef CYTHON_USE_DICT_VERSIONS
+ #define CYTHON_USE_DICT_VERSIONS 0
+ #undef CYTHON_USE_EXC_INFO_STACK
+ #define CYTHON_USE_EXC_INFO_STACK 0
+ #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+ #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+ #endif
+#elif defined(PYSTON_VERSION)
+ #define CYTHON_COMPILING_IN_PYPY 0
+ #define CYTHON_COMPILING_IN_PYSTON 1
+ #define CYTHON_COMPILING_IN_CPYTHON 0
+ #define CYTHON_COMPILING_IN_NOGIL 0
+ #ifndef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #undef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 0
+ #ifndef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 1
+ #endif
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #ifndef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 0
+ #endif
+ #ifndef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 1
+ #endif
+ #ifndef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 1
+ #endif
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #undef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 0
+ #undef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+ #undef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE 0
+ #undef CYTHON_USE_DICT_VERSIONS
+ #define CYTHON_USE_DICT_VERSIONS 0
+ #undef CYTHON_USE_EXC_INFO_STACK
+ #define CYTHON_USE_EXC_INFO_STACK 0
+ #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+ #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+ #endif
+#elif defined(PY_NOGIL)
+ #define CYTHON_COMPILING_IN_PYPY 0
+ #define CYTHON_COMPILING_IN_PYSTON 0
+ #define CYTHON_COMPILING_IN_CPYTHON 0
+ #define CYTHON_COMPILING_IN_NOGIL 1
+ #ifndef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #ifndef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 1
+ #endif
+ #undef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 0
+ #ifndef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 1
+ #endif
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #ifndef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 0
+ #endif
+ #ifndef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 1
+ #endif
+ #ifndef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 1
+ #endif
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #undef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL 0
+ #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+ #endif
+ #ifndef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE 1
+ #endif
+ #undef CYTHON_USE_DICT_VERSIONS
+ #define CYTHON_USE_DICT_VERSIONS 0
+ #undef CYTHON_USE_EXC_INFO_STACK
+ #define CYTHON_USE_EXC_INFO_STACK 0
+#else
+ #define CYTHON_COMPILING_IN_PYPY 0
+ #define CYTHON_COMPILING_IN_PYSTON 0
+ #define CYTHON_COMPILING_IN_CPYTHON 1
+ #define CYTHON_COMPILING_IN_NOGIL 0
+ #ifndef CYTHON_USE_TYPE_SLOTS
+ #define CYTHON_USE_TYPE_SLOTS 1
+ #endif
+ #if PY_VERSION_HEX < 0x02070000
+ #undef CYTHON_USE_PYTYPE_LOOKUP
+ #define CYTHON_USE_PYTYPE_LOOKUP 0
+ #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+ #define CYTHON_USE_PYTYPE_LOOKUP 1
+ #endif
+ #if PY_MAJOR_VERSION < 3
+ #undef CYTHON_USE_ASYNC_SLOTS
+ #define CYTHON_USE_ASYNC_SLOTS 0
+ #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+ #define CYTHON_USE_ASYNC_SLOTS 1
+ #endif
+ #if PY_VERSION_HEX < 0x02070000
+ #undef CYTHON_USE_PYLONG_INTERNALS
+ #define CYTHON_USE_PYLONG_INTERNALS 0
+ #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+ #define CYTHON_USE_PYLONG_INTERNALS (PY_VERSION_HEX < 0x030C00A5)
+ #endif
+ #ifndef CYTHON_USE_PYLIST_INTERNALS
+ #define CYTHON_USE_PYLIST_INTERNALS 1
+ #endif
+ #ifndef CYTHON_USE_UNICODE_INTERNALS
+ #define CYTHON_USE_UNICODE_INTERNALS 1
+ #endif
+ #if PY_VERSION_HEX < 0x030300F0 || PY_VERSION_HEX >= 0x030B00A2
+ #undef CYTHON_USE_UNICODE_WRITER
+ #define CYTHON_USE_UNICODE_WRITER 0
+ #elif !defined(CYTHON_USE_UNICODE_WRITER)
+ #define CYTHON_USE_UNICODE_WRITER 1
+ #endif
+ #ifndef CYTHON_AVOID_BORROWED_REFS
+ #define CYTHON_AVOID_BORROWED_REFS 0
+ #endif
+ #ifndef CYTHON_ASSUME_SAFE_MACROS
+ #define CYTHON_ASSUME_SAFE_MACROS 1
+ #endif
+ #ifndef CYTHON_UNPACK_METHODS
+ #define CYTHON_UNPACK_METHODS 1
+ #endif
+ #if PY_VERSION_HEX >= 0x030B00A4
+ #undef CYTHON_FAST_THREAD_STATE
+ #define CYTHON_FAST_THREAD_STATE 0
+ #elif !defined(CYTHON_FAST_THREAD_STATE)
+ #define CYTHON_FAST_THREAD_STATE 1
+ #endif
+ #ifndef CYTHON_FAST_PYCALL
+ #define CYTHON_FAST_PYCALL (PY_VERSION_HEX < 0x030A0000)
+ #endif
+ #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+ #define CYTHON_PEP489_MULTI_PHASE_INIT (PY_VERSION_HEX >= 0x03050000)
+ #endif
+ #ifndef CYTHON_USE_TP_FINALIZE
+ #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+ #endif
+ #ifndef CYTHON_USE_DICT_VERSIONS
+ #define CYTHON_USE_DICT_VERSIONS ((PY_VERSION_HEX >= 0x030600B1) && (PY_VERSION_HEX < 0x030C00A5))
+ #endif
+ #if PY_VERSION_HEX >= 0x030B00A4
+ #undef CYTHON_USE_EXC_INFO_STACK
+ #define CYTHON_USE_EXC_INFO_STACK 0
+ #elif !defined(CYTHON_USE_EXC_INFO_STACK)
+ #define CYTHON_USE_EXC_INFO_STACK (PY_VERSION_HEX >= 0x030700A3)
+ #endif
+ #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+ #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+ #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+ #if PY_MAJOR_VERSION < 3
+ #include "longintrepr.h"
+ #endif
+ #undef SHIFT
+ #undef BASE
+ #undef MASK
+ #ifdef SIZEOF_VOID_P
+ enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+ #endif
+#endif
+#ifndef __has_attribute
+ #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+ #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+ #if defined(__GNUC__)
+ #define CYTHON_RESTRICT __restrict__
+ #elif defined(_MSC_VER) && _MSC_VER >= 1400
+ #define CYTHON_RESTRICT __restrict
+ #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define CYTHON_RESTRICT restrict
+ #else
+ #define CYTHON_RESTRICT
+ #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+# define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+# define CYTHON_UNUSED
+# endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+# define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+# define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+# if defined(__cplusplus)
+ template void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+# else
+# define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+# endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+# define CYTHON_NCP_UNUSED
+# else
+# define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+ #ifndef _MSC_STDINT_H_
+ #if _MSC_VER < 1300
+ typedef unsigned char uint8_t;
+ typedef unsigned int uint32_t;
+ #else
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int32 uint32_t;
+ #endif
+ #endif
+#else
+ #include
+#endif
+#ifndef CYTHON_FALLTHROUGH
+ #if defined(__cplusplus) && __cplusplus >= 201103L
+ #if __has_cpp_attribute(fallthrough)
+ #define CYTHON_FALLTHROUGH [[fallthrough]]
+ #elif __has_cpp_attribute(clang::fallthrough)
+ #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+ #elif __has_cpp_attribute(gnu::fallthrough)
+ #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+ #endif
+ #endif
+ #ifndef CYTHON_FALLTHROUGH
+ #if __has_attribute(fallthrough)
+ #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+ #else
+ #define CYTHON_FALLTHROUGH
+ #endif
+ #endif
+ #if defined(__clang__ ) && defined(__apple_build_version__)
+ #if __apple_build_version__ < 7000000
+ #undef CYTHON_FALLTHROUGH
+ #define CYTHON_FALLTHROUGH
+ #endif
+ #endif
+#endif
+
+#ifndef CYTHON_INLINE
+ #if defined(__clang__)
+ #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+ #elif defined(__GNUC__)
+ #define CYTHON_INLINE __inline__
+ #elif defined(_MSC_VER)
+ #define CYTHON_INLINE __inline
+ #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define CYTHON_INLINE inline
+ #else
+ #define CYTHON_INLINE
+ #endif
+#endif
+
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+ #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+ PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+ #define __Pyx_DefaultClassType PyClass_Type
+#else
+ #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+ #define __Pyx_DefaultClassType PyType_Type
+#if PY_VERSION_HEX >= 0x030B00A1
+ static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int k, int l, int s, int f,
+ PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+ PyObject *fv, PyObject *cell, PyObject* fn,
+ PyObject *name, int fline, PyObject *lnos) {
+ PyObject *kwds=NULL, *argcount=NULL, *posonlyargcount=NULL, *kwonlyargcount=NULL;
+ PyObject *nlocals=NULL, *stacksize=NULL, *flags=NULL, *replace=NULL, *call_result=NULL, *empty=NULL;
+ const char *fn_cstr=NULL;
+ const char *name_cstr=NULL;
+ PyCodeObject* co=NULL;
+ PyObject *type, *value, *traceback;
+ PyErr_Fetch(&type, &value, &traceback);
+ if (!(kwds=PyDict_New())) goto end;
+ if (!(argcount=PyLong_FromLong(a))) goto end;
+ if (PyDict_SetItemString(kwds, "co_argcount", argcount) != 0) goto end;
+ if (!(posonlyargcount=PyLong_FromLong(0))) goto end;
+ if (PyDict_SetItemString(kwds, "co_posonlyargcount", posonlyargcount) != 0) goto end;
+ if (!(kwonlyargcount=PyLong_FromLong(k))) goto end;
+ if (PyDict_SetItemString(kwds, "co_kwonlyargcount", kwonlyargcount) != 0) goto end;
+ if (!(nlocals=PyLong_FromLong(l))) goto end;
+ if (PyDict_SetItemString(kwds, "co_nlocals", nlocals) != 0) goto end;
+ if (!(stacksize=PyLong_FromLong(s))) goto end;
+ if (PyDict_SetItemString(kwds, "co_stacksize", stacksize) != 0) goto end;
+ if (!(flags=PyLong_FromLong(f))) goto end;
+ if (PyDict_SetItemString(kwds, "co_flags", flags) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_code", code) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_consts", c) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_names", n) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_varnames", v) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_freevars", fv) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_cellvars", cell) != 0) goto end;
+ if (PyDict_SetItemString(kwds, "co_linetable", lnos) != 0) goto end;
+ if (!(fn_cstr=PyUnicode_AsUTF8AndSize(fn, NULL))) goto end;
+ if (!(name_cstr=PyUnicode_AsUTF8AndSize(name, NULL))) goto end;
+ if (!(co = PyCode_NewEmpty(fn_cstr, name_cstr, fline))) goto end;
+ if (!(replace = PyObject_GetAttrString((PyObject*)co, "replace"))) goto cleanup_code_too;
+ if (!(empty = PyTuple_New(0))) goto cleanup_code_too; // unfortunately __pyx_empty_tuple isn't available here
+ if (!(call_result = PyObject_Call(replace, empty, kwds))) goto cleanup_code_too;
+ Py_XDECREF((PyObject*)co);
+ co = (PyCodeObject*)call_result;
+ call_result = NULL;
+ if (0) {
+ cleanup_code_too:
+ Py_XDECREF((PyObject*)co);
+ co = NULL;
+ }
+ end:
+ Py_XDECREF(kwds);
+ Py_XDECREF(argcount);
+ Py_XDECREF(posonlyargcount);
+ Py_XDECREF(kwonlyargcount);
+ Py_XDECREF(nlocals);
+ Py_XDECREF(stacksize);
+ Py_XDECREF(replace);
+ Py_XDECREF(call_result);
+ Py_XDECREF(empty);
+ if (type) {
+ PyErr_Restore(type, value, traceback);
+ }
+ return co;
+ }
+#else
+ #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+ PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+ #define __Pyx_DefaultClassType PyType_Type
+#endif
+#if PY_VERSION_HEX >= 0x030900F0 && !CYTHON_COMPILING_IN_PYPY
+ #define __Pyx_PyObject_GC_IsFinalized(o) PyObject_GC_IsFinalized(o)
+#else
+ #define __Pyx_PyObject_GC_IsFinalized(o) _PyGC_FINALIZED(o)
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+ #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+ #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+ #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+ #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef METH_STACKLESS
+ #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+ #ifndef METH_FASTCALL
+ #define METH_FASTCALL 0x80
+ #endif
+ typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+ typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+ Py_ssize_t nargs, PyObject *kwnames);
+#else
+ #define __Pyx_PyCFunctionFast _PyCFunctionFast
+ #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+ ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+ #define PyObject_Malloc(s) PyMem_Malloc(s)
+ #define PyObject_Free(p) PyMem_Free(p)
+ #define PyObject_Realloc(p) PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030400A1
+ #define PyMem_RawMalloc(n) PyMem_Malloc(n)
+ #define PyMem_RawRealloc(p, n) PyMem_Realloc(p, n)
+ #define PyMem_RawFree(p) PyMem_Free(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+ #define __Pyx_PyCode_HasFreeVars(co) PyCode_HasFreeVars(co)
+ #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+ #define __Pyx_PyCode_HasFreeVars(co) (PyCode_GetNumFree(co) > 0)
+ #define __Pyx_PyFrame_SetLineNumber(frame, lineno) (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+ #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+ #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+ #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+ #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+ *key = PyThread_create_key();
+ return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+ Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+ *key = Py_tss_NEEDS_INIT;
+ return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+ PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+ return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+ PyThread_delete_key(*key);
+ *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+ return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+ return PyThread_get_key_value(*key);
+}
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n) ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n) PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+ #define __Pyx_PyNumber_Divide(x,y) PyNumber_TrueDivide(x,y)
+ #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceTrueDivide(x,y)
+#else
+ #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y)
+ #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name) _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name) PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+ #define CYTHON_PEP393_ENABLED 1
+ #if PY_VERSION_HEX >= 0x030C0000
+ #define __Pyx_PyUnicode_READY(op) (0)
+ #else
+ #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ?\
+ 0 : _PyUnicode_Ready((PyObject *)(op)))
+ #endif
+ #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_LENGTH(u)
+ #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+ #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) PyUnicode_MAX_CHAR_VALUE(u)
+ #define __Pyx_PyUnicode_KIND(u) PyUnicode_KIND(u)
+ #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u)
+ #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i)
+ #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch)
+ #if PY_VERSION_HEX >= 0x030C0000
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u))
+ #else
+ #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03090000
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : ((PyCompactUnicodeObject *)(u))->wstr_length))
+ #else
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+ #endif
+ #endif
+#else
+ #define CYTHON_PEP393_ENABLED 0
+ #define PyUnicode_1BYTE_KIND 1
+ #define PyUnicode_2BYTE_KIND 2
+ #define PyUnicode_4BYTE_KIND 4
+ #define __Pyx_PyUnicode_READY(op) (0)
+ #define __Pyx_PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
+ #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+ #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u) ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+ #define __Pyx_PyUnicode_KIND(u) (sizeof(Py_UNICODE))
+ #define __Pyx_PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
+ #define __Pyx_PyUnicode_READ(k, d, i) ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+ #define __Pyx_PyUnicode_WRITE(k, d, i, ch) (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+ #define __Pyx_PyUnicode_Concat(a, b) PyNumber_Add(a, b)
+ #define __Pyx_PyUnicode_ConcatSafe(a, b) PyNumber_Add(a, b)
+#else
+ #define __Pyx_PyUnicode_Concat(a, b) PyUnicode_Concat(a, b)
+ #define __Pyx_PyUnicode_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+ PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+ #define PyUnicode_Contains(u, s) PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+ #define PyByteArray_Check(obj) PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+ #define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b)
+#else
+ #define __Pyx_PyString_Format(a, b) PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+ #define PyObject_ASCII(o) PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyBaseString_Type PyUnicode_Type
+ #define PyStringObject PyUnicodeObject
+ #define PyString_Type PyUnicode_Type
+ #define PyString_Check PyUnicode_Check
+ #define PyString_CheckExact PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+ #define PyObject_Unicode PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+ #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+ #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+ #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+ #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+ #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+ #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+ #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+ #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+ #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq)
+#else
+ #define __Pyx_PySequence_SIZE(seq) PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyIntObject PyLongObject
+ #define PyInt_Type PyLong_Type
+ #define PyInt_Check(op) PyLong_Check(op)
+ #define PyInt_CheckExact(op) PyLong_CheckExact(op)
+ #define PyInt_FromString PyLong_FromString
+ #define PyInt_FromUnicode PyLong_FromUnicode
+ #define PyInt_FromLong PyLong_FromLong
+ #define PyInt_FromSize_t PyLong_FromSize_t
+ #define PyInt_FromSsize_t PyLong_FromSsize_t
+ #define PyInt_AsLong PyLong_AsLong
+ #define PyInt_AS_LONG PyLong_AS_LONG
+ #define PyInt_AsSsize_t PyLong_AsSsize_t
+ #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask
+ #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+ #define PyNumber_Int PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define PyBoolObject PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+ #ifndef PyUnicode_InternFromString
+ #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+ #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+ typedef long Py_hash_t;
+ #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+ #define __Pyx_PyInt_AsHash_t __Pyx_PyIndex_AsHash_t
+#else
+ #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+ #define __Pyx_PyInt_AsHash_t __Pyx_PyIndex_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+ #define __Pyx_PyMethod_New(func, self, klass) ((self) ? ((void)(klass), PyMethod_New(func, self)) : __Pyx_NewRef(func))
+#else
+ #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+ #if PY_VERSION_HEX >= 0x030500B1
+ #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+ #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+ #else
+ #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+ #endif
+#else
+ #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+ typedef struct {
+ unaryfunc am_await;
+ unaryfunc am_aiter;
+ unaryfunc am_anext;
+ } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(_WIN32) || defined(WIN32) || defined(MS_WINDOWS)
+ #if !defined(_USE_MATH_DEFINES)
+ #define _USE_MATH_DEFINES
+ #endif
+#endif
+#include
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+ float value;
+ memset(&value, 0xFF, sizeof(value));
+ return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+ { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+ { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifndef __PYX_EXTERN_C
+ #ifdef __cplusplus
+ #define __PYX_EXTERN_C extern "C"
+ #else
+ #define __PYX_EXTERN_C extern
+ #endif
+#endif
+
+#define __PYX_HAVE__TTS__tts__utils__monotonic_align__core
+#define __PYX_HAVE_API__TTS__tts__utils__monotonic_align__core
+/* Early includes */
+#include
+#include
+
+ /* Using NumPy API declarations from "numpy/__init__.pxd" */
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/ufuncobject.h"
+#include "pythread.h"
+#include
+#include "pystate.h"
+#ifdef _OPENMP
+#include
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+ const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed) (\
+ (sizeof(type) < sizeof(Py_ssize_t)) ||\
+ (sizeof(type) > sizeof(Py_ssize_t) &&\
+ likely(v < (type)PY_SSIZE_T_MAX ||\
+ v == (type)PY_SSIZE_T_MAX) &&\
+ (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+ v == (type)PY_SSIZE_T_MIN))) ||\
+ (sizeof(type) == sizeof(Py_ssize_t) &&\
+ (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+ v == (type)PY_SSIZE_T_MAX))) )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+ return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+ #include
+ #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+ #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+ #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+ #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+ #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+ #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+ #define __Pyx_PyStr_FromString __Pyx_PyBytes_FromString
+ #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+ #define __Pyx_PyStr_FromString __Pyx_PyUnicode_FromString
+ #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s) ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s) ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s) ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s) ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s) ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s) ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s) ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s) ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s) ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s) ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s) ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s) __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s) __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s) __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s) __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+ const Py_UNICODE *u_end = u;
+ while (*u_end++) ;
+ return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+ (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject*);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+ PyObject* sys;
+ PyObject* default_encoding = NULL;
+ PyObject* ascii_chars_u = NULL;
+ PyObject* ascii_chars_b = NULL;
+ const char* default_encoding_c;
+ sys = PyImport_ImportModule("sys");
+ if (!sys) goto bad;
+ default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+ Py_DECREF(sys);
+ if (!default_encoding) goto bad;
+ default_encoding_c = PyBytes_AsString(default_encoding);
+ if (!default_encoding_c) goto bad;
+ if (strcmp(default_encoding_c, "ascii") == 0) {
+ __Pyx_sys_getdefaultencoding_not_ascii = 0;
+ } else {
+ char ascii_chars[128];
+ int c;
+ for (c = 0; c < 128; c++) {
+ ascii_chars[c] = c;
+ }
+ __Pyx_sys_getdefaultencoding_not_ascii = 1;
+ ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+ if (!ascii_chars_u) goto bad;
+ ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+ if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+ PyErr_Format(
+ PyExc_ValueError,
+ "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+ default_encoding_c);
+ goto bad;
+ }
+ Py_DECREF(ascii_chars_u);
+ Py_DECREF(ascii_chars_b);
+ }
+ Py_DECREF(default_encoding);
+ return 0;
+bad:
+ Py_XDECREF(default_encoding);
+ Py_XDECREF(ascii_chars_u);
+ Py_XDECREF(ascii_chars_b);
+ return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+ PyObject* sys;
+ PyObject* default_encoding = NULL;
+ char* default_encoding_c;
+ sys = PyImport_ImportModule("sys");
+ if (!sys) goto bad;
+ default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+ Py_DECREF(sys);
+ if (!default_encoding) goto bad;
+ default_encoding_c = PyBytes_AsString(default_encoding);
+ if (!default_encoding_c) goto bad;
+ __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+ if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+ strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+ Py_DECREF(default_encoding);
+ return 0;
+bad:
+ Py_XDECREF(default_encoding);
+ return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+ #define likely(x) __builtin_expect(!!(x), 1)
+ #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+ #define likely(x) (x)
+ #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+ #if defined(__cplusplus)
+ #define CYTHON_CCOMPLEX 1
+ #elif (defined(_Complex_I) && !defined(_MSC_VER))
+ #define CYTHON_CCOMPLEX 1
+ #else
+ #define CYTHON_CCOMPLEX 0
+ #endif
+#endif
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ #include
+ #else
+ #include
+ #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+ #undef _Complex_I
+ #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+ "TTS/tts/utils/monotonic_align/core.pyx",
+ "__init__.pxd",
+ "stringsource",
+ "type.pxd",
+};
+/* NoFastGil.proto */
+#define __Pyx_PyGILState_Ensure PyGILState_Ensure
+#define __Pyx_PyGILState_Release PyGILState_Release
+#define __Pyx_FastGIL_Remember()
+#define __Pyx_FastGIL_Forget()
+#define __Pyx_FastGilFuncInit()
+
+/* MemviewSliceStruct.proto */
+struct __pyx_memoryview_obj;
+typedef struct {
+ struct __pyx_memoryview_obj *memview;
+ char *data;
+ Py_ssize_t shape[8];
+ Py_ssize_t strides[8];
+ Py_ssize_t suboffsets[8];
+} __Pyx_memviewslice;
+#define __Pyx_MemoryView_Len(m) (m.shape[0])
+
+/* Atomics.proto */
+#include
+#ifndef CYTHON_ATOMICS
+ #define CYTHON_ATOMICS 1
+#endif
+#define __PYX_CYTHON_ATOMICS_ENABLED() CYTHON_ATOMICS
+#define __pyx_atomic_int_type int
+#if CYTHON_ATOMICS && (__GNUC__ >= 5 || (__GNUC__ == 4 &&\
+ (__GNUC_MINOR__ > 1 ||\
+ (__GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ >= 2))))
+ #define __pyx_atomic_incr_aligned(value) __sync_fetch_and_add(value, 1)
+ #define __pyx_atomic_decr_aligned(value) __sync_fetch_and_sub(value, 1)
+ #ifdef __PYX_DEBUG_ATOMICS
+ #warning "Using GNU atomics"
+ #endif
+#elif CYTHON_ATOMICS && defined(_MSC_VER) && CYTHON_COMPILING_IN_NOGIL
+ #include
+ #undef __pyx_atomic_int_type
+ #define __pyx_atomic_int_type long
+ #pragma intrinsic (_InterlockedExchangeAdd)
+ #define __pyx_atomic_incr_aligned(value) _InterlockedExchangeAdd(value, 1)
+ #define __pyx_atomic_decr_aligned(value) _InterlockedExchangeAdd(value, -1)
+ #ifdef __PYX_DEBUG_ATOMICS
+ #pragma message ("Using MSVC atomics")
+ #endif
+#else
+ #undef CYTHON_ATOMICS
+ #define CYTHON_ATOMICS 0
+ #ifdef __PYX_DEBUG_ATOMICS
+ #warning "Not using atomics"
+ #endif
+#endif
+typedef volatile __pyx_atomic_int_type __pyx_atomic_int;
+#if CYTHON_ATOMICS
+ #define __pyx_add_acquisition_count(memview)\
+ __pyx_atomic_incr_aligned(__pyx_get_slice_count_pointer(memview))
+ #define __pyx_sub_acquisition_count(memview)\
+ __pyx_atomic_decr_aligned(__pyx_get_slice_count_pointer(memview))
+#else
+ #define __pyx_add_acquisition_count(memview)\
+ __pyx_add_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+ #define __pyx_sub_acquisition_count(memview)\
+ __pyx_sub_acquisition_count_locked(__pyx_get_slice_count_pointer(memview), memview->lock)
+#endif
+
+/* ForceInitThreads.proto */
+#ifndef __PYX_FORCE_INIT_THREADS
+ #define __PYX_FORCE_INIT_THREADS 0
+#endif
+
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+ const char* name;
+ struct __Pyx_StructField_* fields;
+ size_t size;
+ size_t arraysize[8];
+ int ndim;
+ char typegroup;
+ char is_unsigned;
+ int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+ __Pyx_TypeInfo* type;
+ const char* name;
+ size_t offset;
+} __Pyx_StructField;
+typedef struct {
+ __Pyx_StructField* field;
+ size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+ __Pyx_StructField root;
+ __Pyx_BufFmt_StackElem* head;
+ size_t fmt_offset;
+ size_t new_count, enc_count;
+ size_t struct_alignment;
+ int is_complex;
+ char enc_type;
+ char new_packmode;
+ char enc_packmode;
+ char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":702
+ * # in Cython to enable them only on the right systems.
+ *
+ * ctypedef npy_int8 int8_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":703
+ *
+ * ctypedef npy_int8 int8_t
+ * ctypedef npy_int16 int16_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int32 int32_t
+ * ctypedef npy_int64 int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":704
+ * ctypedef npy_int8 int8_t
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_int64 int64_t
+ * #ctypedef npy_int96 int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":705
+ * ctypedef npy_int16 int16_t
+ * ctypedef npy_int32 int32_t
+ * ctypedef npy_int64 int64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96 int96_t
+ * #ctypedef npy_int128 int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":709
+ * #ctypedef npy_int128 int128_t
+ *
+ * ctypedef npy_uint8 uint8_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":710
+ *
+ * ctypedef npy_uint8 uint8_t
+ * ctypedef npy_uint16 uint16_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32 uint32_t
+ * ctypedef npy_uint64 uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":711
+ * ctypedef npy_uint8 uint8_t
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64 uint64_t
+ * #ctypedef npy_uint96 uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":712
+ * ctypedef npy_uint16 uint16_t
+ * ctypedef npy_uint32 uint32_t
+ * ctypedef npy_uint64 uint64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96 uint96_t
+ * #ctypedef npy_uint128 uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":716
+ * #ctypedef npy_uint128 uint128_t
+ *
+ * ctypedef npy_float32 float32_t # <<<<<<<<<<<<<<
+ * ctypedef npy_float64 float64_t
+ * #ctypedef npy_float80 float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":717
+ *
+ * ctypedef npy_float32 float32_t
+ * ctypedef npy_float64 float64_t # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80 float80_t
+ * #ctypedef npy_float128 float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":724
+ * ctypedef double complex complex128_t
+ *
+ * ctypedef npy_longlong longlong_t # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong ulonglong_t
+ *
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":725
+ *
+ * ctypedef npy_longlong longlong_t
+ * ctypedef npy_ulonglong ulonglong_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_intp intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":727
+ * ctypedef npy_ulonglong ulonglong_t
+ *
+ * ctypedef npy_intp intp_t # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp uintp_t
+ *
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":728
+ *
+ * ctypedef npy_intp intp_t
+ * ctypedef npy_uintp uintp_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef npy_double float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":730
+ * ctypedef npy_uintp uintp_t
+ *
+ * ctypedef npy_double float_t # <<<<<<<<<<<<<<
+ * ctypedef npy_double double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":731
+ *
+ * ctypedef npy_double float_t
+ * ctypedef npy_double double_t # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ *
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":732
+ * ctypedef npy_double float_t
+ * ctypedef npy_double double_t
+ * ctypedef npy_longdouble longdouble_t # <<<<<<<<<<<<<<
+ *
+ * ctypedef float complex cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ typedef ::std::complex< float > __pyx_t_float_complex;
+ #else
+ typedef float _Complex __pyx_t_float_complex;
+ #endif
+#else
+ typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ typedef ::std::complex< double > __pyx_t_double_complex;
+ #else
+ typedef double _Complex __pyx_t_double_complex;
+ #endif
+#else
+ typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ typedef ::std::complex< long double > __pyx_t_long_double_complex;
+ #else
+ typedef long double _Complex __pyx_t_long_double_complex;
+ #endif
+#else
+ typedef struct { long double real, imag; } __pyx_t_long_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_long_double_complex __pyx_t_long_double_complex_from_parts(long double, long double);
+
+
+/*--- Type declarations ---*/
+struct __pyx_array_obj;
+struct __pyx_MemviewEnum_obj;
+struct __pyx_memoryview_obj;
+struct __pyx_memoryviewslice_obj;
+struct __pyx_opt_args_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c;
+
+/* "TTS/tts/utils/monotonic_align/core.pyx":42
+ * @cython.boundscheck(False)
+ * @cython.wraparound(False)
+ * cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: # <<<<<<<<<<<<<<
+ * cdef int b = values.shape[0]
+ *
+ */
+struct __pyx_opt_args_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c {
+ int __pyx_n;
+ float max_neg_val;
+};
+
+/* "View.MemoryView":106
+ *
+ * @cname("__pyx_array")
+ * cdef class array: # <<<<<<<<<<<<<<
+ *
+ * cdef:
+ */
+struct __pyx_array_obj {
+ PyObject_HEAD
+ struct __pyx_vtabstruct_array *__pyx_vtab;
+ char *data;
+ Py_ssize_t len;
+ char *format;
+ int ndim;
+ Py_ssize_t *_shape;
+ Py_ssize_t *_strides;
+ Py_ssize_t itemsize;
+ PyObject *mode;
+ PyObject *_format;
+ void (*callback_free_data)(void *);
+ int free_data;
+ int dtype_is_object;
+};
+
+
+/* "View.MemoryView":280
+ *
+ * @cname('__pyx_MemviewEnum')
+ * cdef class Enum(object): # <<<<<<<<<<<<<<
+ * cdef object name
+ * def __init__(self, name):
+ */
+struct __pyx_MemviewEnum_obj {
+ PyObject_HEAD
+ PyObject *name;
+};
+
+
+/* "View.MemoryView":331
+ *
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview(object): # <<<<<<<<<<<<<<
+ *
+ * cdef object obj
+ */
+struct __pyx_memoryview_obj {
+ PyObject_HEAD
+ struct __pyx_vtabstruct_memoryview *__pyx_vtab;
+ PyObject *obj;
+ PyObject *_size;
+ PyObject *_array_interface;
+ PyThread_type_lock lock;
+ __pyx_atomic_int acquisition_count[2];
+ __pyx_atomic_int *acquisition_count_aligned_p;
+ Py_buffer view;
+ int flags;
+ int dtype_is_object;
+ __Pyx_TypeInfo *typeinfo;
+};
+
+
+/* "View.MemoryView":967
+ *
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview): # <<<<<<<<<<<<<<
+ * "Internal class for passing memoryview slices to Python"
+ *
+ */
+struct __pyx_memoryviewslice_obj {
+ struct __pyx_memoryview_obj __pyx_base;
+ __Pyx_memviewslice from_slice;
+ PyObject *from_object;
+ PyObject *(*to_object_func)(char *);
+ int (*to_dtype_func)(char *, PyObject *);
+};
+
+
+
+/* "View.MemoryView":106
+ *
+ * @cname("__pyx_array")
+ * cdef class array: # <<<<<<<<<<<<<<
+ *
+ * cdef:
+ */
+
+struct __pyx_vtabstruct_array {
+ PyObject *(*get_memview)(struct __pyx_array_obj *);
+};
+static struct __pyx_vtabstruct_array *__pyx_vtabptr_array;
+
+
+/* "View.MemoryView":331
+ *
+ * @cname('__pyx_memoryview')
+ * cdef class memoryview(object): # <<<<<<<<<<<<<<
+ *
+ * cdef object obj
+ */
+
+struct __pyx_vtabstruct_memoryview {
+ char *(*get_item_pointer)(struct __pyx_memoryview_obj *, PyObject *);
+ PyObject *(*is_slice)(struct __pyx_memoryview_obj *, PyObject *);
+ PyObject *(*setitem_slice_assignment)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+ PyObject *(*setitem_slice_assign_scalar)(struct __pyx_memoryview_obj *, struct __pyx_memoryview_obj *, PyObject *);
+ PyObject *(*setitem_indexed)(struct __pyx_memoryview_obj *, PyObject *, PyObject *);
+ PyObject *(*convert_item_to_object)(struct __pyx_memoryview_obj *, char *);
+ PyObject *(*assign_item_from_object)(struct __pyx_memoryview_obj *, char *, PyObject *);
+};
+static struct __pyx_vtabstruct_memoryview *__pyx_vtabptr_memoryview;
+
+
+/* "View.MemoryView":967
+ *
+ * @cname('__pyx_memoryviewslice')
+ * cdef class _memoryviewslice(memoryview): # <<<<<<<<<<<<<<
+ * "Internal class for passing memoryview slices to Python"
+ *
+ */
+
+struct __pyx_vtabstruct__memoryviewslice {
+ struct __pyx_vtabstruct_memoryview __pyx_base;
+};
+static struct __pyx_vtabstruct__memoryviewslice *__pyx_vtabptr__memoryviewslice;
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+ #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+ typedef struct {
+ void (*INCREF)(void*, PyObject*, int);
+ void (*DECREF)(void*, PyObject*, int);
+ void (*GOTREF)(void*, PyObject*, int);
+ void (*GIVEREF)(void*, PyObject*, int);
+ void* (*SetupContext)(const char*, int, const char*);
+ void (*FinishContext)(void**);
+ } __Pyx_RefNannyAPIStruct;
+ static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+ static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+ #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+ if (acquire_gil) {\
+ PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+ PyGILState_Release(__pyx_gilstate_save);\
+ } else {\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+ }
+#else
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+ __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+ #define __Pyx_RefNannyFinishContext()\
+ __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+ #define __Pyx_INCREF(r) __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_DECREF(r) __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_GOTREF(r) __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+ #define __Pyx_XINCREF(r) do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+ #define __Pyx_XDECREF(r) do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+ #define __Pyx_XGOTREF(r) do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+ #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+ #define __Pyx_RefNannyDeclarations
+ #define __Pyx_RefNannySetupContext(name, acquire_gil)
+ #define __Pyx_RefNannyFinishContext()
+ #define __Pyx_INCREF(r) Py_INCREF(r)
+ #define __Pyx_DECREF(r) Py_DECREF(r)
+ #define __Pyx_GOTREF(r)
+ #define __Pyx_GIVEREF(r)
+ #define __Pyx_XINCREF(r) Py_XINCREF(r)
+ #define __Pyx_XDECREF(r) Py_XDECREF(r)
+ #define __Pyx_XGOTREF(r)
+ #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+ PyObject *tmp = (PyObject *) r;\
+ r = v; __Pyx_XDECREF(tmp);\
+ } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+ PyObject *tmp = (PyObject *) r;\
+ r = v; __Pyx_DECREF(tmp);\
+ } while (0)
+#define __Pyx_CLEAR(r) do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r) do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* MemviewSliceInit.proto */
+#define __Pyx_BUF_MAX_NDIMS %(BUF_MAX_NDIMS)d
+#define __Pyx_MEMVIEW_DIRECT 1
+#define __Pyx_MEMVIEW_PTR 2
+#define __Pyx_MEMVIEW_FULL 4
+#define __Pyx_MEMVIEW_CONTIG 8
+#define __Pyx_MEMVIEW_STRIDED 16
+#define __Pyx_MEMVIEW_FOLLOW 32
+#define __Pyx_IS_C_CONTIG 1
+#define __Pyx_IS_F_CONTIG 2
+static int __Pyx_init_memviewslice(
+ struct __pyx_memoryview_obj *memview,
+ int ndim,
+ __Pyx_memviewslice *memviewslice,
+ int memview_is_new_reference);
+static CYTHON_INLINE int __pyx_add_acquisition_count_locked(
+ __pyx_atomic_int *acquisition_count, PyThread_type_lock lock);
+static CYTHON_INLINE int __pyx_sub_acquisition_count_locked(
+ __pyx_atomic_int *acquisition_count, PyThread_type_lock lock);
+#define __pyx_get_slice_count_pointer(memview) (memview->acquisition_count_aligned_p)
+#define __pyx_get_slice_count(memview) (*__pyx_get_slice_count_pointer(memview))
+#define __PYX_INC_MEMVIEW(slice, have_gil) __Pyx_INC_MEMVIEW(slice, have_gil, __LINE__)
+#define __PYX_XDEC_MEMVIEW(slice, have_gil) __Pyx_XDEC_MEMVIEW(slice, have_gil, __LINE__)
+static CYTHON_INLINE void __Pyx_INC_MEMVIEW(__Pyx_memviewslice *, int, int);
+static CYTHON_INLINE void __Pyx_XDEC_MEMVIEW(__Pyx_memviewslice *, int, int);
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+ Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+ PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+ const char* function_name);
+
+/* None.proto */
+static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname);
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred() __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred() PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb) __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb) __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb) __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb) __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb) PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb) PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb) PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb) PyErr_Fetch(type, value, tb)
+#endif
+
+/* WriteUnraisableException.proto */
+static void __Pyx_WriteUnraisable(const char *name, int clineno,
+ int lineno, const char *filename,
+ int full_traceback, int nogil);
+
+/* GetTopmostException.proto */
+#if CYTHON_USE_EXC_INFO_STACK
+static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate);
+#endif
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb) __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb) __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb) PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb) PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err) PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb) __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+ ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+ __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs) (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+ __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, Py_ssize_t nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#define __Pyx_BUILD_ASSERT_EXPR(cond)\
+ (sizeof(char [1 - 2*!(cond)]) - 1)
+#ifndef Py_MEMBER_SIZE
+#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+#endif
+#if CYTHON_FAST_PYCALL
+ static size_t __pyx_pyframe_localsplus_offset = 0;
+ #include "frameobject.h"
+#if PY_VERSION_HEX >= 0x030b00a6
+ #ifndef Py_BUILD_CORE
+ #define Py_BUILD_CORE 1
+ #endif
+ #include "internal/pycore_frame.h"
+#endif
+ #define __Pxy_PyFrame_Initialize_Offsets()\
+ ((void)__Pyx_BUILD_ASSERT_EXPR(sizeof(PyFrameObject) == offsetof(PyFrameObject, f_localsplus) + Py_MEMBER_SIZE(PyFrameObject, f_localsplus)),\
+ (void)(__pyx_pyframe_localsplus_offset = ((size_t)PyFrame_Type.tp_basicsize) - Py_MEMBER_SIZE(PyFrameObject, f_localsplus)))
+ #define __Pyx_PyFrame_GetLocalsplus(frame)\
+ (assert(__pyx_pyframe_localsplus_offset), (PyObject **)(((char *)(frame)) + __pyx_pyframe_localsplus_offset))
+#endif // CYTHON_FAST_PYCALL
+#endif
+
+/* PyObjectCall2Args.proto */
+static CYTHON_UNUSED PyObject* __Pyx_PyObject_Call2Args(PyObject* function, PyObject* arg1, PyObject* arg2);
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* IncludeStringH.proto */
+#include
+
+/* BytesEquals.proto */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* UnicodeEquals.proto */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* StrEquals.proto */
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals
+#else
+#define __Pyx_PyString_Equals __Pyx_PyBytes_Equals
+#endif
+
+/* DivInt[Py_ssize_t].proto */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t, Py_ssize_t);
+
+/* UnaryNegOverflows.proto */
+#define UNARY_NEG_WOULD_OVERFLOW(x)\
+ (((x) < 0) & ((unsigned long)(x) == 0-(unsigned long)(x)))
+
+static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *); /*proto*/
+/* GetAttr.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *, PyObject *);
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+ (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+ __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+ (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+ __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+ (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+ __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+ (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+ int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+ (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+ __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+ (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+ int wraparound, int boundscheck);
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+ int is_list, int wraparound, int boundscheck);
+
+/* ObjectGetItem.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key);
+#else
+#define __Pyx_PyObject_GetItem(obj, key) PyObject_GetItem(obj, key)
+#endif
+
+/* decode_c_string_utf16.proto */
+static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16(const char *s, Py_ssize_t size, const char *errors) {
+ int byteorder = 0;
+ return PyUnicode_DecodeUTF16(s, size, errors, &byteorder);
+}
+static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16LE(const char *s, Py_ssize_t size, const char *errors) {
+ int byteorder = -1;
+ return PyUnicode_DecodeUTF16(s, size, errors, &byteorder);
+}
+static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16BE(const char *s, Py_ssize_t size, const char *errors) {
+ int byteorder = 1;
+ return PyUnicode_DecodeUTF16(s, size, errors, &byteorder);
+}
+
+/* decode_c_string.proto */
+static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
+ const char* cstring, Py_ssize_t start, Py_ssize_t stop,
+ const char* encoding, const char* errors,
+ PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
+
+/* GetAttr3.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *, PyObject *, PyObject *);
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict) (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+ (version_var) = __PYX_GET_DICT_VERSION(dict);\
+ (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+ static PY_UINT64_T __pyx_dict_version = 0;\
+ static PyObject *__pyx_dict_cached_value = NULL;\
+ if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+ (VAR) = __pyx_dict_cached_value;\
+ } else {\
+ (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+ __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+ }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict) (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) (VAR) = (LOOKUP);
+#endif
+
+/* GetModuleGlobalName.proto */
+#if CYTHON_USE_DICT_VERSIONS
+#define __Pyx_GetModuleGlobalName(var, name) do {\
+ static PY_UINT64_T __pyx_dict_version = 0;\
+ static PyObject *__pyx_dict_cached_value = NULL;\
+ (var) = (likely(__pyx_dict_version == __PYX_GET_DICT_VERSION(__pyx_d))) ?\
+ (likely(__pyx_dict_cached_value) ? __Pyx_NewRef(__pyx_dict_cached_value) : __Pyx_GetBuiltinName(name)) :\
+ __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+#define __Pyx_GetModuleGlobalNameUncached(var, name) do {\
+ PY_UINT64_T __pyx_dict_version;\
+ PyObject *__pyx_dict_cached_value;\
+ (var) = __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value);
+#else
+#define __Pyx_GetModuleGlobalName(var, name) (var) = __Pyx__GetModuleGlobalName(name)
+#define __Pyx_GetModuleGlobalNameUncached(var, name) (var) = __Pyx__GetModuleGlobalName(name)
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name);
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* SwapException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSwap(type, value, tb) __Pyx__ExceptionSwap(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+static CYTHON_UNUSED int __pyx_memoryview_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+/* ListCompAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) {
+ PyListObject* L = (PyListObject*) list;
+ Py_ssize_t len = Py_SIZE(list);
+ if (likely(L->allocated > len)) {
+ Py_INCREF(x);
+ PyList_SET_ITEM(list, len, x);
+ __Pyx_SET_SIZE(list, len + 1);
+ return 0;
+ }
+ return PyList_Append(list, x);
+}
+#else
+#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, long intval, int inplace, int zerodivision_check);
+#else
+#define __Pyx_PyInt_AddObjC(op1, op2, intval, inplace, zerodivision_check)\
+ (inplace ? PyNumber_InPlaceAdd(op1, op2) : PyNumber_Add(op1, op2))
+#endif
+
+/* ListExtend.proto */
+static CYTHON_INLINE int __Pyx_PyList_Extend(PyObject* L, PyObject* v) {
+#if CYTHON_COMPILING_IN_CPYTHON
+ PyObject* none = _PyList_Extend((PyListObject*)L, v);
+ if (unlikely(!none))
+ return -1;
+ Py_DECREF(none);
+ return 0;
+#else
+ return PyList_SetSlice(L, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, v);
+#endif
+}
+
+/* ListAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+ PyListObject* L = (PyListObject*) list;
+ Py_ssize_t len = Py_SIZE(list);
+ if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+ Py_INCREF(x);
+ PyList_SET_ITEM(list, len, x);
+ __Pyx_SET_SIZE(list, len + 1);
+ return 0;
+ }
+ return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* AssertionsEnabled.proto */
+#define __Pyx_init_assertions_enabled()
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+ #define __pyx_assertions_enabled() (1)
+#elif PY_VERSION_HEX < 0x03080000 || CYTHON_COMPILING_IN_PYPY || defined(Py_LIMITED_API)
+ #define __pyx_assertions_enabled() (!Py_OptimizeFlag)
+#elif CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030900A6
+ static int __pyx_assertions_enabled_flag;
+ #define __pyx_assertions_enabled() (__pyx_assertions_enabled_flag)
+ #undef __Pyx_init_assertions_enabled
+ static void __Pyx_init_assertions_enabled(void) {
+ __pyx_assertions_enabled_flag = ! _PyInterpreterState_GetConfig(__Pyx_PyThreadState_Current->interp)->optimization_level;
+ }
+#else
+ #define __pyx_assertions_enabled() (!Py_OptimizeFlag)
+#endif
+
+/* DivInt[long].proto */
+static CYTHON_INLINE long __Pyx_div_long(long, long);
+
+/* PySequenceContains.proto */
+static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) {
+ int result = PySequence_Contains(seq, item);
+ return unlikely(result < 0) ? result : (result == (eq == Py_EQ));
+}
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* HasAttr.proto */
+static CYTHON_INLINE int __Pyx_HasAttr(PyObject *, PyObject *);
+
+/* PyObject_GenericGetAttrNoDict.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr
+#endif
+
+/* PyObject_GenericGetAttr.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr
+#endif
+
+/* SetVTable.proto */
+static int __Pyx_SetVtable(PyObject *dict, void *vtable);
+
+/* PyObjectGetAttrStrNoError.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject* obj, PyObject* attr_name);
+
+/* SetupReduce.proto */
+static int __Pyx_setup_reduce(PyObject* type_obj);
+
+/* TypeImport.proto */
+#ifndef __PYX_HAVE_RT_ImportType_proto_0_29_37
+#define __PYX_HAVE_RT_ImportType_proto_0_29_37
+#if __STDC_VERSION__ >= 201112L
+#include
+#endif
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#define __PYX_GET_STRUCT_ALIGNMENT_0_29_37(s) alignof(s)
+#else
+#define __PYX_GET_STRUCT_ALIGNMENT_0_29_37(s) sizeof(void*)
+#endif
+enum __Pyx_ImportType_CheckSize_0_29_37 {
+ __Pyx_ImportType_CheckSize_Error_0_29_37 = 0,
+ __Pyx_ImportType_CheckSize_Warn_0_29_37 = 1,
+ __Pyx_ImportType_CheckSize_Ignore_0_29_37 = 2
+};
+static PyTypeObject *__Pyx_ImportType_0_29_37(PyObject* module, const char *module_name, const char *class_name, size_t size, size_t alignment, enum __Pyx_ImportType_CheckSize_0_29_37 check_size);
+#endif
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line) (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+ PyCodeObject* code_object;
+ int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+ int count;
+ int max_count;
+ __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+ int py_line, const char *filename);
+
+#if PY_MAJOR_VERSION < 3
+ static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+ static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+ #define __Pyx_GetBuffer PyObject_GetBuffer
+ #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* BufferStructDeclare.proto */
+typedef struct {
+ Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+ size_t refcount;
+ Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+ __Pyx_Buffer *rcbuffer;
+ char *data;
+ __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+/* MemviewSliceIsContig.proto */
+static int __pyx_memviewslice_is_contig(const __Pyx_memviewslice mvs, char order, int ndim);
+
+/* OverlappingSlices.proto */
+static int __pyx_slices_overlap(__Pyx_memviewslice *slice1,
+ __Pyx_memviewslice *slice2,
+ int ndim, size_t itemsize);
+
+/* Capsule.proto */
+static CYTHON_INLINE PyObject *__pyx_capsule_create(void *p, const char *sig);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+ __Pyx_BufFmt_StackElem* stack,
+ __Pyx_TypeInfo* type);
+
+/* TypeInfoCompare.proto */
+static int __pyx_typeinfo_cmp(__Pyx_TypeInfo *a, __Pyx_TypeInfo *b);
+
+/* MemviewSliceValidateAndInit.proto */
+static int __Pyx_ValidateAndInit_memviewslice(
+ int *axes_specs,
+ int c_or_f_flag,
+ int buf_flags,
+ int ndim,
+ __Pyx_TypeInfo *dtype,
+ __Pyx_BufFmt_StackElem stack[],
+ __Pyx_memviewslice *memviewslice,
+ PyObject *original_obj);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_d_d_dc_int(PyObject *, int writable_flag);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_d_d_dc_float(PyObject *, int writable_flag);
+
+/* ObjectToMemviewSlice.proto */
+static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlice_dc_int(PyObject *, int writable_flag);
+
+/* GCCDiagnostics.proto */
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+ #ifdef __cplusplus
+ #define __Pyx_CREAL(z) ((z).real())
+ #define __Pyx_CIMAG(z) ((z).imag())
+ #else
+ #define __Pyx_CREAL(z) (__real__(z))
+ #define __Pyx_CIMAG(z) (__imag__(z))
+ #endif
+#else
+ #define __Pyx_CREAL(z) ((z).real)
+ #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+ && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+ #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+ #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+ #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+ #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+ #define __Pyx_c_eq_float(a, b) ((a)==(b))
+ #define __Pyx_c_sum_float(a, b) ((a)+(b))
+ #define __Pyx_c_diff_float(a, b) ((a)-(b))
+ #define __Pyx_c_prod_float(a, b) ((a)*(b))
+ #define __Pyx_c_quot_float(a, b) ((a)/(b))
+ #define __Pyx_c_neg_float(a) (-(a))
+ #ifdef __cplusplus
+ #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+ #define __Pyx_c_conj_float(z) (::std::conj(z))
+ #if 1
+ #define __Pyx_c_abs_float(z) (::std::abs(z))
+ #define __Pyx_c_pow_float(a, b) (::std::pow(a, b))
+ #endif
+ #else
+ #define __Pyx_c_is_zero_float(z) ((z)==0)
+ #define __Pyx_c_conj_float(z) (conjf(z))
+ #if 1
+ #define __Pyx_c_abs_float(z) (cabsf(z))
+ #define __Pyx_c_pow_float(a, b) (cpowf(a, b))
+ #endif
+ #endif
+#else
+ static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+ static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+ #if 1
+ static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+ static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+ #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+ #define __Pyx_c_eq_double(a, b) ((a)==(b))
+ #define __Pyx_c_sum_double(a, b) ((a)+(b))
+ #define __Pyx_c_diff_double(a, b) ((a)-(b))
+ #define __Pyx_c_prod_double(a, b) ((a)*(b))
+ #define __Pyx_c_quot_double(a, b) ((a)/(b))
+ #define __Pyx_c_neg_double(a) (-(a))
+ #ifdef __cplusplus
+ #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+ #define __Pyx_c_conj_double(z) (::std::conj(z))
+ #if 1
+ #define __Pyx_c_abs_double(z) (::std::abs(z))
+ #define __Pyx_c_pow_double(a, b) (::std::pow(a, b))
+ #endif
+ #else
+ #define __Pyx_c_is_zero_double(z) ((z)==0)
+ #define __Pyx_c_conj_double(z) (conj(z))
+ #if 1
+ #define __Pyx_c_abs_double(z) (cabs(z))
+ #define __Pyx_c_pow_double(a, b) (cpow(a, b))
+ #endif
+ #endif
+#else
+ static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+ static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+ #if 1
+ static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+ static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+ #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+ #define __Pyx_c_eq_long__double(a, b) ((a)==(b))
+ #define __Pyx_c_sum_long__double(a, b) ((a)+(b))
+ #define __Pyx_c_diff_long__double(a, b) ((a)-(b))
+ #define __Pyx_c_prod_long__double(a, b) ((a)*(b))
+ #define __Pyx_c_quot_long__double(a, b) ((a)/(b))
+ #define __Pyx_c_neg_long__double(a) (-(a))
+ #ifdef __cplusplus
+ #define __Pyx_c_is_zero_long__double(z) ((z)==(long double)0)
+ #define __Pyx_c_conj_long__double(z) (::std::conj(z))
+ #if 1
+ #define __Pyx_c_abs_long__double(z) (::std::abs(z))
+ #define __Pyx_c_pow_long__double(a, b) (::std::pow(a, b))
+ #endif
+ #else
+ #define __Pyx_c_is_zero_long__double(z) ((z)==0)
+ #define __Pyx_c_conj_long__double(z) (conjl(z))
+ #if 1
+ #define __Pyx_c_abs_long__double(z) (cabsl(z))
+ #define __Pyx_c_pow_long__double(a, b) (cpowl(a, b))
+ #endif
+ #endif
+#else
+ static CYTHON_INLINE int __Pyx_c_eq_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_sum_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_diff_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_prod_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_quot_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_neg_long__double(__pyx_t_long_double_complex);
+ static CYTHON_INLINE int __Pyx_c_is_zero_long__double(__pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_conj_long__double(__pyx_t_long_double_complex);
+ #if 1
+ static CYTHON_INLINE long double __Pyx_c_abs_long__double(__pyx_t_long_double_complex);
+ static CYTHON_INLINE __pyx_t_long_double_complex __Pyx_c_pow_long__double(__pyx_t_long_double_complex, __pyx_t_long_double_complex);
+ #endif
+#endif
+
+/* MemviewSliceCopyTemplate.proto */
+static __Pyx_memviewslice
+__pyx_memoryview_copy_new_contig(const __Pyx_memviewslice *from_mvs,
+ const char *mode, int ndim,
+ size_t sizeof_dtype, int contig_flag,
+ int dtype_is_object);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE char __Pyx_PyInt_As_char(PyObject *);
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self); /* proto*/
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto*/
+static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_dst, PyObject *__pyx_v_src); /* proto*/
+static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __pyx_memoryview_obj *__pyx_v_self, struct __pyx_memoryview_obj *__pyx_v_dst, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_memoryview_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp); /* proto*/
+static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __pyx_memoryviewslice_obj *__pyx_v_self, char *__pyx_v_itemp, PyObject *__pyx_v_value); /* proto*/
+
+/* Module declarations from 'cython.view' */
+
+/* Module declarations from 'cython' */
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_generic = 0;
+static PyTypeObject *__pyx_ptype_5numpy_number = 0;
+static PyTypeObject *__pyx_ptype_5numpy_integer = 0;
+static PyTypeObject *__pyx_ptype_5numpy_signedinteger = 0;
+static PyTypeObject *__pyx_ptype_5numpy_unsignedinteger = 0;
+static PyTypeObject *__pyx_ptype_5numpy_inexact = 0;
+static PyTypeObject *__pyx_ptype_5numpy_floating = 0;
+static PyTypeObject *__pyx_ptype_5numpy_complexfloating = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flexible = 0;
+static PyTypeObject *__pyx_ptype_5numpy_character = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+
+/* Module declarations from 'TTS.tts.utils.monotonic_align.core' */
+static PyTypeObject *__pyx_array_type = 0;
+static PyTypeObject *__pyx_MemviewEnum_type = 0;
+static PyTypeObject *__pyx_memoryview_type = 0;
+static PyTypeObject *__pyx_memoryviewslice_type = 0;
+static PyObject *generic = 0;
+static PyObject *strided = 0;
+static PyObject *indirect = 0;
+static PyObject *contiguous = 0;
+static PyObject *indirect_contiguous = 0;
+static int __pyx_memoryview_thread_locks_used;
+static PyThread_type_lock __pyx_memoryview_thread_locks[8];
+static void __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_each(__Pyx_memviewslice, __Pyx_memviewslice, int, int, float); /*proto*/
+static void __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(__Pyx_memviewslice, __Pyx_memviewslice, __Pyx_memviewslice, __Pyx_memviewslice, int __pyx_skip_dispatch, struct __pyx_opt_args_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c *__pyx_optional_args); /*proto*/
+static struct __pyx_array_obj *__pyx_array_new(PyObject *, Py_ssize_t, char *, char *, char *); /*proto*/
+static void *__pyx_align_pointer(void *, size_t); /*proto*/
+static PyObject *__pyx_memoryview_new(PyObject *, int, int, __Pyx_TypeInfo *); /*proto*/
+static CYTHON_INLINE int __pyx_memoryview_check(PyObject *); /*proto*/
+static PyObject *_unellipsify(PyObject *, int); /*proto*/
+static PyObject *assert_direct_dimensions(Py_ssize_t *, int); /*proto*/
+static struct __pyx_memoryview_obj *__pyx_memview_slice(struct __pyx_memoryview_obj *, PyObject *); /*proto*/
+static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int *, Py_ssize_t, Py_ssize_t, Py_ssize_t, int, int, int, int); /*proto*/
+static char *__pyx_pybuffer_index(Py_buffer *, char *, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memslice_transpose(__Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice, int, PyObject *(*)(char *), int (*)(char *, PyObject *), int); /*proto*/
+static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_obj *); /*proto*/
+static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_memoryview_obj *, __Pyx_memviewslice *); /*proto*/
+static Py_ssize_t abs_py_ssize_t(Py_ssize_t); /*proto*/
+static char __pyx_get_best_slice_order(__Pyx_memviewslice *, int); /*proto*/
+static void _copy_strided_to_strided(char *, Py_ssize_t *, char *, Py_ssize_t *, Py_ssize_t *, Py_ssize_t *, int, size_t); /*proto*/
+static void copy_strided_to_strided(__Pyx_memviewslice *, __Pyx_memviewslice *, int, size_t); /*proto*/
+static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *, int); /*proto*/
+static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *, Py_ssize_t *, Py_ssize_t, int, char); /*proto*/
+static void *__pyx_memoryview_copy_data_to_temp(__Pyx_memviewslice *, __Pyx_memviewslice *, char, int); /*proto*/
+static int __pyx_memoryview_err_extents(int, Py_ssize_t, Py_ssize_t); /*proto*/
+static int __pyx_memoryview_err_dim(PyObject *, char *, int); /*proto*/
+static int __pyx_memoryview_err(PyObject *, char *); /*proto*/
+static int __pyx_memoryview_copy_contents(__Pyx_memviewslice, __Pyx_memviewslice, int, int, int); /*proto*/
+static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *, int, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_refcount_objects_in_slice(char *, Py_ssize_t *, Py_ssize_t *, int, int); /*proto*/
+static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *, int, size_t, void *, int); /*proto*/
+static void __pyx_memoryview__slice_assign_scalar(char *, Py_ssize_t *, Py_ssize_t *, int, size_t, void *); /*proto*/
+static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum_obj *, PyObject *); /*proto*/
+static __Pyx_TypeInfo __Pyx_TypeInfo_int = { "int", NULL, sizeof(int), { 0 }, 0, IS_UNSIGNED(int) ? 'U' : 'I', IS_UNSIGNED(int), 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_float = { "float", NULL, sizeof(float), { 0 }, 0, 'R', 0, 0 };
+#define __Pyx_MODULE_NAME "TTS.tts.utils.monotonic_align.core"
+extern int __pyx_module_is_main_TTS__tts__utils__monotonic_align__core;
+int __pyx_module_is_main_TTS__tts__utils__monotonic_align__core = 0;
+
+/* Implementation of 'TTS.tts.utils.monotonic_align.core' */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_ImportError;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_MemoryError;
+static PyObject *__pyx_builtin_enumerate;
+static PyObject *__pyx_builtin_TypeError;
+static PyObject *__pyx_builtin_Ellipsis;
+static PyObject *__pyx_builtin_id;
+static PyObject *__pyx_builtin_IndexError;
+static const char __pyx_k_O[] = "O";
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k_id[] = "id";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_new[] = "__new__";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_base[] = "base";
+static const char __pyx_k_dict[] = "__dict__";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mode[] = "mode";
+static const char __pyx_k_name[] = "name";
+static const char __pyx_k_ndim[] = "ndim";
+static const char __pyx_k_pack[] = "pack";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_step[] = "step";
+static const char __pyx_k_stop[] = "stop";
+static const char __pyx_k_t_xs[] = "t_xs";
+static const char __pyx_k_t_ys[] = "t_ys";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_ASCII[] = "ASCII";
+static const char __pyx_k_class[] = "__class__";
+static const char __pyx_k_error[] = "error";
+static const char __pyx_k_flags[] = "flags";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_paths[] = "paths";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_shape[] = "shape";
+static const char __pyx_k_start[] = "start";
+static const char __pyx_k_encode[] = "encode";
+static const char __pyx_k_format[] = "format";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_name_2[] = "__name__";
+static const char __pyx_k_pickle[] = "pickle";
+static const char __pyx_k_reduce[] = "__reduce__";
+static const char __pyx_k_struct[] = "struct";
+static const char __pyx_k_unpack[] = "unpack";
+static const char __pyx_k_update[] = "update";
+static const char __pyx_k_values[] = "values";
+static const char __pyx_k_fortran[] = "fortran";
+static const char __pyx_k_memview[] = "memview";
+static const char __pyx_k_Ellipsis[] = "Ellipsis";
+static const char __pyx_k_getstate[] = "__getstate__";
+static const char __pyx_k_itemsize[] = "itemsize";
+static const char __pyx_k_pyx_type[] = "__pyx_type";
+static const char __pyx_k_setstate[] = "__setstate__";
+static const char __pyx_k_subarray[] = "subarray";
+static const char __pyx_k_TypeError[] = "TypeError";
+static const char __pyx_k_enumerate[] = "enumerate";
+static const char __pyx_k_pyx_state[] = "__pyx_state";
+static const char __pyx_k_reduce_ex[] = "__reduce_ex__";
+static const char __pyx_k_IndexError[] = "IndexError";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_pyx_result[] = "__pyx_result";
+static const char __pyx_k_pyx_vtable[] = "__pyx_vtable__";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_MemoryError[] = "MemoryError";
+static const char __pyx_k_PickleError[] = "PickleError";
+static const char __pyx_k_max_neg_val[] = "max_neg_val";
+static const char __pyx_k_pyx_checksum[] = "__pyx_checksum";
+static const char __pyx_k_stringsource[] = "stringsource";
+static const char __pyx_k_pyx_getbuffer[] = "__pyx_getbuffer";
+static const char __pyx_k_reduce_cython[] = "__reduce_cython__";
+static const char __pyx_k_View_MemoryView[] = "View.MemoryView";
+static const char __pyx_k_allocate_buffer[] = "allocate_buffer";
+static const char __pyx_k_dtype_is_object[] = "dtype_is_object";
+static const char __pyx_k_pyx_PickleError[] = "__pyx_PickleError";
+static const char __pyx_k_setstate_cython[] = "__setstate_cython__";
+static const char __pyx_k_pyx_unpickle_Enum[] = "__pyx_unpickle_Enum";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_strided_and_direct[] = "";
+static const char __pyx_k_strided_and_indirect[] = "";
+static const char __pyx_k_contiguous_and_direct[] = "";
+static const char __pyx_k_MemoryView_of_r_object[] = "";
+static const char __pyx_k_MemoryView_of_r_at_0x_x[] = "";
+static const char __pyx_k_contiguous_and_indirect[] = "";
+static const char __pyx_k_Cannot_index_with_type_s[] = "Cannot index with type '%s'";
+static const char __pyx_k_Invalid_shape_in_axis_d_d[] = "Invalid shape in axis %d: %d.";
+static const char __pyx_k_itemsize_0_for_cython_array[] = "itemsize <= 0 for cython.array";
+static const char __pyx_k_unable_to_allocate_array_data[] = "unable to allocate array data.";
+static const char __pyx_k_strided_and_direct_or_indirect[] = "";
+static const char __pyx_k_Buffer_view_does_not_expose_stri[] = "Buffer view does not expose strides";
+static const char __pyx_k_Can_only_create_a_buffer_that_is[] = "Can only create a buffer that is contiguous in memory.";
+static const char __pyx_k_Cannot_assign_to_read_only_memor[] = "Cannot assign to read-only memoryview";
+static const char __pyx_k_Cannot_create_writable_memory_vi[] = "Cannot create writable memory view from read-only memoryview";
+static const char __pyx_k_Empty_shape_tuple_for_cython_arr[] = "Empty shape tuple for cython.array";
+static const char __pyx_k_Incompatible_checksums_0x_x_vs_0[] = "Incompatible checksums (0x%x vs (0xb068931, 0x82a3537, 0x6ae9995) = (name))";
+static const char __pyx_k_Indirect_dimensions_not_supporte[] = "Indirect dimensions not supported";
+static const char __pyx_k_Invalid_mode_expected_c_or_fortr[] = "Invalid mode, expected 'c' or 'fortran', got %s";
+static const char __pyx_k_Out_of_bounds_on_buffer_access_a[] = "Out of bounds on buffer access (axis %d)";
+static const char __pyx_k_Unable_to_convert_item_to_object[] = "Unable to convert item to object";
+static const char __pyx_k_got_differing_extents_in_dimensi[] = "got differing extents in dimension %d (got %d and %d)";
+static const char __pyx_k_no_default___reduce___due_to_non[] = "no default __reduce__ due to non-trivial __cinit__";
+static const char __pyx_k_numpy__core_multiarray_failed_to[] = "numpy._core.multiarray failed to import";
+static const char __pyx_k_numpy__core_umath_failed_to_impo[] = "numpy._core.umath failed to import";
+static const char __pyx_k_unable_to_allocate_shape_and_str[] = "unable to allocate shape and strides.";
+static PyObject *__pyx_n_s_ASCII;
+static PyObject *__pyx_kp_s_Buffer_view_does_not_expose_stri;
+static PyObject *__pyx_kp_s_Can_only_create_a_buffer_that_is;
+static PyObject *__pyx_kp_s_Cannot_assign_to_read_only_memor;
+static PyObject *__pyx_kp_s_Cannot_create_writable_memory_vi;
+static PyObject *__pyx_kp_s_Cannot_index_with_type_s;
+static PyObject *__pyx_n_s_Ellipsis;
+static PyObject *__pyx_kp_s_Empty_shape_tuple_for_cython_arr;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_kp_s_Incompatible_checksums_0x_x_vs_0;
+static PyObject *__pyx_n_s_IndexError;
+static PyObject *__pyx_kp_s_Indirect_dimensions_not_supporte;
+static PyObject *__pyx_kp_s_Invalid_mode_expected_c_or_fortr;
+static PyObject *__pyx_kp_s_Invalid_shape_in_axis_d_d;
+static PyObject *__pyx_n_s_MemoryError;
+static PyObject *__pyx_kp_s_MemoryView_of_r_at_0x_x;
+static PyObject *__pyx_kp_s_MemoryView_of_r_object;
+static PyObject *__pyx_n_b_O;
+static PyObject *__pyx_kp_s_Out_of_bounds_on_buffer_access_a;
+static PyObject *__pyx_n_s_PickleError;
+static PyObject *__pyx_n_s_TypeError;
+static PyObject *__pyx_kp_s_Unable_to_convert_item_to_object;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_View_MemoryView;
+static PyObject *__pyx_n_s_allocate_buffer;
+static PyObject *__pyx_n_s_base;
+static PyObject *__pyx_n_s_c;
+static PyObject *__pyx_n_u_c;
+static PyObject *__pyx_n_s_class;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_kp_s_contiguous_and_direct;
+static PyObject *__pyx_kp_s_contiguous_and_indirect;
+static PyObject *__pyx_n_s_dict;
+static PyObject *__pyx_n_s_dtype_is_object;
+static PyObject *__pyx_n_s_encode;
+static PyObject *__pyx_n_s_enumerate;
+static PyObject *__pyx_n_s_error;
+static PyObject *__pyx_n_s_flags;
+static PyObject *__pyx_n_s_format;
+static PyObject *__pyx_n_s_fortran;
+static PyObject *__pyx_n_u_fortran;
+static PyObject *__pyx_n_s_getstate;
+static PyObject *__pyx_kp_s_got_differing_extents_in_dimensi;
+static PyObject *__pyx_n_s_id;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_itemsize;
+static PyObject *__pyx_kp_s_itemsize_0_for_cython_array;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_max_neg_val;
+static PyObject *__pyx_n_s_memview;
+static PyObject *__pyx_n_s_mode;
+static PyObject *__pyx_n_s_name;
+static PyObject *__pyx_n_s_name_2;
+static PyObject *__pyx_n_s_ndim;
+static PyObject *__pyx_n_s_new;
+static PyObject *__pyx_kp_s_no_default___reduce___due_to_non;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_u_numpy__core_multiarray_failed_to;
+static PyObject *__pyx_kp_u_numpy__core_umath_failed_to_impo;
+static PyObject *__pyx_n_s_obj;
+static PyObject *__pyx_n_s_pack;
+static PyObject *__pyx_n_s_paths;
+static PyObject *__pyx_n_s_pickle;
+static PyObject *__pyx_n_s_pyx_PickleError;
+static PyObject *__pyx_n_s_pyx_checksum;
+static PyObject *__pyx_n_s_pyx_getbuffer;
+static PyObject *__pyx_n_s_pyx_result;
+static PyObject *__pyx_n_s_pyx_state;
+static PyObject *__pyx_n_s_pyx_type;
+static PyObject *__pyx_n_s_pyx_unpickle_Enum;
+static PyObject *__pyx_n_s_pyx_vtable;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_reduce;
+static PyObject *__pyx_n_s_reduce_cython;
+static PyObject *__pyx_n_s_reduce_ex;
+static PyObject *__pyx_n_s_setstate;
+static PyObject *__pyx_n_s_setstate_cython;
+static PyObject *__pyx_n_s_shape;
+static PyObject *__pyx_n_s_size;
+static PyObject *__pyx_n_s_start;
+static PyObject *__pyx_n_s_step;
+static PyObject *__pyx_n_s_stop;
+static PyObject *__pyx_kp_s_strided_and_direct;
+static PyObject *__pyx_kp_s_strided_and_direct_or_indirect;
+static PyObject *__pyx_kp_s_strided_and_indirect;
+static PyObject *__pyx_kp_s_stringsource;
+static PyObject *__pyx_n_s_struct;
+static PyObject *__pyx_n_s_subarray;
+static PyObject *__pyx_n_s_t_xs;
+static PyObject *__pyx_n_s_t_ys;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_kp_s_unable_to_allocate_array_data;
+static PyObject *__pyx_kp_s_unable_to_allocate_shape_and_str;
+static PyObject *__pyx_n_s_unpack;
+static PyObject *__pyx_n_s_update;
+static PyObject *__pyx_n_s_values;
+static PyObject *__pyx_pf_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_paths, __Pyx_memviewslice __pyx_v_values, __Pyx_memviewslice __pyx_v_t_xs, __Pyx_memviewslice __pyx_v_t_ys, float __pyx_v_max_neg_val); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr); /* proto */
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item); /* proto */
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /* proto */
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name); /* proto */
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object); /* proto */
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_4__getitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index, PyObject *__pyx_v_value); /* proto */
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__(struct __pyx_memoryview_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffsets___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_10__len__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_12__repr__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_14__str__(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_16is_c_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_18is_f_contig(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_20copy(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_22copy_fortran(struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryview_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView_16_memoryviewslice_4base___get__(struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYTHON_UNUSED struct __pyx_memoryviewslice_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v___pyx_type, long __pyx_v___pyx_checksum, PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_int_0;
+static PyObject *__pyx_int_1;
+static PyObject *__pyx_int_112105877;
+static PyObject *__pyx_int_136983863;
+static PyObject *__pyx_int_184977713;
+static PyObject *__pyx_int_neg_1;
+static float __pyx_k_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_slice__18;
+static PyObject *__pyx_tuple__10;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__12;
+static PyObject *__pyx_tuple__13;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__15;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_tuple__17;
+static PyObject *__pyx_tuple__19;
+static PyObject *__pyx_tuple__20;
+static PyObject *__pyx_tuple__21;
+static PyObject *__pyx_tuple__22;
+static PyObject *__pyx_tuple__23;
+static PyObject *__pyx_tuple__24;
+static PyObject *__pyx_tuple__25;
+static PyObject *__pyx_tuple__26;
+static PyObject *__pyx_tuple__27;
+static PyObject *__pyx_tuple__28;
+static PyObject *__pyx_codeobj__29;
+/* Late includes */
+
+/* "TTS/tts/utils/monotonic_align/core.pyx":11
+ * @cython.boundscheck(False)
+ * @cython.wraparound(False)
+ * cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: # <<<<<<<<<<<<<<
+ * cdef int x
+ * cdef int y
+ */
+
+static void __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_each(__Pyx_memviewslice __pyx_v_path, __Pyx_memviewslice __pyx_v_value, int __pyx_v_t_x, int __pyx_v_t_y, float __pyx_v_max_neg_val) {
+ int __pyx_v_x;
+ int __pyx_v_y;
+ float __pyx_v_v_prev;
+ float __pyx_v_v_cur;
+ int __pyx_v_index;
+ int __pyx_t_1;
+ int __pyx_t_2;
+ int __pyx_t_3;
+ long __pyx_t_4;
+ int __pyx_t_5;
+ long __pyx_t_6;
+ long __pyx_t_7;
+ int __pyx_t_8;
+ Py_ssize_t __pyx_t_9;
+ Py_ssize_t __pyx_t_10;
+ float __pyx_t_11;
+ float __pyx_t_12;
+ float __pyx_t_13;
+ Py_ssize_t __pyx_t_14;
+ Py_ssize_t __pyx_t_15;
+ int __pyx_t_16;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":17
+ * cdef float v_cur
+ * cdef float tmp
+ * cdef int index = t_x - 1 # <<<<<<<<<<<<<<
+ *
+ * for y in range(t_y):
+ */
+ __pyx_v_index = (__pyx_v_t_x - 1);
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":19
+ * cdef int index = t_x - 1
+ *
+ * for y in range(t_y): # <<<<<<<<<<<<<<
+ * for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+ * if x == y:
+ */
+ __pyx_t_1 = __pyx_v_t_y;
+ __pyx_t_2 = __pyx_t_1;
+ for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+ __pyx_v_y = __pyx_t_3;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":20
+ *
+ * for y in range(t_y):
+ * for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): # <<<<<<<<<<<<<<
+ * if x == y:
+ * v_cur = max_neg_val
+ */
+ __pyx_t_4 = (__pyx_v_y + 1);
+ __pyx_t_5 = __pyx_v_t_x;
+ if (((__pyx_t_4 < __pyx_t_5) != 0)) {
+ __pyx_t_6 = __pyx_t_4;
+ } else {
+ __pyx_t_6 = __pyx_t_5;
+ }
+ __pyx_t_4 = __pyx_t_6;
+ __pyx_t_5 = ((__pyx_v_t_x + __pyx_v_y) - __pyx_v_t_y);
+ __pyx_t_6 = 0;
+ if (((__pyx_t_5 > __pyx_t_6) != 0)) {
+ __pyx_t_7 = __pyx_t_5;
+ } else {
+ __pyx_t_7 = __pyx_t_6;
+ }
+ __pyx_t_6 = __pyx_t_4;
+ for (__pyx_t_5 = __pyx_t_7; __pyx_t_5 < __pyx_t_6; __pyx_t_5+=1) {
+ __pyx_v_x = __pyx_t_5;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":21
+ * for y in range(t_y):
+ * for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+ * if x == y: # <<<<<<<<<<<<<<
+ * v_cur = max_neg_val
+ * else:
+ */
+ __pyx_t_8 = ((__pyx_v_x == __pyx_v_y) != 0);
+ if (__pyx_t_8) {
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":22
+ * for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+ * if x == y:
+ * v_cur = max_neg_val # <<<<<<<<<<<<<<
+ * else:
+ * v_cur = value[x, y-1]
+ */
+ __pyx_v_v_cur = __pyx_v_max_neg_val;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":21
+ * for y in range(t_y):
+ * for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+ * if x == y: # <<<<<<<<<<<<<<
+ * v_cur = max_neg_val
+ * else:
+ */
+ goto __pyx_L7;
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":24
+ * v_cur = max_neg_val
+ * else:
+ * v_cur = value[x, y-1] # <<<<<<<<<<<<<<
+ * if x == 0:
+ * if y == 0:
+ */
+ /*else*/ {
+ __pyx_t_9 = __pyx_v_x;
+ __pyx_t_10 = (__pyx_v_y - 1);
+ __pyx_v_v_cur = (*((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_9 * __pyx_v_value.strides[0]) )) + __pyx_t_10)) )));
+ }
+ __pyx_L7:;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":25
+ * else:
+ * v_cur = value[x, y-1]
+ * if x == 0: # <<<<<<<<<<<<<<
+ * if y == 0:
+ * v_prev = 0.
+ */
+ __pyx_t_8 = ((__pyx_v_x == 0) != 0);
+ if (__pyx_t_8) {
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":26
+ * v_cur = value[x, y-1]
+ * if x == 0:
+ * if y == 0: # <<<<<<<<<<<<<<
+ * v_prev = 0.
+ * else:
+ */
+ __pyx_t_8 = ((__pyx_v_y == 0) != 0);
+ if (__pyx_t_8) {
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":27
+ * if x == 0:
+ * if y == 0:
+ * v_prev = 0. # <<<<<<<<<<<<<<
+ * else:
+ * v_prev = max_neg_val
+ */
+ __pyx_v_v_prev = 0.;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":26
+ * v_cur = value[x, y-1]
+ * if x == 0:
+ * if y == 0: # <<<<<<<<<<<<<<
+ * v_prev = 0.
+ * else:
+ */
+ goto __pyx_L9;
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":29
+ * v_prev = 0.
+ * else:
+ * v_prev = max_neg_val # <<<<<<<<<<<<<<
+ * else:
+ * v_prev = value[x-1, y-1]
+ */
+ /*else*/ {
+ __pyx_v_v_prev = __pyx_v_max_neg_val;
+ }
+ __pyx_L9:;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":25
+ * else:
+ * v_cur = value[x, y-1]
+ * if x == 0: # <<<<<<<<<<<<<<
+ * if y == 0:
+ * v_prev = 0.
+ */
+ goto __pyx_L8;
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":31
+ * v_prev = max_neg_val
+ * else:
+ * v_prev = value[x-1, y-1] # <<<<<<<<<<<<<<
+ * value[x, y] = max(v_cur, v_prev) + value[x, y]
+ *
+ */
+ /*else*/ {
+ __pyx_t_10 = (__pyx_v_x - 1);
+ __pyx_t_9 = (__pyx_v_y - 1);
+ __pyx_v_v_prev = (*((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_10 * __pyx_v_value.strides[0]) )) + __pyx_t_9)) )));
+ }
+ __pyx_L8:;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":32
+ * else:
+ * v_prev = value[x-1, y-1]
+ * value[x, y] = max(v_cur, v_prev) + value[x, y] # <<<<<<<<<<<<<<
+ *
+ * for y in range(t_y - 1, -1, -1):
+ */
+ __pyx_t_11 = __pyx_v_v_prev;
+ __pyx_t_12 = __pyx_v_v_cur;
+ if (((__pyx_t_11 > __pyx_t_12) != 0)) {
+ __pyx_t_13 = __pyx_t_11;
+ } else {
+ __pyx_t_13 = __pyx_t_12;
+ }
+ __pyx_t_9 = __pyx_v_x;
+ __pyx_t_10 = __pyx_v_y;
+ __pyx_t_14 = __pyx_v_x;
+ __pyx_t_15 = __pyx_v_y;
+ *((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_14 * __pyx_v_value.strides[0]) )) + __pyx_t_15)) )) = (__pyx_t_13 + (*((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_9 * __pyx_v_value.strides[0]) )) + __pyx_t_10)) ))));
+ }
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":34
+ * value[x, y] = max(v_cur, v_prev) + value[x, y]
+ *
+ * for y in range(t_y - 1, -1, -1): # <<<<<<<<<<<<<<
+ * path[index, y] = 1
+ * if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
+ */
+ for (__pyx_t_1 = (__pyx_v_t_y - 1); __pyx_t_1 > -1; __pyx_t_1-=1) {
+ __pyx_v_y = __pyx_t_1;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":35
+ *
+ * for y in range(t_y - 1, -1, -1):
+ * path[index, y] = 1 # <<<<<<<<<<<<<<
+ * if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
+ * index = index - 1
+ */
+ __pyx_t_10 = __pyx_v_index;
+ __pyx_t_9 = __pyx_v_y;
+ *((int *) ( /* dim=1 */ ((char *) (((int *) ( /* dim=0 */ (__pyx_v_path.data + __pyx_t_10 * __pyx_v_path.strides[0]) )) + __pyx_t_9)) )) = 1;
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":36
+ * for y in range(t_y - 1, -1, -1):
+ * path[index, y] = 1
+ * if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): # <<<<<<<<<<<<<<
+ * index = index - 1
+ *
+ */
+ __pyx_t_16 = ((__pyx_v_index != 0) != 0);
+ if (__pyx_t_16) {
+ } else {
+ __pyx_t_8 = __pyx_t_16;
+ goto __pyx_L13_bool_binop_done;
+ }
+ __pyx_t_16 = ((__pyx_v_index == __pyx_v_y) != 0);
+ if (!__pyx_t_16) {
+ } else {
+ __pyx_t_8 = __pyx_t_16;
+ goto __pyx_L13_bool_binop_done;
+ }
+ __pyx_t_9 = __pyx_v_index;
+ __pyx_t_10 = (__pyx_v_y - 1);
+ __pyx_t_15 = (__pyx_v_index - 1);
+ __pyx_t_14 = (__pyx_v_y - 1);
+ __pyx_t_16 = (((*((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_9 * __pyx_v_value.strides[0]) )) + __pyx_t_10)) ))) < (*((float *) ( /* dim=1 */ ((char *) (((float *) ( /* dim=0 */ (__pyx_v_value.data + __pyx_t_15 * __pyx_v_value.strides[0]) )) + __pyx_t_14)) )))) != 0);
+ __pyx_t_8 = __pyx_t_16;
+ __pyx_L13_bool_binop_done:;
+ if (__pyx_t_8) {
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":37
+ * path[index, y] = 1
+ * if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
+ * index = index - 1 # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_v_index = (__pyx_v_index - 1);
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":36
+ * for y in range(t_y - 1, -1, -1):
+ * path[index, y] = 1
+ * if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): # <<<<<<<<<<<<<<
+ * index = index - 1
+ *
+ */
+ }
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":11
+ * @cython.boundscheck(False)
+ * @cython.wraparound(False)
+ * cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: # <<<<<<<<<<<<<<
+ * cdef int x
+ * cdef int y
+ */
+
+ /* function exit code */
+}
+
+/* "TTS/tts/utils/monotonic_align/core.pyx":42
+ * @cython.boundscheck(False)
+ * @cython.wraparound(False)
+ * cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: # <<<<<<<<<<<<<<
+ * cdef int b = values.shape[0]
+ *
+ */
+
+static PyObject *__pyx_pw_3TTS_3tts_5utils_15monotonic_align_4core_1maximum_path_c(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static void __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(__Pyx_memviewslice __pyx_v_paths, __Pyx_memviewslice __pyx_v_values, __Pyx_memviewslice __pyx_v_t_xs, __Pyx_memviewslice __pyx_v_t_ys, CYTHON_UNUSED int __pyx_skip_dispatch, struct __pyx_opt_args_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c *__pyx_optional_args) {
+ float __pyx_v_max_neg_val = __pyx_k_;
+ CYTHON_UNUSED int __pyx_v_b;
+ int __pyx_v_i;
+ int __pyx_t_1;
+ int __pyx_t_2;
+ int __pyx_t_3;
+ __Pyx_memviewslice __pyx_t_4 = { 0, 0, { 0 }, { 0 }, { 0 } };
+ __Pyx_memviewslice __pyx_t_5 = { 0, 0, { 0 }, { 0 }, { 0 } };
+ Py_ssize_t __pyx_t_6;
+ Py_ssize_t __pyx_t_7;
+ if (__pyx_optional_args) {
+ if (__pyx_optional_args->__pyx_n > 0) {
+ __pyx_v_max_neg_val = __pyx_optional_args->max_neg_val;
+ }
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":43
+ * @cython.wraparound(False)
+ * cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
+ * cdef int b = values.shape[0] # <<<<<<<<<<<<<<
+ *
+ * cdef int i
+ */
+ __pyx_v_b = (__pyx_v_values.shape[0]);
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":46
+ *
+ * cdef int i
+ * for i in prange(b, nogil=True): # <<<<<<<<<<<<<<
+ * maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
+ */
+ {
+ #ifdef WITH_THREAD
+ PyThreadState *_save;
+ Py_UNBLOCK_THREADS
+ __Pyx_FastGIL_Remember();
+ #endif
+ /*try:*/ {
+ __pyx_t_1 = __pyx_v_b;
+ if ((1 == 0)) abort();
+ {
+ #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
+ #undef likely
+ #undef unlikely
+ #define likely(x) (x)
+ #define unlikely(x) (x)
+ #endif
+ __pyx_t_3 = (__pyx_t_1 - 0 + 1 - 1/abs(1)) / 1;
+ if (__pyx_t_3 > 0)
+ {
+ #ifdef _OPENMP
+ #pragma omp parallel private(__pyx_t_6, __pyx_t_7) firstprivate(__pyx_t_4, __pyx_t_5)
+ #endif /* _OPENMP */
+ {
+ #ifdef _OPENMP
+ #pragma omp for firstprivate(__pyx_v_i) lastprivate(__pyx_v_i)
+ #endif /* _OPENMP */
+ for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_3; __pyx_t_2++){
+ {
+ __pyx_v_i = (int)(0 + 1 * __pyx_t_2);
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":47
+ * cdef int i
+ * for i in prange(b, nogil=True):
+ * maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) # <<<<<<<<<<<<<<
+ */
+ __pyx_t_4.data = __pyx_v_paths.data;
+ __pyx_t_4.memview = __pyx_v_paths.memview;
+ __PYX_INC_MEMVIEW(&__pyx_t_4, 0);
+ {
+ Py_ssize_t __pyx_tmp_idx = __pyx_v_i;
+ Py_ssize_t __pyx_tmp_stride = __pyx_v_paths.strides[0];
+ __pyx_t_4.data += __pyx_tmp_idx * __pyx_tmp_stride;
+}
+
+__pyx_t_4.shape[0] = __pyx_v_paths.shape[1];
+__pyx_t_4.strides[0] = __pyx_v_paths.strides[1];
+ __pyx_t_4.suboffsets[0] = -1;
+
+__pyx_t_4.shape[1] = __pyx_v_paths.shape[2];
+__pyx_t_4.strides[1] = __pyx_v_paths.strides[2];
+ __pyx_t_4.suboffsets[1] = -1;
+
+__pyx_t_5.data = __pyx_v_values.data;
+ __pyx_t_5.memview = __pyx_v_values.memview;
+ __PYX_INC_MEMVIEW(&__pyx_t_5, 0);
+ {
+ Py_ssize_t __pyx_tmp_idx = __pyx_v_i;
+ Py_ssize_t __pyx_tmp_stride = __pyx_v_values.strides[0];
+ __pyx_t_5.data += __pyx_tmp_idx * __pyx_tmp_stride;
+}
+
+__pyx_t_5.shape[0] = __pyx_v_values.shape[1];
+__pyx_t_5.strides[0] = __pyx_v_values.strides[1];
+ __pyx_t_5.suboffsets[0] = -1;
+
+__pyx_t_5.shape[1] = __pyx_v_values.shape[2];
+__pyx_t_5.strides[1] = __pyx_v_values.strides[2];
+ __pyx_t_5.suboffsets[1] = -1;
+
+__pyx_t_6 = __pyx_v_i;
+ __pyx_t_7 = __pyx_v_i;
+ __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_each(__pyx_t_4, __pyx_t_5, (*((int *) ( /* dim=0 */ ((char *) (((int *) __pyx_v_t_xs.data) + __pyx_t_6)) ))), (*((int *) ( /* dim=0 */ ((char *) (((int *) __pyx_v_t_ys.data) + __pyx_t_7)) ))), __pyx_v_max_neg_val);
+ __PYX_XDEC_MEMVIEW(&__pyx_t_4, 0);
+ __pyx_t_4.memview = NULL;
+ __pyx_t_4.data = NULL;
+ __PYX_XDEC_MEMVIEW(&__pyx_t_5, 0);
+ __pyx_t_5.memview = NULL;
+ __pyx_t_5.data = NULL;
+ }
+ }
+ }
+ }
+ }
+ #if ((defined(__APPLE__) || defined(__OSX__)) && (defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))))
+ #undef likely
+ #undef unlikely
+ #define likely(x) __builtin_expect(!!(x), 1)
+ #define unlikely(x) __builtin_expect(!!(x), 0)
+ #endif
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":46
+ *
+ * cdef int i
+ * for i in prange(b, nogil=True): # <<<<<<<<<<<<<<
+ * maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
+ */
+ /*finally:*/ {
+ /*normal exit:*/{
+ #ifdef WITH_THREAD
+ __Pyx_FastGIL_Forget();
+ Py_BLOCK_THREADS
+ #endif
+ goto __pyx_L5;
+ }
+ __pyx_L5:;
+ }
+ }
+
+ /* "TTS/tts/utils/monotonic_align/core.pyx":42
+ * @cython.boundscheck(False)
+ * @cython.wraparound(False)
+ * cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: # <<<<<<<<<<<<<<
+ * cdef int b = values.shape[0]
+ *
+ */
+
+ /* function exit code */
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_3TTS_3tts_5utils_15monotonic_align_4core_1maximum_path_c(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyObject *__pyx_pw_3TTS_3tts_5utils_15monotonic_align_4core_1maximum_path_c(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ __Pyx_memviewslice __pyx_v_paths = { 0, 0, { 0 }, { 0 }, { 0 } };
+ __Pyx_memviewslice __pyx_v_values = { 0, 0, { 0 }, { 0 }, { 0 } };
+ __Pyx_memviewslice __pyx_v_t_xs = { 0, 0, { 0 }, { 0 }, { 0 } };
+ __Pyx_memviewslice __pyx_v_t_ys = { 0, 0, { 0 }, { 0 }, { 0 } };
+ float __pyx_v_max_neg_val;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("maximum_path_c (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_paths,&__pyx_n_s_values,&__pyx_n_s_t_xs,&__pyx_n_s_t_ys,&__pyx_n_s_max_neg_val,0};
+ PyObject* values[5] = {0,0,0,0,0};
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_paths)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_values)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("maximum_path_c", 0, 4, 5, 1); __PYX_ERR(0, 42, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_t_xs)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("maximum_path_c", 0, 4, 5, 2); __PYX_ERR(0, 42, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 3:
+ if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_t_ys)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("maximum_path_c", 0, 4, 5, 3); __PYX_ERR(0, 42, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 4:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_max_neg_val);
+ if (value) { values[4] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "maximum_path_c") < 0)) __PYX_ERR(0, 42, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_paths = __Pyx_PyObject_to_MemoryviewSlice_d_d_dc_int(values[0], PyBUF_WRITABLE); if (unlikely(!__pyx_v_paths.memview)) __PYX_ERR(0, 42, __pyx_L3_error)
+ __pyx_v_values = __Pyx_PyObject_to_MemoryviewSlice_d_d_dc_float(values[1], PyBUF_WRITABLE); if (unlikely(!__pyx_v_values.memview)) __PYX_ERR(0, 42, __pyx_L3_error)
+ __pyx_v_t_xs = __Pyx_PyObject_to_MemoryviewSlice_dc_int(values[2], PyBUF_WRITABLE); if (unlikely(!__pyx_v_t_xs.memview)) __PYX_ERR(0, 42, __pyx_L3_error)
+ __pyx_v_t_ys = __Pyx_PyObject_to_MemoryviewSlice_dc_int(values[3], PyBUF_WRITABLE); if (unlikely(!__pyx_v_t_ys.memview)) __PYX_ERR(0, 42, __pyx_L3_error)
+ if (values[4]) {
+ __pyx_v_max_neg_val = __pyx_PyFloat_AsFloat(values[4]); if (unlikely((__pyx_v_max_neg_val == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 42, __pyx_L3_error)
+ } else {
+ __pyx_v_max_neg_val = __pyx_k_;
+ }
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("maximum_path_c", 0, 4, 5, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 42, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("TTS.tts.utils.monotonic_align.core.maximum_path_c", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return NULL;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_pf_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(__pyx_self, __pyx_v_paths, __pyx_v_values, __pyx_v_t_xs, __pyx_v_t_ys, __pyx_v_max_neg_val);
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_paths, __Pyx_memviewslice __pyx_v_values, __Pyx_memviewslice __pyx_v_t_xs, __Pyx_memviewslice __pyx_v_t_ys, float __pyx_v_max_neg_val) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ struct __pyx_opt_args_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("maximum_path_c", 0);
+ __Pyx_XDECREF(__pyx_r);
+ if (unlikely(!__pyx_v_paths.memview)) { __Pyx_RaiseUnboundLocalError("paths"); __PYX_ERR(0, 42, __pyx_L1_error) }
+ if (unlikely(!__pyx_v_values.memview)) { __Pyx_RaiseUnboundLocalError("values"); __PYX_ERR(0, 42, __pyx_L1_error) }
+ if (unlikely(!__pyx_v_t_xs.memview)) { __Pyx_RaiseUnboundLocalError("t_xs"); __PYX_ERR(0, 42, __pyx_L1_error) }
+ if (unlikely(!__pyx_v_t_ys.memview)) { __Pyx_RaiseUnboundLocalError("t_ys"); __PYX_ERR(0, 42, __pyx_L1_error) }
+ __pyx_t_1.__pyx_n = 1;
+ __pyx_t_1.max_neg_val = __pyx_v_max_neg_val;
+ __pyx_f_3TTS_3tts_5utils_15monotonic_align_4core_maximum_path_c(__pyx_v_paths, __pyx_v_values, __pyx_v_t_xs, __pyx_v_t_ys, 0, &__pyx_t_1);
+ __pyx_t_2 = __Pyx_void_to_None(NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 42, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_r = __pyx_t_2;
+ __pyx_t_2 = 0;
+ goto __pyx_L0;
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_AddTraceback("TTS.tts.utils.monotonic_align.core.maximum_path_c", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __PYX_XDEC_MEMVIEW(&__pyx_v_paths, 1);
+ __PYX_XDEC_MEMVIEW(&__pyx_v_values, 1);
+ __PYX_XDEC_MEMVIEW(&__pyx_v_t_xs, 1);
+ __PYX_XDEC_MEMVIEW(&__pyx_v_t_ys, 1);
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":739
+ * ctypedef long double complex clongdouble_t
+ *
+ * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(1, a)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":740
+ *
+ * cdef inline object PyArray_MultiIterNew1(a):
+ * return PyArray_MultiIterNew(1, a) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 740, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":739
+ * ctypedef long double complex clongdouble_t
+ *
+ * cdef inline object PyArray_MultiIterNew1(a): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(1, a)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":742
+ * return PyArray_MultiIterNew(1, a)
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":743
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ * return PyArray_MultiIterNew(2, a, b) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 743, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":742
+ * return PyArray_MultiIterNew(1, a)
+ *
+ * cdef inline object PyArray_MultiIterNew2(a, b): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":745
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":746
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ * return PyArray_MultiIterNew(3, a, b, c) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 746, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":745
+ * return PyArray_MultiIterNew(2, a, b)
+ *
+ * cdef inline object PyArray_MultiIterNew3(a, b, c): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":748
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":749
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ * return PyArray_MultiIterNew(4, a, b, c, d) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 749, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":748
+ * return PyArray_MultiIterNew(3, a, b, c)
+ *
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":751
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":752
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ * return PyArray_MultiIterNew(5, a, b, c, d, e) # <<<<<<<<<<<<<<
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 752, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":751
+ * return PyArray_MultiIterNew(4, a, b, c, d)
+ *
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): # <<<<<<<<<<<<<<
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":754
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<<
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":755
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<<
+ * return d.subarray.shape
+ * else:
+ */
+ __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":756
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape # <<<<<<<<<<<<<<
+ * else:
+ * return ()
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_2 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_d), __pyx_n_s_subarray); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 756, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 756, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __Pyx_INCREF(((PyObject*)__pyx_t_3));
+ __pyx_r = ((PyObject*)__pyx_t_3);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":755
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ * if PyDataType_HASSUBARRAY(d): # <<<<<<<<<<<<<<
+ * return d.subarray.shape
+ * else:
+ */
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":758
+ * return d.subarray.shape
+ * else:
+ * return () # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ /*else*/ {
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_INCREF(__pyx_empty_tuple);
+ __pyx_r = __pyx_empty_tuple;
+ goto __pyx_L0;
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":754
+ * return PyArray_MultiIterNew(5, a, b, c, d, e)
+ *
+ * cdef inline tuple PyDataType_SHAPE(dtype d): # <<<<<<<<<<<<<<
+ * if PyDataType_HASSUBARRAY(d):
+ * return d.subarray.shape
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_AddTraceback("numpy.PyDataType_SHAPE", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":949
+ * int _import_umath() except -1
+ *
+ * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<<
+ * Py_INCREF(base) # important to do this before stealing the reference below!
+ * PyArray_SetBaseObject(arr, base)
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("set_array_base", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":950
+ *
+ * cdef inline void set_array_base(ndarray arr, object base):
+ * Py_INCREF(base) # important to do this before stealing the reference below! # <<<<<<<<<<<<<<
+ * PyArray_SetBaseObject(arr, base)
+ *
+ */
+ Py_INCREF(__pyx_v_base);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":951
+ * cdef inline void set_array_base(ndarray arr, object base):
+ * Py_INCREF(base) # important to do this before stealing the reference below!
+ * PyArray_SetBaseObject(arr, base) # <<<<<<<<<<<<<<
+ *
+ * cdef inline object get_array_base(ndarray arr):
+ */
+ __pyx_t_1 = PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_base); if (unlikely(__pyx_t_1 == ((int)-1))) __PYX_ERR(1, 951, __pyx_L1_error)
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":949
+ * int _import_umath() except -1
+ *
+ * cdef inline void set_array_base(ndarray arr, object base): # <<<<<<<<<<<<<<
+ * Py_INCREF(base) # important to do this before stealing the reference below!
+ * PyArray_SetBaseObject(arr, base)
+ */
+
+ /* function exit code */
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_WriteUnraisable("numpy.set_array_base", __pyx_clineno, __pyx_lineno, __pyx_filename, 1, 0);
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":953
+ * PyArray_SetBaseObject(arr, base)
+ *
+ * cdef inline object get_array_base(ndarray arr): # <<<<<<<<<<<<<<
+ * base = PyArray_BASE(arr)
+ * if base is NULL:
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+ PyObject *__pyx_v_base;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ __Pyx_RefNannySetupContext("get_array_base", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":954
+ *
+ * cdef inline object get_array_base(ndarray arr):
+ * base = PyArray_BASE(arr) # <<<<<<<<<<<<<<
+ * if base is NULL:
+ * return None
+ */
+ __pyx_v_base = PyArray_BASE(__pyx_v_arr);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":955
+ * cdef inline object get_array_base(ndarray arr):
+ * base = PyArray_BASE(arr)
+ * if base is NULL: # <<<<<<<<<<<<<<
+ * return None
+ * return base
+ */
+ __pyx_t_1 = ((__pyx_v_base == NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":956
+ * base = PyArray_BASE(arr)
+ * if base is NULL:
+ * return None # <<<<<<<<<<<<<<
+ * return base
+ *
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":955
+ * cdef inline object get_array_base(ndarray arr):
+ * base = PyArray_BASE(arr)
+ * if base is NULL: # <<<<<<<<<<<<<<
+ * return None
+ * return base
+ */
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":957
+ * if base is NULL:
+ * return None
+ * return base # <<<<<<<<<<<<<<
+ *
+ * # Versions of the import_* functions which are more suitable for
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_INCREF(((PyObject *)__pyx_v_base));
+ __pyx_r = ((PyObject *)__pyx_v_base);
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":953
+ * PyArray_SetBaseObject(arr, base)
+ *
+ * cdef inline object get_array_base(ndarray arr): # <<<<<<<<<<<<<<
+ * base = PyArray_BASE(arr)
+ * if base is NULL:
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":961
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * __pyx_import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_t_4;
+ PyObject *__pyx_t_5 = NULL;
+ PyObject *__pyx_t_6 = NULL;
+ PyObject *__pyx_t_7 = NULL;
+ PyObject *__pyx_t_8 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("import_array", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":962
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * __pyx_import_array()
+ * except Exception:
+ */
+ {
+ __Pyx_PyThreadState_declare
+ __Pyx_PyThreadState_assign
+ __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+ __Pyx_XGOTREF(__pyx_t_1);
+ __Pyx_XGOTREF(__pyx_t_2);
+ __Pyx_XGOTREF(__pyx_t_3);
+ /*try:*/ {
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":963
+ * cdef inline int import_array() except -1:
+ * try:
+ * __pyx_import_array() # <<<<<<<<<<<<<<
+ * except Exception:
+ * raise ImportError("numpy._core.multiarray failed to import")
+ */
+ __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 963, __pyx_L3_error)
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":962
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * __pyx_import_array()
+ * except Exception:
+ */
+ }
+ __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+ goto __pyx_L8_try_end;
+ __pyx_L3_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":964
+ * try:
+ * __pyx_import_array()
+ * except Exception: # <<<<<<<<<<<<<<
+ * raise ImportError("numpy._core.multiarray failed to import")
+ *
+ */
+ __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+ if (__pyx_t_4) {
+ __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 964, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":965
+ * __pyx_import_array()
+ * except Exception:
+ * raise ImportError("numpy._core.multiarray failed to import") # <<<<<<<<<<<<<<
+ *
+ * cdef inline int import_umath() except -1:
+ */
+ __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 965, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_8);
+ __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+ __PYX_ERR(1, 965, __pyx_L5_except_error)
+ }
+ goto __pyx_L5_except_error;
+ __pyx_L5_except_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":962
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * __pyx_import_array()
+ * except Exception:
+ */
+ __Pyx_XGIVEREF(__pyx_t_1);
+ __Pyx_XGIVEREF(__pyx_t_2);
+ __Pyx_XGIVEREF(__pyx_t_3);
+ __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+ goto __pyx_L1_error;
+ __pyx_L8_try_end:;
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":961
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * __pyx_import_array()
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_XDECREF(__pyx_t_6);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_8);
+ __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":967
+ * raise ImportError("numpy._core.multiarray failed to import")
+ *
+ * cdef inline int import_umath() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_t_4;
+ PyObject *__pyx_t_5 = NULL;
+ PyObject *__pyx_t_6 = NULL;
+ PyObject *__pyx_t_7 = NULL;
+ PyObject *__pyx_t_8 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("import_umath", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":968
+ *
+ * cdef inline int import_umath() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ {
+ __Pyx_PyThreadState_declare
+ __Pyx_PyThreadState_assign
+ __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+ __Pyx_XGOTREF(__pyx_t_1);
+ __Pyx_XGOTREF(__pyx_t_2);
+ __Pyx_XGOTREF(__pyx_t_3);
+ /*try:*/ {
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":969
+ * cdef inline int import_umath() except -1:
+ * try:
+ * _import_umath() # <<<<<<<<<<<<<<
+ * except Exception:
+ * raise ImportError("numpy._core.umath failed to import")
+ */
+ __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 969, __pyx_L3_error)
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":968
+ *
+ * cdef inline int import_umath() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ }
+ __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+ goto __pyx_L8_try_end;
+ __pyx_L3_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":970
+ * try:
+ * _import_umath()
+ * except Exception: # <<<<<<<<<<<<<<
+ * raise ImportError("numpy._core.umath failed to import")
+ *
+ */
+ __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+ if (__pyx_t_4) {
+ __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 970, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":971
+ * _import_umath()
+ * except Exception:
+ * raise ImportError("numpy._core.umath failed to import") # <<<<<<<<<<<<<<
+ *
+ * cdef inline int import_ufunc() except -1:
+ */
+ __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 971, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_8);
+ __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+ __PYX_ERR(1, 971, __pyx_L5_except_error)
+ }
+ goto __pyx_L5_except_error;
+ __pyx_L5_except_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":968
+ *
+ * cdef inline int import_umath() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ __Pyx_XGIVEREF(__pyx_t_1);
+ __Pyx_XGIVEREF(__pyx_t_2);
+ __Pyx_XGIVEREF(__pyx_t_3);
+ __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+ goto __pyx_L1_error;
+ __pyx_L8_try_end:;
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":967
+ * raise ImportError("numpy._core.multiarray failed to import")
+ *
+ * cdef inline int import_umath() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * _import_umath()
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_XDECREF(__pyx_t_6);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_8);
+ __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":973
+ * raise ImportError("numpy._core.umath failed to import")
+ *
+ * cdef inline int import_ufunc() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_t_4;
+ PyObject *__pyx_t_5 = NULL;
+ PyObject *__pyx_t_6 = NULL;
+ PyObject *__pyx_t_7 = NULL;
+ PyObject *__pyx_t_8 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":974
+ *
+ * cdef inline int import_ufunc() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ {
+ __Pyx_PyThreadState_declare
+ __Pyx_PyThreadState_assign
+ __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+ __Pyx_XGOTREF(__pyx_t_1);
+ __Pyx_XGOTREF(__pyx_t_2);
+ __Pyx_XGOTREF(__pyx_t_3);
+ /*try:*/ {
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":975
+ * cdef inline int import_ufunc() except -1:
+ * try:
+ * _import_umath() # <<<<<<<<<<<<<<
+ * except Exception:
+ * raise ImportError("numpy._core.umath failed to import")
+ */
+ __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 975, __pyx_L3_error)
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":974
+ *
+ * cdef inline int import_ufunc() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ }
+ __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+ goto __pyx_L8_try_end;
+ __pyx_L3_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":976
+ * try:
+ * _import_umath()
+ * except Exception: # <<<<<<<<<<<<<<
+ * raise ImportError("numpy._core.umath failed to import")
+ *
+ */
+ __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+ if (__pyx_t_4) {
+ __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 976, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_GOTREF(__pyx_t_7);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":977
+ * _import_umath()
+ * except Exception:
+ * raise ImportError("numpy._core.umath failed to import") # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 977, __pyx_L5_except_error)
+ __Pyx_GOTREF(__pyx_t_8);
+ __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+ __PYX_ERR(1, 977, __pyx_L5_except_error)
+ }
+ goto __pyx_L5_except_error;
+ __pyx_L5_except_error:;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":974
+ *
+ * cdef inline int import_ufunc() except -1:
+ * try: # <<<<<<<<<<<<<<
+ * _import_umath()
+ * except Exception:
+ */
+ __Pyx_XGIVEREF(__pyx_t_1);
+ __Pyx_XGIVEREF(__pyx_t_2);
+ __Pyx_XGIVEREF(__pyx_t_3);
+ __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+ goto __pyx_L1_error;
+ __pyx_L8_try_end:;
+ }
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":973
+ * raise ImportError("numpy._core.umath failed to import")
+ *
+ * cdef inline int import_ufunc() except -1: # <<<<<<<<<<<<<<
+ * try:
+ * _import_umath()
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_XDECREF(__pyx_t_6);
+ __Pyx_XDECREF(__pyx_t_7);
+ __Pyx_XDECREF(__pyx_t_8);
+ __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":980
+ *
+ *
+ * cdef inline bint is_timedelta64_object(object obj): # <<<<<<<<<<<<<<
+ * """
+ * Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *__pyx_v_obj) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("is_timedelta64_object", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":992
+ * bool
+ * """
+ * return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type) # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyTimedeltaArrType_Type));
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":980
+ *
+ *
+ * cdef inline bint is_timedelta64_object(object obj): # <<<<<<<<<<<<<<
+ * """
+ * Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":995
+ *
+ *
+ * cdef inline bint is_datetime64_object(object obj): # <<<<<<<<<<<<<<
+ * """
+ * Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *__pyx_v_obj) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("is_datetime64_object", 0);
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1007
+ * bool
+ * """
+ * return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type) # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyDatetimeArrType_Type));
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":995
+ *
+ *
+ * cdef inline bint is_datetime64_object(object obj): # <<<<<<<<<<<<<<
+ * """
+ * Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1010
+ *
+ *
+ * cdef inline npy_datetime get_datetime64_value(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(PyObject *__pyx_v_obj) {
+ npy_datetime __pyx_r;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1017
+ * also needed. That can be found using `get_datetime64_unit`.
+ * """
+ * return (obj).obval # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = ((PyDatetimeScalarObject *)__pyx_v_obj)->obval;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1010
+ *
+ *
+ * cdef inline npy_datetime get_datetime64_value(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1020
+ *
+ *
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(PyObject *__pyx_v_obj) {
+ npy_timedelta __pyx_r;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1024
+ * returns the int64 value underlying scalar numpy timedelta64 object
+ * """
+ * return (obj).obval # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = ((PyTimedeltaScalarObject *)__pyx_v_obj)->obval;
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1020
+ *
+ *
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1027
+ *
+ *
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit(PyObject *__pyx_v_obj) {
+ NPY_DATETIMEUNIT __pyx_r;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1031
+ * returns the unit part of the dtype for a numpy datetime64 object.
+ * """
+ * return (obj).obmeta.base # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = ((NPY_DATETIMEUNIT)((PyDatetimeScalarObject *)__pyx_v_obj)->obmeta.base);
+ goto __pyx_L0;
+
+ /* "../../../../../../private/var/folders/w1/36c6k6fx2bx5_d7s_9z_nnhr0000gn/T/pip-build-env-05vrrlbr/overlay/lib/python3.10/site-packages/numpy/__init__.pxd":1027
+ *
+ *
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil: # <<<<<<<<<<<<<<
+ * """
+ * returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "View.MemoryView":123
+ * cdef bint dtype_is_object
+ *
+ * def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None, # <<<<<<<<<<<<<<
+ * mode="c", bint allocate_buffer=True):
+ *
+ */
+
+/* Python wrapper */
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_shape = 0;
+ Py_ssize_t __pyx_v_itemsize;
+ PyObject *__pyx_v_format = 0;
+ PyObject *__pyx_v_mode = 0;
+ int __pyx_v_allocate_buffer;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_shape,&__pyx_n_s_itemsize,&__pyx_n_s_format,&__pyx_n_s_mode,&__pyx_n_s_allocate_buffer,0};
+ PyObject* values[5] = {0,0,0,0,0};
+ values[3] = ((PyObject *)__pyx_n_s_c);
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_shape)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_itemsize)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 1); __PYX_ERR(2, 123, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_format)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, 2); __PYX_ERR(2, 123, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 3:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_mode);
+ if (value) { values[3] = value; kw_args--; }
+ }
+ CYTHON_FALLTHROUGH;
+ case 4:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_allocate_buffer);
+ if (value) { values[4] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(2, 123, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ CYTHON_FALLTHROUGH;
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ CYTHON_FALLTHROUGH;
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_shape = ((PyObject*)values[0]);
+ __pyx_v_itemsize = __Pyx_PyIndex_AsSsize_t(values[1]); if (unlikely((__pyx_v_itemsize == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(2, 123, __pyx_L3_error)
+ __pyx_v_format = values[2];
+ __pyx_v_mode = values[3];
+ if (values[4]) {
+ __pyx_v_allocate_buffer = __Pyx_PyObject_IsTrue(values[4]); if (unlikely((__pyx_v_allocate_buffer == (int)-1) && PyErr_Occurred())) __PYX_ERR(2, 124, __pyx_L3_error)
+ } else {
+
+ /* "View.MemoryView":124
+ *
+ * def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None,
+ * mode="c", bint allocate_buffer=True): # <<<<<<<<<<<<<<
+ *
+ * cdef int idx
+ */
+ __pyx_v_allocate_buffer = ((int)1);
+ }
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 3, 5, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(2, 123, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return -1;
+ __pyx_L4_argument_unpacking_done:;
+ if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_shape), (&PyTuple_Type), 1, "shape", 1))) __PYX_ERR(2, 123, __pyx_L1_error)
+ if (unlikely(((PyObject *)__pyx_v_format) == Py_None)) {
+ PyErr_Format(PyExc_TypeError, "Argument '%.200s' must not be None", "format"); __PYX_ERR(2, 123, __pyx_L1_error)
+ }
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(((struct __pyx_array_obj *)__pyx_v_self), __pyx_v_shape, __pyx_v_itemsize, __pyx_v_format, __pyx_v_mode, __pyx_v_allocate_buffer);
+
+ /* "View.MemoryView":123
+ * cdef bint dtype_is_object
+ *
+ * def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None, # <<<<<<<<<<<<<<
+ * mode="c", bint allocate_buffer=True):
+ *
+ */
+
+ /* function exit code */
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, PyObject *__pyx_v_format, PyObject *__pyx_v_mode, int __pyx_v_allocate_buffer) {
+ int __pyx_v_idx;
+ Py_ssize_t __pyx_v_i;
+ Py_ssize_t __pyx_v_dim;
+ PyObject **__pyx_v_p;
+ char __pyx_v_order;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ Py_ssize_t __pyx_t_1;
+ int __pyx_t_2;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_t_4;
+ PyObject *__pyx_t_5 = NULL;
+ PyObject *__pyx_t_6 = NULL;
+ char *__pyx_t_7;
+ int __pyx_t_8;
+ Py_ssize_t __pyx_t_9;
+ PyObject *__pyx_t_10 = NULL;
+ Py_ssize_t __pyx_t_11;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__cinit__", 0);
+ __Pyx_INCREF(__pyx_v_format);
+
+ /* "View.MemoryView":130
+ * cdef PyObject **p
+ *
+ * self.ndim = len(shape) # <<<<<<<<<<<<<<
+ * self.itemsize = itemsize
+ *
+ */
+ if (unlikely(__pyx_v_shape == Py_None)) {
+ PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()");
+ __PYX_ERR(2, 130, __pyx_L1_error)
+ }
+ __pyx_t_1 = PyTuple_GET_SIZE(__pyx_v_shape); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(2, 130, __pyx_L1_error)
+ __pyx_v_self->ndim = ((int)__pyx_t_1);
+
+ /* "View.MemoryView":131
+ *
+ * self.ndim = len(shape)
+ * self.itemsize = itemsize # <<<<<<<<<<<<<<
+ *
+ * if not self.ndim:
+ */
+ __pyx_v_self->itemsize = __pyx_v_itemsize;
+
+ /* "View.MemoryView":133
+ * self.itemsize = itemsize
+ *
+ * if not self.ndim: # <<<<<<<<<<<<<<
+ * raise ValueError("Empty shape tuple for cython.array")
+ *
+ */
+ __pyx_t_2 = ((!(__pyx_v_self->ndim != 0)) != 0);
+ if (unlikely(__pyx_t_2)) {
+
+ /* "View.MemoryView":134
+ *
+ * if not self.ndim:
+ * raise ValueError("Empty shape tuple for cython.array") # <<<<<<<<<<<<<<
+ *
+ * if itemsize <= 0:
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 134, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(2, 134, __pyx_L1_error)
+
+ /* "View.MemoryView":133
+ * self.itemsize = itemsize
+ *
+ * if not self.ndim: # <<<<<<<<<<<<<<
+ * raise ValueError("Empty shape tuple for cython.array")
+ *
+ */
+ }
+
+ /* "View.MemoryView":136
+ * raise ValueError("Empty shape tuple for cython.array")
+ *
+ * if itemsize <= 0: # <<<<<<<<<<<<<<
+ * raise ValueError("itemsize <= 0 for cython.array")
+ *
+ */
+ __pyx_t_2 = ((__pyx_v_itemsize <= 0) != 0);
+ if (unlikely(__pyx_t_2)) {
+
+ /* "View.MemoryView":137
+ *
+ * if itemsize <= 0:
+ * raise ValueError("itemsize <= 0 for cython.array") # <<<<<<<<<<<<<<
+ *
+ * if not isinstance(format, bytes):
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 137, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(2, 137, __pyx_L1_error)
+
+ /* "View.MemoryView":136
+ * raise ValueError("Empty shape tuple for cython.array")
+ *
+ * if itemsize <= 0: # <<<<<<<<<<<<<<
+ * raise ValueError("itemsize <= 0 for cython.array")
+ *
+ */
+ }
+
+ /* "View.MemoryView":139
+ * raise ValueError("itemsize <= 0 for cython.array")
+ *
+ * if not isinstance(format, bytes): # <<<<<<<<<<<<<<
+ * format = format.encode('ASCII')
+ * self._format = format # keep a reference to the byte string
+ */
+ __pyx_t_2 = PyBytes_Check(__pyx_v_format);
+ __pyx_t_4 = ((!(__pyx_t_2 != 0)) != 0);
+ if (__pyx_t_4) {
+
+ /* "View.MemoryView":140
+ *
+ * if not isinstance(format, bytes):
+ * format = format.encode('ASCII') # <<<<<<<<<<<<<<
+ * self._format = format # keep a reference to the byte string
+ * self.format = self._format
+ */
+ __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_format, __pyx_n_s_encode); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 140, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __pyx_t_6 = NULL;
+ if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_5))) {
+ __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5);
+ if (likely(__pyx_t_6)) {
+ PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+ __Pyx_INCREF(__pyx_t_6);
+ __Pyx_INCREF(function);
+ __Pyx_DECREF_SET(__pyx_t_5, function);
+ }
+ }
+ __pyx_t_3 = (__pyx_t_6) ? __Pyx_PyObject_Call2Args(__pyx_t_5, __pyx_t_6, __pyx_n_s_ASCII) : __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_n_s_ASCII);
+ __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+ if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 140, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+ __Pyx_DECREF_SET(__pyx_v_format, __pyx_t_3);
+ __pyx_t_3 = 0;
+
+ /* "View.MemoryView":139
+ * raise ValueError("itemsize <= 0 for cython.array")
+ *
+ * if not isinstance(format, bytes): # <<<<<<<<<<<<<<
+ * format = format.encode('ASCII')
+ * self._format = format # keep a reference to the byte string
+ */
+ }
+
+ /* "View.MemoryView":141
+ * if not isinstance(format, bytes):
+ * format = format.encode('ASCII')
+ * self._format = format # keep a reference to the byte string # <<<<<<<<<<<<<<
+ * self.format = self._format
+ *
+ */
+ if (!(likely(PyBytes_CheckExact(__pyx_v_format))||((__pyx_v_format) == Py_None)||((void)PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_v_format)->tp_name), 0))) __PYX_ERR(2, 141, __pyx_L1_error)
+ __pyx_t_3 = __pyx_v_format;
+ __Pyx_INCREF(__pyx_t_3);
+ __Pyx_GIVEREF(__pyx_t_3);
+ __Pyx_GOTREF(__pyx_v_self->_format);
+ __Pyx_DECREF(__pyx_v_self->_format);
+ __pyx_v_self->_format = ((PyObject*)__pyx_t_3);
+ __pyx_t_3 = 0;
+
+ /* "View.MemoryView":142
+ * format = format.encode('ASCII')
+ * self._format = format # keep a reference to the byte string
+ * self.format = self._format # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ if (unlikely(__pyx_v_self->_format == Py_None)) {
+ PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found");
+ __PYX_ERR(2, 142, __pyx_L1_error)
+ }
+ __pyx_t_7 = __Pyx_PyBytes_AsWritableString(__pyx_v_self->_format); if (unlikely((!__pyx_t_7) && PyErr_Occurred())) __PYX_ERR(2, 142, __pyx_L1_error)
+ __pyx_v_self->format = __pyx_t_7;
+
+ /* "View.MemoryView":145
+ *
+ *
+ * self._shape = PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2) # <<<<<<<<<<<<<<
+ * self._strides = self._shape + self.ndim
+ *
+ */
+ __pyx_v_self->_shape = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * __pyx_v_self->ndim) * 2)));
+
+ /* "View.MemoryView":146
+ *
+ * self._shape = PyObject_Malloc(sizeof(Py_ssize_t)*self.ndim*2)
+ * self._strides = self._shape + self.ndim # <<<<<<<<<<<<<<
+ *
+ * if not self._shape:
+ */
+ __pyx_v_self->_strides = (__pyx_v_self->_shape + __pyx_v_self->ndim);
+
+ /* "View.MemoryView":148
+ * self._strides = self._shape + self.ndim
+ *
+ * if not self._shape: # <<<<<<<<<<<<<<
+ * raise MemoryError("unable to allocate shape and strides.")
+ *
+ */
+ __pyx_t_4 = ((!(__pyx_v_self->_shape != 0)) != 0);
+ if (unlikely(__pyx_t_4)) {
+
+ /* "View.MemoryView":149
+ *
+ * if not self._shape:
+ * raise MemoryError("unable to allocate shape and strides.") # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 149, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(2, 149, __pyx_L1_error)
+
+ /* "View.MemoryView":148
+ * self._strides = self._shape + self.ndim
+ *
+ * if not self._shape: # <<<<<<<<<<<<<<
+ * raise MemoryError("unable to allocate shape and strides.")
+ *
+ */
+ }
+
+ /* "View.MemoryView":152
+ *
+ *
+ * for idx, dim in enumerate(shape): # <<<<<<<<<<<<<<
+ * if dim <= 0:
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim))
+ */
+ __pyx_t_8 = 0;
+ __pyx_t_3 = __pyx_v_shape; __Pyx_INCREF(__pyx_t_3); __pyx_t_1 = 0;
+ for (;;) {
+ if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_3)) break;
+ #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+ __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(2, 152, __pyx_L1_error)
+ #else
+ __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 152, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ #endif
+ __pyx_t_9 = __Pyx_PyIndex_AsSsize_t(__pyx_t_5); if (unlikely((__pyx_t_9 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(2, 152, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+ __pyx_v_dim = __pyx_t_9;
+ __pyx_v_idx = __pyx_t_8;
+ __pyx_t_8 = (__pyx_t_8 + 1);
+
+ /* "View.MemoryView":153
+ *
+ * for idx, dim in enumerate(shape):
+ * if dim <= 0: # <<<<<<<<<<<<<<
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim))
+ * self._shape[idx] = dim
+ */
+ __pyx_t_4 = ((__pyx_v_dim <= 0) != 0);
+ if (unlikely(__pyx_t_4)) {
+
+ /* "View.MemoryView":154
+ * for idx, dim in enumerate(shape):
+ * if dim <= 0:
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim)) # <<<<<<<<<<<<<<
+ * self._shape[idx] = dim
+ *
+ */
+ __pyx_t_5 = __Pyx_PyInt_From_int(__pyx_v_idx); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 154, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __pyx_t_6 = PyInt_FromSsize_t(__pyx_v_dim); if (unlikely(!__pyx_t_6)) __PYX_ERR(2, 154, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_6);
+ __pyx_t_10 = PyTuple_New(2); if (unlikely(!__pyx_t_10)) __PYX_ERR(2, 154, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_10);
+ __Pyx_GIVEREF(__pyx_t_5);
+ PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_5);
+ __Pyx_GIVEREF(__pyx_t_6);
+ PyTuple_SET_ITEM(__pyx_t_10, 1, __pyx_t_6);
+ __pyx_t_5 = 0;
+ __pyx_t_6 = 0;
+ __pyx_t_6 = __Pyx_PyString_Format(__pyx_kp_s_Invalid_shape_in_axis_d_d, __pyx_t_10); if (unlikely(!__pyx_t_6)) __PYX_ERR(2, 154, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __pyx_t_10 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_6); if (unlikely(!__pyx_t_10)) __PYX_ERR(2, 154, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_10);
+ __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+ __Pyx_Raise(__pyx_t_10, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __PYX_ERR(2, 154, __pyx_L1_error)
+
+ /* "View.MemoryView":153
+ *
+ * for idx, dim in enumerate(shape):
+ * if dim <= 0: # <<<<<<<<<<<<<<
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim))
+ * self._shape[idx] = dim
+ */
+ }
+
+ /* "View.MemoryView":155
+ * if dim <= 0:
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim))
+ * self._shape[idx] = dim # <<<<<<<<<<<<<<
+ *
+ * cdef char order
+ */
+ (__pyx_v_self->_shape[__pyx_v_idx]) = __pyx_v_dim;
+
+ /* "View.MemoryView":152
+ *
+ *
+ * for idx, dim in enumerate(shape): # <<<<<<<<<<<<<<
+ * if dim <= 0:
+ * raise ValueError("Invalid shape in axis %d: %d." % (idx, dim))
+ */
+ }
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+ /* "View.MemoryView":158
+ *
+ * cdef char order
+ * if mode == 'fortran': # <<<<<<<<<<<<<<
+ * order = b'F'
+ * self.mode = u'fortran'
+ */
+ __pyx_t_4 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_fortran, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(2, 158, __pyx_L1_error)
+ if (__pyx_t_4) {
+
+ /* "View.MemoryView":159
+ * cdef char order
+ * if mode == 'fortran':
+ * order = b'F' # <<<<<<<<<<<<<<
+ * self.mode = u'fortran'
+ * elif mode == 'c':
+ */
+ __pyx_v_order = 'F';
+
+ /* "View.MemoryView":160
+ * if mode == 'fortran':
+ * order = b'F'
+ * self.mode = u'fortran' # <<<<<<<<<<<<<<
+ * elif mode == 'c':
+ * order = b'C'
+ */
+ __Pyx_INCREF(__pyx_n_u_fortran);
+ __Pyx_GIVEREF(__pyx_n_u_fortran);
+ __Pyx_GOTREF(__pyx_v_self->mode);
+ __Pyx_DECREF(__pyx_v_self->mode);
+ __pyx_v_self->mode = __pyx_n_u_fortran;
+
+ /* "View.MemoryView":158
+ *
+ * cdef char order
+ * if mode == 'fortran': # <<<<<<<<<<<<<<
+ * order = b'F'
+ * self.mode = u'fortran'
+ */
+ goto __pyx_L10;
+ }
+
+ /* "View.MemoryView":161
+ * order = b'F'
+ * self.mode = u'fortran'
+ * elif mode == 'c': # <<<<<<<<<<<<<<
+ * order = b'C'
+ * self.mode = u'c'
+ */
+ __pyx_t_4 = (__Pyx_PyString_Equals(__pyx_v_mode, __pyx_n_s_c, Py_EQ)); if (unlikely(__pyx_t_4 < 0)) __PYX_ERR(2, 161, __pyx_L1_error)
+ if (likely(__pyx_t_4)) {
+
+ /* "View.MemoryView":162
+ * self.mode = u'fortran'
+ * elif mode == 'c':
+ * order = b'C' # <<<<<<<<<<<<<<
+ * self.mode = u'c'
+ * else:
+ */
+ __pyx_v_order = 'C';
+
+ /* "View.MemoryView":163
+ * elif mode == 'c':
+ * order = b'C'
+ * self.mode = u'c' # <<<<<<<<<<<<<<
+ * else:
+ * raise ValueError("Invalid mode, expected 'c' or 'fortran', got %s" % mode)
+ */
+ __Pyx_INCREF(__pyx_n_u_c);
+ __Pyx_GIVEREF(__pyx_n_u_c);
+ __Pyx_GOTREF(__pyx_v_self->mode);
+ __Pyx_DECREF(__pyx_v_self->mode);
+ __pyx_v_self->mode = __pyx_n_u_c;
+
+ /* "View.MemoryView":161
+ * order = b'F'
+ * self.mode = u'fortran'
+ * elif mode == 'c': # <<<<<<<<<<<<<<
+ * order = b'C'
+ * self.mode = u'c'
+ */
+ goto __pyx_L10;
+ }
+
+ /* "View.MemoryView":165
+ * self.mode = u'c'
+ * else:
+ * raise ValueError("Invalid mode, expected 'c' or 'fortran', got %s" % mode) # <<<<<<<<<<<<<<
+ *
+ * self.len = fill_contig_strides_array(self._shape, self._strides,
+ */
+ /*else*/ {
+ __pyx_t_3 = __Pyx_PyString_FormatSafe(__pyx_kp_s_Invalid_mode_expected_c_or_fortr, __pyx_v_mode); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 165, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_10 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_3); if (unlikely(!__pyx_t_10)) __PYX_ERR(2, 165, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_10);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __Pyx_Raise(__pyx_t_10, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __PYX_ERR(2, 165, __pyx_L1_error)
+ }
+ __pyx_L10:;
+
+ /* "View.MemoryView":167
+ * raise ValueError("Invalid mode, expected 'c' or 'fortran', got %s" % mode)
+ *
+ * self.len = fill_contig_strides_array(self._shape, self._strides, # <<<<<<<<<<<<<<
+ * itemsize, self.ndim, order)
+ *
+ */
+ __pyx_v_self->len = __pyx_fill_contig_strides_array(__pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_itemsize, __pyx_v_self->ndim, __pyx_v_order);
+
+ /* "View.MemoryView":170
+ * itemsize, self.ndim, order)
+ *
+ * self.free_data = allocate_buffer # <<<<<<<<<<<<<<
+ * self.dtype_is_object = format == b'O'
+ * if allocate_buffer:
+ */
+ __pyx_v_self->free_data = __pyx_v_allocate_buffer;
+
+ /* "View.MemoryView":171
+ *
+ * self.free_data = allocate_buffer
+ * self.dtype_is_object = format == b'O' # <<<<<<<<<<<<<<
+ * if allocate_buffer:
+ *
+ */
+ __pyx_t_10 = PyObject_RichCompare(__pyx_v_format, __pyx_n_b_O, Py_EQ); __Pyx_XGOTREF(__pyx_t_10); if (unlikely(!__pyx_t_10)) __PYX_ERR(2, 171, __pyx_L1_error)
+ __pyx_t_4 = __Pyx_PyObject_IsTrue(__pyx_t_10); if (unlikely((__pyx_t_4 == (int)-1) && PyErr_Occurred())) __PYX_ERR(2, 171, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __pyx_v_self->dtype_is_object = __pyx_t_4;
+
+ /* "View.MemoryView":172
+ * self.free_data = allocate_buffer
+ * self.dtype_is_object = format == b'O'
+ * if allocate_buffer: # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_t_4 = (__pyx_v_allocate_buffer != 0);
+ if (__pyx_t_4) {
+
+ /* "View.MemoryView":175
+ *
+ *
+ * self.data = malloc(self.len) # <<<<<<<<<<<<<<
+ * if not self.data:
+ * raise MemoryError("unable to allocate array data.")
+ */
+ __pyx_v_self->data = ((char *)malloc(__pyx_v_self->len));
+
+ /* "View.MemoryView":176
+ *
+ * self.data = malloc(self.len)
+ * if not self.data: # <<<<<<<<<<<<<<
+ * raise MemoryError("unable to allocate array data.")
+ *
+ */
+ __pyx_t_4 = ((!(__pyx_v_self->data != 0)) != 0);
+ if (unlikely(__pyx_t_4)) {
+
+ /* "View.MemoryView":177
+ * self.data = malloc(self.len)
+ * if not self.data:
+ * raise MemoryError("unable to allocate array data.") # <<<<<<<<<<<<<<
+ *
+ * if self.dtype_is_object:
+ */
+ __pyx_t_10 = __Pyx_PyObject_Call(__pyx_builtin_MemoryError, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_10)) __PYX_ERR(2, 177, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_10);
+ __Pyx_Raise(__pyx_t_10, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __PYX_ERR(2, 177, __pyx_L1_error)
+
+ /* "View.MemoryView":176
+ *
+ * self.data = malloc(self.len)
+ * if not self.data: # <<<<<<<<<<<<<<
+ * raise MemoryError("unable to allocate array data.")
+ *
+ */
+ }
+
+ /* "View.MemoryView":179
+ * raise MemoryError("unable to allocate array data.")
+ *
+ * if self.dtype_is_object: # <<<<<<<<<<<<<<
+ * p = self.data
+ * for i in range(self.len / itemsize):
+ */
+ __pyx_t_4 = (__pyx_v_self->dtype_is_object != 0);
+ if (__pyx_t_4) {
+
+ /* "View.MemoryView":180
+ *
+ * if self.dtype_is_object:
+ * p = self.data # <<<<<<<<<<<<<<
+ * for i in range(self.len / itemsize):
+ * p[i] = Py_None
+ */
+ __pyx_v_p = ((PyObject **)__pyx_v_self->data);
+
+ /* "View.MemoryView":181
+ * if self.dtype_is_object:
+ * p = self.data
+ * for i in range(self.len / itemsize): # <<<<<<<<<<<<<<
+ * p[i] = Py_None
+ * Py_INCREF(Py_None)
+ */
+ if (unlikely(__pyx_v_itemsize == 0)) {
+ PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+ __PYX_ERR(2, 181, __pyx_L1_error)
+ }
+ else if (sizeof(Py_ssize_t) == sizeof(long) && (!(((Py_ssize_t)-1) > 0)) && unlikely(__pyx_v_itemsize == (Py_ssize_t)-1) && unlikely(UNARY_NEG_WOULD_OVERFLOW(__pyx_v_self->len))) {
+ PyErr_SetString(PyExc_OverflowError, "value too large to perform division");
+ __PYX_ERR(2, 181, __pyx_L1_error)
+ }
+ __pyx_t_1 = __Pyx_div_Py_ssize_t(__pyx_v_self->len, __pyx_v_itemsize);
+ __pyx_t_9 = __pyx_t_1;
+ for (__pyx_t_11 = 0; __pyx_t_11 < __pyx_t_9; __pyx_t_11+=1) {
+ __pyx_v_i = __pyx_t_11;
+
+ /* "View.MemoryView":182
+ * p = self.data
+ * for i in range(self.len / itemsize):
+ * p[i] = Py_None # <<<<<<<<<<<<<<
+ * Py_INCREF(Py_None)
+ *
+ */
+ (__pyx_v_p[__pyx_v_i]) = Py_None;
+
+ /* "View.MemoryView":183
+ * for i in range(self.len / itemsize):
+ * p[i] = Py_None
+ * Py_INCREF(Py_None) # <<<<<<<<<<<<<<
+ *
+ * @cname('getbuffer')
+ */
+ Py_INCREF(Py_None);
+ }
+
+ /* "View.MemoryView":179
+ * raise MemoryError("unable to allocate array data.")
+ *
+ * if self.dtype_is_object: # <<<<<<<<<<<<<<
+ * p = self.data
+ * for i in range(self.len / itemsize):
+ */
+ }
+
+ /* "View.MemoryView":172
+ * self.free_data = allocate_buffer
+ * self.dtype_is_object = format == b'O'
+ * if allocate_buffer: # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ }
+
+ /* "View.MemoryView":123
+ * cdef bint dtype_is_object
+ *
+ * def __cinit__(array self, tuple shape, Py_ssize_t itemsize, format not None, # <<<<<<<<<<<<<<
+ * mode="c", bint allocate_buffer=True):
+ *
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_XDECREF(__pyx_t_6);
+ __Pyx_XDECREF(__pyx_t_10);
+ __Pyx_AddTraceback("View.MemoryView.array.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_XDECREF(__pyx_v_format);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":186
+ *
+ * @cname('getbuffer')
+ * def __getbuffer__(self, Py_buffer *info, int flags): # <<<<<<<<<<<<<<
+ * cdef int bufmode = -1
+ * if self.mode == u"c":
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(((struct __pyx_array_obj *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__(struct __pyx_array_obj *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+ int __pyx_v_bufmode;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_t_2;
+ PyObject *__pyx_t_3 = NULL;
+ char *__pyx_t_4;
+ Py_ssize_t __pyx_t_5;
+ int __pyx_t_6;
+ Py_ssize_t *__pyx_t_7;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ if (__pyx_v_info == NULL) {
+ PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+ return -1;
+ }
+ __Pyx_RefNannySetupContext("__getbuffer__", 0);
+ __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+ __Pyx_GIVEREF(__pyx_v_info->obj);
+
+ /* "View.MemoryView":187
+ * @cname('getbuffer')
+ * def __getbuffer__(self, Py_buffer *info, int flags):
+ * cdef int bufmode = -1 # <<<<<<<<<<<<<<
+ * if self.mode == u"c":
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+ __pyx_v_bufmode = -1;
+
+ /* "View.MemoryView":188
+ * def __getbuffer__(self, Py_buffer *info, int flags):
+ * cdef int bufmode = -1
+ * if self.mode == u"c": # <<<<<<<<<<<<<<
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * elif self.mode == u"fortran":
+ */
+ __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_c, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(2, 188, __pyx_L1_error)
+ __pyx_t_2 = (__pyx_t_1 != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":189
+ * cdef int bufmode = -1
+ * if self.mode == u"c":
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS # <<<<<<<<<<<<<<
+ * elif self.mode == u"fortran":
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ */
+ __pyx_v_bufmode = (PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+ /* "View.MemoryView":188
+ * def __getbuffer__(self, Py_buffer *info, int flags):
+ * cdef int bufmode = -1
+ * if self.mode == u"c": # <<<<<<<<<<<<<<
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * elif self.mode == u"fortran":
+ */
+ goto __pyx_L3;
+ }
+
+ /* "View.MemoryView":190
+ * if self.mode == u"c":
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * elif self.mode == u"fortran": # <<<<<<<<<<<<<<
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * if not (flags & bufmode):
+ */
+ __pyx_t_2 = (__Pyx_PyUnicode_Equals(__pyx_v_self->mode, __pyx_n_u_fortran, Py_EQ)); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(2, 190, __pyx_L1_error)
+ __pyx_t_1 = (__pyx_t_2 != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":191
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * elif self.mode == u"fortran":
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS # <<<<<<<<<<<<<<
+ * if not (flags & bufmode):
+ * raise ValueError("Can only create a buffer that is contiguous in memory.")
+ */
+ __pyx_v_bufmode = (PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS);
+
+ /* "View.MemoryView":190
+ * if self.mode == u"c":
+ * bufmode = PyBUF_C_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * elif self.mode == u"fortran": # <<<<<<<<<<<<<<
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * if not (flags & bufmode):
+ */
+ }
+ __pyx_L3:;
+
+ /* "View.MemoryView":192
+ * elif self.mode == u"fortran":
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * if not (flags & bufmode): # <<<<<<<<<<<<<<
+ * raise ValueError("Can only create a buffer that is contiguous in memory.")
+ * info.buf = self.data
+ */
+ __pyx_t_1 = ((!((__pyx_v_flags & __pyx_v_bufmode) != 0)) != 0);
+ if (unlikely(__pyx_t_1)) {
+
+ /* "View.MemoryView":193
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * if not (flags & bufmode):
+ * raise ValueError("Can only create a buffer that is contiguous in memory.") # <<<<<<<<<<<<<<
+ * info.buf = self.data
+ * info.len = self.len
+ */
+ __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 193, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __PYX_ERR(2, 193, __pyx_L1_error)
+
+ /* "View.MemoryView":192
+ * elif self.mode == u"fortran":
+ * bufmode = PyBUF_F_CONTIGUOUS | PyBUF_ANY_CONTIGUOUS
+ * if not (flags & bufmode): # <<<<<<<<<<<<<<
+ * raise ValueError("Can only create a buffer that is contiguous in memory.")
+ * info.buf = self.data
+ */
+ }
+
+ /* "View.MemoryView":194
+ * if not (flags & bufmode):
+ * raise ValueError("Can only create a buffer that is contiguous in memory.")
+ * info.buf = self.data # <<<<<<<<<<<<<<
+ * info.len = self.len
+ * info.ndim = self.ndim
+ */
+ __pyx_t_4 = __pyx_v_self->data;
+ __pyx_v_info->buf = __pyx_t_4;
+
+ /* "View.MemoryView":195
+ * raise ValueError("Can only create a buffer that is contiguous in memory.")
+ * info.buf = self.data
+ * info.len = self.len # <<<<<<<<<<<<<<
+ * info.ndim = self.ndim
+ * info.shape = self._shape
+ */
+ __pyx_t_5 = __pyx_v_self->len;
+ __pyx_v_info->len = __pyx_t_5;
+
+ /* "View.MemoryView":196
+ * info.buf = self.data
+ * info.len = self.len
+ * info.ndim = self.ndim # <<<<<<<<<<<<<<
+ * info.shape = self._shape
+ * info.strides = self._strides
+ */
+ __pyx_t_6 = __pyx_v_self->ndim;
+ __pyx_v_info->ndim = __pyx_t_6;
+
+ /* "View.MemoryView":197
+ * info.len = self.len
+ * info.ndim = self.ndim
+ * info.shape = self._shape # <<<<<<<<<<<<<<
+ * info.strides = self._strides
+ * info.suboffsets = NULL
+ */
+ __pyx_t_7 = __pyx_v_self->_shape;
+ __pyx_v_info->shape = __pyx_t_7;
+
+ /* "View.MemoryView":198
+ * info.ndim = self.ndim
+ * info.shape = self._shape
+ * info.strides = self._strides # <<<<<<<<<<<<<<
+ * info.suboffsets = NULL
+ * info.itemsize = self.itemsize
+ */
+ __pyx_t_7 = __pyx_v_self->_strides;
+ __pyx_v_info->strides = __pyx_t_7;
+
+ /* "View.MemoryView":199
+ * info.shape = self._shape
+ * info.strides = self._strides
+ * info.suboffsets = NULL # <<<<<<<<<<<<<<
+ * info.itemsize = self.itemsize
+ * info.readonly = 0
+ */
+ __pyx_v_info->suboffsets = NULL;
+
+ /* "View.MemoryView":200
+ * info.strides = self._strides
+ * info.suboffsets = NULL
+ * info.itemsize = self.itemsize # <<<<<<<<<<<<<<
+ * info.readonly = 0
+ *
+ */
+ __pyx_t_5 = __pyx_v_self->itemsize;
+ __pyx_v_info->itemsize = __pyx_t_5;
+
+ /* "View.MemoryView":201
+ * info.suboffsets = NULL
+ * info.itemsize = self.itemsize
+ * info.readonly = 0 # <<<<<<<<<<<<<<
+ *
+ * if flags & PyBUF_FORMAT:
+ */
+ __pyx_v_info->readonly = 0;
+
+ /* "View.MemoryView":203
+ * info.readonly = 0
+ *
+ * if flags & PyBUF_FORMAT: # <<<<<<<<<<<<<<
+ * info.format = self.format
+ * else:
+ */
+ __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":204
+ *
+ * if flags & PyBUF_FORMAT:
+ * info.format = self.format # <<<<<<<<<<<<<<
+ * else:
+ * info.format = NULL
+ */
+ __pyx_t_4 = __pyx_v_self->format;
+ __pyx_v_info->format = __pyx_t_4;
+
+ /* "View.MemoryView":203
+ * info.readonly = 0
+ *
+ * if flags & PyBUF_FORMAT: # <<<<<<<<<<<<<<
+ * info.format = self.format
+ * else:
+ */
+ goto __pyx_L5;
+ }
+
+ /* "View.MemoryView":206
+ * info.format = self.format
+ * else:
+ * info.format = NULL # <<<<<<<<<<<<<<
+ *
+ * info.obj = self
+ */
+ /*else*/ {
+ __pyx_v_info->format = NULL;
+ }
+ __pyx_L5:;
+
+ /* "View.MemoryView":208
+ * info.format = NULL
+ *
+ * info.obj = self # <<<<<<<<<<<<<<
+ *
+ * __pyx_getbuffer = capsule( &__pyx_array_getbuffer, "getbuffer(obj, view, flags)")
+ */
+ __Pyx_INCREF(((PyObject *)__pyx_v_self));
+ __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj);
+ __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+ /* "View.MemoryView":186
+ *
+ * @cname('getbuffer')
+ * def __getbuffer__(self, Py_buffer *info, int flags): # <<<<<<<<<<<<<<
+ * cdef int bufmode = -1
+ * if self.mode == u"c":
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_AddTraceback("View.MemoryView.array.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ if (__pyx_v_info->obj != NULL) {
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+ }
+ goto __pyx_L2;
+ __pyx_L0:;
+ if (__pyx_v_info->obj == Py_None) {
+ __Pyx_GOTREF(__pyx_v_info->obj);
+ __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+ }
+ __pyx_L2:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":212
+ * __pyx_getbuffer = capsule( &__pyx_array_getbuffer, "getbuffer(obj, view, flags)")
+ *
+ * def __dealloc__(array self): # <<<<<<<<<<<<<<
+ * if self.callback_free_data != NULL:
+ * self.callback_free_data(self.data)
+ */
+
+/* Python wrapper */
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_array___dealloc__(PyObject *__pyx_v_self) {
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+ __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(((struct __pyx_array_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__(struct __pyx_array_obj *__pyx_v_self) {
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ __Pyx_RefNannySetupContext("__dealloc__", 0);
+
+ /* "View.MemoryView":213
+ *
+ * def __dealloc__(array self):
+ * if self.callback_free_data != NULL: # <<<<<<<<<<<<<<
+ * self.callback_free_data(self.data)
+ * elif self.free_data:
+ */
+ __pyx_t_1 = ((__pyx_v_self->callback_free_data != NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":214
+ * def __dealloc__(array self):
+ * if self.callback_free_data != NULL:
+ * self.callback_free_data(self.data) # <<<<<<<<<<<<<<
+ * elif self.free_data:
+ * if self.dtype_is_object:
+ */
+ __pyx_v_self->callback_free_data(__pyx_v_self->data);
+
+ /* "View.MemoryView":213
+ *
+ * def __dealloc__(array self):
+ * if self.callback_free_data != NULL: # <<<<<<<<<<<<<<
+ * self.callback_free_data(self.data)
+ * elif self.free_data:
+ */
+ goto __pyx_L3;
+ }
+
+ /* "View.MemoryView":215
+ * if self.callback_free_data != NULL:
+ * self.callback_free_data(self.data)
+ * elif self.free_data: # <<<<<<<<<<<<<<
+ * if self.dtype_is_object:
+ * refcount_objects_in_slice(self.data, self._shape,
+ */
+ __pyx_t_1 = (__pyx_v_self->free_data != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":216
+ * self.callback_free_data(self.data)
+ * elif self.free_data:
+ * if self.dtype_is_object: # <<<<<<<<<<<<<<
+ * refcount_objects_in_slice(self.data, self._shape,
+ * self._strides, self.ndim, False)
+ */
+ __pyx_t_1 = (__pyx_v_self->dtype_is_object != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":217
+ * elif self.free_data:
+ * if self.dtype_is_object:
+ * refcount_objects_in_slice(self.data, self._shape, # <<<<<<<<<<<<<<
+ * self._strides, self.ndim, False)
+ * free(self.data)
+ */
+ __pyx_memoryview_refcount_objects_in_slice(__pyx_v_self->data, __pyx_v_self->_shape, __pyx_v_self->_strides, __pyx_v_self->ndim, 0);
+
+ /* "View.MemoryView":216
+ * self.callback_free_data(self.data)
+ * elif self.free_data:
+ * if self.dtype_is_object: # <<<<<<<<<<<<<<
+ * refcount_objects_in_slice(self.data, self._shape,
+ * self._strides, self.ndim, False)
+ */
+ }
+
+ /* "View.MemoryView":219
+ * refcount_objects_in_slice(self.data, self._shape,
+ * self._strides, self.ndim, False)
+ * free(self.data) # <<<<<<<<<<<<<<
+ * PyObject_Free(self._shape)
+ *
+ */
+ free(__pyx_v_self->data);
+
+ /* "View.MemoryView":215
+ * if self.callback_free_data != NULL:
+ * self.callback_free_data(self.data)
+ * elif self.free_data: # <<<<<<<<<<<<<<
+ * if self.dtype_is_object:
+ * refcount_objects_in_slice(self.data, self._shape,
+ */
+ }
+ __pyx_L3:;
+
+ /* "View.MemoryView":220
+ * self._strides, self.ndim, False)
+ * free(self.data)
+ * PyObject_Free(self._shape) # <<<<<<<<<<<<<<
+ *
+ * @property
+ */
+ PyObject_Free(__pyx_v_self->_shape);
+
+ /* "View.MemoryView":212
+ * __pyx_getbuffer = capsule( &__pyx_array_getbuffer, "getbuffer(obj, view, flags)")
+ *
+ * def __dealloc__(array self): # <<<<<<<<<<<<<<
+ * if self.callback_free_data != NULL:
+ * self.callback_free_data(self.data)
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+/* "View.MemoryView":223
+ *
+ * @property
+ * def memview(self): # <<<<<<<<<<<<<<
+ * return self.get_memview()
+ *
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__(PyObject *__pyx_v_self) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__get__ (wrapper)", 0);
+ __pyx_r = __pyx_pf_15View_dot_MemoryView_5array_7memview___get__(((struct __pyx_array_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(struct __pyx_array_obj *__pyx_v_self) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__get__", 0);
+
+ /* "View.MemoryView":224
+ * @property
+ * def memview(self):
+ * return self.get_memview() # <<<<<<<<<<<<<<
+ *
+ * @cname('get_memview')
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = ((struct __pyx_vtabstruct_array *)__pyx_v_self->__pyx_vtab)->get_memview(__pyx_v_self); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 224, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_r = __pyx_t_1;
+ __pyx_t_1 = 0;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":223
+ *
+ * @property
+ * def memview(self): # <<<<<<<<<<<<<<
+ * return self.get_memview()
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("View.MemoryView.array.memview.__get__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":227
+ *
+ * @cname('get_memview')
+ * cdef get_memview(self): # <<<<<<<<<<<<<<
+ * flags = PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ * return memoryview(self, flags, self.dtype_is_object)
+ */
+
+static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v_self) {
+ int __pyx_v_flags;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("get_memview", 0);
+
+ /* "View.MemoryView":228
+ * @cname('get_memview')
+ * cdef get_memview(self):
+ * flags = PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE # <<<<<<<<<<<<<<
+ * return memoryview(self, flags, self.dtype_is_object)
+ *
+ */
+ __pyx_v_flags = ((PyBUF_ANY_CONTIGUOUS | PyBUF_FORMAT) | PyBUF_WRITABLE);
+
+ /* "View.MemoryView":229
+ * cdef get_memview(self):
+ * flags = PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ * return memoryview(self, flags, self.dtype_is_object) # <<<<<<<<<<<<<<
+ *
+ * def __len__(self):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_flags); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 229, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyBool_FromLong(__pyx_v_self->dtype_is_object); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 229, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 229, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __Pyx_INCREF(((PyObject *)__pyx_v_self));
+ __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+ PyTuple_SET_ITEM(__pyx_t_3, 0, ((PyObject *)__pyx_v_self));
+ __Pyx_GIVEREF(__pyx_t_1);
+ PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_1);
+ __Pyx_GIVEREF(__pyx_t_2);
+ PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_2);
+ __pyx_t_1 = 0;
+ __pyx_t_2 = 0;
+ __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_memoryview_type), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 229, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_r = __pyx_t_2;
+ __pyx_t_2 = 0;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":227
+ *
+ * @cname('get_memview')
+ * cdef get_memview(self): # <<<<<<<<<<<<<<
+ * flags = PyBUF_ANY_CONTIGUOUS|PyBUF_FORMAT|PyBUF_WRITABLE
+ * return memoryview(self, flags, self.dtype_is_object)
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_AddTraceback("View.MemoryView.array.get_memview", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":231
+ * return memoryview(self, flags, self.dtype_is_object)
+ *
+ * def __len__(self): # <<<<<<<<<<<<<<
+ * return self._shape[0]
+ *
+ */
+
+/* Python wrapper */
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self); /*proto*/
+static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self) {
+ Py_ssize_t __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__len__ (wrapper)", 0);
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(((struct __pyx_array_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__len__(struct __pyx_array_obj *__pyx_v_self) {
+ Py_ssize_t __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__len__", 0);
+
+ /* "View.MemoryView":232
+ *
+ * def __len__(self):
+ * return self._shape[0] # <<<<<<<<<<<<<<
+ *
+ * def __getattr__(self, attr):
+ */
+ __pyx_r = (__pyx_v_self->_shape[0]);
+ goto __pyx_L0;
+
+ /* "View.MemoryView":231
+ * return memoryview(self, flags, self.dtype_is_object)
+ *
+ * def __len__(self): # <<<<<<<<<<<<<<
+ * return self._shape[0]
+ *
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":234
+ * return self._shape[0]
+ *
+ * def __getattr__(self, attr): # <<<<<<<<<<<<<<
+ * return getattr(self.memview, attr)
+ *
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr); /*proto*/
+static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_attr) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__getattr__ (wrapper)", 0);
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_attr));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__getattr__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_attr) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__getattr__", 0);
+
+ /* "View.MemoryView":235
+ *
+ * def __getattr__(self, attr):
+ * return getattr(self.memview, attr) # <<<<<<<<<<<<<<
+ *
+ * def __getitem__(self, item):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 235, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_GetAttr(__pyx_t_1, __pyx_v_attr); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 235, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_r = __pyx_t_2;
+ __pyx_t_2 = 0;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":234
+ * return self._shape[0]
+ *
+ * def __getattr__(self, attr): # <<<<<<<<<<<<<<
+ * return getattr(self.memview, attr)
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_AddTraceback("View.MemoryView.array.__getattr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":237
+ * return getattr(self.memview, attr)
+ *
+ * def __getitem__(self, item): # <<<<<<<<<<<<<<
+ * return self.memview[item]
+ *
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item); /*proto*/
+static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__getitem__ (wrapper)", 0);
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__getitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ PyObject *__pyx_t_2 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__getitem__", 0);
+
+ /* "View.MemoryView":238
+ *
+ * def __getitem__(self, item):
+ * return self.memview[item] # <<<<<<<<<<<<<<
+ *
+ * def __setitem__(self, item, value):
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 238, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_t_1, __pyx_v_item); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 238, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_r = __pyx_t_2;
+ __pyx_t_2 = 0;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":237
+ * return getattr(self.memview, attr)
+ *
+ * def __getitem__(self, item): # <<<<<<<<<<<<<<
+ * return self.memview[item]
+ *
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_AddTraceback("View.MemoryView.array.__getitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":240
+ * return self.memview[item]
+ *
+ * def __setitem__(self, item, value): # <<<<<<<<<<<<<<
+ * self.memview[item] = value
+ *
+ */
+
+/* Python wrapper */
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value); /*proto*/
+static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__setitem__ (wrapper)", 0);
+ __pyx_r = __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v_item), ((PyObject *)__pyx_v_value));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__(struct __pyx_array_obj *__pyx_v_self, PyObject *__pyx_v_item, PyObject *__pyx_v_value) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__setitem__", 0);
+
+ /* "View.MemoryView":241
+ *
+ * def __setitem__(self, item, value):
+ * self.memview[item] = value # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_self), __pyx_n_s_memview); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 241, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ if (unlikely(PyObject_SetItem(__pyx_t_1, __pyx_v_item, __pyx_v_value) < 0)) __PYX_ERR(2, 241, __pyx_L1_error)
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+ /* "View.MemoryView":240
+ * return self.memview[item]
+ *
+ * def __setitem__(self, item, value): # <<<<<<<<<<<<<<
+ * self.memview[item] = value
+ *
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("View.MemoryView.array.__setitem__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self): # <<<<<<<<<<<<<<
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+ __pyx_r = __pyx_pf___pyx_array___reduce_cython__(((struct __pyx_array_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__reduce_cython__", 0);
+
+ /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+ __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 2, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __PYX_ERR(2, 2, __pyx_L1_error)
+
+ /* "(tree fragment)":1
+ * def __reduce_cython__(self): # <<<<<<<<<<<<<<
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("View.MemoryView.array.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<<
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/
+static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+ __pyx_r = __pyx_pf___pyx_array_2__setstate_cython__(((struct __pyx_array_obj *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED struct __pyx_array_obj *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__setstate_cython__", 0);
+
+ /* "(tree fragment)":4
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__") # <<<<<<<<<<<<<<
+ */
+ __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 4, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __PYX_ERR(2, 4, __pyx_L1_error)
+
+ /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<<
+ * raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("View.MemoryView.array.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":245
+ *
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, # <<<<<<<<<<<<<<
+ * char *mode, char *buf):
+ * cdef array result
+ */
+
+static struct __pyx_array_obj *__pyx_array_new(PyObject *__pyx_v_shape, Py_ssize_t __pyx_v_itemsize, char *__pyx_v_format, char *__pyx_v_mode, char *__pyx_v_buf) {
+ struct __pyx_array_obj *__pyx_v_result = 0;
+ struct __pyx_array_obj *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ PyObject *__pyx_t_3 = NULL;
+ PyObject *__pyx_t_4 = NULL;
+ PyObject *__pyx_t_5 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("array_cwrapper", 0);
+
+ /* "View.MemoryView":249
+ * cdef array result
+ *
+ * if buf == NULL: # <<<<<<<<<<<<<<
+ * result = array(shape, itemsize, format, mode.decode('ASCII'))
+ * else:
+ */
+ __pyx_t_1 = ((__pyx_v_buf == NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":250
+ *
+ * if buf == NULL:
+ * result = array(shape, itemsize, format, mode.decode('ASCII')) # <<<<<<<<<<<<<<
+ * else:
+ * result = array(shape, itemsize, format, mode.decode('ASCII'),
+ */
+ __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 250, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_3 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 250, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_4 = __Pyx_decode_c_string(__pyx_v_mode, 0, strlen(__pyx_v_mode), NULL, NULL, PyUnicode_DecodeASCII); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 250, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_5 = PyTuple_New(4); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 250, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_INCREF(__pyx_v_shape);
+ __Pyx_GIVEREF(__pyx_v_shape);
+ PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_v_shape);
+ __Pyx_GIVEREF(__pyx_t_2);
+ PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_2);
+ __Pyx_GIVEREF(__pyx_t_3);
+ PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_3);
+ __Pyx_GIVEREF(__pyx_t_4);
+ PyTuple_SET_ITEM(__pyx_t_5, 3, __pyx_t_4);
+ __pyx_t_2 = 0;
+ __pyx_t_3 = 0;
+ __pyx_t_4 = 0;
+ __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)__pyx_array_type), __pyx_t_5, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 250, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+ __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_4);
+ __pyx_t_4 = 0;
+
+ /* "View.MemoryView":249
+ * cdef array result
+ *
+ * if buf == NULL: # <<<<<<<<<<<<<<
+ * result = array(shape, itemsize, format, mode.decode('ASCII'))
+ * else:
+ */
+ goto __pyx_L3;
+ }
+
+ /* "View.MemoryView":252
+ * result = array(shape, itemsize, format, mode.decode('ASCII'))
+ * else:
+ * result = array(shape, itemsize, format, mode.decode('ASCII'), # <<<<<<<<<<<<<<
+ * allocate_buffer=False)
+ * result.data = buf
+ */
+ /*else*/ {
+ __pyx_t_4 = PyInt_FromSsize_t(__pyx_v_itemsize); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 252, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_5 = __Pyx_PyBytes_FromString(__pyx_v_format); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 252, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __pyx_t_3 = __Pyx_decode_c_string(__pyx_v_mode, 0, strlen(__pyx_v_mode), NULL, NULL, PyUnicode_DecodeASCII); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 252, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ __pyx_t_2 = PyTuple_New(4); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 252, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __Pyx_INCREF(__pyx_v_shape);
+ __Pyx_GIVEREF(__pyx_v_shape);
+ PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_shape);
+ __Pyx_GIVEREF(__pyx_t_4);
+ PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_4);
+ __Pyx_GIVEREF(__pyx_t_5);
+ PyTuple_SET_ITEM(__pyx_t_2, 2, __pyx_t_5);
+ __Pyx_GIVEREF(__pyx_t_3);
+ PyTuple_SET_ITEM(__pyx_t_2, 3, __pyx_t_3);
+ __pyx_t_4 = 0;
+ __pyx_t_5 = 0;
+ __pyx_t_3 = 0;
+
+ /* "View.MemoryView":253
+ * else:
+ * result = array(shape, itemsize, format, mode.decode('ASCII'),
+ * allocate_buffer=False) # <<<<<<<<<<<<<<
+ * result.data = buf
+ *
+ */
+ __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 253, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_3);
+ if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_allocate_buffer, Py_False) < 0) __PYX_ERR(2, 253, __pyx_L1_error)
+
+ /* "View.MemoryView":252
+ * result = array(shape, itemsize, format, mode.decode('ASCII'))
+ * else:
+ * result = array(shape, itemsize, format, mode.decode('ASCII'), # <<<<<<<<<<<<<<
+ * allocate_buffer=False)
+ * result.data = buf
+ */
+ __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)__pyx_array_type), __pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 252, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+ __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+ __pyx_v_result = ((struct __pyx_array_obj *)__pyx_t_5);
+ __pyx_t_5 = 0;
+
+ /* "View.MemoryView":254
+ * result = array(shape, itemsize, format, mode.decode('ASCII'),
+ * allocate_buffer=False)
+ * result.data = buf # <<<<<<<<<<<<<<
+ *
+ * return result
+ */
+ __pyx_v_result->data = __pyx_v_buf;
+ }
+ __pyx_L3:;
+
+ /* "View.MemoryView":256
+ * result.data = buf
+ *
+ * return result # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __Pyx_XDECREF(((PyObject *)__pyx_r));
+ __Pyx_INCREF(((PyObject *)__pyx_v_result));
+ __pyx_r = __pyx_v_result;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":245
+ *
+ * @cname("__pyx_array_new")
+ * cdef array array_cwrapper(tuple shape, Py_ssize_t itemsize, char *format, # <<<<<<<<<<<<<<
+ * char *mode, char *buf):
+ * cdef array result
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_2);
+ __Pyx_XDECREF(__pyx_t_3);
+ __Pyx_XDECREF(__pyx_t_4);
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_AddTraceback("View.MemoryView.array_cwrapper", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = 0;
+ __pyx_L0:;
+ __Pyx_XDECREF((PyObject *)__pyx_v_result);
+ __Pyx_XGIVEREF((PyObject *)__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":282
+ * cdef class Enum(object):
+ * cdef object name
+ * def __init__(self, name): # <<<<<<<<<<<<<<
+ * self.name = name
+ * def __repr__(self):
+ */
+
+/* Python wrapper */
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_name = 0;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_name,0};
+ PyObject* values[1] = {0};
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_name)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(2, 282, __pyx_L3_error)
+ }
+ } else if (PyTuple_GET_SIZE(__pyx_args) != 1) {
+ goto __pyx_L5_argtuple_error;
+ } else {
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ }
+ __pyx_v_name = values[0];
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("__init__", 1, 1, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(2, 282, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("View.MemoryView.Enum.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return -1;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), __pyx_v_name);
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v_name) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__init__", 0);
+
+ /* "View.MemoryView":283
+ * cdef object name
+ * def __init__(self, name):
+ * self.name = name # <<<<<<<<<<<<<<
+ * def __repr__(self):
+ * return self.name
+ */
+ __Pyx_INCREF(__pyx_v_name);
+ __Pyx_GIVEREF(__pyx_v_name);
+ __Pyx_GOTREF(__pyx_v_self->name);
+ __Pyx_DECREF(__pyx_v_self->name);
+ __pyx_v_self->name = __pyx_v_name;
+
+ /* "View.MemoryView":282
+ * cdef class Enum(object):
+ * cdef object name
+ * def __init__(self, name): # <<<<<<<<<<<<<<
+ * self.name = name
+ * def __repr__(self):
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":284
+ * def __init__(self, name):
+ * self.name = name
+ * def __repr__(self): # <<<<<<<<<<<<<<
+ * return self.name
+ *
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self); /*proto*/
+static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__repr__ (wrapper)", 0);
+ __pyx_r = __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_2__repr__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__repr__", 0);
+
+ /* "View.MemoryView":285
+ * self.name = name
+ * def __repr__(self):
+ * return self.name # <<<<<<<<<<<<<<
+ *
+ * cdef generic = Enum("")
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_INCREF(__pyx_v_self->name);
+ __pyx_r = __pyx_v_self->name;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":284
+ * def __init__(self, name):
+ * self.name = name
+ * def __repr__(self): # <<<<<<<<<<<<<<
+ * return self.name
+ *
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self): # <<<<<<<<<<<<<<
+ * cdef tuple state
+ * cdef object _dict
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+ __pyx_r = __pyx_pf___pyx_MemviewEnum___reduce_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self) {
+ PyObject *__pyx_v_state = 0;
+ PyObject *__pyx_v__dict = 0;
+ int __pyx_v_use_setstate;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_t_2;
+ int __pyx_t_3;
+ PyObject *__pyx_t_4 = NULL;
+ PyObject *__pyx_t_5 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__reduce_cython__", 0);
+
+ /* "(tree fragment)":5
+ * cdef object _dict
+ * cdef bint use_setstate
+ * state = (self.name,) # <<<<<<<<<<<<<<
+ * _dict = getattr(self, '__dict__', None)
+ * if _dict is not None:
+ */
+ __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 5, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_INCREF(__pyx_v_self->name);
+ __Pyx_GIVEREF(__pyx_v_self->name);
+ PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_self->name);
+ __pyx_v_state = ((PyObject*)__pyx_t_1);
+ __pyx_t_1 = 0;
+
+ /* "(tree fragment)":6
+ * cdef bint use_setstate
+ * state = (self.name,)
+ * _dict = getattr(self, '__dict__', None) # <<<<<<<<<<<<<<
+ * if _dict is not None:
+ * state += (_dict,)
+ */
+ __pyx_t_1 = __Pyx_GetAttr3(((PyObject *)__pyx_v_self), __pyx_n_s_dict, Py_None); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 6, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_v__dict = __pyx_t_1;
+ __pyx_t_1 = 0;
+
+ /* "(tree fragment)":7
+ * state = (self.name,)
+ * _dict = getattr(self, '__dict__', None)
+ * if _dict is not None: # <<<<<<<<<<<<<<
+ * state += (_dict,)
+ * use_setstate = True
+ */
+ __pyx_t_2 = (__pyx_v__dict != Py_None);
+ __pyx_t_3 = (__pyx_t_2 != 0);
+ if (__pyx_t_3) {
+
+ /* "(tree fragment)":8
+ * _dict = getattr(self, '__dict__', None)
+ * if _dict is not None:
+ * state += (_dict,) # <<<<<<<<<<<<<<
+ * use_setstate = True
+ * else:
+ */
+ __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 8, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_INCREF(__pyx_v__dict);
+ __Pyx_GIVEREF(__pyx_v__dict);
+ PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v__dict);
+ __pyx_t_4 = PyNumber_InPlaceAdd(__pyx_v_state, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 8, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __Pyx_DECREF_SET(__pyx_v_state, ((PyObject*)__pyx_t_4));
+ __pyx_t_4 = 0;
+
+ /* "(tree fragment)":9
+ * if _dict is not None:
+ * state += (_dict,)
+ * use_setstate = True # <<<<<<<<<<<<<<
+ * else:
+ * use_setstate = self.name is not None
+ */
+ __pyx_v_use_setstate = 1;
+
+ /* "(tree fragment)":7
+ * state = (self.name,)
+ * _dict = getattr(self, '__dict__', None)
+ * if _dict is not None: # <<<<<<<<<<<<<<
+ * state += (_dict,)
+ * use_setstate = True
+ */
+ goto __pyx_L3;
+ }
+
+ /* "(tree fragment)":11
+ * use_setstate = True
+ * else:
+ * use_setstate = self.name is not None # <<<<<<<<<<<<<<
+ * if use_setstate:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, None), state
+ */
+ /*else*/ {
+ __pyx_t_3 = (__pyx_v_self->name != Py_None);
+ __pyx_v_use_setstate = __pyx_t_3;
+ }
+ __pyx_L3:;
+
+ /* "(tree fragment)":12
+ * else:
+ * use_setstate = self.name is not None
+ * if use_setstate: # <<<<<<<<<<<<<<
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, None), state
+ * else:
+ */
+ __pyx_t_3 = (__pyx_v_use_setstate != 0);
+ if (__pyx_t_3) {
+
+ /* "(tree fragment)":13
+ * use_setstate = self.name is not None
+ * if use_setstate:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, None), state # <<<<<<<<<<<<<<
+ * else:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, state)
+ */
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 13, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 13, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ __Pyx_INCREF(__pyx_int_184977713);
+ __Pyx_GIVEREF(__pyx_int_184977713);
+ PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_184977713);
+ __Pyx_INCREF(Py_None);
+ __Pyx_GIVEREF(Py_None);
+ PyTuple_SET_ITEM(__pyx_t_1, 2, Py_None);
+ __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 13, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __Pyx_GIVEREF(__pyx_t_4);
+ PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4);
+ __Pyx_GIVEREF(__pyx_t_1);
+ PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_1);
+ __Pyx_INCREF(__pyx_v_state);
+ __Pyx_GIVEREF(__pyx_v_state);
+ PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_v_state);
+ __pyx_t_4 = 0;
+ __pyx_t_1 = 0;
+ __pyx_r = __pyx_t_5;
+ __pyx_t_5 = 0;
+ goto __pyx_L0;
+
+ /* "(tree fragment)":12
+ * else:
+ * use_setstate = self.name is not None
+ * if use_setstate: # <<<<<<<<<<<<<<
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, None), state
+ * else:
+ */
+ }
+
+ /* "(tree fragment)":15
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, None), state
+ * else:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, state) # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ * __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+ /*else*/ {
+ __Pyx_XDECREF(__pyx_r);
+ __Pyx_GetModuleGlobalName(__pyx_t_5, __pyx_n_s_pyx_unpickle_Enum); if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 15, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 15, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_INCREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ __Pyx_GIVEREF(((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ PyTuple_SET_ITEM(__pyx_t_1, 0, ((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))));
+ __Pyx_INCREF(__pyx_int_184977713);
+ __Pyx_GIVEREF(__pyx_int_184977713);
+ PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_int_184977713);
+ __Pyx_INCREF(__pyx_v_state);
+ __Pyx_GIVEREF(__pyx_v_state);
+ PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_v_state);
+ __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 15, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_4);
+ __Pyx_GIVEREF(__pyx_t_5);
+ PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_5);
+ __Pyx_GIVEREF(__pyx_t_1);
+ PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1);
+ __pyx_t_5 = 0;
+ __pyx_t_1 = 0;
+ __pyx_r = __pyx_t_4;
+ __pyx_t_4 = 0;
+ goto __pyx_L0;
+ }
+
+ /* "(tree fragment)":1
+ * def __reduce_cython__(self): # <<<<<<<<<<<<<<
+ * cdef tuple state
+ * cdef object _dict
+ */
+
+ /* function exit code */
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_XDECREF(__pyx_t_4);
+ __Pyx_XDECREF(__pyx_t_5);
+ __Pyx_AddTraceback("View.MemoryView.Enum.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XDECREF(__pyx_v_state);
+ __Pyx_XDECREF(__pyx_v__dict);
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "(tree fragment)":16
+ * else:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, state)
+ * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<<
+ * __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/
+static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+ __pyx_r = __pyx_pf___pyx_MemviewEnum_2__setstate_cython__(((struct __pyx_MemviewEnum_obj *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct __pyx_MemviewEnum_obj *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__setstate_cython__", 0);
+
+ /* "(tree fragment)":17
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, state)
+ * def __setstate_cython__(self, __pyx_state):
+ * __pyx_unpickle_Enum__set_state(self, __pyx_state) # <<<<<<<<<<<<<<
+ */
+ if (!(likely(PyTuple_CheckExact(__pyx_v___pyx_state))||((__pyx_v___pyx_state) == Py_None)||((void)PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_v___pyx_state)->tp_name), 0))) __PYX_ERR(2, 17, __pyx_L1_error)
+ __pyx_t_1 = __pyx_unpickle_Enum__set_state(__pyx_v_self, ((PyObject*)__pyx_v___pyx_state)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 17, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+ /* "(tree fragment)":16
+ * else:
+ * return __pyx_unpickle_Enum, (type(self), 0xb068931, state)
+ * def __setstate_cython__(self, __pyx_state): # <<<<<<<<<<<<<<
+ * __pyx_unpickle_Enum__set_state(self, __pyx_state)
+ */
+
+ /* function exit code */
+ __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_XDECREF(__pyx_t_1);
+ __Pyx_AddTraceback("View.MemoryView.Enum.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = NULL;
+ __pyx_L0:;
+ __Pyx_XGIVEREF(__pyx_r);
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":299
+ *
+ * @cname('__pyx_align_pointer')
+ * cdef void *align_pointer(void *memory, size_t alignment) nogil: # <<<<<<<<<<<<<<
+ * "Align pointer memory on a given boundary"
+ * cdef Py_intptr_t aligned_p = memory
+ */
+
+static void *__pyx_align_pointer(void *__pyx_v_memory, size_t __pyx_v_alignment) {
+ Py_intptr_t __pyx_v_aligned_p;
+ size_t __pyx_v_offset;
+ void *__pyx_r;
+ int __pyx_t_1;
+
+ /* "View.MemoryView":301
+ * cdef void *align_pointer(void *memory, size_t alignment) nogil:
+ * "Align pointer memory on a given boundary"
+ * cdef Py_intptr_t aligned_p = memory # <<<<<<<<<<<<<<
+ * cdef size_t offset
+ *
+ */
+ __pyx_v_aligned_p = ((Py_intptr_t)__pyx_v_memory);
+
+ /* "View.MemoryView":305
+ *
+ * with cython.cdivision(True):
+ * offset = aligned_p % alignment # <<<<<<<<<<<<<<
+ *
+ * if offset > 0:
+ */
+ __pyx_v_offset = (__pyx_v_aligned_p % __pyx_v_alignment);
+
+ /* "View.MemoryView":307
+ * offset = aligned_p % alignment
+ *
+ * if offset > 0: # <<<<<<<<<<<<<<
+ * aligned_p += alignment - offset
+ *
+ */
+ __pyx_t_1 = ((__pyx_v_offset > 0) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":308
+ *
+ * if offset > 0:
+ * aligned_p += alignment - offset # <<<<<<<<<<<<<<
+ *
+ * return aligned_p
+ */
+ __pyx_v_aligned_p = (__pyx_v_aligned_p + (__pyx_v_alignment - __pyx_v_offset));
+
+ /* "View.MemoryView":307
+ * offset = aligned_p % alignment
+ *
+ * if offset > 0: # <<<<<<<<<<<<<<
+ * aligned_p += alignment - offset
+ *
+ */
+ }
+
+ /* "View.MemoryView":310
+ * aligned_p += alignment - offset
+ *
+ * return aligned_p # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = ((void *)__pyx_v_aligned_p);
+ goto __pyx_L0;
+
+ /* "View.MemoryView":299
+ *
+ * @cname('__pyx_align_pointer')
+ * cdef void *align_pointer(void *memory, size_t alignment) nogil: # <<<<<<<<<<<<<<
+ * "Align pointer memory on a given boundary"
+ * cdef Py_intptr_t aligned_p = memory
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "View.MemoryView":346
+ * cdef __Pyx_TypeInfo *typeinfo
+ *
+ * def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False): # <<<<<<<<<<<<<<
+ * self.obj = obj
+ * self.flags = flags
+ */
+
+/* Python wrapper */
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_obj = 0;
+ int __pyx_v_flags;
+ int __pyx_v_dtype_is_object;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_obj,&__pyx_n_s_flags,&__pyx_n_s_dtype_is_object,0};
+ PyObject* values[3] = {0,0,0};
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ CYTHON_FALLTHROUGH;
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ CYTHON_FALLTHROUGH;
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_obj)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ CYTHON_FALLTHROUGH;
+ case 1:
+ if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_flags)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, 1); __PYX_ERR(2, 346, __pyx_L3_error)
+ }
+ CYTHON_FALLTHROUGH;
+ case 2:
+ if (kw_args > 0) {
+ PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dtype_is_object);
+ if (value) { values[2] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(2, 346, __pyx_L3_error)
+ }
+ } else {
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ CYTHON_FALLTHROUGH;
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ }
+ __pyx_v_obj = values[0];
+ __pyx_v_flags = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_flags == (int)-1) && PyErr_Occurred())) __PYX_ERR(2, 346, __pyx_L3_error)
+ if (values[2]) {
+ __pyx_v_dtype_is_object = __Pyx_PyObject_IsTrue(values[2]); if (unlikely((__pyx_v_dtype_is_object == (int)-1) && PyErr_Occurred())) __PYX_ERR(2, 346, __pyx_L3_error)
+ } else {
+ __pyx_v_dtype_is_object = ((int)0);
+ }
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(2, 346, __pyx_L3_error)
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return -1;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(((struct __pyx_memoryview_obj *)__pyx_v_self), __pyx_v_obj, __pyx_v_flags, __pyx_v_dtype_is_object);
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_obj, int __pyx_v_flags, int __pyx_v_dtype_is_object) {
+ int __pyx_r;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_t_2;
+ int __pyx_t_3;
+ int __pyx_t_4;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("__cinit__", 0);
+
+ /* "View.MemoryView":347
+ *
+ * def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ * self.obj = obj # <<<<<<<<<<<<<<
+ * self.flags = flags
+ * if type(self) is memoryview or obj is not None:
+ */
+ __Pyx_INCREF(__pyx_v_obj);
+ __Pyx_GIVEREF(__pyx_v_obj);
+ __Pyx_GOTREF(__pyx_v_self->obj);
+ __Pyx_DECREF(__pyx_v_self->obj);
+ __pyx_v_self->obj = __pyx_v_obj;
+
+ /* "View.MemoryView":348
+ * def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False):
+ * self.obj = obj
+ * self.flags = flags # <<<<<<<<<<<<<<
+ * if type(self) is memoryview or obj is not None:
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ */
+ __pyx_v_self->flags = __pyx_v_flags;
+
+ /* "View.MemoryView":349
+ * self.obj = obj
+ * self.flags = flags
+ * if type(self) is memoryview or obj is not None: # <<<<<<<<<<<<<<
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ * if self.view.obj == NULL:
+ */
+ __pyx_t_2 = (((PyObject *)Py_TYPE(((PyObject *)__pyx_v_self))) == ((PyObject *)__pyx_memoryview_type));
+ __pyx_t_3 = (__pyx_t_2 != 0);
+ if (!__pyx_t_3) {
+ } else {
+ __pyx_t_1 = __pyx_t_3;
+ goto __pyx_L4_bool_binop_done;
+ }
+ __pyx_t_3 = (__pyx_v_obj != Py_None);
+ __pyx_t_2 = (__pyx_t_3 != 0);
+ __pyx_t_1 = __pyx_t_2;
+ __pyx_L4_bool_binop_done:;
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":350
+ * self.flags = flags
+ * if type(self) is memoryview or obj is not None:
+ * __Pyx_GetBuffer(obj, &self.view, flags) # <<<<<<<<<<<<<<
+ * if self.view.obj == NULL:
+ * (<__pyx_buffer *> &self.view).obj = Py_None
+ */
+ __pyx_t_4 = __Pyx_GetBuffer(__pyx_v_obj, (&__pyx_v_self->view), __pyx_v_flags); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 350, __pyx_L1_error)
+
+ /* "View.MemoryView":351
+ * if type(self) is memoryview or obj is not None:
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ * if self.view.obj == NULL: # <<<<<<<<<<<<<<
+ * (<__pyx_buffer *> &self.view).obj = Py_None
+ * Py_INCREF(Py_None)
+ */
+ __pyx_t_1 = ((((PyObject *)__pyx_v_self->view.obj) == NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":352
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ * if self.view.obj == NULL:
+ * (<__pyx_buffer *> &self.view).obj = Py_None # <<<<<<<<<<<<<<
+ * Py_INCREF(Py_None)
+ *
+ */
+ ((Py_buffer *)(&__pyx_v_self->view))->obj = Py_None;
+
+ /* "View.MemoryView":353
+ * if self.view.obj == NULL:
+ * (<__pyx_buffer *> &self.view).obj = Py_None
+ * Py_INCREF(Py_None) # <<<<<<<<<<<<<<
+ *
+ * if not __PYX_CYTHON_ATOMICS_ENABLED():
+ */
+ Py_INCREF(Py_None);
+
+ /* "View.MemoryView":351
+ * if type(self) is memoryview or obj is not None:
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ * if self.view.obj == NULL: # <<<<<<<<<<<<<<
+ * (<__pyx_buffer *> &self.view).obj = Py_None
+ * Py_INCREF(Py_None)
+ */
+ }
+
+ /* "View.MemoryView":349
+ * self.obj = obj
+ * self.flags = flags
+ * if type(self) is memoryview or obj is not None: # <<<<<<<<<<<<<<
+ * __Pyx_GetBuffer(obj, &self.view, flags)
+ * if self.view.obj == NULL:
+ */
+ }
+
+ /* "View.MemoryView":355
+ * Py_INCREF(Py_None)
+ *
+ * if not __PYX_CYTHON_ATOMICS_ENABLED(): # <<<<<<<<<<<<<<
+ * global __pyx_memoryview_thread_locks_used
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED:
+ */
+ __pyx_t_1 = ((!(__PYX_CYTHON_ATOMICS_ENABLED() != 0)) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":357
+ * if not __PYX_CYTHON_ATOMICS_ENABLED():
+ * global __pyx_memoryview_thread_locks_used
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED: # <<<<<<<<<<<<<<
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ * __pyx_memoryview_thread_locks_used += 1
+ */
+ __pyx_t_1 = ((__pyx_memoryview_thread_locks_used < 8) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":358
+ * global __pyx_memoryview_thread_locks_used
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED:
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks_used += 1
+ * if self.lock is NULL:
+ */
+ __pyx_v_self->lock = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+
+ /* "View.MemoryView":359
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED:
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ * __pyx_memoryview_thread_locks_used += 1 # <<<<<<<<<<<<<<
+ * if self.lock is NULL:
+ * self.lock = PyThread_allocate_lock()
+ */
+ __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used + 1);
+
+ /* "View.MemoryView":357
+ * if not __PYX_CYTHON_ATOMICS_ENABLED():
+ * global __pyx_memoryview_thread_locks_used
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED: # <<<<<<<<<<<<<<
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ * __pyx_memoryview_thread_locks_used += 1
+ */
+ }
+
+ /* "View.MemoryView":360
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ * __pyx_memoryview_thread_locks_used += 1
+ * if self.lock is NULL: # <<<<<<<<<<<<<<
+ * self.lock = PyThread_allocate_lock()
+ * if self.lock is NULL:
+ */
+ __pyx_t_1 = ((__pyx_v_self->lock == NULL) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":361
+ * __pyx_memoryview_thread_locks_used += 1
+ * if self.lock is NULL:
+ * self.lock = PyThread_allocate_lock() # <<<<<<<<<<<<<<
+ * if self.lock is NULL:
+ * raise MemoryError
+ */
+ __pyx_v_self->lock = PyThread_allocate_lock();
+
+ /* "View.MemoryView":362
+ * if self.lock is NULL:
+ * self.lock = PyThread_allocate_lock()
+ * if self.lock is NULL: # <<<<<<<<<<<<<<
+ * raise MemoryError
+ *
+ */
+ __pyx_t_1 = ((__pyx_v_self->lock == NULL) != 0);
+ if (unlikely(__pyx_t_1)) {
+
+ /* "View.MemoryView":363
+ * self.lock = PyThread_allocate_lock()
+ * if self.lock is NULL:
+ * raise MemoryError # <<<<<<<<<<<<<<
+ *
+ * if flags & PyBUF_FORMAT:
+ */
+ PyErr_NoMemory(); __PYX_ERR(2, 363, __pyx_L1_error)
+
+ /* "View.MemoryView":362
+ * if self.lock is NULL:
+ * self.lock = PyThread_allocate_lock()
+ * if self.lock is NULL: # <<<<<<<<<<<<<<
+ * raise MemoryError
+ *
+ */
+ }
+
+ /* "View.MemoryView":360
+ * self.lock = __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]
+ * __pyx_memoryview_thread_locks_used += 1
+ * if self.lock is NULL: # <<<<<<<<<<<<<<
+ * self.lock = PyThread_allocate_lock()
+ * if self.lock is NULL:
+ */
+ }
+
+ /* "View.MemoryView":355
+ * Py_INCREF(Py_None)
+ *
+ * if not __PYX_CYTHON_ATOMICS_ENABLED(): # <<<<<<<<<<<<<<
+ * global __pyx_memoryview_thread_locks_used
+ * if __pyx_memoryview_thread_locks_used < THREAD_LOCKS_PREALLOCATED:
+ */
+ }
+
+ /* "View.MemoryView":365
+ * raise MemoryError
+ *
+ * if flags & PyBUF_FORMAT: # <<<<<<<<<<<<<<
+ * self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ * else:
+ */
+ __pyx_t_1 = ((__pyx_v_flags & PyBUF_FORMAT) != 0);
+ if (__pyx_t_1) {
+
+ /* "View.MemoryView":366
+ *
+ * if flags & PyBUF_FORMAT:
+ * self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0') # <<<<<<<<<<<<<<
+ * else:
+ * self.dtype_is_object = dtype_is_object
+ */
+ __pyx_t_2 = (((__pyx_v_self->view.format[0]) == 'O') != 0);
+ if (__pyx_t_2) {
+ } else {
+ __pyx_t_1 = __pyx_t_2;
+ goto __pyx_L12_bool_binop_done;
+ }
+ __pyx_t_2 = (((__pyx_v_self->view.format[1]) == '\x00') != 0);
+ __pyx_t_1 = __pyx_t_2;
+ __pyx_L12_bool_binop_done:;
+ __pyx_v_self->dtype_is_object = __pyx_t_1;
+
+ /* "View.MemoryView":365
+ * raise MemoryError
+ *
+ * if flags & PyBUF_FORMAT: # <<<<<<<<<<<<<<
+ * self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ * else:
+ */
+ goto __pyx_L11;
+ }
+
+ /* "View.MemoryView":368
+ * self.dtype_is_object = (self.view.format[0] == b'O' and self.view.format[1] == b'\0')
+ * else:
+ * self.dtype_is_object = dtype_is_object # <<<<<<<<<<<<<<
+ *
+ * self.acquisition_count_aligned_p = <__pyx_atomic_int *> align_pointer(
+ */
+ /*else*/ {
+ __pyx_v_self->dtype_is_object = __pyx_v_dtype_is_object;
+ }
+ __pyx_L11:;
+
+ /* "View.MemoryView":370
+ * self.dtype_is_object = dtype_is_object
+ *
+ * self.acquisition_count_aligned_p = <__pyx_atomic_int *> align_pointer( # <<<<<<<<<<<<<<
+ * &self.acquisition_count[0], sizeof(__pyx_atomic_int))
+ * self.typeinfo = NULL
+ */
+ __pyx_v_self->acquisition_count_aligned_p = ((__pyx_atomic_int *)__pyx_align_pointer(((void *)(&(__pyx_v_self->acquisition_count[0]))), (sizeof(__pyx_atomic_int))));
+
+ /* "View.MemoryView":372
+ * self.acquisition_count_aligned_p = <__pyx_atomic_int *> align_pointer(
+ * &self.acquisition_count[0], sizeof(__pyx_atomic_int))
+ * self.typeinfo = NULL # <<<<<<<<<<<<<<
+ *
+ * def __dealloc__(memoryview self):
+ */
+ __pyx_v_self->typeinfo = NULL;
+
+ /* "View.MemoryView":346
+ * cdef __Pyx_TypeInfo *typeinfo
+ *
+ * def __cinit__(memoryview self, object obj, int flags, bint dtype_is_object=False): # <<<<<<<<<<<<<<
+ * self.obj = obj
+ * self.flags = flags
+ */
+
+ /* function exit code */
+ __pyx_r = 0;
+ goto __pyx_L0;
+ __pyx_L1_error:;
+ __Pyx_AddTraceback("View.MemoryView.memoryview.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __pyx_r = -1;
+ __pyx_L0:;
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
+
+/* "View.MemoryView":374
+ * self.typeinfo = NULL
+ *
+ * def __dealloc__(memoryview self): # <<<<<<<<<<<<<<
+ * if self.obj is not None:
+ * __Pyx_ReleaseBuffer(&self.view)
+ */
+
+/* Python wrapper */
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self) {
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+ __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(((struct __pyx_memoryview_obj *)__pyx_v_self));
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__(struct __pyx_memoryview_obj *__pyx_v_self) {
+ int __pyx_v_i;
+ __Pyx_RefNannyDeclarations
+ int __pyx_t_1;
+ int __pyx_t_2;
+ int __pyx_t_3;
+ int __pyx_t_4;
+ int __pyx_t_5;
+ PyThread_type_lock __pyx_t_6;
+ PyThread_type_lock __pyx_t_7;
+ __Pyx_RefNannySetupContext("__dealloc__", 0);
+
+ /* "View.MemoryView":375
+ *
+ * def __dealloc__(memoryview self):
+ * if self.obj is not None: # <<<<<<<<<<<<<<
+ * __Pyx_ReleaseBuffer(&self.view)
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+ __pyx_t_1 = (__pyx_v_self->obj != Py_None);
+ __pyx_t_2 = (__pyx_t_1 != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":376
+ * def __dealloc__(memoryview self):
+ * if self.obj is not None:
+ * __Pyx_ReleaseBuffer(&self.view) # <<<<<<<<<<<<<<
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ *
+ */
+ __Pyx_ReleaseBuffer((&__pyx_v_self->view));
+
+ /* "View.MemoryView":375
+ *
+ * def __dealloc__(memoryview self):
+ * if self.obj is not None: # <<<<<<<<<<<<<<
+ * __Pyx_ReleaseBuffer(&self.view)
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ */
+ goto __pyx_L3;
+ }
+
+ /* "View.MemoryView":377
+ * if self.obj is not None:
+ * __Pyx_ReleaseBuffer(&self.view)
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None: # <<<<<<<<<<<<<<
+ *
+ * (<__pyx_buffer *> &self.view).obj = NULL
+ */
+ __pyx_t_2 = ((((Py_buffer *)(&__pyx_v_self->view))->obj == Py_None) != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":379
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None:
+ *
+ * (<__pyx_buffer *> &self.view).obj = NULL # <<<<<<<<<<<<<<
+ * Py_DECREF(Py_None)
+ *
+ */
+ ((Py_buffer *)(&__pyx_v_self->view))->obj = NULL;
+
+ /* "View.MemoryView":380
+ *
+ * (<__pyx_buffer *> &self.view).obj = NULL
+ * Py_DECREF(Py_None) # <<<<<<<<<<<<<<
+ *
+ * cdef int i
+ */
+ Py_DECREF(Py_None);
+
+ /* "View.MemoryView":377
+ * if self.obj is not None:
+ * __Pyx_ReleaseBuffer(&self.view)
+ * elif (<__pyx_buffer *> &self.view).obj == Py_None: # <<<<<<<<<<<<<<
+ *
+ * (<__pyx_buffer *> &self.view).obj = NULL
+ */
+ }
+ __pyx_L3:;
+
+ /* "View.MemoryView":384
+ * cdef int i
+ * global __pyx_memoryview_thread_locks_used
+ * if self.lock != NULL: # <<<<<<<<<<<<<<
+ * for i in range(__pyx_memoryview_thread_locks_used):
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+ __pyx_t_2 = ((__pyx_v_self->lock != NULL) != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":385
+ * global __pyx_memoryview_thread_locks_used
+ * if self.lock != NULL:
+ * for i in range(__pyx_memoryview_thread_locks_used): # <<<<<<<<<<<<<<
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ * __pyx_memoryview_thread_locks_used -= 1
+ */
+ __pyx_t_3 = __pyx_memoryview_thread_locks_used;
+ __pyx_t_4 = __pyx_t_3;
+ for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+ __pyx_v_i = __pyx_t_5;
+
+ /* "View.MemoryView":386
+ * if self.lock != NULL:
+ * for i in range(__pyx_memoryview_thread_locks_used):
+ * if __pyx_memoryview_thread_locks[i] is self.lock: # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks_used -= 1
+ * if i != __pyx_memoryview_thread_locks_used:
+ */
+ __pyx_t_2 = (((__pyx_memoryview_thread_locks[__pyx_v_i]) == __pyx_v_self->lock) != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":387
+ * for i in range(__pyx_memoryview_thread_locks_used):
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ * __pyx_memoryview_thread_locks_used -= 1 # <<<<<<<<<<<<<<
+ * if i != __pyx_memoryview_thread_locks_used:
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ */
+ __pyx_memoryview_thread_locks_used = (__pyx_memoryview_thread_locks_used - 1);
+
+ /* "View.MemoryView":388
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ * __pyx_memoryview_thread_locks_used -= 1
+ * if i != __pyx_memoryview_thread_locks_used: # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ * __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+ __pyx_t_2 = ((__pyx_v_i != __pyx_memoryview_thread_locks_used) != 0);
+ if (__pyx_t_2) {
+
+ /* "View.MemoryView":390
+ * if i != __pyx_memoryview_thread_locks_used:
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ * __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i]) # <<<<<<<<<<<<<<
+ * break
+ * else:
+ */
+ __pyx_t_6 = (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]);
+ __pyx_t_7 = (__pyx_memoryview_thread_locks[__pyx_v_i]);
+
+ /* "View.MemoryView":389
+ * __pyx_memoryview_thread_locks_used -= 1
+ * if i != __pyx_memoryview_thread_locks_used:
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = ( # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ * break
+ */
+ (__pyx_memoryview_thread_locks[__pyx_v_i]) = __pyx_t_6;
+ (__pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used]) = __pyx_t_7;
+
+ /* "View.MemoryView":388
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ * __pyx_memoryview_thread_locks_used -= 1
+ * if i != __pyx_memoryview_thread_locks_used: # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ * __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ */
+ }
+
+ /* "View.MemoryView":391
+ * __pyx_memoryview_thread_locks[i], __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used] = (
+ * __pyx_memoryview_thread_locks[__pyx_memoryview_thread_locks_used], __pyx_memoryview_thread_locks[i])
+ * break # <<<<<<<<<<<<<<
+ * else:
+ * PyThread_free_lock(self.lock)
+ */
+ goto __pyx_L6_break;
+
+ /* "View.MemoryView":386
+ * if self.lock != NULL:
+ * for i in range(__pyx_memoryview_thread_locks_used):
+ * if __pyx_memoryview_thread_locks[i] is self.lock: # <<<<<<<<<<<<<<
+ * __pyx_memoryview_thread_locks_used -= 1
+ * if i != __pyx_memoryview_thread_locks_used:
+ */
+ }
+ }
+ /*else*/ {
+
+ /* "View.MemoryView":393
+ * break
+ * else:
+ * PyThread_free_lock(self.lock) # <<<<<<<<<<<<<<
+ *
+ * cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ */
+ PyThread_free_lock(__pyx_v_self->lock);
+ }
+ __pyx_L6_break:;
+
+ /* "View.MemoryView":384
+ * cdef int i
+ * global __pyx_memoryview_thread_locks_used
+ * if self.lock != NULL: # <<<<<<<<<<<<<<
+ * for i in range(__pyx_memoryview_thread_locks_used):
+ * if __pyx_memoryview_thread_locks[i] is self.lock:
+ */
+ }
+
+ /* "View.MemoryView":374
+ * self.typeinfo = NULL
+ *
+ * def __dealloc__(memoryview self): # <<<<<<<<<<<<<<
+ * if self.obj is not None:
+ * __Pyx_ReleaseBuffer(&self.view)
+ */
+
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+}
+
+/* "View.MemoryView":395
+ * PyThread_free_lock(self.lock)
+ *
+ * cdef char *get_item_pointer(memoryview self, object index) except NULL: # <<<<<<<<<<<<<<
+ * cdef Py_ssize_t dim
+ * cdef char *itemp = self.view.buf
+ */
+
+static char *__pyx_memoryview_get_item_pointer(struct __pyx_memoryview_obj *__pyx_v_self, PyObject *__pyx_v_index) {
+ Py_ssize_t __pyx_v_dim;
+ char *__pyx_v_itemp;
+ PyObject *__pyx_v_idx = NULL;
+ char *__pyx_r;
+ __Pyx_RefNannyDeclarations
+ Py_ssize_t __pyx_t_1;
+ PyObject *__pyx_t_2 = NULL;
+ Py_ssize_t __pyx_t_3;
+ PyObject *(*__pyx_t_4)(PyObject *);
+ PyObject *__pyx_t_5 = NULL;
+ Py_ssize_t __pyx_t_6;
+ char *__pyx_t_7;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("get_item_pointer", 0);
+
+ /* "View.MemoryView":397
+ * cdef char *get_item_pointer(memoryview self, object index) except NULL:
+ * cdef Py_ssize_t dim
+ * cdef char *itemp = self.view.buf # <<<<<<<<<<<<<<
+ *
+ * for dim, idx in enumerate(index):
+ */
+ __pyx_v_itemp = ((char *)__pyx_v_self->view.buf);
+
+ /* "View.MemoryView":399
+ * cdef char *itemp = self.view.buf
+ *
+ * for dim, idx in enumerate(index): # <<<<<<<<<<<<<<
+ * itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ *
+ */
+ __pyx_t_1 = 0;
+ if (likely(PyList_CheckExact(__pyx_v_index)) || PyTuple_CheckExact(__pyx_v_index)) {
+ __pyx_t_2 = __pyx_v_index; __Pyx_INCREF(__pyx_t_2); __pyx_t_3 = 0;
+ __pyx_t_4 = NULL;
+ } else {
+ __pyx_t_3 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_index); if (unlikely(!__pyx_t_2)) __PYX_ERR(2, 399, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_2);
+ __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 399, __pyx_L1_error)
+ }
+ for (;;) {
+ if (likely(!__pyx_t_4)) {
+ if (likely(PyList_CheckExact(__pyx_t_2))) {
+ if (__pyx_t_3 >= PyList_GET_SIZE(__pyx_t_2)) break;
+ #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+ __pyx_t_5 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(2, 399, __pyx_L1_error)
+ #else
+ __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 399, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ #endif
+ } else {
+ if (__pyx_t_3 >= PyTuple_GET_SIZE(__pyx_t_2)) break;
+ #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+ __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_3); __Pyx_INCREF(__pyx_t_5); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(2, 399, __pyx_L1_error)
+ #else
+ __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_5)) __PYX_ERR(2, 399, __pyx_L1_error)
+ __Pyx_GOTREF(__pyx_t_5);
+ #endif
+ }
+ } else {
+ __pyx_t_5 = __pyx_t_4(__pyx_t_2);
+ if (unlikely(!__pyx_t_5)) {
+ PyObject* exc_type = PyErr_Occurred();
+ if (exc_type) {
+ if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+ else __PYX_ERR(2, 399, __pyx_L1_error)
+ }
+ break;
+ }
+ __Pyx_GOTREF(__pyx_t_5);
+ }
+ __Pyx_XDECREF_SET(__pyx_v_idx, __pyx_t_5);
+ __pyx_t_5 = 0;
+ __pyx_v_dim = __pyx_t_1;
+ __pyx_t_1 = (__pyx_t_1 + 1);
+
+ /* "View.MemoryView":400
+ *
+ * for dim, idx in enumerate(index):
+ * itemp = pybuffer_index(&self.view, itemp, idx, dim) # <<<<<<<<<<<<<<
+ *
+ * return itemp
+ */
+ __pyx_t_6 = __Pyx_PyIndex_AsSsize_t(__pyx_v_idx); if (unlikely((__pyx_t_6 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(2, 400, __pyx_L1_error)
+ __pyx_t_7 = __pyx_pybuffer_index((&__pyx_v_self->view), __pyx_v_itemp, __pyx_t_6, __pyx_v_dim); if (unlikely(__pyx_t_7 == ((char *)NULL))) __PYX_ERR(2, 400, __pyx_L1_error)
+ __pyx_v_itemp = __pyx_t_7;
+
+ /* "View.MemoryView":399
+ * cdef char *itemp = self.view.buf
+ *
+ * for dim, idx in enumerate(index): # <<<<<<<<<<<<<<
+ * itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ *
+ */
+ }
+ __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+ /* "View.MemoryView":402
+ * itemp = pybuffer_index(&self.view, itemp, idx, dim)
+ *
+ * return itemp # <<<<<<<<<<<<<<
+ *
+ *
+ */
+ __pyx_r = __pyx_v_itemp;
+ goto __pyx_L0;
+
+ /* "View.MemoryView":395
+ * PyThread_free_lock(self.lock)
+ *
+ * cdef char *get_item_pointer(memoryview self, object index) except NULL: # <<<<<<<<<<<<<<
+ * cdef Py_ssize_t dim
+ * cdef char *itemp =